├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── stale.yml └── workflows │ ├── ci.yml │ └── cml.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .travis.yml ├── Dockerfile ├── Dockerfile-GPU ├── Dockerfile-SageMaker ├── Dockerfile-onnxruntime ├── LICENSE ├── MANIFEST.in ├── docker-compose.yml ├── docs ├── Makefile ├── _static │ └── custom.css ├── api │ ├── data_handling.rst │ ├── modeling.rst │ └── running.rst ├── basic_usage.rst ├── conf.py ├── data_handling.rst ├── examples.rst ├── img │ ├── adaptive_model_no_bg.jpg │ ├── adaptive_model_no_bg_small.jpg │ ├── code_snippet_building_blocks.png │ ├── code_snippet_experiment.png │ ├── code_snippet_inference.png │ ├── data_silo_no_bg.jpg │ ├── data_silo_no_bg_small.jpg │ ├── deepset_logo.png │ ├── farm_logo_text_right_wide.png │ ├── inference-api-screen.png │ ├── logo.png │ └── sample_basket_no_bg.jpg ├── index.rst ├── installation.rst ├── make.bat ├── modeling.rst └── qa_formats.py ├── examples ├── conversion_huggingface_models.py ├── conversion_huggingface_models_classification.py ├── doc_classification.py ├── doc_classification_cola.py ├── doc_classification_crossvalidation.py ├── doc_classification_custom_optimizer.py ├── doc_classification_fasttext_LM.py ├── doc_classification_holdout.py ├── doc_classification_multilabel.py ├── doc_classification_multilabel_roberta.py ├── doc_classification_with_earlystopping.py ├── doc_classification_word_embedding_LM.py ├── doc_regression.py ├── dpr_encoder.py ├── embeddings_extraction.py ├── embeddings_extraction_s3e_pooling.py ├── evaluation.py ├── lm_finetuning.py ├── mtl01_tclass_tclass.py ├── natural_questions.py ├── ner.py ├── onnx_question_answering.py ├── passage_ranking.py ├── question_answering.py ├── question_answering_confidence.py ├── question_answering_crossvalidation.py ├── streaming_inference.py ├── text_pair_classification.py ├── train_from_scratch.py ├── train_from_scratch_with_sagemaker.py └── wordembedding_inference.py ├── experiments ├── electra_eval │ └── conll2003_en_config.json ├── german-bert2.0-eval │ ├── germEval14_config.json │ ├── germEval18Coarse_config.json │ └── germEval18Fine_config.json ├── lm_finetuning │ └── finetune_sample_config.json ├── ner │ ├── conll2003_de_config.json │ ├── conll2003_en_config.json │ └── germEval14_config.json ├── qa │ └── squad20_config.json ├── text_classification │ ├── cola_config.json │ ├── germEval18Coarse_config.json │ ├── germEval18Fine_config.json │ └── gnad_config.json ├── text_pair_classification │ └── asnq_binary_config.json └── xlm_roberta_eval │ ├── conll2003_de_config.json │ ├── germEval14_config.json │ └── germEval18Coarse_config.json ├── farm ├── __init__.py ├── _version.py ├── conversion │ ├── __init__.py │ ├── convert_tf_checkpoint_to_pytorch.py │ └── transformers.py ├── data_handler │ ├── __init__.py │ ├── data_silo.py │ ├── dataloader.py │ ├── dataset.py │ ├── input_features.py │ ├── inputs.py │ ├── nq_utils.py │ ├── processor.py │ ├── samples.py │ └── utils.py ├── eval.py ├── evaluation │ ├── __init__.py │ ├── metrics.py │ ├── msmarco_passage_farm.py │ ├── msmarco_passage_official.py │ ├── semantic_answer_similarity_evaluation.py │ └── squad_evaluation.py ├── experiment.py ├── file_utils.py ├── infer.py ├── inference_rest_api.py ├── modeling │ ├── __init__.py │ ├── adaptive_model.py │ ├── biadaptive_model.py │ ├── language_model.py │ ├── optimization.py │ ├── prediction_head.py │ ├── predictions.py │ ├── tokenization.py │ └── wordembedding_utils.py ├── train.py ├── utils.py └── visual │ ├── __init__.py │ └── ascii │ ├── __init__.py │ ├── images.py │ └── text.py ├── readme.rst ├── requirements.txt ├── run_all_experiments.py ├── setup.cfg ├── setup.py ├── test ├── benchmarks │ ├── README.md │ ├── conftest.py │ ├── convert_result_to_csv.py │ ├── question_answering.py │ ├── question_answering_accuracy.py │ ├── question_answering_components.html │ ├── question_answering_components.py │ ├── sample_file.txt │ └── samples │ │ ├── question_answering_questions.txt │ │ └── question_answering_sample.txt ├── conftest.py ├── create_testdata.py ├── modeling │ └── test_optimization.py ├── samples │ ├── doc_class │ │ ├── test-sample.tsv │ │ └── train-sample.tsv │ ├── doc_class_other_text_column_name │ │ ├── test-sample.tsv │ │ └── train-sample.tsv │ ├── doc_regr │ │ ├── test-sample.tsv │ │ └── train-sample.tsv │ ├── doc_regr_other_text_column_name │ │ ├── test-sample.tsv │ │ └── train-sample.tsv │ ├── dpr │ │ └── sample.json │ ├── lm_finetuning │ │ ├── test-sample.txt │ │ └── train-sample.txt │ ├── ner │ │ ├── dev-sample.txt │ │ └── train-sample.txt │ ├── nq │ │ ├── dev_sample.jsonl │ │ └── train_sample.jsonl │ ├── qa │ │ ├── answer-offset-wrong.json │ │ ├── answer-wrong.json │ │ ├── dev-sample.json │ │ ├── eval-sample.json │ │ ├── noanswer.json │ │ ├── train-sample.json │ │ └── vanilla.json │ ├── s3e │ │ ├── fitted_s3e │ │ │ ├── language_model_config.json │ │ │ ├── processor_config.json │ │ │ ├── s3e_stats.pkl │ │ │ ├── vectors.txt │ │ │ └── vocab.txt │ │ ├── tiny_corpus.txt │ │ └── tiny_fasttext_model │ │ │ ├── language_model_config.json │ │ │ ├── vectors.txt │ │ │ └── vocab.txt │ ├── text_pair │ │ └── sample.tsv │ └── tokenizer │ │ ├── bert-base-cased-vocab.txt │ │ └── custom_vocab.txt ├── test_conversion.py ├── test_data_silo.py ├── test_doc_classification_distilbert.py ├── test_doc_regression.py ├── test_dpr.py ├── test_evaluation_metrics.py ├── test_inference.py ├── test_lm_finetuning.py ├── test_model_versioning.py ├── test_natural_questions.py ├── test_ner.py ├── test_ner_amp.py ├── test_onnx_conversion.py ├── test_prediction_head.py ├── test_processor_qa.py ├── test_processor_saving_loading.py ├── test_question_answering.py ├── test_s3e_pooling.py ├── test_text_pair.py └── test_tokenization.py └── tutorials ├── 1_farm_building_blocks.ipynb ├── 2_Build_a_processor_for_your_own_dataset.ipynb └── sagemaker ├── 3_train_with_sagemaker.ipynb └── source ├── doc_classification.py └── requirements.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | saved_models/** 2 | data/** -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: We love animals, but bugs need to be reported. 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Error message** 14 | Error that was thrown (if available) 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Additional context** 20 | Add any other context about the problem here, like type of downstream task, part of etc.. 21 | 22 | **To Reproduce** 23 | Steps to reproduce the behavior 24 | 25 | **System:** 26 | - OS: 27 | - GPU/CPU: 28 | - FARM version: 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: 'Got an idea for improving FARM? ' 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem or particular use case?** 11 | A clear and concise description of what the problem or use case is. 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Not sure how to use a component? Just ask :) 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Question** 11 | Put your question here 12 | 13 | **Additional context** 14 | Add any other context or screenshots about the question (optional). 15 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 120 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 21 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed in 21 days if no further activity occurs. 15 | 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-20.04 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Python 3.8 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.8 21 | 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install pytest 26 | pip install -r requirements.txt 27 | pip install onnxruntime 28 | pip install -e . 29 | 30 | - name: Run pytest - only "conversion" marker 31 | run: cd test && pytest -m "conversion" 32 | 33 | - name: Run Pytest - all except conversion marker 34 | run: cd test && pytest -m "not conversion" -------------------------------------------------------------------------------- /.github/workflows/cml.yaml: -------------------------------------------------------------------------------- 1 | name: benchmarks 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | types: [labeled] 7 | jobs: 8 | deploy-cloud-runner: 9 | if: ${{ (github.event.action == 'labeled' && github.event.label.name == 'benchmark') || github.event.action == 'workflow_dispatch' }} 10 | runs-on: [ubuntu-latest] 11 | container: docker://dvcorg/cml 12 | steps: 13 | - name: deploy 14 | env: 15 | repo_token: ${{ secrets.HAYSTACK_BOT_TOKEN }} 16 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CI_ACCESS_KEY }} 17 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CI_SECRET_ACCESS_KEY }} 18 | VPC: ${{ secrets.AWS_CI_VPC }} 19 | run: | 20 | echo "Deploying..." 21 | RUNNER_LABELS="cml,aws" 22 | RUNNER_REPO="https://github.com/${GITHUB_REPOSITORY}" 23 | MACHINE="cml$(date +%s)" 24 | docker-machine create \ 25 | --driver amazonec2 \ 26 | --amazonec2-instance-type p3.8xlarge \ 27 | --amazonec2-vpc-id $VPC \ 28 | --amazonec2-region us-east-1 \ 29 | --amazonec2-zone c \ 30 | --amazonec2-ssh-user ubuntu \ 31 | --amazonec2-ami ami-06a25ee8966373068 \ 32 | --amazonec2-root-size 150 \ 33 | $MACHINE 34 | eval "$(docker-machine env --shell sh $MACHINE)" 35 | 36 | ( 37 | docker-machine ssh $MACHINE "sudo mkdir -p \ 38 | /docker_machine && \ 39 | sudo chmod 777 /docker_machine" && \ 40 | docker-machine scp -r -q ~/.docker/machine/ \ 41 | $MACHINE:/docker_machine && \ 42 | docker run --name runner -d \ 43 | --gpus all \ 44 | -v /docker_machine/machine:/root/.docker/machine \ 45 | --net host \ 46 | --ipc host \ 47 | -e DOCKER_MACHINE=$MACHINE \ 48 | -e repo_token=$repo_token \ 49 | -e RUNNER_LABELS=$RUNNER_LABELS \ 50 | -e RUNNER_REPO=$RUNNER_REPO \ 51 | -e RUNNER_IDLE_TIMEOUT=120 \ 52 | dvcorg/cml-py3:latest && \ 53 | sleep 20 && echo "Deployed $MACHINE" 54 | ) || (echo "Shut down machine" && docker-machine rm -y -f $MACHINE && exit 1) 55 | run-benchmark: 56 | if: ${{ (github.event.action == 'labeled' && github.event.label.name == 'benchmark') || github.event.action == 'workflow_dispatch' }} 57 | needs: deploy-cloud-runner 58 | runs-on: [self-hosted,cml] 59 | steps: 60 | - uses: actions/checkout@v2 61 | - name: cml_run 62 | env: 63 | repo_token: ${{ secrets.HAYSTACK_BOT_TOKEN }} 64 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CI_ACCESS_KEY }} 65 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CI_SECRET_ACCESS_KEY }} 66 | run: | 67 | apt-get update -y 68 | apt-get install python3-dev -y 69 | pip install -r requirements.txt 70 | pip install . 71 | cd test/benchmarks && python question_answering_accuracy.py 72 | echo -en "## Benchmarks: QA Accuracy\n" >> accuracy_report.md 73 | cat results_accuracy.md >> accuracy_report.md 74 | cml-send-comment accuracy_report.md 75 | python question_answering_components.py 76 | echo -en "## Benchmarks: QA per component\n" >> components_report.md 77 | cat results_per_component.md >> components_report.md 78 | cml-send-comment components_report.md 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | apex 4 | 5 | Pipfile* 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/ 116 | .dmypy.json 117 | dmypy.json 118 | 119 | # Pyre type checker 120 | .pyre/ 121 | 122 | # vscode 123 | .vscode 124 | 125 | # pycharm 126 | .idea/ 127 | 128 | # TF code 129 | tensorflow_code 130 | 131 | # training data 132 | data/ 133 | 134 | # models 135 | models/ 136 | save/ 137 | testsave/ 138 | saved_models/ 139 | 140 | # mlruns 141 | mlruns/ 142 | .DS_Store 143 | 144 | # cache 145 | *cache* 146 | 147 | sandbox/ 148 | 149 | 150 | # files created by example scripts 151 | examples/doc_classification_holdout.results.json 152 | examples/doc_classification_xval.results.json 153 | 154 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/python/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | cache: pip 4 | python: 5 | - "3.7" 6 | install: 7 | - "pip install -e ." 8 | - "pip install sphinx==2.1.2" 9 | - "pip install sphinx-rtd-theme==0.4.3" 10 | script: 11 | - "cd test && pytest" 12 | - "cd ../docs && sphinx-build -W -b html -d _build/doctrees . _build/html" 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.4-stretch 2 | 3 | WORKDIR /home/user 4 | 5 | COPY setup.py requirements.txt readme.rst /home/user/ 6 | RUN pip install -r requirements.txt 7 | RUN pip install -e . 8 | 9 | COPY farm /home/user/farm 10 | # optionally: copy some base models into the image to allow simple demos / comparisons 11 | #COPY saved_models /home/user/base_models 12 | 13 | CMD FLASK_APP=farm.inference_rest_api flask run --host 0.0.0.0 14 | -------------------------------------------------------------------------------- /Dockerfile-GPU: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-devel 2 | 3 | RUN apt-get update && apt-get install -y git 4 | 5 | # Setup locales 6 | RUN apt-get update \ 7 | && apt-get install -y --no-install-recommends \ 8 | locales 9 | RUN locale-gen en_US.UTF-8 10 | ENV LANG en_US.UTF-8 11 | ENV LANGUAGE en_US:en 12 | ENV LC_ALL en_US.UTF-8 13 | 14 | WORKDIR /home/user 15 | 16 | # Install apex 17 | RUN git clone https://github.com/NVIDIA/apex \ 18 | && cd apex \ 19 | && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 20 | 21 | 22 | 23 | # Install FARM 24 | COPY setup.py requirements.txt readme.rst /home/user/ 25 | RUN pip install -r requirements.txt 26 | COPY farm farm 27 | RUN pip install -e . 28 | 29 | 30 | # Copy Training Scripts 31 | COPY examples examples 32 | 33 | CMD FLASK_APP=farm.inference_rest_api flask run --host 0.0.0.0 34 | -------------------------------------------------------------------------------- /Dockerfile-SageMaker: -------------------------------------------------------------------------------- 1 | FROM deepset/farm-gpu:latest 2 | COPY examples examples 3 | #COPY data/test data/test 4 | 5 | # ENV SAGEMAKER_PROGRAM train.py 6 | ENTRYPOINT ["python3","-m", "torch.distributed.launch", "--nproc_per_node=4", "examples/train_from_scratch_with_sagemaker.py"] 7 | -------------------------------------------------------------------------------- /Dockerfile-onnxruntime: -------------------------------------------------------------------------------- 1 | # Adapted from ONNXRuntime CUDA Dockerfile at https://github.com/microsoft/onnxruntime/blob/master/dockerfiles/Dockerfile.cuda 2 | 3 | FROM nvidia/cuda:10.1-cudnn7-devel 4 | 5 | ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime 6 | ARG ONNXRUNTIME_BRANCH=master 7 | 8 | RUN apt-get update &&\ 9 | apt-get install -y sudo git bash 10 | 11 | WORKDIR /code 12 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:/opt/miniconda/bin:${PATH} 13 | ENV LD_LIBRARY_PATH /opt/miniconda/lib:$LD_LIBRARY_PATH 14 | 15 | # Prepare onnxruntime repository & build onnxruntime with CUDA 16 | RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ 17 | /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\ 18 | cp onnxruntime/docs/Privacy.md /code/Privacy.md &&\ 19 | cp onnxruntime/ThirdPartyNotices.txt /code/ThirdPartyNotices.txt &&\ 20 | cp onnxruntime/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt &&\ 21 | cd onnxruntime &&\ 22 | /bin/sh ./build.sh --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_cuda --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\ 23 | pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ 24 | cd .. &&\ 25 | rm -rf onnxruntime cmake-3.14.3-Linux-x86_64 26 | 27 | # Clone FARM repositry and install the requirements 28 | RUN git clone --depth 1 --branch 0.4.3 https://github.com/deepset-ai/farm.git 29 | RUN pip install -e FARM 30 | RUN pip install -r FARM/test/requirements.txt -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | include readme.rst 4 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | inference-api: 4 | # this Docker image comes with preloaded models. 5 | image: "deepset/farm-inference-api:base-models-0.4.2" 6 | ports: 7 | - "5000:5000" 8 | # (optional) mount your own models 9 | volumes: 10 | - "./saved_models:/home/user/saved_models" 11 | inference-ui: 12 | image: "deepset/farm-inference-ui:latest" 13 | ports: 14 | - "3000:80" 15 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/custom.css: -------------------------------------------------------------------------------- 1 | .wy-side-nav-search{ background-color: #FFEFDB } 2 | a.icon-home { color: #18A063 } 3 | .icon-home:before{ display:none } -------------------------------------------------------------------------------- /docs/api/data_handling.rst: -------------------------------------------------------------------------------- 1 | Data Handling 2 | ============= 3 | 4 | 5 | Processor 6 | --------- 7 | 8 | .. automodule:: farm.data_handler.processor 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | :exclude-members: subclasses 13 | 14 | Data Silo 15 | ---------- 16 | 17 | .. automodule:: farm.data_handler.data_silo 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | Dataset 23 | ------- 24 | 25 | .. automodule:: farm.data_handler.dataset 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | DataLoader 31 | ---------- 32 | 33 | .. automodule:: farm.data_handler.dataloader 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | Samples 39 | ------- 40 | 41 | .. automodule:: farm.data_handler.samples 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | Input Features 47 | --------------- 48 | 49 | .. automodule:: farm.data_handler.input_features 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | -------------------------------------------------------------------------------- /docs/api/modeling.rst: -------------------------------------------------------------------------------- 1 | Modeling 2 | ======== 3 | 4 | Adaptive Model 5 | ------------------ 6 | 7 | .. automodule:: farm.modeling.adaptive_model 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | BiAdaptive Model 13 | ------------------ 14 | 15 | .. automodule:: farm.modeling.biadaptive_model 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | Language Model 21 | -------------- 22 | 23 | .. automodule:: farm.modeling.language_model 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | :exclude-members: 28 | 29 | Prediction Head 30 | --------------- 31 | 32 | .. automodule:: farm.modeling.prediction_head 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | :exclude-members: subclasses 37 | 38 | 39 | Optimization 40 | ------------ 41 | 42 | .. automodule:: farm.modeling.optimization 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Tokenization 48 | ------------ 49 | 50 | .. automodule:: farm.modeling.tokenization 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/api/running.rst: -------------------------------------------------------------------------------- 1 | Running 2 | ======= 3 | 4 | 5 | Train 6 | ----- 7 | 8 | .. automodule:: farm.train 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | Eval 14 | ---- 15 | 16 | .. automodule:: farm.eval 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | Infer 22 | ----- 23 | 24 | .. automodule:: farm.infer 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | Experiment 30 | ---------- 31 | 32 | .. automodule:: farm.experiment 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | 37 | Metrics 38 | ------- 39 | 40 | .. automodule:: farm.evaluation.metrics 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | File utils 46 | ----------- 47 | 48 | .. automodule:: farm.file_utils 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /docs/basic_usage.rst: -------------------------------------------------------------------------------- 1 | Basic Usage 2 | ############ 3 | 4 | 1. Train a downstream model 5 | **************************** 6 | FARM offers two modes for model training: 7 | 8 | **Option 1: Run experiment(s) from config**:: 9 | 10 | from farm.experiment import run_experiment, load_experiments 11 | experiments = load_experiments(Path("experiments/ner/conll2003_de_config.json") 12 | run_experiment(experiments[0]) 13 | 14 | *Use cases:* Training your first model, hyperparameter optimization, evaluating a language model on multiple down-stream tasks. 15 | 16 | **Option 2: Stick together your own building blocks**:: 17 | 18 | # Choose a language model (e.g. from transformers' model hub: https://huggingface.co/models) 19 | language_model = "bert-base-german-cased" 20 | 21 | # Basic building blocks for data handling 22 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model) 23 | processor = NERProcessor(tokenizer=tokenizer, data_dir=Path("../data/conll03-de"), max_seq_len=128) 24 | ... 25 | 26 | # AdaptiveModel = LanguageModel + PredictionHead(s) 27 | language_model = LanguageModel.load(language_model) 28 | prediction_head = TokenClassificationHead(num_labels=13) 29 | model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], ...) 30 | ... 31 | 32 | # Feed it to a Trainer, which keeps care of growing our model 33 | trainer = Trainer( 34 | model=model, 35 | optimizer=optimizer, 36 | data_silo=data_silo, 37 | epochs=n_epochs, 38 | lr_schedule=lr_schedule, 39 | evaluate_every=evaluate_every, 40 | n_gpu=n_gpu, 41 | device=device) 42 | 43 | # 7. Let it grow 44 | model = trainer.train() 45 | 46 | See this `tutorial `_ for details 47 | 48 | *Usecases:* Custom datasets, language models, prediction heads ... 49 | 50 | 2. Run Inference 51 | ***************** 52 | Use a `public model `__ or your own to get predictions:: 53 | 54 | # Load model, tokenizer & processor (local or any from https://huggingface.co/models) 55 | nlp = Inferencer.load("deepset/bert-large-uncased-whole-word-masking-squad2", task_type="question_answering") 56 | 57 | # Run predictions 58 | QA_input = [{"questions": ["Why is model conversion important?"], 59 | "text": "Model conversion lets people easily switch between frameworks."}] 60 | result = nlp.inference_from_dicts(dicts=QA_input) 61 | 62 | 3. Showcase your model (API + UI) 63 | ********************************** 64 | 65 | Quick start 66 | =============== 67 | 68 | * Run :code:`docker-compose up` 69 | * Open http://localhost:3000 in your browser 70 | 71 | .. image:: img/inference-api-screen.png 72 | :alt: FARM Inferennce UI 73 | 74 | One docker container exposes a REST API (localhost:5000) and another one runs a simple demo UI (localhost:3000). 75 | You can use both of them individually and mount your own models. 76 | 77 | API Docker 78 | ============== 79 | *(deepset/farm-inference-api)* 80 | 81 | The API container includes FARM and is made for running trained (multiple) down-stream models in inference mode. It exposes a REST API on port 5000. 82 | 83 | You can either start the docker via docker-compose (recommended) or manually via: 84 | :code:`docker run -d -p 5000:5000 deepset/farm-inference-api:base-models` 85 | 86 | **What models are loaded?** 87 | 88 | The container is loading all models located in the docker's directory :code:`/home/user/saved_models`. 89 | We have one image version with some exemplary models stored in this directory: :code:`farm-inference-api:base-models`. 90 | This might be helpful if you just want to try the API/UI or compare your own model to some other baselines. 91 | If you only want to run your own models, you can also use the smaller image with tag :code:`farm-inference-api:lastest` 92 | 93 | **How can I add my own models?** 94 | 95 | Just mount them from your disk into the docker directory :code:`/home/user/saved_models`. 96 | The easiest way of doing this is to edit the :code:`docker-compose.yml`. Just put your own path with the model folder(s):: 97 | 98 | volumes: 99 | - "./your_path/some_folder:/home/user/saved_models" 100 | 101 | If you don't run via docker-compose you can also supply the mounted volume to :code:`docker run`:: 102 | 103 | docker run -d \ 104 | -p 5000:5000 \ 105 | -v /your_path/some_folder:/home/user/saved_models \ 106 | deepset/farm-inference-api:base-models 107 | 108 | UI Docker 109 | ============= 110 | *(deepset/farm-inference-ui)* 111 | 112 | The UI container can be launched in addition to provide a frontend that queries the API exposed on port 5000 by the other container. 113 | Start the container via docker-compose or individually via 114 | 115 | :code:`docker run -d -p 3000:80 deepset/farm-inference-ui` 116 | 117 | Open localhost:3000 in your browser. Then simply select the tab with your task on the left (e.g. QA), one of the models 118 | exposed by the API and enter some text that you want to feed to the model. 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("..")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "FARM" 22 | copyright = "2019, deepset" 23 | author = "deepset" 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ["_templates"] 35 | 36 | # List of patterns, relative to source directory, that match files and 37 | # directories to ignore when looking for source files. 38 | # This pattern also affects html_static_path and html_extra_path. 39 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 40 | 41 | autodoc_member_order = "bysource" 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = "sphinx_rtd_theme" 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = ["_static"] 54 | 55 | 56 | html_logo = "img/logo.png" 57 | 58 | html_context = {"css_files": ["_static/custom.css"]} 59 | 60 | # -- Add autodocs for __init__() methods ------------------------------------- 61 | 62 | 63 | def skip(app, what, name, obj, would_skip, options): 64 | if name == "__init__": 65 | return False 66 | return would_skip 67 | 68 | 69 | def setup(app): 70 | app.connect("autodoc-skip-member", skip) 71 | -------------------------------------------------------------------------------- /docs/data_handling.rst: -------------------------------------------------------------------------------- 1 | Data Handling 2 | ================================ 3 | 4 | 5 | Design Philosophy 6 | ################## 7 | In many cases adapting a language model to your own NLP problem requires heavy lifting on the preprocessing side. 8 | To lessen this burden, we have designed the data handling with a few goals in mind. We want: 9 | 10 | * Customization of preprocessing components to be easy 11 | * Inspection of the inputs and outputs of different preprocessing stages to be possible 12 | * A structure that is general enough to handle the requirements of different NLP tasks 13 | 14 | As such, you will find the following features in our code: 15 | 16 | * The functions that we expect the user to customize are grouped together 17 | * Many of the generic pipeline components are easily reusable 18 | * There is a clear separation of generic and dataset/task/model specific components in the pipeline 19 | * Processing goes stage by stage rather than sample by sample so that you are able to inspect the full dataset at any point in the processing 20 | * Powerful debugging that allows inspecting a sample in different phases of the pipeline (raw, tokenized, featurized, tensors ...) 21 | 22 | Building Blocks 23 | ################# 24 | 25 | .. image:: img/data_silo_no_bg.jpg 26 | :alt: FARM Data Silo 27 | 28 | In FARM the **Processor** contains the functions which handle the **conversion from file or request to PyTorch Datasets**. 29 | In essence, it prepares data to be consumed by the modelling components. 30 | This is done in stages to allow for easier debugging. 31 | It should be able to handle file input or requests. 32 | This class contains everything that needs to be customized when adapting a new dataset. 33 | Custom datasets can be handled by extending the Processor (e.g. see CONLLProcessor). 34 | 35 | The **DataSilo** is a generic class that stores the train, dev and test data sets. 36 | It calls upon the methods from the Processor to do the loading and then exposes a DataLoader for each set. 37 | In cases where there is not a separate dev file, it will create one by slicing the train set. 38 | 39 | .. image:: img/sample_basket_no_bg.jpg 40 | :alt: FARM Sample Basket 41 | 42 | The **Sample** and **SampleBasket** objects allow powerful debugging and logging capabilities as they store different views on the same sample (raw, tokenized, featurized ...) 43 | The **SampleBasket** stores one string sample as well as the one or more **Samples** that that string sample might generate. 44 | These data structures are design like this since a single document only generates one sample when performing document classification but can generate multiple samples for question answering. 45 | 46 | 47 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ================================ 3 | 4 | You can find exemplary scripts for the major down-stream tasks in :code:`examples/` 5 | 6 | Document Classification 7 | ########################## 8 | (see :code:`examples/doc_classification.py` for full script) 9 | 10 | 1.Create a tokenizer:: 11 | 12 | tokenizer = Tokenizer.load( 13 | pretrained_model_name_or_path=lang_model, 14 | do_lower_case=False) 15 | 16 | 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset:: 17 | 18 | processor = GermEval18CoarseProcessor(tokenizer=tokenizer, 19 | max_seq_len=128, 20 | data_dir="../data/germeval18") 21 | 22 | 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets:: 23 | 24 | data_silo = DataSilo( 25 | processor=processor, 26 | batch_size=batch_size) 27 | 28 | 4. Create an AdaptiveModel 29 | a) which consists of a pretrained language model as a basis:: 30 | 31 | language_model = LanguageModel.load(lang_model) 32 | 33 | b) and a prediction head on top that is suited for our task => Text classification:: 34 | 35 | prediction_head = TextClassificationHead(layer_dims=[768, len(processor.label_list)]) 36 | 37 | model = AdaptiveModel( 38 | language_model=language_model, 39 | prediction_heads=[prediction_head], 40 | embeds_dropout_prob=0.1, 41 | lm_output_types=["per_sequence"], 42 | device=device) 43 | 44 | 5. Create an optimizer and optionally optimize model and optimizer with AMP:: 45 | 46 | model, optimizer, warmup_linear = initialize_optimizer( 47 | model=model, 48 | learning_rate=2e-5, 49 | warmup_proportion=0.1, 50 | n_examples=data_silo.n_samples("train"), 51 | batch_size=batch_size, 52 | n_epochs=1) 53 | 54 | 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time:: 55 | 56 | trainer = Trainer( 57 | optimizer=optimizer, 58 | data_silo=data_silo, 59 | epochs=n_epochs, 60 | n_gpu=1, 61 | warmup_linear=warmup_linear, 62 | evaluate_every=evaluate_every, 63 | device=device) 64 | 65 | 7. Let it grow:: 66 | 67 | model = trainer.train(model) 68 | 69 | 8. Hooray! You have a model. Store it:: 70 | 71 | save_dir = "save/bert-german-GNAD-tutorial" 72 | model.save(save_dir) 73 | processor.save(save_dir) 74 | 75 | 9. Load it & harvest your fruits (Inference):: 76 | 77 | basic_texts = [ 78 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"}, 79 | {"text": "Martin Müller spielt Fussball"}, 80 | ] 81 | model = Inferencer(save_dir) 82 | result = model.inference_from_dicts(dicts=basic_texts) 83 | print(result) 84 | -------------------------------------------------------------------------------- /docs/img/adaptive_model_no_bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/adaptive_model_no_bg.jpg -------------------------------------------------------------------------------- /docs/img/adaptive_model_no_bg_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/adaptive_model_no_bg_small.jpg -------------------------------------------------------------------------------- /docs/img/code_snippet_building_blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_building_blocks.png -------------------------------------------------------------------------------- /docs/img/code_snippet_experiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_experiment.png -------------------------------------------------------------------------------- /docs/img/code_snippet_inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_inference.png -------------------------------------------------------------------------------- /docs/img/data_silo_no_bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/data_silo_no_bg.jpg -------------------------------------------------------------------------------- /docs/img/data_silo_no_bg_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/data_silo_no_bg_small.jpg -------------------------------------------------------------------------------- /docs/img/deepset_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/deepset_logo.png -------------------------------------------------------------------------------- /docs/img/farm_logo_text_right_wide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/farm_logo_text_right_wide.png -------------------------------------------------------------------------------- /docs/img/inference-api-screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/inference-api-screen.png -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/logo.png -------------------------------------------------------------------------------- /docs/img/sample_basket_no_bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/sample_basket_no_bg.jpg -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ############# 3 | Recommended (because of highly active development):: 4 | 5 | git clone https://github.com/deepset-ai/FARM.git 6 | cd FARM 7 | pip install -r requirements.txt 8 | pip install --editable . 9 | 10 | If problems occur, please do a git pull. the --editable flag will update changes immediately. 11 | 12 | With pip:: 13 | 14 | pip install farm 15 | 16 | We recommend using Python 3.7. 17 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/modeling.rst: -------------------------------------------------------------------------------- 1 | Modeling 2 | ================================ 3 | 4 | 5 | Design Philosophy 6 | ################## 7 | We live in exciting times for NLP and see new publications on language models, adaptation strategies and down-stream applications on a weekly base. 8 | However, keeping track with recent developments is not easy. Switching between pretrained models or adaptation strategies is not easy in practice, since most researchers publish their models in individual repos and not always have the desired down-stream tasks implemented. 9 | 10 | FARM offer's a more flexible and general approach of transfer learning by abstracting from the underlying pretrained language models and their prediction head. 11 | With FARM you can stick together any pretrained language model (BERT, XLNet or whatever comes next) with one or multiple prediction heads (NER, Doc classification ...) to form an AdaptiveModel. 12 | This allows you a fast and easy comparison between different language models and simplifies changes in your production system, if you want to migrate to a new model. 13 | 14 | Building Blocks 15 | ################# 16 | 17 | .. image:: img/adaptive_model_no_bg.jpg 18 | :alt: FARM Adaptive Model 19 | 20 | 1. Language Model 21 | ******************** 22 | * Standardized parent class for all language models out there (BERT, XLNet ...). 23 | * A pretrained language model converts tokens to vector representations 24 | 25 | 2. Prediction Head 26 | ******************** 27 | * Standardized parent class for all types of down-stream tasks (NER, Text classification, QA ...). 28 | * A prediction head retrieves vector representations from the language model and converts them into down-stream predictions (e.g. class probabilities) 29 | 30 | 31 | 3. AdaptiveModel 32 | ******************** 33 | * Standardized parent class for end-to-end transfer learning models 34 | * Combines the language model with one or multiple prediction heads. 35 | * An AdaptiveModel 1) propagates the input to the language model, it's output to the prediction head(s) and then consolidates the loss(es) / predictions. During training the loss is backpropagated through the entire neural network (incl. language model). We will soon provide further adaptation strategies here like Adapter Modules or Discriminative Finetuning. 36 | 37 | All three classes provide standardized interfaces for all kinds of model functions like retrieving logits, loss or formatted predictions. -------------------------------------------------------------------------------- /docs/qa_formats.py: -------------------------------------------------------------------------------- 1 | #################################### 2 | ###### JSON (REST API) FORMAT ###### 3 | #################################### 4 | 5 | # INPUT 6 | 7 | input = [{"questions": ["What is X?"], "text": "Some context containing the answer"}] 8 | 9 | # OUTPUT 10 | 11 | output= { 12 | "task": "qa", 13 | "predictions": [ 14 | { 15 | "question": question, 16 | "question_id": id, 17 | "ground_truth": None, 18 | "answers": answers, 19 | "no_ans_gap": no_ans_gap # Add no_ans_gap to current no_ans_boost for switching top prediction 20 | } 21 | ], 22 | } 23 | 24 | answer = {"score": score, 25 | "probability": -1, 26 | "answer": string, 27 | "offset_answer_start": ans_start_ch, 28 | "offset_answer_end": ans_end_ch, 29 | "context": context_string, 30 | "offset_context_start": context_start_ch, 31 | "offset_context_end": context_end_ch, 32 | "document_id": document_id} 33 | 34 | 35 | ############################### 36 | ###### SQUAD EVAL FORMAT ###### 37 | ############################### 38 | 39 | # INPUT 40 | 41 | input = [{"qas": ["What is X?"], "context": "Some context containing the answer"}] 42 | 43 | # OUTPUT 44 | 45 | output = {"id": basket_id, 46 | "preds": [[pred_str, start_t, end_t, score, sample_idx], ...]} 47 | -------------------------------------------------------------------------------- /examples/conversion_huggingface_models.py: -------------------------------------------------------------------------------- 1 | from farm.modeling.adaptive_model import AdaptiveModel 2 | from farm.modeling.tokenization import Tokenizer 3 | from farm.conversion.transformers import Converter 4 | from farm.infer import Inferencer 5 | import pprint 6 | from transformers.pipelines import pipeline 7 | import os 8 | from pathlib import Path 9 | 10 | ############################################## 11 | ### From Transformers -> FARM 12 | ############################################## 13 | def convert_from_transformers(): 14 | # CASE 1: MODEL 15 | # Load model from transformers model hub (-> continue training / compare models / ...) 16 | model = Converter.convert_from_transformers("deepset/bert-large-uncased-whole-word-masking-squad2", device="cpu") 17 | #Alternative way to load from transformers model hub: 18 | #model = AdaptiveModel.convert_from_transformers("deepset/bert-large-uncased-whole-word-masking-squad2", device="cpu", task_type="question_answering") 19 | # ... continue as in the other examples e.g. to fine-tune this QA model on your own data 20 | 21 | # CASE 2: INFERENCER 22 | # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions) 23 | nlp = Inferencer.load("deepset/bert-large-uncased-whole-word-masking-squad2", task_type="question_answering") 24 | 25 | # run predictions 26 | QA_input = [{"questions": ["Why is model conversion important?"], 27 | "text": "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."}] 28 | result = nlp.inference_from_dicts(dicts=QA_input) 29 | pprint.pprint(result) 30 | nlp.close_multiprocessing_pool() 31 | 32 | # save it 33 | farm_model_dir = Path("../saved_models/bert-english-qa-large") 34 | nlp.save(farm_model_dir) 35 | 36 | ############################################## 37 | ### From FARM -> Transformers 38 | ############################################## 39 | def convert_to_transformers(): 40 | farm_model_dir = Path("../saved_models/bert-english-qa-large") 41 | 42 | # load from FARM format 43 | model = AdaptiveModel.load(farm_model_dir, device="cpu") 44 | tokenizer = Tokenizer.load(farm_model_dir) 45 | 46 | # convert to transformers 47 | transformer_model = Converter.convert_to_transformers(model)[0] 48 | #Alternative way to convert to transformers: 49 | #transformer_model = model.convert_to_transformers()[0] 50 | 51 | # save it (Note: transformers uses strings rather than Path objects) 52 | model_dir = "../saved_models/bert-large-uncased-whole-word-masking-squad2" 53 | os.makedirs(model_dir, exist_ok=True) 54 | transformer_model.save_pretrained(model_dir) 55 | tokenizer.save_pretrained(model_dir) 56 | 57 | # run predictions (using transformers) 58 | nlp = pipeline('question-answering', model=model_dir, tokenizer=model_dir) 59 | res = nlp({ 60 | 'question': 'Why is model conversion important?', 61 | 'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.' 62 | }) 63 | pprint.pprint(res) 64 | 65 | # To upload to transformer's model hub run this in bash: 66 | # transformers-cli upload ../saved_models/bert-large-uncased-whole-word-masking-squad2 67 | 68 | 69 | if __name__ == "__main__": 70 | convert_from_transformers() 71 | convert_to_transformers() -------------------------------------------------------------------------------- /examples/conversion_huggingface_models_classification.py: -------------------------------------------------------------------------------- 1 | from farm.modeling.adaptive_model import AdaptiveModel 2 | from farm.conversion.transformers import Converter 3 | from farm.data_handler.processor import Processor 4 | 5 | from farm.infer import Inferencer 6 | import pprint 7 | from transformers.pipelines import pipeline 8 | from pathlib import Path 9 | 10 | ############################################## 11 | ### From Transformers -> FARM 12 | ############################################## 13 | def convert_from_transformers(): 14 | transformers_input_name = "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse" 15 | farm_output_dir = Path("../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse") 16 | 17 | # # CASE 1: MODEL 18 | # # Load model from transformers model hub (-> continue training / compare models / ...) 19 | model = Converter.convert_from_transformers(transformers_input_name, device="cpu") 20 | 21 | # # Alternative way to load from transformers model hub: 22 | #model = AdaptiveModel.convert_from_transformers(transformers_input_name, device="cpu", task_type="text_classification") 23 | # # ... continue as in the other examples e.g. to fine-tune this QA model on your own data 24 | # 25 | # # CASE 2: INFERENCER 26 | # # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions) 27 | nlp = Inferencer.load(transformers_input_name, task_type="text_classification") 28 | # 29 | # # run predictions 30 | result = nlp.inference_from_dicts(dicts=[{"text": "Was ein scheiß Nazi!"}]) 31 | pprint.pprint(result) 32 | nlp.close_multiprocessing_pool() 33 | 34 | # save it 35 | nlp.save(farm_output_dir) 36 | 37 | # ############################################## 38 | # ### From FARM -> Transformers 39 | # ############################################## 40 | def convert_to_transformers(): 41 | farm_input_dir = Path("../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse") 42 | transformers_output_dir = "../saved_models/bert-base-german-cased-hatespeech-GermEval18Coarse" 43 | # 44 | # # # load from FARM format 45 | model = AdaptiveModel.load(farm_input_dir, device="cpu") 46 | processor = Processor.load_from_dir(farm_input_dir) 47 | model.connect_heads_with_processor(processor.tasks) 48 | 49 | # convert to transformers 50 | transformer_model = Converter.convert_to_transformers(model)[0] 51 | # # Alternative way to convert to transformers: 52 | #transformer_model = model.convert_to_transformers()[0] 53 | 54 | # save it (note: transformers use str instead of Path objects) 55 | Path(transformers_output_dir).mkdir(parents=True, exist_ok=True) 56 | transformer_model.save_pretrained(transformers_output_dir) 57 | processor.tokenizer.save_pretrained(transformers_output_dir) 58 | 59 | # run predictions (using transformers) 60 | nlp = pipeline('sentiment-analysis', model=str(transformers_output_dir), tokenizer=str(transformers_output_dir)) 61 | res = nlp("Was ein scheiß Nazi!") 62 | pprint.pprint(res) 63 | 64 | # # To upload to transformer's model hub run this in bash: 65 | # # transformers-cli upload ../saved_models/bert-large-uncased-whole-word-masking-squad2 66 | 67 | if __name__ == "__main__": 68 | convert_from_transformers() 69 | convert_to_transformers() -------------------------------------------------------------------------------- /examples/doc_classification_cola.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import TextClassificationProcessor 7 | from farm.modeling.optimization import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import LanguageModel 11 | from farm.modeling.prediction_head import TextClassificationHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 15 | 16 | def doc_classification_cola(): 17 | logging.basicConfig( 18 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 19 | datefmt="%m/%d/%Y %H:%M:%S", 20 | level=logging.INFO) 21 | 22 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 23 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") 24 | 25 | ########################## 26 | ########## Settings 27 | ########################## 28 | set_all_seeds(seed=42) 29 | device, n_gpu = initialize_device_settings(use_cuda=True) 30 | n_epochs = 5 31 | batch_size = 100 32 | evaluate_every = 20 33 | lang_model = "bert-base-cased" 34 | do_lower_case = False 35 | 36 | # 1.Create a tokenizer 37 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) 38 | 39 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 40 | # Here we load Cola 2018 Data automaticaly if it is not available. 41 | # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv 42 | 43 | label_list = ["0", "1"] 44 | metric = "mcc" 45 | 46 | processor = TextClassificationProcessor(tokenizer=tokenizer, 47 | max_seq_len=64, 48 | data_dir=Path("../data/cola"), 49 | dev_filename=Path("dev.tsv"), 50 | dev_split=None, 51 | test_filename=None, 52 | label_list=label_list, 53 | metric=metric, 54 | label_column_name="label" 55 | ) 56 | 57 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 58 | data_silo = DataSilo( 59 | processor=processor, 60 | batch_size=batch_size) 61 | 62 | # 4. Create an AdaptiveModel 63 | # a) which consists of a pretrained language model as a basis 64 | language_model = LanguageModel.load(lang_model) 65 | 66 | # language_model = Roberta.load(lang_model) 67 | # b) and a prediction head on top that is suited for our task => Text classification 68 | prediction_head = TextClassificationHead( 69 | num_labels=len(label_list), 70 | class_weights=data_silo.calculate_class_weights(task_name="text_classification")) 71 | 72 | model = AdaptiveModel( 73 | language_model=language_model, 74 | prediction_heads=[prediction_head], 75 | embeds_dropout_prob=0.1, 76 | lm_output_types=["per_sequence"], 77 | device=device) 78 | 79 | # 5. Create an optimizer 80 | model, optimizer, lr_schedule = initialize_optimizer( 81 | model=model, 82 | learning_rate=2e-5, 83 | device=device, 84 | n_batches=len(data_silo.loaders["train"]), 85 | n_epochs=n_epochs) 86 | 87 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 88 | trainer = Trainer( 89 | model=model, 90 | optimizer=optimizer, 91 | data_silo=data_silo, 92 | epochs=n_epochs, 93 | n_gpu=n_gpu, 94 | lr_schedule=lr_schedule, 95 | evaluate_every=evaluate_every, 96 | device=device) 97 | 98 | # 7. Let it grow 99 | trainer.train() 100 | 101 | # 8. Hooray! You have a model. Store it: 102 | save_dir = Path("saved_models/bert-doc-tutorial") 103 | model.save(save_dir) 104 | processor.save(save_dir) 105 | 106 | # 9. Load it & harvest your fruits (Inference) 107 | basic_texts = [ 108 | {"text": "The box contained the ball from the tree."}, 109 | {"text": "I'll fix you a drink."}, 110 | ] 111 | model = Inferencer.load(save_dir) 112 | result = model.inference_from_dicts(dicts=basic_texts) 113 | print(result) 114 | model.close_multiprocessing_pool() 115 | 116 | if __name__ == "__main__": 117 | doc_classification_cola() 118 | 119 | # fmt: on 120 | -------------------------------------------------------------------------------- /examples/doc_classification_multilabel.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import TextClassificationProcessor 7 | from farm.modeling.optimization import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import LanguageModel 11 | from farm.modeling.prediction_head import MultiLabelTextClassificationHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 15 | 16 | def doc_classification_multilabel(): 17 | logging.basicConfig( 18 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 19 | datefmt="%m/%d/%Y %H:%M:%S", 20 | level=logging.INFO) 21 | 22 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 23 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") 24 | 25 | ########################## 26 | ########## Settings 27 | ########################## 28 | set_all_seeds(seed=42) 29 | device, n_gpu = initialize_device_settings(use_cuda=True) 30 | n_epochs = 1 31 | batch_size = 32 32 | 33 | evaluate_every = 500 34 | lang_model = "bert-base-uncased" 35 | do_lower_case = True 36 | 37 | # 1.Create a tokenizer 38 | tokenizer = Tokenizer.load( 39 | pretrained_model_name_or_path=lang_model, 40 | do_lower_case=do_lower_case) 41 | 42 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 43 | # Here we load Toxic Comments Data automaticaly if it is not available. 44 | 45 | label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] 46 | metric = "acc" 47 | 48 | processor = TextClassificationProcessor(tokenizer=tokenizer, 49 | max_seq_len=128, 50 | data_dir=Path("../data/toxic-comments"), 51 | label_list=label_list, 52 | label_column_name="label", 53 | metric=metric, 54 | quote_char='"', 55 | multilabel=True, 56 | train_filename="train.tsv", 57 | dev_filename="val.tsv", 58 | test_filename=None, 59 | dev_split=0, 60 | ) 61 | 62 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 63 | data_silo = DataSilo( 64 | processor=processor, 65 | batch_size=batch_size) 66 | 67 | # 4. Create an AdaptiveModel 68 | # a) which consists of a pretrained language model as a basis 69 | language_model = LanguageModel.load(lang_model) 70 | # b) and a prediction head on top that is suited for our task => Text classification 71 | prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) 72 | 73 | model = AdaptiveModel( 74 | language_model=language_model, 75 | prediction_heads=[prediction_head], 76 | embeds_dropout_prob=0.1, 77 | lm_output_types=["per_sequence"], 78 | device=device) 79 | 80 | # 5. Create an optimizer 81 | model, optimizer, lr_schedule = initialize_optimizer( 82 | model=model, 83 | learning_rate=3e-5, 84 | device=device, 85 | n_batches=len(data_silo.loaders["train"]), 86 | n_epochs=n_epochs) 87 | 88 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 89 | trainer = Trainer( 90 | model=model, 91 | optimizer=optimizer, 92 | data_silo=data_silo, 93 | epochs=n_epochs, 94 | n_gpu=n_gpu, 95 | lr_schedule=lr_schedule, 96 | evaluate_every=evaluate_every, 97 | device=device) 98 | 99 | # 7. Let it grow 100 | trainer.train() 101 | 102 | # 8. Hooray! You have a model. Store it: 103 | save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") 104 | model.save(save_dir) 105 | processor.save(save_dir) 106 | 107 | # 9. Load it & harvest your fruits (Inference) 108 | basic_texts = [ 109 | {"text": "You fucking bastards"}, 110 | {"text": "What a lovely world"}, 111 | ] 112 | model = Inferencer.load(save_dir) 113 | result = model.inference_from_dicts(dicts=basic_texts) 114 | print(result) 115 | model.close_multiprocessing_pool() 116 | 117 | 118 | if __name__ == "__main__": 119 | doc_classification_multilabel() 120 | 121 | # fmt: on 122 | -------------------------------------------------------------------------------- /examples/doc_classification_multilabel_roberta.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import TextClassificationProcessor 7 | from farm.modeling.optimization import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import Roberta 11 | from farm.modeling.prediction_head import MultiLabelTextClassificationHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 15 | 16 | 17 | def doc_classification_multilabel_roberta(): 18 | logging.basicConfig( 19 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 20 | datefmt="%m/%d/%Y %H:%M:%S", 21 | level=logging.INFO) 22 | 23 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 24 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") 25 | 26 | ########################## 27 | ########## Settings 28 | ########################## 29 | set_all_seeds(seed=42) 30 | device, n_gpu = initialize_device_settings(use_cuda=False) 31 | n_epochs = 1 32 | batch_size = 32 33 | 34 | evaluate_every = 500 35 | lang_model = "roberta-base" 36 | do_lower_case = False # roberta is a cased model 37 | 38 | # 1.Create a tokenizer 39 | tokenizer = Tokenizer.load( 40 | pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case 41 | ) 42 | 43 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 44 | # Here we load Toxic Comments Data automaticaly if it is not available. 45 | 46 | label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] 47 | metric = "acc" 48 | 49 | processor = TextClassificationProcessor(tokenizer=tokenizer, 50 | max_seq_len=128, 51 | data_dir=Path("../data/toxic-comments"), 52 | label_list=label_list, 53 | label_column_name="label", 54 | metric=metric, 55 | quote_char='"', 56 | multilabel=True, 57 | train_filename=Path("train.tsv"), 58 | dev_filename=Path("val.tsv"), 59 | test_filename=None, 60 | dev_split=0, 61 | max_samples=1000 62 | ) 63 | 64 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 65 | data_silo = DataSilo( 66 | processor=processor, 67 | batch_size=batch_size) 68 | 69 | # 4. Create an AdaptiveModel 70 | # a) which consists of a pretrained language model as a basis 71 | language_model = Roberta.load(lang_model) 72 | # b) and a prediction head on top that is suited for our task => Text classification 73 | prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) 74 | 75 | model = AdaptiveModel( 76 | language_model=language_model, 77 | prediction_heads=[prediction_head], 78 | embeds_dropout_prob=0.1, 79 | lm_output_types=["per_sequence"], 80 | device=device) 81 | 82 | # 5. Create an optimizer 83 | model, optimizer, lr_schedule = initialize_optimizer( 84 | model=model, 85 | learning_rate=3e-5, 86 | device=device, 87 | n_batches=len(data_silo.loaders["train"]), 88 | n_epochs=n_epochs) 89 | 90 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 91 | trainer = Trainer( 92 | model=model, 93 | optimizer=optimizer, 94 | data_silo=data_silo, 95 | epochs=n_epochs, 96 | n_gpu=n_gpu, 97 | lr_schedule=lr_schedule, 98 | evaluate_every=evaluate_every, 99 | device=device) 100 | 101 | # 7. Let it grow 102 | trainer.train() 103 | 104 | # 8. Hooray! You have a model. Store it: 105 | save_dir = Path("saved_models/bert-multi-doc-roberta") 106 | model.save(save_dir) 107 | processor.save(save_dir) 108 | 109 | # 9. Load it & harvest your fruits (Inference) 110 | basic_texts = [ 111 | {"text": "You fucking bastards"}, 112 | {"text": "What a lovely world"}, 113 | ] 114 | model = Inferencer.load(save_dir) 115 | result = model.run_inference(dicts=basic_texts) 116 | print(result) 117 | model.close_multiprocessing_pool() 118 | 119 | 120 | if __name__ == "__main__": 121 | doc_classification_multilabel_roberta() 122 | 123 | # fmt: on 124 | -------------------------------------------------------------------------------- /examples/doc_classification_word_embedding_LM.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | import time 5 | 6 | from farm.data_handler.data_silo import DataSilo, StreamingDataSilo 7 | from farm.data_handler.processor import TextClassificationProcessor 8 | from farm.modeling.optimization import initialize_optimizer 9 | from farm.infer import Inferencer 10 | from farm.modeling.adaptive_model import AdaptiveModel 11 | from farm.modeling.language_model import LanguageModel 12 | from farm.modeling.prediction_head import TextClassificationHead 13 | from farm.modeling.tokenization import Tokenizer 14 | from farm.train import Trainer 15 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 16 | 17 | def doc_classifcation(): 18 | logging.basicConfig( 19 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 20 | datefmt="%m/%d/%Y %H:%M:%S", 21 | level=logging.INFO) 22 | 23 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 24 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove") 25 | 26 | ########################## 27 | ########## Settings 28 | ########################## 29 | set_all_seeds(seed=42) 30 | n_epochs = 3 31 | batch_size = 32 32 | evaluate_every = 100 33 | # load from a local path: 34 | lang_model = Path("../saved_models/glove-german-uncased") 35 | # or through s3 36 | #lang_model = "glove-german-uncased" 37 | do_lower_case = True 38 | 39 | device, n_gpu = initialize_device_settings(use_cuda=True) 40 | 41 | # 1.Create a tokenizer 42 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) 43 | 44 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 45 | # Here we load GermEval 2018 Data automaticaly if it is not available. 46 | # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv 47 | label_list = ["OTHER", "OFFENSE"] 48 | metric = "f1_macro" 49 | 50 | processor = TextClassificationProcessor( 51 | tokenizer=tokenizer, 52 | max_seq_len=128, 53 | data_dir=Path("../data/germeval18"), 54 | label_list=label_list, 55 | dev_split=0, 56 | test_filename="test.tsv", 57 | train_filename="train.tsv", 58 | metric=metric, 59 | label_column_name="coarse_label") 60 | 61 | 62 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a 63 | data_silo = DataSilo( 64 | processor=processor, 65 | batch_size=batch_size, 66 | max_processes=1) 67 | 68 | # 4. Create an AdaptiveModel 69 | # a) which consists of an embedding model as a basis. 70 | # Word embedding models only converts words it has seen during training to embedding vectors. 71 | language_model = LanguageModel.load(lang_model) 72 | # b) and a prediction head on top that is suited for our task => Text classification 73 | prediction_head = TextClassificationHead( 74 | layer_dims=[300,600,len(label_list)], 75 | class_weights=data_silo.calculate_class_weights(task_name="text_classification"), 76 | num_labels=len(label_list)) 77 | 78 | model = AdaptiveModel( 79 | language_model=language_model, 80 | prediction_heads=[prediction_head], 81 | embeds_dropout_prob=0.1, 82 | lm_output_types=["per_sequence"], 83 | device=device) 84 | 85 | # 5. Create an optimizer 86 | model, optimizer, lr_schedule = initialize_optimizer( 87 | model=model, 88 | learning_rate=3e-5, 89 | device=device, 90 | n_batches=len(data_silo.loaders["train"]), 91 | n_epochs=n_epochs) 92 | 93 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 94 | trainer = Trainer( 95 | model=model, 96 | optimizer=optimizer, 97 | data_silo=data_silo, 98 | epochs=n_epochs, 99 | n_gpu=n_gpu, 100 | lr_schedule=lr_schedule, 101 | evaluate_every=evaluate_every, 102 | device=device) 103 | 104 | # 7. Let it grow 105 | trainer.train() 106 | 107 | 108 | if __name__ == "__main__": 109 | doc_classifcation() 110 | 111 | # fmt: on 112 | -------------------------------------------------------------------------------- /examples/doc_regression.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import RegressionProcessor 7 | from farm.experiment import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import LanguageModel 11 | from farm.modeling.prediction_head import RegressionHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 15 | 16 | 17 | def doc_regression(): 18 | logging.basicConfig( 19 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 20 | datefmt="%m/%d/%Y %H:%M:%S", 21 | level=logging.INFO) 22 | 23 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 24 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") 25 | 26 | ########################## 27 | ########## Settings 28 | ########################## 29 | set_all_seeds(seed=42) 30 | device, n_gpu = initialize_device_settings(use_cuda=True) 31 | n_epochs = 5 32 | batch_size = 32 33 | evaluate_every = 30 34 | lang_model = "bert-base-cased" 35 | do_lower_case = False 36 | 37 | # 1.Create a tokenizer 38 | tokenizer = Tokenizer.load( 39 | pretrained_model_name_or_path=lang_model, 40 | do_lower_case=do_lower_case) 41 | 42 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 43 | # We do not have a sample dataset for regression yet, add your own dataset to run the example 44 | processor = RegressionProcessor(tokenizer=tokenizer, 45 | max_seq_len=128, 46 | data_dir=Path("../data/"), 47 | label_column_name="label" 48 | ) 49 | 50 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 51 | data_silo = DataSilo( 52 | processor=processor, 53 | batch_size=batch_size) 54 | 55 | # 4. Create an AdaptiveModel 56 | # a) which consists of a pretrained language model as a basis 57 | language_model = LanguageModel.load(lang_model) 58 | # b) and a prediction head on top that is suited for our task => Text regression 59 | prediction_head = RegressionHead() 60 | 61 | model = AdaptiveModel( 62 | language_model=language_model, 63 | prediction_heads=[prediction_head], 64 | embeds_dropout_prob=0.1, 65 | lm_output_types=["per_sequence_continuous"], 66 | device=device) 67 | 68 | # 5. Create an optimizer 69 | model, optimizer, lr_schedule = initialize_optimizer( 70 | model=model, 71 | learning_rate=2e-5, 72 | device=device, 73 | n_batches=len(data_silo.loaders["train"]), 74 | n_epochs=n_epochs) 75 | 76 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 77 | trainer = Trainer( 78 | model=model, 79 | optimizer=optimizer, 80 | data_silo=data_silo, 81 | epochs=n_epochs, 82 | n_gpu=n_gpu, 83 | lr_schedule=lr_schedule, 84 | evaluate_every=evaluate_every, 85 | device=device) 86 | 87 | # 7. Let it grow 88 | trainer.train() 89 | 90 | # 8. Hooray! You have a model. Store it: 91 | save_dir = Path("saved_models/bert-doc-regression-tutorial") 92 | model.save(save_dir) 93 | processor.save(save_dir) 94 | 95 | # 9. Load it & harvest your fruits (Inference) 96 | # Add your own text adapted to the dataset you provide 97 | basic_texts = [ 98 | {"text": ""}, 99 | {"text": ""}, 100 | ] 101 | model = Inferencer.load(save_dir) 102 | result = model.inference_from_dicts(dicts=basic_texts) 103 | 104 | print(result) 105 | model.close_multiprocessing_pool() 106 | 107 | 108 | if __name__ == "__main__": 109 | doc_regression() 110 | 111 | # fmt: on 112 | -------------------------------------------------------------------------------- /examples/embeddings_extraction.py: -------------------------------------------------------------------------------- 1 | from farm.infer import Inferencer 2 | from farm.utils import set_all_seeds 3 | from pathlib import Path 4 | 5 | def embeddings_extraction(): 6 | set_all_seeds(seed=42) 7 | batch_size = 32 8 | use_gpu = False 9 | lang_model = "bert-base-german-cased" 10 | # or local path: 11 | # lang_model = Path("../saved_models/farm-bert-base-cased-squad2") 12 | 13 | # Input 14 | basic_texts = [ 15 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"}, 16 | {"text": "Martin Müller spielt Fussball"}, 17 | ] 18 | 19 | # Load model, tokenizer and processor directly into Inferencer 20 | model = Inferencer.load(lang_model, task_type="embeddings", gpu=use_gpu, batch_size=batch_size, 21 | extraction_strategy="reduce_mean", extraction_layer=-2, num_processes=0) 22 | 23 | # Get embeddings for input text (you can vary the strategy and layer) 24 | result = model.inference_from_dicts(dicts=basic_texts) 25 | print(result) 26 | model.close_multiprocessing_pool() 27 | 28 | 29 | if __name__ == "__main__": 30 | embeddings_extraction() 31 | -------------------------------------------------------------------------------- /examples/embeddings_extraction_s3e_pooling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | from pathlib import Path 4 | 5 | from farm.data_handler.processor import InferenceProcessor 6 | from farm.infer import Inferencer 7 | from farm.modeling.adaptive_model import AdaptiveModel 8 | from farm.modeling.language_model import LanguageModel 9 | from farm.modeling.tokenization import Tokenizer 10 | from farm.utils import set_all_seeds, initialize_device_settings 11 | from farm.modeling.wordembedding_utils import fit_s3e_on_corpus 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | """ 16 | Example for generating sentence embeddings via the S3E pooling approach as described by Wang et al in the paper 17 | "Efficient Sentence Embedding via Semantic Subspace Analysis" 18 | (https://arxiv.org/abs/2002.09620) 19 | 20 | You can use classical models like fasttext, glove or word2vec and apply S3E on top. 21 | This can be a powerful benchmark for plain transformer-based embeddings. 22 | 23 | First, we fit the required stats on a custom corpus. This includes the derivation of token_weights depending on 24 | token occurences in the corpus, creation of the semantic clusters via k-means and a couple of 25 | pre-/post-processing steps to normalize the embeddings. 26 | 27 | Second, we feed the resulting objects into our Inferencer to extract the actual sentence embeddings for our sentences. 28 | """ 29 | 30 | def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False): 31 | # Fit S3E on a corpus 32 | set_all_seeds(seed=42) 33 | device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) 34 | 35 | # Create a InferenceProcessor 36 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) 37 | processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) 38 | 39 | # Create an AdaptiveModel 40 | language_model = LanguageModel.load(language_model) 41 | 42 | model = AdaptiveModel( 43 | language_model=language_model, 44 | prediction_heads=[], 45 | embeds_dropout_prob=0.1, 46 | lm_output_types=["per_sequence"], 47 | device=device) 48 | 49 | model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, 50 | model=model, 51 | corpus=corpus_path, 52 | n_clusters=10, 53 | pca_n_components=300, 54 | svd_postprocessing=True, 55 | min_token_occurrences=1) 56 | 57 | # save everything to allow inference without fitting everything again 58 | model.save(save_dir) 59 | processor.save(save_dir) 60 | with open(save_dir / "s3e_stats.pkl", "wb") as f: 61 | pickle.dump(s3e_stats, f) 62 | 63 | # Load model, tokenizer and processor directly into Inferencer 64 | inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, 65 | batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, 66 | s3e_stats=s3e_stats) 67 | 68 | # Input 69 | basic_texts = [ 70 | {"text": "a man is walking on the street."}, 71 | {"text": "a woman is walking on the street."}, 72 | ] 73 | 74 | # Get embeddings for input text (you can vary the strategy and layer) 75 | result = inferencer.inference_from_dicts(dicts=basic_texts) 76 | print(result) 77 | inferencer.close_multiprocessing_pool() 78 | 79 | 80 | def extract_embeddings(load_dir, use_gpu, batch_size): 81 | with open(load_dir / "s3e_stats.pkl", "rb") as f: 82 | s3e_stats = pickle.load(f) 83 | 84 | # Init inferencer 85 | inferencer = Inferencer.load(model_name_or_path=load_dir, task_type="embeddings", gpu=use_gpu, 86 | batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, 87 | s3e_stats=s3e_stats) 88 | 89 | # Input 90 | basic_texts = [ 91 | {"text": "a man is walking on the street."}, 92 | {"text": "a woman is walking on the street."}, 93 | ] 94 | 95 | # Get embeddings for input text 96 | result = inferencer.inference_from_dicts(dicts=basic_texts) 97 | print(result) 98 | inferencer.close_multiprocessing_pool() 99 | 100 | 101 | if __name__ == "__main__": 102 | lang_model = "glove-english-uncased-6B" 103 | do_lower_case = True 104 | 105 | # You can download this from: 106 | # "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/lm_finetune_nips.tar.gz" 107 | corpus_path = Path("../data/lm_finetune_nips/train.txt") 108 | 109 | s3e_dir = Path("../saved_models/fitted_s3e/") 110 | 111 | fit(language_model=lang_model, 112 | do_lower_case=do_lower_case, 113 | corpus_path=corpus_path, 114 | save_dir=s3e_dir 115 | ) 116 | 117 | extract_embeddings(load_dir=s3e_dir, use_gpu=False, batch_size=10) -------------------------------------------------------------------------------- /examples/evaluation.py: -------------------------------------------------------------------------------- 1 | from farm.utils import initialize_device_settings 2 | from farm.modeling.tokenization import Tokenizer 3 | from farm.data_handler.processor import TextClassificationProcessor, SquadProcessor 4 | from farm.data_handler.data_silo import DataSilo 5 | from farm.eval import Evaluator 6 | from farm.modeling.adaptive_model import AdaptiveModel 7 | from pathlib import Path 8 | 9 | def evaluate_classification(): 10 | ########################## 11 | ########## Settings 12 | ########################## 13 | device, n_gpu = initialize_device_settings(use_cuda=True) 14 | lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17" 15 | do_lower_case = False 16 | batch_size = 100 17 | 18 | data_dir = Path("../data/germeval17") 19 | evaluation_filename = "test_TIMESTAMP1.tsv" 20 | label_list = ["negative", "neutral", "positive"] 21 | metric = "f1_macro" 22 | 23 | # 1.Create a tokenizer 24 | tokenizer = Tokenizer.load( 25 | pretrained_model_name_or_path=lang_model, 26 | do_lower_case=do_lower_case) 27 | 28 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 29 | # Here we load GermEval 2017 Data automaticaly if it is not available. 30 | 31 | processor = TextClassificationProcessor( 32 | tokenizer=tokenizer, 33 | max_seq_len=384, 34 | label_list=label_list, 35 | metric=metric, 36 | train_filename=None, 37 | dev_filename=None, 38 | dev_split=0, 39 | test_filename=evaluation_filename, 40 | data_dir=data_dir, 41 | ) 42 | 43 | # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets 44 | data_silo = DataSilo( 45 | processor=processor, 46 | batch_size=batch_size) 47 | 48 | # 4. Create an Evaluator 49 | evaluator = Evaluator( 50 | data_loader=data_silo.get_data_loader("test"), 51 | tasks=data_silo.processor.tasks, 52 | device=device 53 | ) 54 | 55 | # 5. Load model 56 | model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification") 57 | # use "load" if you want to use a local model that was trained with FARM 58 | # model = AdaptiveModel.load(lang_model, device=device) 59 | model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) 60 | 61 | # 6. Run the Evaluator 62 | results = evaluator.eval(model) 63 | f1_score = results[0]["f1_macro"] 64 | print("Macro-averaged F1-Score:", f1_score) 65 | 66 | 67 | def evaluate_question_answering(): 68 | ########################## 69 | ########## Settings 70 | ########################## 71 | device, n_gpu = initialize_device_settings(use_cuda=True) 72 | lang_model = "deepset/roberta-base-squad2" 73 | do_lower_case = True 74 | 75 | data_dir = Path("../data/squad20") 76 | evaluation_filename = "dev-v2.0.json" 77 | 78 | batch_size = 50 79 | no_ans_boost = 0 80 | accuracy_at = 3 # accuracy at n is useful for answers inside long documents 81 | 82 | # 1.Create a tokenizer 83 | tokenizer = Tokenizer.load( 84 | pretrained_model_name_or_path=lang_model, 85 | do_lower_case=do_lower_case) 86 | 87 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 88 | processor = SquadProcessor( 89 | tokenizer=tokenizer, 90 | max_seq_len=256, 91 | label_list= ["start_token", "end_token"], 92 | metric="squad", 93 | train_filename=None, 94 | dev_filename=None, 95 | dev_split=0, 96 | test_filename=evaluation_filename, 97 | data_dir=data_dir, 98 | doc_stride=128, 99 | ) 100 | 101 | # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets 102 | data_silo = DataSilo( 103 | processor=processor, 104 | batch_size=batch_size) 105 | 106 | # 4. Create an Evaluator 107 | evaluator = Evaluator( 108 | data_loader=data_silo.get_data_loader("test"), 109 | tasks=data_silo.processor.tasks, 110 | device=device 111 | ) 112 | 113 | # 5. Load model 114 | model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") 115 | # use "load" if you want to use a local model that was trained with FARM 116 | #model = AdaptiveModel.load(lang_model, device=device) 117 | model.prediction_heads[0].no_ans_boost = no_ans_boost 118 | model.prediction_heads[0].n_best = accuracy_at 119 | model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) 120 | 121 | # 6. Run the Evaluator 122 | results = evaluator.eval(model) 123 | f1_score = results[0]["f1"] 124 | em_score = results[0]["EM"] 125 | tnacc = results[0]["top_n_accuracy"] 126 | print("F1-Score:", f1_score) 127 | print("Exact Match Score:", em_score) 128 | print(f"top_{accuracy_at}_accuracy:", tnacc) 129 | 130 | 131 | if __name__ == "__main__": 132 | #evaluate_classification() 133 | evaluate_question_answering() 134 | -------------------------------------------------------------------------------- /examples/lm_finetuning.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from farm.data_handler.data_silo import DataSilo 5 | from farm.data_handler.processor import BertStyleLMProcessor 6 | from farm.modeling.adaptive_model import AdaptiveModel 7 | from farm.modeling.language_model import LanguageModel 8 | from farm.modeling.prediction_head import BertLMHead, NextSentenceHead 9 | from farm.modeling.tokenization import Tokenizer 10 | from farm.train import Trainer 11 | from farm.modeling.optimization import initialize_optimizer 12 | 13 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 14 | 15 | 16 | def lm_finetuning(): 17 | logging.basicConfig( 18 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 19 | datefmt="%m/%d/%Y %H:%M:%S", 20 | level=logging.INFO, 21 | ) 22 | next_sent_pred_style = "bert-style" 23 | next_sent_pred=True 24 | set_all_seeds(seed=42) 25 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 26 | ml_logger.init_experiment( 27 | experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}" 28 | ) 29 | ########################## 30 | ########## Settings 31 | ########################## 32 | device, n_gpu = initialize_device_settings(use_cuda=True) 33 | n_epochs = 1 34 | batch_size = 32 35 | evaluate_every = 1000 36 | lang_model = "bert-base-cased" 37 | do_lower_case = False 38 | 39 | # 1.Create a tokenizer 40 | tokenizer = Tokenizer.load( 41 | pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case 42 | ) 43 | 44 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 45 | processor = BertStyleLMProcessor( 46 | data_dir=Path("../data/lm_finetune_nips"), 47 | tokenizer=tokenizer, 48 | max_seq_len=128, 49 | max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example 50 | next_sent_pred_style=next_sent_pred_style 51 | ) 52 | 53 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 54 | data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20) 55 | 56 | # 4. Create an AdaptiveModel 57 | # a) which consists of a pretrained language model as a basis 58 | language_model = LanguageModel.load(lang_model) 59 | # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning 60 | lm_prediction_head = BertLMHead.load(lang_model) 61 | next_sentence_head = NextSentenceHead.load(lang_model) 62 | 63 | model = AdaptiveModel( 64 | language_model=language_model, 65 | prediction_heads=[lm_prediction_head, next_sentence_head], 66 | embeds_dropout_prob=0.1, 67 | lm_output_types=["per_token", "per_sequence"], 68 | device=device, 69 | ) 70 | 71 | # 5. Create an optimizer 72 | model, optimizer, lr_schedule = initialize_optimizer( 73 | model=model, 74 | learning_rate=2e-5, 75 | device=device, 76 | n_batches=len(data_silo.loaders["train"]), 77 | n_epochs=n_epochs 78 | ) 79 | 80 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 81 | trainer = Trainer( 82 | model=model, 83 | optimizer=optimizer, 84 | data_silo=data_silo, 85 | epochs=n_epochs, 86 | n_gpu=n_gpu, 87 | lr_schedule=lr_schedule, 88 | evaluate_every=evaluate_every, 89 | device=device, 90 | eval_report=False 91 | ) 92 | 93 | # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai 94 | trainer.train() 95 | 96 | # 8. Hooray! You have a model. Store it: 97 | save_dir = Path("saved_models/bert-english-lm-tutorial") 98 | model.save(save_dir) 99 | processor.save(save_dir) 100 | 101 | 102 | if __name__ == "__main__": 103 | lm_finetuning() 104 | -------------------------------------------------------------------------------- /examples/ner.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import NERProcessor 7 | from farm.modeling.optimization import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import LanguageModel 11 | from farm.modeling.prediction_head import TokenClassificationHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 15 | 16 | def ner(): 17 | logging.basicConfig( 18 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 19 | datefmt="%m/%d/%Y %H:%M:%S", 20 | level=logging.INFO, 21 | ) 22 | 23 | ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") 24 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner") 25 | 26 | ########################## 27 | ########## Settings 28 | ########################## 29 | set_all_seeds(seed=42) 30 | device, n_gpu = initialize_device_settings(use_cuda=True) 31 | n_epochs = 4 32 | batch_size = 32 33 | evaluate_every = 400 34 | lang_model = "bert-base-german-cased" 35 | do_lower_case = False 36 | 37 | # 1.Create a tokenizer 38 | tokenizer = Tokenizer.load( 39 | pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case 40 | ) 41 | 42 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 43 | # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor 44 | ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"] 45 | 46 | processor = NERProcessor( 47 | tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels 48 | ) 49 | 50 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets 51 | data_silo = DataSilo(processor=processor, batch_size=batch_size) 52 | 53 | # 4. Create an AdaptiveModel 54 | # a) which consists of a pretrained language model as a basis 55 | language_model = LanguageModel.load(lang_model) 56 | # b) and a prediction head on top that is suited for our task => NER 57 | prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) 58 | 59 | model = AdaptiveModel( 60 | language_model=language_model, 61 | prediction_heads=[prediction_head], 62 | embeds_dropout_prob=0.1, 63 | lm_output_types=["per_token"], 64 | device=device, 65 | ) 66 | 67 | # 5. Create an optimizer 68 | model, optimizer, lr_schedule = initialize_optimizer( 69 | model=model, 70 | learning_rate=1e-5, 71 | n_batches=len(data_silo.loaders["train"]), 72 | n_epochs=n_epochs, 73 | device=device, 74 | ) 75 | 76 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 77 | trainer = Trainer( 78 | model=model, 79 | optimizer=optimizer, 80 | data_silo=data_silo, 81 | epochs=n_epochs, 82 | n_gpu=n_gpu, 83 | lr_schedule=lr_schedule, 84 | evaluate_every=evaluate_every, 85 | device=device, 86 | ) 87 | 88 | # 7. Let it grow 89 | trainer.train() 90 | 91 | # 8. Hooray! You have a model. Store it: 92 | save_dir = "saved_models/bert-german-ner-tutorial" 93 | model.save(save_dir) 94 | processor.save(save_dir) 95 | 96 | 97 | # 9. Load it & harvest your fruits (Inference) 98 | basic_texts = [ 99 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, 100 | {"text": "Martin Müller spielt Handball in Berlin"}, 101 | ] 102 | model = Inferencer.load(save_dir) 103 | result = model.inference_from_dicts(dicts=basic_texts) 104 | print(result) 105 | 106 | model.close_multiprocessing_pool() 107 | 108 | 109 | if __name__ == "__main__": 110 | ner() 111 | -------------------------------------------------------------------------------- /examples/onnx_question_answering.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from farm.infer import Inferencer 4 | from farm.modeling.adaptive_model import AdaptiveModel 5 | 6 | 7 | def onnx_runtime_example(): 8 | """ 9 | This example shows conversion of a transformers model from the Model Hub to 10 | ONNX format & inference using ONNXRuntime. 11 | """ 12 | 13 | model_name_or_path = "deepset/roberta-base-squad2" 14 | onnx_model_export_path = Path("./roberta-onnx") 15 | 16 | AdaptiveModel.convert_to_onnx(model_name_or_path, onnx_model_export_path, task_type="question_answering") 17 | 18 | # for ONNX models, the Inferencer uses ONNXRuntime under-the-hood 19 | inferencer = Inferencer.load(model_name_or_path=onnx_model_export_path) 20 | 21 | qa_input = [ 22 | { 23 | "questions": ["Who counted the game among the best ever made?"], 24 | "text": "Twilight Princess was released to universal critical acclaim and commercial success. " 25 | "It received perfect scores from major publications such as 1UP.com, Computer and Video Games, " 26 | "Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators " 27 | "GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii " 28 | "version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called " 29 | "it one of the greatest games ever created.", 30 | } 31 | ] 32 | 33 | results = inferencer.inference_from_dicts(qa_input) 34 | print(results) 35 | inferencer.close_multiprocessing_pool() 36 | 37 | 38 | if __name__ == "__main__": 39 | onnx_runtime_example() 40 | -------------------------------------------------------------------------------- /examples/streaming_inference.py: -------------------------------------------------------------------------------- 1 | from farm.infer import Inferencer 2 | 3 | 4 | def streaming_inference_example(): 5 | """ 6 | The FARM Inferencer has a high performance non-blocking streaming mode for large scale inference use cases. With 7 | this mode, the dicts parameter can optionally be a Python generator object that yield dicts, thus avoiding loading 8 | dicts in memory. The inference_from_dicts() method returns a generator that yield predictions. To use streaming, 9 | set the streaming param to True and determine optimal multiprocessing_chunksize by performing speed benchmarks. 10 | """ 11 | 12 | model_name_or_path = "deepset/bert-base-cased-squad2" 13 | inferencer = Inferencer.load(model_name_or_path=model_name_or_path, task_type="question_answering", num_processes=8) 14 | 15 | dicts = sample_dicts_generator() # it can be a list of dicts or a generator object 16 | results = inferencer.inference_from_dicts(dicts, streaming=True, multiprocessing_chunksize=20) 17 | 18 | for prediction in results: # results is a generator object that yields predictions 19 | print(prediction) 20 | 21 | inferencer.close_multiprocessing_pool() 22 | 23 | 24 | def sample_dicts_generator(): 25 | """ 26 | This is a sample dicts generator. Some exemplary use cases: 27 | 28 | * read chunks of text from large files iteratively and generate inference predictions 29 | * connect with external datasources, eg, a Elasticsearch Scroll API that reads all documents from a given index 30 | * building a streaming microservice that reads from Kafka 31 | 32 | :return: a generator that yield dicts 33 | :rtype: iter 34 | """ 35 | qa_input = { 36 | "questions": ["Who counted the game among the best ever made?"], 37 | "text": "Twilight Princess was released to universal critical acclaim and commercial success. " 38 | "It received perfect scores from major publications such as 1UP.com, Computer and Video Games, " 39 | "Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators " 40 | "GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii " 41 | "version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called " 42 | "it one of the greatest games ever created.", 43 | } 44 | 45 | for i in range(100000): 46 | yield qa_input 47 | 48 | 49 | if __name__ == "__main__": 50 | streaming_inference_example() 51 | -------------------------------------------------------------------------------- /examples/wordembedding_inference.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | import logging 3 | from pathlib import Path 4 | 5 | 6 | from farm.data_handler.processor import InferenceProcessor 7 | from farm.infer import Inferencer 8 | from farm.modeling.adaptive_model import AdaptiveModel 9 | from farm.modeling.language_model import LanguageModel 10 | from farm.modeling.tokenization import Tokenizer 11 | from farm.utils import set_all_seeds, initialize_device_settings 12 | 13 | def embedding_extraction(): 14 | logging.basicConfig( 15 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 16 | datefmt="%m/%d/%Y %H:%M:%S", 17 | level=logging.INFO) 18 | 19 | ########################## 20 | ########## Settings 21 | ########################## 22 | set_all_seeds(seed=42) 23 | # load from a local path: 24 | #lang_model = Path("../saved_models/glove-german-uncased") 25 | # or through s3 26 | lang_model = "glove-german-uncased" #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported 27 | do_lower_case = True 28 | use_amp = None 29 | device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) 30 | 31 | # Create a InferenceProcessor 32 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) 33 | processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) 34 | 35 | # Create an AdaptiveModel 36 | language_model = LanguageModel.load(lang_model) 37 | model = AdaptiveModel( 38 | language_model=language_model, 39 | prediction_heads=[], 40 | embeds_dropout_prob=0.1, 41 | lm_output_types=["per_sequence"], 42 | device=device) 43 | 44 | 45 | # Create Inferencer for embedding extraction 46 | inferencer = Inferencer( 47 | model=model, 48 | processor=processor, 49 | task_type="embeddings" 50 | ) 51 | 52 | 53 | # Extract vectors 54 | basic_texts = [ 55 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, 56 | {"text": "Martin Müller spielt Handball in Berlin"}, 57 | ] 58 | 59 | result = inferencer.extract_vectors( 60 | dicts=basic_texts, 61 | extraction_strategy="cls_token", 62 | extraction_layer=-1 63 | ) 64 | print(result) 65 | inferencer.close_multiprocessing_pool() 66 | 67 | 68 | if __name__ == "__main__": 69 | embedding_extraction() 70 | 71 | # fmt: on 72 | -------------------------------------------------------------------------------- /experiments/lm_finetuning/finetune_sample_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "general": { 3 | "cache_dir": {"value": null, "default": "", "desc": "Path for storing pre-trained models downloaded from s3."}, 4 | "data_dir": {"value": null, "default": "data/lm_finetune_nips", "desc": "Input directory for downstream task. Should contain train + test (+ dev) files."}, 5 | "output_dir": {"value": null, "default": "saved_models", "desc": "Output directory where model predictions and checkpoints will be saved."}, 6 | 7 | "cuda": {"value": false, "default": true, "desc": "CUDA flag, uses CUDA if available."}, 8 | "local_rank": {"value": null, "default": -1, "desc": "If local_rank == -1 -> multiGPU mode on one machine, other values signal distributed computation across several nodes (apex install required)."}, 9 | "use_amp": {"value": null, "default": null, "desc": "Automatic mixed precision with APEX. Must be set to null to disable or to any optimisation level (see apex documentation). 'O1' is recommended."}, 10 | "seed": {"value": null, "default": 42, "desc": "Random seed for initializations."} 11 | }, 12 | 13 | "task": { 14 | "name": {"value": null, "default": "test_lm_finetuning", "desc": "Description."}, 15 | "output_mode": {"value": null, "default": "lm", "desc": "Used for data loading and evaluation. Choices: classification, ner, lm TBD"}, 16 | "prediction_head": {"value": null, "default": "lm", "desc": "Prediction head on top of vanilla LM Model, must correspond to task and data."}, 17 | 18 | "do_eval": {"value": null, "default": false, "desc": "Whether to run eval on the dev set."}, 19 | "do_train": {"value": null, "default": true, "desc": "Whether to run training. Can be used to only evaluate on an already trained model."}, 20 | 21 | "processor_name": {"value": null, "default": "BertStyleLMProcessor", "desc": "Class name of DataProcessor."}, 22 | "dev_split": {"value": null, "default": 0.1, "desc": "Split a dev set from the training set using dev_split as proportion."}, 23 | "train_filename": {"value": null, "default": "train.txt", "desc": "Filename for training."}, 24 | "dev_filename": {"value": null, "default": "dev.txt", "desc": "Filename for development."}, 25 | "test_filename": {"value": null, "default": "test.txt", "desc": "Filename for testing."} 26 | }, 27 | 28 | "parameter": { 29 | "model": {"value": "bert-base-cased", "default": null, "desc": "Bert pre-trained model selected in the list: bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese."}, 30 | "lower_case": {"value": null, "default": false, "desc": "Set to true if you are using an uncased model."}, 31 | "max_seq_len": {"value": null, "default": 128, "desc": "The maximum total input sequence length after WordPiece tokenization. 128 was too short for some texts"}, 32 | "balance_classes": {"value": null, "default": true, "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged, so we need to balance for optimal performance.."}, 33 | 34 | "num_train_epochs": {"value": null, "default": 1.0, "desc": "Total number of training epochs to perform."}, 35 | "batch_size": {"value": null, "default": 64, "desc": ""}, 36 | "gradient_accumulation_steps": {"value": null, "default": 1, "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."} 37 | }, 38 | "optimizer": { 39 | "learning_rate": {"value": null, "default": 2e-5, "desc": "The learning rate for the optimizer."}, 40 | "optimizer_opts": {"value": null, "default": null, "desc": "Additional optimizer config."}, 41 | "schedule_opts": {"value": null, "default": {"name": "LinearWarmup", "warmup_proportion": 0.1}, "desc": "opts for lr schedule"} 42 | }, 43 | "logging": { 44 | "eval_every": {"value": null, "default": 30, "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."}, 45 | "mlflow_url": {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"}, 46 | "mlflow_nested": {"value": null, "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."}, 47 | 48 | "mlflow_experiment": {"value": "debug_lm_finetuning", "default": null, "desc": "Experiment name used for mlflow"}, 49 | "mlflow_run_name": {"value": "lm finetuning example", "default": null, "desc": "Name of the particular run for mlflow"} 50 | } 51 | } 52 | 53 | 54 | -------------------------------------------------------------------------------- /farm/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch.multiprocessing as mp 4 | from farm._version import __version__ 5 | 6 | logging.basicConfig( 7 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 8 | datefmt="%m/%d/%Y %H:%M:%S", 9 | level=logging.INFO, 10 | ) 11 | 12 | # reduce verbosity from transformers library 13 | logging.getLogger('transformers.configuration_utils').setLevel(logging.WARNING) 14 | 15 | # https://pytorch.org/docs/stable/multiprocessing.html#sharing-strategies 16 | if "file_descriptor" in mp.get_all_sharing_strategies(): 17 | import resource 18 | 19 | mp.set_sharing_strategy("file_descriptor") 20 | 21 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 22 | # seting soft limit to hard limit (=rlimit[1]) minus a small amount to be safe 23 | resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1]-512, rlimit[1])) 24 | -------------------------------------------------------------------------------- /farm/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.8.1-snapshot" 2 | -------------------------------------------------------------------------------- /farm/conversion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/conversion/__init__.py -------------------------------------------------------------------------------- /farm/conversion/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | 23 | import torch 24 | from transformers.modeling_bert import ( 25 | BertConfig, 26 | BertForPreTraining, 27 | load_tf_weights_in_bert, 28 | ) 29 | 30 | 31 | def convert_tf_checkpoint_to_pytorch( 32 | tf_checkpoint_path, bert_config_file, pytorch_dump_path 33 | ): 34 | # Initialise PyTorch model 35 | config = BertConfig.from_json_file(bert_config_file) 36 | print("Building PyTorch model from configuration: {}".format(str(config))) 37 | model = BertForPreTraining(config) 38 | 39 | # Load weights from tf checkpoint 40 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 41 | 42 | # Save pytorch-model 43 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 44 | torch.save(model.state_dict(), pytorch_dump_path) 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | ## Required parameters 50 | parser.add_argument( 51 | "--tf_checkpoint_path", 52 | default=None, 53 | type=str, 54 | required=True, 55 | help="Path the TensorFlow checkpoint path.", 56 | ) 57 | parser.add_argument( 58 | "--bert_config_file", 59 | default=None, 60 | type=str, 61 | required=True, 62 | help="The config json file corresponding to the pre-trained BERT model. \n" 63 | "This specifies the model architecture.", 64 | ) 65 | parser.add_argument( 66 | "--pytorch_dump_path", 67 | default=None, 68 | type=str, 69 | required=True, 70 | help="Path to the output PyTorch model.", 71 | ) 72 | args = parser.parse_args() 73 | convert_tf_checkpoint_to_pytorch( 74 | args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path 75 | ) 76 | -------------------------------------------------------------------------------- /farm/data_handler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/data_handler/__init__.py -------------------------------------------------------------------------------- /farm/data_handler/dataloader.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | 3 | from torch.utils.data import DataLoader, Dataset, Sampler 4 | import torch 5 | 6 | 7 | class NamedDataLoader(DataLoader): 8 | """ 9 | A modified version of the PyTorch DataLoader that returns a dictionary where the key is 10 | the name of the tensor and the value is the tensor itself. 11 | """ 12 | 13 | def __init__(self, dataset, batch_size, sampler=None, tensor_names=None, num_workers=0, pin_memory=False): 14 | """ 15 | :param dataset: The dataset that will be wrapped by this NamedDataLoader 16 | :type dataset: Dataset 17 | :param sampler: The sampler used by the NamedDataLoader to choose which samples to include in the batch 18 | :type sampler: Sampler 19 | :param batch_size: The size of the batch to be returned by the NamedDataLoader 20 | :type batch_size: int 21 | :param tensor_names: The names of the tensor, in the order that the dataset returns them in. 22 | :type tensor_names: list 23 | :param num_workers: number of workers to use for the DataLoader 24 | :type num_workers: int 25 | :param pin_memory: argument for Data Loader to use page-locked memory for faster transfer of data to GPU 26 | :type pin_memory: bool 27 | """ 28 | 29 | def collate_fn(batch): 30 | """ 31 | A custom collate function that formats the batch as a dictionary where the key is 32 | the name of the tensor and the value is the tensor itself 33 | """ 34 | 35 | if type(dataset).__name__ == "_StreamingDataSet": 36 | _tensor_names = dataset.tensor_names 37 | else: 38 | _tensor_names = tensor_names 39 | 40 | if type(batch[0]) == list: 41 | batch = batch[0] 42 | 43 | assert len(batch[0]) == len( 44 | _tensor_names 45 | ), "Dataset contains {} tensors while there are {} tensor names supplied: {}".format( 46 | len(batch[0]), len(_tensor_names), _tensor_names 47 | ) 48 | lists_temp = [[] for _ in range(len(_tensor_names))] 49 | ret = dict(zip(_tensor_names, lists_temp)) 50 | 51 | for example in batch: 52 | for name, tensor in zip(_tensor_names, example): 53 | ret[name].append(tensor) 54 | 55 | for key in ret: 56 | ret[key] = torch.stack(ret[key]) 57 | 58 | return ret 59 | 60 | super(NamedDataLoader, self).__init__( 61 | dataset=dataset, 62 | sampler=sampler, 63 | batch_size=batch_size, 64 | collate_fn=collate_fn, 65 | pin_memory=pin_memory, 66 | num_workers=num_workers, 67 | ) 68 | 69 | def __len__(self): 70 | if type(self.dataset).__name__ == "_StreamingDataSet": 71 | num_samples = len(self.dataset) 72 | num_batches = ceil(num_samples / self.dataset.batch_size) 73 | return num_batches 74 | else: 75 | return super().__len__() 76 | 77 | 78 | def covert_dataset_to_dataloader(dataset, sampler, batch_size): 79 | """ 80 | Wraps a PyTorch Dataset with a DataLoader. 81 | 82 | :param dataset: Dataset to be wrapped. 83 | :type dataset: Dataset 84 | :param sampler: PyTorch sampler used to pick samples in a batch. 85 | :type sampler: Sampler 86 | :param batch_size: Number of samples in the batch. 87 | :return: A DataLoader that wraps the input Dataset. 88 | """ 89 | sampler_initialized = sampler(dataset) 90 | data_loader = DataLoader( 91 | dataset, sampler=sampler_initialized, batch_size=batch_size 92 | ) 93 | return data_loader 94 | -------------------------------------------------------------------------------- /farm/data_handler/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | import numpy as np 3 | import numbers 4 | import logging 5 | import torch 6 | from torch.utils.data import Dataset, ConcatDataset, TensorDataset 7 | from farm.utils import flatten_list 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def convert_features_to_dataset(features): 13 | """ 14 | Converts a list of feature dictionaries (one for each sample) into a PyTorch Dataset. 15 | 16 | :param features: A list of dictionaries. Each dictionary corresponds to one sample. Its keys are the 17 | names of the type of feature and the keys are the features themselves. 18 | :Return: a Pytorch dataset and a list of tensor names. 19 | """ 20 | # features can be an empty list in cases where down sampling occurs (e.g. Natural Questions 21 | # downsamples instances of is_impossible) 22 | if len(features) == 0: 23 | return None, None 24 | tensor_names = list(features[0].keys()) 25 | all_tensors = [] 26 | for t_name in tensor_names: 27 | # Conversion of floats 28 | if t_name == 'regression_label_ids': 29 | cur_tensor = torch.tensor([sample[t_name] for sample in features], dtype=torch.float32) 30 | else: 31 | try: 32 | # Checking weather a non-integer will be silently converted to torch.long 33 | check = features[0][t_name] 34 | if isinstance(check, numbers.Number): 35 | base = check 36 | # extract a base variable from a nested lists or tuples 37 | elif isinstance(check, list): 38 | base = list(flatten_list(check))[0] 39 | # extract a base variable from numpy arrays 40 | else: 41 | base = check.ravel()[0] 42 | if not np.issubdtype(type(base), np.integer): 43 | logger.warning(f"Problem during conversion to torch tensors:\n" 44 | f"A non-integer value for feature '{t_name}' with a value of: " 45 | f"'{base}' will be converted to a torch tensor of dtype long.") 46 | except: 47 | logger.warning(f"Could not determine type for feature '{t_name}'. " 48 | "Converting now to a tensor of default type long.") 49 | 50 | # Convert all remaining python objects to torch long tensors 51 | cur_tensor = torch.tensor([sample[t_name] for sample in features], dtype=torch.long) 52 | 53 | all_tensors.append(cur_tensor) 54 | 55 | dataset = TensorDataset(*all_tensors) 56 | return dataset, tensor_names 57 | 58 | 59 | class ConcatTensorDataset(ConcatDataset): 60 | r"""ConcatDataset of only TensorDatasets which supports getting slices. 61 | 62 | This dataset allows the use of slices, e.g. ds[2:4] if all concatenated 63 | datasets are either TensorDatasets or Subset or other ConcatTensorDataset instances 64 | which eventually contain only TensorDataset instances. If no slicing is needed, 65 | this class works exactly like torch.utils.data.ConcatDataset and can concatenate arbitrary 66 | (not just TensorDataset) datasets. 67 | 68 | Args: 69 | datasets (sequence): List of datasets to be concatenated 70 | """ 71 | def __init__(self, datasets: Iterable[Dataset]) -> None: 72 | super(ConcatTensorDataset, self).__init__(datasets) 73 | 74 | def __getitem__(self, idx): 75 | if isinstance(idx, slice): 76 | rows = [super(ConcatTensorDataset, self).__getitem__(i) for i in range(self.__len__())[idx]] 77 | return tuple(map(torch.stack, zip(*rows))) 78 | elif isinstance(idx, (list, np.ndarray)): 79 | rows = [super(ConcatTensorDataset, self).__getitem__(i) for i in idx] 80 | return tuple(map(torch.stack, zip(*rows))) 81 | else: 82 | return super(ConcatTensorDataset, self).__getitem__(idx) 83 | -------------------------------------------------------------------------------- /farm/data_handler/inputs.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | 4 | class Question: 5 | def __init__(self, text: str, uid: str=None): 6 | self.text = text 7 | self.uid = uid 8 | 9 | def to_dict(self): 10 | ret = {"question": self.text, 11 | "id": self.uid, 12 | "answers": []} 13 | return ret 14 | 15 | 16 | class QAInput: 17 | def __init__(self, doc_text: str, questions: Union[List[Question], Question]): 18 | self.doc_text = doc_text 19 | if type(questions) == Question: 20 | self.questions = [questions] 21 | else: 22 | self.questions = questions 23 | 24 | def to_dict(self): 25 | questions = [q.to_dict() for q in self.questions] 26 | ret = {"qas": questions, 27 | "context": self.doc_text} 28 | return ret 29 | 30 | -------------------------------------------------------------------------------- /farm/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/evaluation/__init__.py -------------------------------------------------------------------------------- /farm/evaluation/msmarco_passage_farm.py: -------------------------------------------------------------------------------- 1 | from farm.evaluation.msmarco_passage_official import compute_metrics_from_files 2 | import os 3 | import pandas as pd 4 | 5 | 6 | def msmarco_evaluation(preds_file, dev_file, qrels_file, output_file): 7 | """ 8 | Performs official msmarco passage ranking evaluation (https://github.com/microsoft/MSMARCO-Passage-Ranking) 9 | on a file containing the is_relevent prediction scores. It will convert the input file (qid, pid, score) 10 | into the format expected by the official eval function (compute_metrics_from_files) 11 | 12 | :param predictions_filename: File where each line is the is_relevant prediction score 13 | :param dev_filename: File in format qid, query, pid, passage, label 14 | :param qrels_filename: File in the format qid, pid when is_relevant=1 15 | :param output_file: File to write to in format qid, pid, rank 16 | 17 | :return: 18 | """ 19 | 20 | # Initialize files 21 | preds_scores = [float(l) for l in open(preds_file)] 22 | dev_lines = [l for i,l in enumerate(open(dev_file)) if i != 0] 23 | output = open(output_file, "w") 24 | 25 | # Populate a dict with all qid/pid/score triples 26 | results = dict() 27 | for i, (score, line) in enumerate(zip(preds_scores, dev_lines)): 28 | if i == 0: 29 | continue 30 | qid, _, pid, _, _ = line.split("\t") 31 | if qid not in results: 32 | results[qid] = [] 33 | results[qid].append((pid, score)) 34 | 35 | # ########## 36 | # ### NOTE: This block is to generate a view that is interpretable when debugging 37 | # ########## 38 | # interpretable = dict() 39 | # for i, (score, line) in enumerate(zip(preds_scores, dev_lines)): 40 | # if i == 0: 41 | # continue 42 | # _, query, _, passage, label = line.split("\t") 43 | # if query not in interpretable: 44 | # interpretable[query] = [] 45 | # interpretable[query].append((passage, score, label[:-1])) 46 | # for query in interpretable: 47 | # sorted_scores = sorted(interpretable[query], key= lambda x: x[1], reverse=True)[:10] 48 | # results[query] = sorted_scores 49 | # relevant = [] 50 | # for query in interpretable: 51 | # for (passage, score, label) in interpretable[query]: 52 | # if label == "1": 53 | # relevant.append((passage, score)) 54 | # rel_scores = [x[1] for x in relevant] 55 | # irrelevant = [] 56 | # for query in interpretable: 57 | # for (passage, score, label) in interpretable[query]: 58 | # if label == "0": 59 | # irrelevant.append((passage, score)) 60 | # irrel_scores = [x[1] for x in irrelevant] 61 | # print() 62 | 63 | # Sort by scores and take top 10 64 | for qid in list(results): 65 | sorted_scores = sorted(results[qid], key= lambda x: x[1], reverse=True)[:10] 66 | results[qid] = [(pid, i+1) for i, (pid, _) in enumerate(sorted_scores)] 67 | 68 | # Write to file 69 | for qid in list(results): 70 | for (pid, rank) in results[qid]: 71 | output.write(f"{qid}\t{pid}\t{rank}\n") 72 | output.close() 73 | 74 | curr_qids = list(results) 75 | df = pd.read_csv(qrels_file, sep="\t", header=None) 76 | df = df.loc[df[0].isin(curr_qids)] 77 | df.to_csv("tmp", sep="\t", header=None, index=None) 78 | 79 | path_to_reference = "tmp" 80 | path_to_candidate = output_file 81 | metrics = compute_metrics_from_files(path_to_reference, path_to_candidate) 82 | print('#####################') 83 | for metric in sorted(metrics): 84 | print('{}: {}'.format(metric, metrics[metric])) 85 | print('#####################') 86 | os.remove(path_to_reference) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /farm/inference_rest_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | from flask import Flask, request, make_response 7 | from flask_cors import CORS 8 | from flask_restplus import Api, Resource 9 | 10 | from farm.infer import Inferencer 11 | 12 | logger = logging.getLogger(__name__) 13 | logging.basicConfig( 14 | format="%(asctime)s %(levelname)-8s %(message)s", 15 | level="INFO", 16 | datefmt="%Y-%m-%d %H:%M:%S", 17 | ) 18 | 19 | MODELS_DIRS = ["saved_models", "base_models"] 20 | 21 | model_paths = [] 22 | for model_dir in MODELS_DIRS: 23 | path = Path(model_dir) 24 | if path.is_dir(): 25 | models = [f for f in path.iterdir() if f.is_dir()] 26 | model_paths.extend(models) 27 | 28 | INFERENCERS = {} 29 | for idx, model_dir in enumerate(model_paths): 30 | # refer to examples/inferencer_multiprocessing.py for using multiprocessing in the Inferencers. 31 | INFERENCERS[idx + 1] = Inferencer.load(str(model_dir), num_processes=0) 32 | 33 | app = Flask(__name__) 34 | CORS(app) 35 | api = Api(app, debug=True, validate=True, version="1.0", title="FARM NLP APIs") 36 | app.config["JSON_SORT_KEYS"] = True 37 | app.config["RESTPLUS_VALIDATE"] = True 38 | 39 | 40 | @api.route("/models") 41 | class ModelListEndpoint(Resource): 42 | def get(self): 43 | resp = [] 44 | 45 | for idx, model in INFERENCERS.items(): 46 | 47 | #TODO UI still relies on the old prediction_type attribute, but we should switch this to inferencer.task_type 48 | prediction_type = model.model.prediction_heads[0].model_type 49 | 50 | _res = { 51 | "id": idx, 52 | "name": model.name, 53 | "prediction_type": prediction_type, 54 | "language": model.language, 55 | } 56 | resp.append(_res) 57 | 58 | return resp 59 | 60 | 61 | class NumpyEncoder(json.JSONEncoder): 62 | def default(self, obj): 63 | if isinstance(obj, np.ndarray): 64 | return obj.tolist() 65 | if isinstance(obj, np.float32): 66 | return str(obj) 67 | return json.JSONEncoder.default(self, obj) 68 | 69 | 70 | @api.representation("application/json") 71 | def resp_json(data, code, headers=None): 72 | resp = make_response(json.dumps(data, cls=NumpyEncoder), code) 73 | resp.headers.extend(headers or {}) 74 | return resp 75 | 76 | 77 | @api.route("/models//inference") 78 | class InferenceEndpoint(Resource): 79 | def post(self, model_id): 80 | model = INFERENCERS.get(model_id, None) 81 | if not model: 82 | return "Model not found", 404 83 | 84 | dicts = request.get_json().get("input", None) 85 | if not dicts: 86 | return {} 87 | results = model.inference_from_dicts(dicts=dicts) 88 | return results[0] 89 | 90 | 91 | if __name__ == "__main__": 92 | app.run(host="0.0.0.0") 93 | -------------------------------------------------------------------------------- /farm/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/modeling/__init__.py -------------------------------------------------------------------------------- /farm/visual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/visual/__init__.py -------------------------------------------------------------------------------- /farm/visual/ascii/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/visual/ascii/__init__.py -------------------------------------------------------------------------------- /farm/visual/ascii/text.py: -------------------------------------------------------------------------------- 1 | 2 | FARM_BLOCKS = """ 3 | .----------------. .----------------. .----------------. .----------------. 4 | | .--------------. || .--------------. || .--------------. || .--------------. | 5 | | | _________ | || | __ | || | _______ | || | ____ ____ | | 6 | | | |_ ___ | | || | / \ | || | |_ __ \ | || ||_ \ / _|| | 7 | | | | |_ \_| | || | / /\ \ | || | | |__) | | || | | \/ | | | 8 | | | | _| | || | / ____ \ | || | | __ / | || | | |\ /| | | | 9 | | | _| |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |_\/_| |_ | | 10 | | | |_____| | || ||____| |____|| || | |____| |___| | || ||_____||_____|| | 11 | | | | || | | || | | || | | | 12 | | '--------------' || '--------------' || '--------------' || '--------------' | 13 | '----------------' '----------------' '----------------' '----------------' 14 | """ 15 | 16 | FARM_DOOM = """ 17 | __ 18 | / _| 19 | | |_ __ _ _ __ _ __ ___ 20 | | _/ _` | '__| '_ ` _ \ 21 | | || (_| | | | | | | | | 22 | |_| \__,_|_| |_| |_| |_| 23 | """ 24 | 25 | FARM_MODULAR = """ 26 | _______ _______ ______ __ __ 27 | | || _ || _ | | |_| | 28 | | ___|| |_| || | || | | 29 | | |___ | || |_||_ | | 30 | | ___|| || __ || | 31 | | | | _ || | | || ||_|| | 32 | |___| |__| |__||___| |_||_| |_| 33 | """ 34 | 35 | FARM_COLOSSAL = """ 36 | .d888 37 | d88P" 38 | 888 39 | 888888 8888b. 888d888 88888b.d88b. 40 | 888 "88b 888P" 888 "888 "88b 41 | 888 .d888888 888 888 888 888 42 | 888 888 888 888 888 888 888 43 | 888 "Y888888 888 888 888 888 44 | """ 45 | 46 | FARM_DIET_COLA = """ 47 | .-._.---' 48 | (_) / 49 | /--..-. ).--.. .-. .-. 50 | / ( | / )/ ) ) 51 | .-/ `-'-'/ '/ / ( 52 | (_/ `-' 53 | """ 54 | 55 | WELCOME = """ 56 | . 57 | / / 58 | `) ( .-. / .-. .-._.. .-. .-. .-. ---/---.-._. 59 | / . )./.-'_ / ( ( ) )/ ) )./.-'_ / ( ) 60 | (_.' `-' (__.'_/_.-`---'`-' '/ / ( (__.' / `-' 61 | `-' """ 62 | 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # basics 2 | setuptools 3 | wheel 4 | # PyTorch 5 | # Temp. disabled the next line as it gets currently resolved to https://download.pytorch.org/whl/rocm3.8/torch-1.7.1%2Brocm3.8-cp38-cp38-linux_x86_64.whl 6 | #--find-links=https://download.pytorch.org/whl/torch_stable.html 7 | torch>1.5,<1.10 8 | # progress bars in model download and training scripts 9 | tqdm 10 | # Accessing files from S3 directly. 11 | boto3 12 | # Used for downloading models over HTTP 13 | requests 14 | # Scipy & sklearn for stats in run_classifier 15 | scipy>=1.3.2 16 | sklearn 17 | # Metrics or logging related 18 | seqeval 19 | mlflow<=1.13.1 20 | # huggingface repository 21 | transformers==4.7.0 22 | #sentence transformers 23 | sentence-transformers 24 | # accessing dictionary elements with dot notation 25 | dotmap 26 | # for inference-rest-apis 27 | Werkzeug==0.16.1 28 | flask 29 | flask-restplus 30 | flask-cors 31 | dill # pickle extension for (de-)serialization 32 | # optional for inference 33 | #fasttext==0.9.1 34 | # Inference with ONNX models. Install onnxruntime-gpu for Inference on GPUs 35 | #onnxruntime 36 | #onnxruntime_tools 37 | psutil 38 | sentencepiece 39 | -------------------------------------------------------------------------------- /run_all_experiments.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Downstream runner for all experiments in specified config files.""" 15 | 16 | from pathlib import Path 17 | from farm.experiment import run_experiment, load_experiments 18 | 19 | 20 | def main(): 21 | config_files = [ 22 | Path("experiments/ner/conll2003_de_config.json"), 23 | Path("experiments/ner/conll2003_en_config.json"), 24 | Path("experiments/ner/germEval14_config.json"), 25 | Path("experiments/text_classification/germEval18Fine_config.json"), 26 | Path("experiments/text_classification/germEval18Coarse_config.json"), 27 | Path("experiments/text_classification/gnad_config.json"), 28 | Path("experiments/text_classification/cola_config.json"), 29 | Path("experiments/qa/squad20_config.json"), 30 | ] 31 | 32 | for conf_file in config_files: 33 | experiments = load_experiments(conf_file) 34 | for experiment in experiments: 35 | run_experiment(experiment) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = readme.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import re 4 | from io import open 5 | 6 | from setuptools import find_packages, setup 7 | 8 | 9 | def parse_requirements(filename): 10 | """ 11 | Parse a requirements pip file returning the list of required packages. It exclude commented lines and --find-links directives. 12 | 13 | Args: 14 | filename: pip requirements requirements 15 | 16 | Returns: 17 | list of required package with versions constraints 18 | 19 | """ 20 | with open(filename) as file: 21 | parsed_requirements = file.read().splitlines() 22 | parsed_requirements = [line.strip() 23 | for line in parsed_requirements 24 | if not ((line.strip()[0] == "#") or line.strip().startswith('--find-links'))] 25 | return parsed_requirements 26 | 27 | 28 | def get_dependency_links(filename): 29 | """ 30 | Parse a requirements pip file looking for the --find-links directive. 31 | Args: 32 | filename: pip requirements requirements 33 | 34 | Returns: 35 | list of find-links's url 36 | """ 37 | with open(filename) as file: 38 | parsed_requirements = file.read().splitlines() 39 | dependency_links = list() 40 | for line in parsed_requirements: 41 | line = line.strip() 42 | if line.startswith('--find-links'): 43 | dependency_links.append(line.split('=')[1]) 44 | return dependency_links 45 | 46 | 47 | dependency_links = get_dependency_links('requirements.txt') 48 | parsed_requirements = parse_requirements('requirements.txt') 49 | 50 | 51 | def versionfromfile(*filepath): 52 | infile = os.path.join(*filepath) 53 | with open(infile) as fp: 54 | version_match = re.search( 55 | r"^__version__\s*=\s*['\"]([^'\"]*)['\"]", fp.read(), re.M 56 | ) 57 | if version_match: 58 | return version_match.group(1) 59 | raise RuntimeError("Unable to find version string in {}.".format(infile)) 60 | 61 | 62 | here = os.path.abspath(os.path.dirname(__file__)) 63 | 64 | 65 | setup( 66 | name="farm", 67 | version=versionfromfile(here, "farm", "_version.py"), 68 | author="Timo Moeller, Malte Pietsch, Branden Chan, Tanay Soni, Bogdan Kostic, Julian Risch", 69 | author_email="timo.moeller@deepset.ai", 70 | description="Framework for finetuning and evaluating transformer based language models", 71 | long_description=open("readme.rst", "r", encoding="utf-8").read(), 72 | long_description_content_type="text/x-rst", 73 | keywords="BERT NLP deep-learning language-model transformer qa question-answering transfer-learning", 74 | license="Apache", 75 | url="https://github.com/deepset-ai/FARM", 76 | download_url="https://github.com/deepset-ai/FARM/archive/v"+versionfromfile(here, "farm", "_version.py")+".tar.gz", 77 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 78 | dependency_links=dependency_links, 79 | install_requires=parsed_requirements, 80 | python_requires=">=3.6.0", 81 | extras_require={ 82 | "fasttext": ["fasttext==0.9.1"], 83 | "onnx": ["onnxruntime"], 84 | }, 85 | tests_require=["pytest"], 86 | classifiers=[ 87 | "Intended Audience :: Science/Research", 88 | "License :: OSI Approved :: Apache Software License", 89 | "Programming Language :: Python :: 3", 90 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 91 | ], 92 | ) 93 | -------------------------------------------------------------------------------- /test/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Inference Speed Benchmarks 2 | 3 | FARM provides an automated speed benchmarking pipeline with options to parameterize the benchmarks with batch_size, 4 | max sequence length, document size, and so on. 5 | 6 | The pipeline is implemented using [pytest-benchmark](https://github.com/ionelmc/pytest-benchmark). The warmup/iterations for each benchmark are configurable and the 7 | results can be exported to a JSON file. 8 | 9 | 10 | 11 | ## Question Answering 12 | 13 | The `benchmarks/question_answering.py` file contains tests for inference with PyTorch(`test_question_answering_pytorch`) 14 | and ONNXRuntime(`test_question_answering_onnx`). 15 | 16 | The benchmarks are available [here](https://docs.google.com/spreadsheets/d/1ak9Cxj1zcNBDtjf7qn2j_ydKDDzpBgWiyJ7cO-7BPvA/edit?usp=sharing). 17 | 18 | ### Running Benchmark with Docker 19 | 20 | #### GPU 21 | For running benchmark on a GPU, bash into the Docker Image using ```docker run -it --gpus all deepset/farm-onnxruntime-gpu:0.4.3 bash```. 22 | Once inside the container, execute ```cd FARM/test && pytest benchmarks/question_answering.py -k test_question_answering_pytorch --use_gpu --benchmark-json result.json```. 23 | 24 | #### CPU 25 | Bash into the Docker container with ```docker run -it deepset/farm-inference-api:0.4.3 bash``` and then execute 26 | ```cd test && pytest benchmarks/question_answering.py -k test_question_answering_pytorch --benchmark-json result.json```. 27 | 28 | ### Exporting results in CSV format 29 | 30 | The results of benchmarks are exported to a `result.json` file in the `test` folder. To convert results to csv format, 31 | execute `python benchmarks/convert_result_to_csv.py`. -------------------------------------------------------------------------------- /test/benchmarks/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from farm.infer import Inferencer 6 | from farm.modeling.adaptive_model import AdaptiveModel 7 | 8 | 9 | @pytest.fixture(scope="session") 10 | def onnx_adaptive_model_qa(use_gpu, num_processes, model_name_or_path="deepset/bert-base-cased-squad2"): 11 | if (Path(model_name_or_path) / "model.onnx").is_file(): # load model directly if in ONNX format 12 | onnx_model_path = model_name_or_path 13 | else: # convert to ONNX format 14 | onnx_model_path = Path("benchmarks/onnx-export") 15 | model = AdaptiveModel.convert_from_transformers( 16 | model_name_or_path, device="cpu", task_type="question_answering" 17 | ) 18 | model.convert_to_onnx(onnx_model_path) 19 | 20 | try: 21 | model = Inferencer.load( 22 | onnx_model_path, task_type="question_answering", batch_size=1, num_processes=num_processes, gpu=use_gpu 23 | ) 24 | yield model 25 | finally: 26 | if num_processes != 0: 27 | model.close_multiprocessing_pool() 28 | 29 | -------------------------------------------------------------------------------- /test/benchmarks/convert_result_to_csv.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | 4 | with open("result.json") as f: 5 | results = json.load(f) 6 | 7 | with open("result.csv", "w") as f: 8 | fieldnames = list(results["benchmarks"][0]["params"].keys()) 9 | fieldnames.append("time") 10 | writer = csv.DictWriter(f, fieldnames=fieldnames) 11 | writer.writeheader() 12 | 13 | for benchmark in results["benchmarks"]: 14 | writer.writerow({"time": benchmark["stats"]["total"], **benchmark["params"]}) 15 | -------------------------------------------------------------------------------- /test/benchmarks/question_answering.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | import torch 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | @pytest.mark.parametrize("max_seq_len", [128, 256, 384]) 10 | @pytest.mark.parametrize("batch_size", [1, 4, 16, 64]) 11 | @pytest.mark.parametrize("document_size", [10_000, 100_000]) 12 | @pytest.mark.parametrize("num_processes", [0], scope="session") 13 | def test_question_answering_pytorch(adaptive_model_qa, benchmark, max_seq_len, batch_size, use_gpu, document_size): 14 | if use_gpu and not torch.cuda.is_available(): 15 | pytest.skip("Skipping benchmarking on GPU as it not available.") 16 | 17 | if not use_gpu and document_size > 10_000: 18 | pytest.skip("Document size is large for CPU") 19 | 20 | with open("benchmarks/sample_file.txt") as f: 21 | context = f.read()[:document_size] 22 | QA_input = [{"qas": ["When were the first traces of Human life found in France?"], "context": context}] 23 | 24 | adaptive_model_qa.batch_size = batch_size 25 | adaptive_model_qa.max_seq_len = max_seq_len 26 | benchmark.pedantic( 27 | target=adaptive_model_qa.inference_from_dicts, args=(QA_input,), warmup_rounds=1, iterations=3, 28 | ) 29 | 30 | 31 | @pytest.mark.parametrize("max_seq_len", [128, 256, 384]) 32 | @pytest.mark.parametrize("batch_size", [1, 4, 16, 64]) 33 | @pytest.mark.parametrize("document_size", [10_000, 100_000]) 34 | @pytest.mark.parametrize("num_processes", [0], scope="session") 35 | def test_question_answering_onnx(onnx_adaptive_model_qa, benchmark, max_seq_len, batch_size, use_gpu, document_size): 36 | if use_gpu and not torch.cuda.is_available(): 37 | pytest.skip("Skipping benchmarking on GPU as it not available.") 38 | 39 | if not use_gpu and document_size > 10_000: 40 | pytest.skip("Document size is large for CPU") 41 | 42 | with open("benchmarks/sample_file.txt") as f: 43 | context = f.read()[:document_size] 44 | QA_input = [{"qas": ["When were the first traces of Human life found in France?"], "context": context}] 45 | 46 | onnx_adaptive_model_qa.batch_size = batch_size 47 | onnx_adaptive_model_qa.max_seq_len = max_seq_len 48 | benchmark.pedantic( 49 | target=onnx_adaptive_model_qa.inference_from_dicts, args=(QA_input,), warmup_rounds=1, iterations=3 50 | ) 51 | -------------------------------------------------------------------------------- /test/benchmarks/question_answering_components.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 34 | 35 | 36 |
37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /test/benchmarks/samples/question_answering_questions.txt: -------------------------------------------------------------------------------- 1 | When were the first traces of Human life found in France? 2 | How many pretrained models are available in Transformers? 3 | What does Transformers provide? 4 | Transformers provides interoperability between which frameworks? -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import pytest 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import SquadProcessor 7 | from farm.modeling.adaptive_model import AdaptiveModel 8 | from farm.modeling.language_model import LanguageModel 9 | from farm.modeling.optimization import initialize_optimizer 10 | from farm.modeling.prediction_head import QuestionAnsweringHead 11 | from farm.modeling.tokenization import Tokenizer 12 | from farm.train import Trainer 13 | from farm.utils import set_all_seeds, initialize_device_settings 14 | from farm.infer import Inferencer, QAInferencer 15 | 16 | 17 | def pytest_addoption(parser): 18 | """ 19 | Hook to pass pytest-fixture arguments to tests from the command line. 20 | """ 21 | parser.addoption("--use_gpu", action="store_true", default=False) 22 | 23 | 24 | def pytest_generate_tests(metafunc): 25 | """ 26 | This method gets called for all test cases. Here, we set the arguments supplied in pytest_addoption(). 27 | """ 28 | option_value = metafunc.config.option.use_gpu 29 | if 'use_gpu' in metafunc.fixturenames: 30 | if option_value: 31 | metafunc.parametrize("use_gpu", [True], scope="session") 32 | else: 33 | metafunc.parametrize("use_gpu", [False], scope="session") 34 | 35 | 36 | def pytest_collection_modifyitems(items): 37 | for item in items: 38 | if "conversion" in item.nodeid: 39 | item.add_marker(pytest.mark.conversion) 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def adaptive_model_qa(use_gpu, num_processes): 44 | """ 45 | PyTest Fixture for a Question Answering Inferencer based on PyTorch. 46 | """ 47 | try: 48 | model = Inferencer.load( 49 | "deepset/bert-base-cased-squad2", 50 | task_type="question_answering", 51 | batch_size=16, 52 | num_processes=num_processes, 53 | gpu=use_gpu, 54 | ) 55 | yield model 56 | finally: 57 | if num_processes != 0: 58 | # close the pool 59 | # we pass join=True to wait for all sub processes to close 60 | # this is because below we want to test if all sub-processes 61 | # have exited 62 | model.close_multiprocessing_pool(join=True) 63 | 64 | # check if all workers (sub processes) are closed 65 | current_process = psutil.Process() 66 | children = current_process.children() 67 | assert len(children) == 0 68 | 69 | 70 | @pytest.fixture(scope="module") 71 | def bert_base_squad2(request): 72 | model = QAInferencer.load( 73 | "deepset/minilm-uncased-squad2", 74 | task_type="question_answering", 75 | batch_size=4, 76 | num_processes=0, 77 | multithreading_rust=False, 78 | use_fast=True # TODO parametrize this to test slow as well 79 | ) 80 | return model 81 | 82 | # TODO add other model types (roberta, xlm-r, albert) here as well 83 | 84 | @pytest.fixture(scope="module") 85 | def distilbert_squad(request): 86 | set_all_seeds(seed=42) 87 | device, n_gpu = initialize_device_settings(use_cuda=False) 88 | batch_size = 2 89 | n_epochs = 1 90 | evaluate_every = 4 91 | base_LM_model = "distilbert-base-uncased" 92 | 93 | tokenizer = Tokenizer.load( 94 | pretrained_model_name_or_path=base_LM_model, 95 | do_lower_case=True, 96 | use_fast=True # TODO parametrize this to test slow as well 97 | ) 98 | label_list = ["start_token", "end_token"] 99 | processor = SquadProcessor( 100 | tokenizer=tokenizer, 101 | max_seq_len=20, 102 | doc_stride=10, 103 | max_query_length=6, 104 | train_filename="train-sample.json", 105 | dev_filename="dev-sample.json", 106 | test_filename=None, 107 | data_dir=Path("samples/qa"), 108 | label_list=label_list, 109 | metric="squad" 110 | ) 111 | 112 | data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) 113 | language_model = LanguageModel.load(base_LM_model) 114 | prediction_head = QuestionAnsweringHead() 115 | model = AdaptiveModel( 116 | language_model=language_model, 117 | prediction_heads=[prediction_head], 118 | embeds_dropout_prob=0.1, 119 | lm_output_types=["per_token"], 120 | device=device, 121 | ) 122 | 123 | model, optimizer, lr_schedule = initialize_optimizer( 124 | model=model, 125 | learning_rate=2e-5, 126 | #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, 127 | n_batches=len(data_silo.loaders["train"]), 128 | n_epochs=n_epochs, 129 | device=device 130 | ) 131 | trainer = Trainer( 132 | model=model, 133 | optimizer=optimizer, 134 | data_silo=data_silo, 135 | epochs=n_epochs, 136 | n_gpu=n_gpu, 137 | lr_schedule=lr_schedule, 138 | evaluate_every=evaluate_every, 139 | device=device 140 | ) 141 | trainer.train() 142 | 143 | return model, processor 144 | 145 | -------------------------------------------------------------------------------- /test/create_testdata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import pprint 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def squad_subsample(): 10 | if not os.path.exists("samples/qa"): 11 | os.makedirs("samples/qa") 12 | 13 | with open('../data/squad20/dev-v2.0.json') as json_file: 14 | data = json.load(json_file) 15 | 16 | ss = data["data"][0]["paragraphs"][:1] 17 | sample = {} 18 | sample["data"] = [{"paragraphs": ss}] 19 | # just creating same train and dev files 20 | with open('samples/qa/dev-sample.json', 'w') as outfile: 21 | json.dump(sample, outfile) 22 | with open('samples/qa/train-sample.json', 'w') as outfile: 23 | json.dump(sample, outfile) 24 | 25 | def germeval14_subsample(): 26 | if not os.path.exists("samples/ner"): 27 | os.makedirs("samples/ner") 28 | 29 | with open('../data/germeval14/dev.txt') as file: 30 | data = file.readlines() 31 | 32 | ss = "".join(data[:200]) 33 | with open('samples/ner/train-sample.txt', 'w') as outfile: 34 | outfile.write(ss) 35 | with open('samples/ner/dev-sample.txt', 'w') as outfile: 36 | outfile.write(ss) 37 | 38 | def germeval18_subsample(): 39 | if not os.path.exists("samples/doc_class"): 40 | os.makedirs("samples/doc_class") 41 | with open('../data/germeval18/test.tsv') as file: 42 | data = file.readlines() 43 | 44 | ss = "".join(data[:50]) 45 | with open('samples/doc_class/train-sample.tsv', 'w') as outfile: 46 | outfile.write(ss) 47 | with open('samples/doc_class/test-sample.tsv', 'w') as outfile: 48 | outfile.write(ss) 49 | 50 | if __name__=="__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('--task', type=str, default='', help="Which task to create testdata for: qa, ner, doc_class") 53 | args = parser.parse_args() 54 | if(args.task == "qa"): 55 | logger.info("Creating test data for Question Answering, please make sure the original data is already downloaded and in data/squad20") 56 | squad_subsample() 57 | elif(args.task == "ner"): 58 | logger.info( 59 | "Creating test data for NER, please make sure the original data is already downloaded and in data/germeval14") 60 | germeval14_subsample() 61 | elif(args.task == "doc_class"): 62 | logger.info( 63 | "Creating test data for Document Classification, please make sure the original data is already downloaded and in data/germeval18") 64 | germeval18_subsample() -------------------------------------------------------------------------------- /test/modeling/test_optimization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from farm.modeling.optimization import initialize_optimizer 4 | 5 | 6 | def test_initialize_optimizer_param_schedule_opts(): 7 | with pytest.raises(TypeError): 8 | initialize_optimizer(None, 1, 1, 'cpu', 0.4e-5, schedule_opts=[]) 9 | -------------------------------------------------------------------------------- /test/samples/doc_class/test-sample.tsv: -------------------------------------------------------------------------------- 1 | text coarse_label fine_label 2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺ OFFENSE OTHER 3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen. OTHER OTHER -------------------------------------------------------------------------------- /test/samples/doc_class/train-sample.tsv: -------------------------------------------------------------------------------- 1 | text coarse_label fine_label 2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺ OFFENSE OTHER 3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen. OTHER OTHER 4 | #Merkel rollt dem Emir von #Katar, der islamistischen Terror unterstützt, den roten Teppich aus.Wir brauchen einen sofortigen #Waffenstopp! OTHER OTHER 5 | „Merle ist kein junges unschuldiges Mädchen“ Kch....... 😱 #tatort OTHER OTHER -------------------------------------------------------------------------------- /test/samples/doc_class_other_text_column_name/test-sample.tsv: -------------------------------------------------------------------------------- 1 | text_other coarse_label fine_label 2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺ OFFENSE OTHER 3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen. OTHER OTHER 4 | -------------------------------------------------------------------------------- /test/samples/doc_class_other_text_column_name/train-sample.tsv: -------------------------------------------------------------------------------- 1 | text_other coarse_label fine_label 2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺ OFFENSE OTHER 3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen. OTHER OTHER 4 | #Merkel rollt dem Emir von #Katar, der islamistischen Terror unterstützt, den roten Teppich aus.Wir brauchen einen sofortigen #Waffenstopp! OTHER OTHER 5 | „Merle ist kein junges unschuldiges Mädchen“ Kch....... 😱 #tatort OTHER OTHER 6 | -------------------------------------------------------------------------------- /test/samples/doc_regr/test-sample.tsv: -------------------------------------------------------------------------------- 1 | text label 2 | I love, love this dress except for the armpits. if they had just made the armpits a normal round shape with normal openings, the dress would have been perfection. so audrey hepburn!! but i had to say no. i really wish they would redo this dress with normal arm openings. i think it would sell like crazy. 4 3 | I wanted this sweater to work but sadly it failed. first, the pink was way to sheer for my liking. the sheerness caused a weird color overlap on the stomach area. then the band at the bottom was too tight causing a weird ballooning affect. a shirt underneath could work but it takes away from the beauty of the knit. the soft pink is gorgeous but not good for medium to light skinned folks. 2 4 | Oh my! i love this tee. it is super soft. i love how it doesn't look like a sack with no shape. i can't wait to get more colors. i am tall plus have a long torso and it still is long enough for me so this is definitely a win! 5 5 | I love the style of this swimsuit on the model. when i purchased is i didn't realize that there was no support (wire/ padding/ lining) in the chest. the rest of the swimsuit was great but i did not like the look in the chest - it provided no support. i ended up returning it. 3 -------------------------------------------------------------------------------- /test/samples/doc_regr/train-sample.tsv: -------------------------------------------------------------------------------- 1 | text label 2 | The embroidery around the chest/collar is lovely. but the lower half of the shirt didn't fit my post-pregnancy bod. it's going back. 4 3 | "I am so pleased with this top! it is slightly fitted - i am 5'3"", 110 lbs, - and have trouble finding tops that are flattering but not too form fitting. also it is 100% cotton, which is a definite plus. as of now it is my go-to top - looks great with jeans or leggings." 5 4 | I honestly don't understand whey this top isn't sold out. i have it in both colors and love it! it's a cool, gauzy woven fabric, super soft and perfect for warm weather. the white fabric is doubled so it's not see-through, the pink (more of a pale terracotta) is doubled halfway up, so it's slightly sheer on top but your pants/skirt waistband will not show through. it is a loose-fitting top, so you may be able to size down. i usually wear size 4p, but it was sold out so i got regular size 2 and i 5 5 | How can you go wrong with soft cotton top that is neither too snug nor too loose? expect will wear these as layers under when really cold and by them selves in early spring and in the fall. great colors and love the multiple textures. 5 -------------------------------------------------------------------------------- /test/samples/doc_regr_other_text_column_name/test-sample.tsv: -------------------------------------------------------------------------------- 1 | text_other label 2 | I love, love this dress except for the armpits. if they had just made the armpits a normal round shape with normal openings, the dress would have been perfection. so audrey hepburn!! but i had to say no. i really wish they would redo this dress with normal arm openings. i think it would sell like crazy. 4 3 | I wanted this sweater to work but sadly it failed. first, the pink was way to sheer for my liking. the sheerness caused a weird color overlap on the stomach area. then the band at the bottom was too tight causing a weird ballooning affect. a shirt underneath could work but it takes away from the beauty of the knit. the soft pink is gorgeous but not good for medium to light skinned folks. 2 4 | Oh my! i love this tee. it is super soft. i love how it doesn't look like a sack with no shape. i can't wait to get more colors. i am tall plus have a long torso and it still is long enough for me so this is definitely a win! 5 5 | I love the style of this swimsuit on the model. when i purchased is i didn't realize that there was no support (wire/ padding/ lining) in the chest. the rest of the swimsuit was great but i did not like the look in the chest - it provided no support. i ended up returning it. 3 6 | -------------------------------------------------------------------------------- /test/samples/doc_regr_other_text_column_name/train-sample.tsv: -------------------------------------------------------------------------------- 1 | text_other label 2 | The embroidery around the chest/collar is lovely. but the lower half of the shirt didn't fit my post-pregnancy bod. it's going back. 4 3 | "I am so pleased with this top! it is slightly fitted - i am 5'3"", 110 lbs, - and have trouble finding tops that are flattering but not too form fitting. also it is 100% cotton, which is a definite plus. as of now it is my go-to top - looks great with jeans or leggings." 5 4 | I honestly don't understand whey this top isn't sold out. i have it in both colors and love it! it's a cool, gauzy woven fabric, super soft and perfect for warm weather. the white fabric is doubled so it's not see-through, the pink (more of a pale terracotta) is doubled halfway up, so it's slightly sheer on top but your pants/skirt waistband will not show through. it is a loose-fitting top, so you may be able to size down. i usually wear size 4p, but it was sold out so i got regular size 2 and i 5 5 | How can you go wrong with soft cotton top that is neither too snug nor too loose? expect will wear these as layers under when really cold and by them selves in early spring and in the fall. great colors and love the multiple textures. 5 6 | -------------------------------------------------------------------------------- /test/samples/lm_finetuning/test-sample.txt: -------------------------------------------------------------------------------- 1 | Text should be one-sentence-per-line, with empty lines between documents. 2 | A Seentence to teest whoole woord maasking, muust includio multiplee woords wiith subwoord tookens. 3 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 4 | 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. -------------------------------------------------------------------------------- /test/samples/lm_finetuning/train-sample.txt: -------------------------------------------------------------------------------- 1 | Text should be one-sentence-per-line, with empty lines between documents. 2 | A Seentence to teest whoole woord maasking, muust includio multiplee woords wiith subwoord tookens. 3 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত 4 | 5 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbb ccccccccccccccccccccccc 6 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbb cccccccccccccccccccccccccccccc 7 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbb ccccccccccccccccccccccc 8 | 9 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. 10 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. 11 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. 12 | -------------------------------------------------------------------------------- /test/samples/ner/dev-sample.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- -X- -X- O 2 | 3 | Ereignis Ereignis NN I-NC O 4 | und und KON O O 5 | Erzählung Erzählung NN I-NC O 6 | oder oder KON I-NC O 7 | : : $. O O 8 | 9 | Albrecht Albrecht NE B-NC I-PER 10 | Lehmann Lehmann NE I-NC I-PER 11 | läßt lassen VVFIN I-VC O 12 | Flüchtlinge Flüchtling NN I-NC O 13 | und und KON O O 14 | Vertriebene Vertriebene NN I-NC O 15 | in in APPR I-PC O 16 | Westdeutschland Westdeutschland NE I-NC I-LOC 17 | , , $, I-NC O 18 | 1945-1990 @card@ CARD I-NC O 19 | , , $, O O 20 | zu zu APPR I-PC O 21 | Wort Wort NN I-NC O 22 | kommen kommen VVFIN I-VC O 23 | 24 | Einwanderungsfragen Einwanderungsfrage|Einwanderungsfragen NN I-NC O 25 | haben haben VAFIN I-VC O 26 | in in APPR I-PC O 27 | Deutschland Deutschland NE I-NC I-LOC 28 | in in APPR I-PC O 29 | den d ART I-NC O 30 | letzten letzt ADJA I-NC O 31 | Monaten Monat NN I-NC O 32 | Politik Politik NN B-NC O 33 | und und KON O O 34 | Medien Medium NN I-NC O 35 | beherrscht beherrschen VVPP I-VC O 36 | . . $. O O 37 | 38 | in in APPR I-PC O 39 | Westdeutschland Westdeutschland NE I-NC I-LOC 40 | von von APPR I-PC O 41 | 1945 1945 CARD I-NC O 42 | bis bis APPR I-PC O 43 | 1990 1990 CARD I-NC O 44 | Aufmerksamkeit Aufmerksamkeit NN I-NC O 45 | zu zu PTKZU I-VC O 46 | erregen erregen VVINF I-VC O 47 | . . $. O O 48 | -------------------------------------------------------------------------------- /test/samples/ner/train-sample.txt: -------------------------------------------------------------------------------- 1 | -DOCSTART- -X- -X- -X- O 2 | 3 | Ereignis Ereignis NN I-NC O 4 | und und KON O O 5 | Erzählung Erzählung NN I-NC O 6 | oder oder KON I-NC O 7 | : : $. O O 8 | 9 | Albrecht Albrecht NE B-NC I-PER 10 | Lehmann Lehmann NE I-NC I-PER 11 | läßt lassen VVFIN I-VC O 12 | Flüchtlinge Flüchtling NN I-NC O 13 | und und KON O O 14 | Vertriebene Vertriebene NN I-NC O 15 | in in APPR I-PC O 16 | Westdeutschland Westdeutschland NE I-NC I-LOC 17 | , , $, I-NC O 18 | 1945-1990 @card@ CARD I-NC O 19 | , , $, O O 20 | zu zu APPR I-PC O 21 | Wort Wort NN I-NC O 22 | kommen kommen VVFIN I-VC O 23 | 24 | Einwanderungsfragen Einwanderungsfrage|Einwanderungsfragen NN I-NC O 25 | haben haben VAFIN I-VC O 26 | in in APPR I-PC O 27 | Deutschland Deutschland NE I-NC I-LOC 28 | in in APPR I-PC O 29 | den d ART I-NC O 30 | letzten letzt ADJA I-NC O 31 | Monaten Monat NN I-NC O 32 | Politik Politik NN B-NC O 33 | und und KON O O 34 | Medien Medium NN I-NC O 35 | beherrscht beherrschen VVPP I-VC O 36 | . . $. O O 37 | 38 | in in APPR I-PC O 39 | Westdeutschland Westdeutschland NE I-NC I-LOC 40 | von von APPR I-PC O 41 | 1945 1945 CARD I-NC O 42 | bis bis APPR I-PC O 43 | 1990 1990 CARD I-NC O 44 | Aufmerksamkeit Aufmerksamkeit NN I-NC O 45 | zu zu PTKZU I-VC O 46 | erregen erregen VVINF I-VC O 47 | . . $. O O 48 | 49 | -------------------------------------------------------------------------------- /test/samples/qa/answer-offset-wrong.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "10", "answer_start": 0}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/answer-wrong.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "11", "answer_start": 11}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/dev-sample.json: -------------------------------------------------------------------------------- 1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 53}], "is_impossible": false}], "context": "The Normans gave their name to Normandy, a region in France."}]}]} -------------------------------------------------------------------------------- /test/samples/qa/eval-sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Paris?", 11 | "id": "5ad3d560604f3c001a3ff2c6", 12 | "answers": [], 13 | "is_impossible": true 14 | } 15 | ] 16 | } 17 | ] 18 | }, 19 | { 20 | "title": "Test2", 21 | "paragraphs": [ 22 | { 23 | "context": "Berlin has 10 inhabitants.", 24 | "qas": [ 25 | { 26 | "question": "How many people live in Berlin?", 27 | "id": "5ad3d560604f3c001a3ff2c7", 28 | "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}], 29 | "is_impossible": false 30 | }, 31 | { 32 | "question": "How many people live in Berlin?", 33 | "id": "5ad3d560604f3c001a3ff2c8", 34 | "answers": [{"text": "Berlin", "answer_start": 0}, {"text": "Berlin", "answer_start": 0}], 35 | "is_impossible": false 36 | } 37 | ] 38 | } 39 | ] 40 | } 41 | ], 42 | "version": "v2.0" 43 | } -------------------------------------------------------------------------------- /test/samples/qa/noanswer.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Paris?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [], 13 | "is_impossible": true 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/qa/train-sample.json: -------------------------------------------------------------------------------- 1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 159}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]} -------------------------------------------------------------------------------- /test/samples/qa/vanilla.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "title": "Test", 5 | "paragraphs": [ 6 | { 7 | "context": "Berlin has 10 inhabitants.", 8 | "qas": [ 9 | { 10 | "question": "How many people live in Berlin?", 11 | "id": "5ad3d560604f3c001a3ff2c8", 12 | "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}], 13 | "is_impossible": false 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | ], 20 | "version": "v2.0" 21 | } -------------------------------------------------------------------------------- /test/samples/s3e/fitted_s3e/language_model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings_filename": "vectors.txt", 3 | "hidden_size": 300, 4 | "language": "German", 5 | "name": "WordEmbedding_LM", 6 | "vocab_filename": "vocab.txt", 7 | "vocab_size": 113 8 | } 9 | -------------------------------------------------------------------------------- /test/samples/s3e/fitted_s3e/processor_config.json: -------------------------------------------------------------------------------- 1 | {"baskets": [], "data_dir": null, "dev_filename": null, "dev_split": null, "max_seq_len": 128, "proxies": null, "tasks": {}, "test_filename": null, "train_filename": null, "tokenizer": "EmbeddingTokenizer", "processor": "InferenceProcessor"} -------------------------------------------------------------------------------- /test/samples/s3e/fitted_s3e/s3e_stats.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/test/samples/s3e/fitted_s3e/s3e_stats.pkl -------------------------------------------------------------------------------- /test/samples/s3e/fitted_s3e/vocab.txt: -------------------------------------------------------------------------------- 1 | [CLS] 2 | [SEP] 3 | [UNK] 4 | [PAD] 5 | [MASK] 6 | , 7 | the 8 | . 9 | and 10 | to 11 | of 12 | a 13 | in 14 | is 15 | for 16 | that 17 | it 18 | on 19 | with 20 | ) 21 | ( 22 | you 23 | was 24 | are 25 | this 26 | have 27 | ! 28 | but 29 | by 30 | ? 31 | my 32 | one 33 | so 34 | has 35 | can 36 | more 37 | had 38 | what 39 | me 40 | would 41 | if 42 | other 43 | its 44 | said 45 | work 46 | how 47 | good 48 | after 49 | great 50 | go 51 | those 52 | love 53 | many 54 | i 55 | very 56 | than 57 | such 58 | got 59 | set 60 | well 61 | much 62 | play 63 | give 64 | everything 65 | does 66 | man 67 | person 68 | buy 69 | video 70 | looking 71 | sure 72 | price 73 | almost 74 | wrong 75 | woman 76 | front 77 | ways 78 | spent 79 | feature 80 | fast 81 | player 82 | far 83 | street 84 | files 85 | models 86 | button 87 | plays 88 | forward 89 | fill 90 | walking 91 | investment 92 | opinion 93 | panel 94 | layout 95 | im 96 | consistently 97 | practically 98 | com 99 | discovering 100 | formats 101 | alternate 102 | sleek 103 | happier 104 | smoothly 105 | reviewer 106 | dvd 107 | amazon 108 | apex 109 | nicest 110 | rewind 111 | mp3s 112 | cads 113 | cods 114 | -------------------------------------------------------------------------------- /test/samples/s3e/tiny_corpus.txt: -------------------------------------------------------------------------------- 1 | a man is walking on the street . 2 | a woman is walking on the street . 3 | im a more happier person after discovering the button ! . 4 | but , if you are looking for my opinion of the apex dvd player , i love it ! . 5 | it practically plays almost everything you give it . 6 | for the price it is a well spent investment ! . 7 | this is by far the nicest one , in so many ways . 8 | it is very sleek looking with a very good front panel button layout , and it has a great feature set . 9 | its fast forward and rewind work much more smoothly and consistently than those of other models i have had . 10 | it plays alternate video formats cads cods very well . 11 | and amazon . com has it for such a great price how can you go wrong ? . 12 | what got me to buy was the reviewer that said it would play dvd fill of files ( mp3s ) . 13 | it sure does ! . 14 | -------------------------------------------------------------------------------- /test/samples/s3e/tiny_fasttext_model/language_model_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings_filename": "vectors.txt", 3 | "hidden_size": 300, 4 | "language": "German", 5 | "name": "WordEmbedding_LM", 6 | "vocab_filename": "vocab.txt", 7 | "vocab_size": 4008 8 | } -------------------------------------------------------------------------------- /test/samples/s3e/tiny_fasttext_model/vocab.txt: -------------------------------------------------------------------------------- 1 | [CLS] 2 | [SEP] 3 | [UNK] 4 | [PAD] 5 | [MASK] 6 | , 7 | the 8 | . 9 | and 10 | to 11 | of 12 | a 13 | in 14 | is 15 | for 16 | that 17 | it 18 | on 19 | with 20 | ) 21 | ( 22 | you 23 | was 24 | are 25 | this 26 | have 27 | ! 28 | but 29 | by 30 | ? 31 | my 32 | one 33 | so 34 | has 35 | can 36 | more 37 | had 38 | what 39 | me 40 | would 41 | if 42 | other 43 | its 44 | said 45 | work 46 | how 47 | good 48 | after 49 | great 50 | go 51 | those 52 | love 53 | many 54 | i 55 | very 56 | than 57 | such 58 | got 59 | set 60 | well 61 | much 62 | play 63 | give 64 | everything 65 | does 66 | man 67 | person 68 | buy 69 | video 70 | looking 71 | sure 72 | price 73 | almost 74 | wrong 75 | woman 76 | front 77 | ways 78 | spent 79 | feature 80 | fast 81 | player 82 | far 83 | street 84 | files 85 | models 86 | button 87 | plays 88 | forward 89 | fill 90 | walking 91 | investment 92 | opinion 93 | panel 94 | layout 95 | im 96 | consistently 97 | practically 98 | com 99 | discovering 100 | formats 101 | alternate 102 | sleek 103 | happier 104 | smoothly 105 | reviewer 106 | dvd 107 | amazon 108 | apex 109 | nicest 110 | rewind 111 | mp3s 112 | cads 113 | cods 114 | -------------------------------------------------------------------------------- /test/samples/text_pair/sample.tsv: -------------------------------------------------------------------------------- 1 | text text_b label 2 | how many times have real madrid won the champions league in a row They have also won the competition the most times in a row , winning it five times from 1956 to 1960 . 1 3 | when did new york stop using the electric chair Following the U.S. Supreme Court 's ruling declaring existing capital punishment statutes unconstitutional in Furman v. Georgia ( 1972 ) , New York was without a death penalty until 1995 , when then - Governor George Pataki signed a new statute into law , which provided for execution by lethal injection . 1 4 | songs on 4 your eyez only j cole `` Neighbors '' Cole 3 : 36 8 . 2 5 | how many seasons of the blacklist are there on netflix Retrieved March 27 , 2018 . 0 6 | how many books are in the one piece series The series spans over 800 chapters and more than 80 tankōbon volumes . 1 7 | central idea of poem lines from the deserted village It is a work of social commentary , and condemns rural depopulation and the pursuit of excessive wealth . 1 8 | who shot first in the shot heard around the world The North Bridge skirmish did see the first shots by Americans acting under orders , the first organized volley by Americans , the first British fatalities , and the first British retreat . 1 9 | who is beauty and the beast written by Beauty and the Beast ( French : La Belle et la Bête ) is a traditional fairy tale written by French novelist Gabrielle - Suzanne Barbot de Villeneuve and published in 1740 in La Jeune Américaine et les contes marins ( The Young American and Marine Tales ) . 1 10 | what episode does eleven come in season 1 Deep South Mag . 2 11 | love yourself by justin bieber is about who Rolling Stone . 1 12 | who starred in the movie natural born killers Scagnetti arrives and tells Mickey that unless he surrenders , he will cut off Mallory 's breasts . 0 13 | when does the new season on the 100 come out Monty accidentally fries all of the wristbands . 1 14 | where was the super bowl 52 played at Jump up ^ Chiari , Mike ( January 24 , 2018 ) . 0 15 | who won the academy award for the deer hunter Best Director , Michael Cimino 3 . 1 16 | how long do former presidents get secret service protection All living former presidents and their spouses are now entitled to receive lifetime Secret Service protection . 1 17 | the man in the high castle episode 1 season 1 Jump up ^ `` FX 's Tyrant casts Annet Mahendru ; Sebastian Roché in Amazon 's Man in the High Castle '' . 0 18 | who has hosted the most fifa world cups On 1 June 2014 , The Sunday Times claimed to have obtained documents including e-mails , letters and bank transfers which allegedly proved that Bin Hammam had paid more than US $5 million to football officials to support the Qatar bid . 0 19 | what was the first form of manga in japan Yomiuri Shimbun . 0 -------------------------------------------------------------------------------- /test/samples/tokenizer/custom_vocab.txt: -------------------------------------------------------------------------------- 1 | neverseentokens -------------------------------------------------------------------------------- /test/test_doc_classification_distilbert.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import logging 3 | import numpy as np 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import TextClassificationProcessor 7 | from farm.modeling.optimization import initialize_optimizer 8 | from farm.infer import Inferencer 9 | from farm.modeling.adaptive_model import AdaptiveModel 10 | from farm.modeling.language_model import DistilBert 11 | from farm.modeling.prediction_head import TextClassificationHead 12 | from farm.modeling.tokenization import Tokenizer 13 | from farm.train import Trainer 14 | from farm.utils import set_all_seeds, initialize_device_settings 15 | 16 | 17 | def test_doc_classification(caplog): 18 | if caplog: 19 | caplog.set_level(logging.CRITICAL) 20 | 21 | set_all_seeds(seed=42) 22 | device, n_gpu = initialize_device_settings(use_cuda=False) 23 | n_epochs = 1 24 | batch_size = 1 25 | evaluate_every = 2 26 | lang_model = "distilbert-base-german-cased" 27 | 28 | tokenizer = Tokenizer.load( 29 | pretrained_model_name_or_path=lang_model, 30 | do_lower_case=False) 31 | 32 | processor = TextClassificationProcessor(tokenizer=tokenizer, 33 | max_seq_len=8, 34 | data_dir=Path("samples/doc_class"), 35 | train_filename=Path("train-sample.tsv"), 36 | label_list=["OTHER", "OFFENSE"], 37 | metric="f1_macro", 38 | dev_filename="test-sample.tsv", 39 | test_filename=None, 40 | dev_split=0.0, 41 | label_column_name="coarse_label") 42 | 43 | data_silo = DataSilo( 44 | processor=processor, 45 | batch_size=batch_size) 46 | 47 | language_model = DistilBert.load(lang_model) 48 | prediction_head = TextClassificationHead(num_labels=2) 49 | model = AdaptiveModel( 50 | language_model=language_model, 51 | prediction_heads=[prediction_head], 52 | embeds_dropout_prob=0.1, 53 | lm_output_types=["per_sequence"], 54 | device=device) 55 | 56 | model, optimizer, lr_schedule = initialize_optimizer( 57 | model=model, 58 | learning_rate=2e-5, 59 | n_batches=len(data_silo.loaders["train"]), 60 | n_epochs=1, 61 | device=device, 62 | schedule_opts=None) 63 | 64 | trainer = Trainer( 65 | model=model, 66 | optimizer=optimizer, 67 | data_silo=data_silo, 68 | epochs=n_epochs, 69 | n_gpu=n_gpu, 70 | lr_schedule=lr_schedule, 71 | evaluate_every=evaluate_every, 72 | device=device) 73 | 74 | trainer.train() 75 | 76 | save_dir = Path("testsave/doc_class") 77 | model.save(save_dir) 78 | processor.save(save_dir) 79 | 80 | del model 81 | del processor 82 | del optimizer 83 | del data_silo 84 | del trainer 85 | 86 | basic_texts = [ 87 | {"text": "Malte liebt Berlin."}, 88 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."} 89 | ] 90 | 91 | inf = Inferencer.load(save_dir, batch_size=2, num_processes=0) 92 | result = inf.inference_from_dicts(dicts=basic_texts) 93 | assert isinstance(result[0]["predictions"][0]["probability"], np.float32) 94 | del inf 95 | 96 | if __name__ == "__main__": 97 | test_doc_classification(None) 98 | -------------------------------------------------------------------------------- /test/test_doc_regression.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from farm.data_handler.data_silo import DataSilo 8 | from farm.data_handler.processor import RegressionProcessor 9 | from farm.modeling.optimization import initialize_optimizer 10 | from farm.infer import Inferencer 11 | from farm.modeling.adaptive_model import AdaptiveModel 12 | from farm.modeling.language_model import LanguageModel 13 | from farm.modeling.prediction_head import RegressionHead 14 | from farm.modeling.tokenization import Tokenizer 15 | from farm.train import Trainer 16 | from farm.utils import set_all_seeds, initialize_device_settings 17 | 18 | @pytest.mark.parametrize("data_dir_path,text_column_name", 19 | [("samples/doc_regr", None), 20 | ("samples/doc_regr_other_text_column_name", "text_other")]) 21 | def test_doc_regression(data_dir_path, text_column_name, caplog=None): 22 | if caplog: 23 | caplog.set_level(logging.CRITICAL) 24 | 25 | set_all_seeds(seed=42) 26 | device, n_gpu = initialize_device_settings(use_cuda=False) 27 | n_epochs = 1 28 | batch_size = 1 29 | evaluate_every = 2 30 | lang_model = "bert-base-cased" 31 | 32 | tokenizer = Tokenizer.load( 33 | pretrained_model_name_or_path=lang_model, 34 | do_lower_case=False) 35 | 36 | rp_params = dict(tokenizer=tokenizer, 37 | max_seq_len=8, 38 | data_dir=Path(data_dir_path), 39 | train_filename="train-sample.tsv", 40 | dev_filename="test-sample.tsv", 41 | test_filename=None, 42 | label_column_name="label") 43 | 44 | if text_column_name is not None: 45 | rp_params["text_column_name"] = text_column_name 46 | 47 | processor = RegressionProcessor(**rp_params) 48 | 49 | data_silo = DataSilo( 50 | processor=processor, 51 | batch_size=batch_size) 52 | 53 | language_model = LanguageModel.load(lang_model) 54 | prediction_head = RegressionHead() 55 | model = AdaptiveModel( 56 | language_model=language_model, 57 | prediction_heads=[prediction_head], 58 | embeds_dropout_prob=0.1, 59 | lm_output_types=["per_sequence_continuous"], 60 | device=device 61 | ) 62 | 63 | model, optimizer, lr_schedule = initialize_optimizer( 64 | model=model, 65 | learning_rate=2e-5, 66 | #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, 67 | n_batches=len(data_silo.loaders["train"]), 68 | n_epochs=1, 69 | device=device, 70 | schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1} 71 | ) 72 | 73 | trainer = Trainer( 74 | model=model, 75 | optimizer=optimizer, 76 | data_silo=data_silo, 77 | epochs=n_epochs, 78 | n_gpu=n_gpu, 79 | lr_schedule=lr_schedule, 80 | evaluate_every=evaluate_every, 81 | device=device 82 | ) 83 | 84 | trainer.train() 85 | 86 | save_dir = Path("testsave/doc_regr") 87 | model.save(save_dir) 88 | processor.save(save_dir) 89 | 90 | del model 91 | del processor 92 | del optimizer 93 | del data_silo 94 | del trainer 95 | 96 | basic_texts = [ 97 | {"text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."}, 98 | {"text": "it just did not fit right. The top is very thin showing everything."}, 99 | ] 100 | 101 | model = Inferencer.load(save_dir, num_processes=0) 102 | result = model.inference_from_dicts(dicts=basic_texts) 103 | assert isinstance(result[0]["predictions"][0]["pred"], np.float32) 104 | del model 105 | -------------------------------------------------------------------------------- /test/test_evaluation_metrics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import math 3 | import numpy as np 4 | 5 | from farm.evaluation.metrics import compute_metrics 6 | from farm.evaluation.semantic_answer_similarity_evaluation import semantic_answer_similarity 7 | 8 | def test_compute_metrics_basic(): 9 | # check we get some exception, may not always be the AssertionError we get now 10 | with pytest.raises(Exception): 11 | compute_metrics("acc", ["x"] * 10, [""] * 11) 12 | ret = compute_metrics("acc", [], []) 13 | assert isinstance(ret, dict) 14 | assert "acc" in ret 15 | assert math.isnan(ret["acc"]) 16 | with pytest.raises(Exception): 17 | compute_metrics("asdfasdf", ["a"], ["b"]) 18 | ls = (["a"] * 5) 19 | ls.extend(["b"] * 5) 20 | ps = ["a"] * 10 21 | ret = compute_metrics("acc", ps, ls) 22 | assert ret["acc"] == 0.5 23 | ret = compute_metrics("acc", ls, ps) 24 | assert ret["acc"] == 0.5 25 | ret = compute_metrics("f1_macro", ps, ls) 26 | assert ret["f1_macro"] == 1/3 27 | ret = compute_metrics("f1_macro", ls, ps) 28 | assert ret["f1_macro"] == 1 / 3 29 | ret = compute_metrics(["f1_macro", "acc"], ps, ls) 30 | assert isinstance(ret, dict) 31 | assert len(ret) == 2 32 | assert "acc" in ret 33 | assert "f1_macro" in ret 34 | assert ret["f1_macro"] == 1/3 35 | assert ret["acc"] == 0.5 36 | ret = compute_metrics(["f1_macro", "acc", "acc"], ps, ls) 37 | assert isinstance(ret, dict) 38 | assert len(ret) == 2 39 | assert "acc" in ret 40 | assert "f1_macro" in ret 41 | assert ret["f1_macro"] == 1/3 42 | assert ret["acc"] == 0.5 43 | ret = compute_metrics(["f1_macro", ["acc"]], ps, ls) 44 | assert isinstance(ret, dict) 45 | assert len(ret) == 2 46 | assert "acc" in ret 47 | assert "f1_macro" in ret 48 | assert ret["f1_macro"] == 1/3 49 | assert ret["acc"] == 0.5 50 | 51 | def test_semantic_answer_similarity(bert_base_squad2): 52 | bert_base_squad2.model.prediction_heads[0].n_best = 2 53 | result = bert_base_squad2.inference_from_file(file="samples/qa/eval-sample.json",return_json=False) 54 | 55 | top1_sim, topn_sim, r, d = semantic_answer_similarity(result=result, 56 | sts_model_path_or_string="paraphrase-MiniLM-L6-v2", 57 | debug=True) 58 | 59 | assert np.isclose(top1_sim, 0.7405298) 60 | assert np.isclose(topn_sim, 0.7405298) 61 | assert len(d) == 1 62 | assert "semantic_answer_score" in r[0].prediction[0].meta 63 | 64 | -------------------------------------------------------------------------------- /test/test_inference.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | from farm.infer import Inferencer 5 | from transformers import BertTokenizerFast 6 | 7 | 8 | @pytest.mark.parametrize("streaming", [True, False]) 9 | @pytest.mark.parametrize("multiprocessing_chunksize", [None, 2]) 10 | @pytest.mark.parametrize("num_processes", [2, 0, None], scope="module") 11 | def test_qa_format_and_results(adaptive_model_qa, streaming, multiprocessing_chunksize): 12 | qa_inputs_dicts = [ 13 | { 14 | "questions": ["In what country is Normandy"], 15 | "text": "The Normans are an ethnic group that arose in Normandy, a northern region " 16 | "of France, from contact between Viking settlers and indigenous Franks and Gallo-Romans", 17 | }, 18 | { 19 | "questions": ["Who counted the game among the best ever made?"], 20 | "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received " 21 | "perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic " 22 | "Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings " 23 | "and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores " 24 | "of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the " 25 | "greatest games ever created.", 26 | }, 27 | ] 28 | ground_truths = ["France", "GameTrailers"] 29 | 30 | results = adaptive_model_qa.inference_from_dicts( 31 | dicts=qa_inputs_dicts, 32 | multiprocessing_chunksize=multiprocessing_chunksize, 33 | streaming=streaming, 34 | ) 35 | # sample results 36 | # [ 37 | # { 38 | # "task": "qa", 39 | # "predictions": [ 40 | # { 41 | # "question": "In what country is Normandy", 42 | # "question_id": "None", 43 | # "ground_truth": None, 44 | # "answers": [ 45 | # { 46 | # "score": 1.1272038221359253, 47 | # "probability": -1, 48 | # "answer": "France", 49 | # "offset_answer_start": 54, 50 | # "offset_answer_end": 60, 51 | # "context": "The Normans gave their name to Normandy, a region in France.", 52 | # "offset_context_start": 0, 53 | # "offset_context_end": 60, 54 | # "document_id": None, 55 | # } 56 | # ] 57 | # } 58 | # ], 59 | # } 60 | # ] 61 | predictions = list(results)[0]["predictions"] 62 | 63 | for prediction, ground_truth, qa_input_dict in zip( 64 | predictions, ground_truths, qa_inputs_dicts 65 | ): 66 | assert prediction["question"] == qa_input_dict["questions"][0] 67 | answer = prediction["answers"][0] 68 | assert answer["answer"] in answer["context"] 69 | assert answer["answer"] == ground_truth 70 | assert ( 71 | {"answer", "score", "probability", "offset_answer_start", "offset_answer_end", "context", 72 | "offset_context_start", "offset_context_end", "document_id"} 73 | == answer.keys() 74 | ) 75 | 76 | 77 | @pytest.mark.parametrize("num_processes", [0], scope="session") 78 | @pytest.mark.parametrize("use_fast", [True]) 79 | def test_embeddings_extraction(num_processes, use_fast): 80 | # Input 81 | basic_texts = [ 82 | {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"}, 83 | {"text": "Martin Müller spielt Fussball"}, 84 | ] 85 | 86 | # Load model, tokenizer and processor directly into Inferencer 87 | model = Inferencer.load( 88 | model_name_or_path="bert-base-german-cased", 89 | task_type="embeddings", 90 | gpu=False, 91 | batch_size=5, 92 | extraction_strategy="reduce_mean", 93 | extraction_layer=-2, 94 | use_fast=use_fast, 95 | num_processes=num_processes, 96 | ) 97 | 98 | # Get embeddings for input text (you can vary the strategy and layer) 99 | result = model.inference_from_dicts(dicts=basic_texts) 100 | assert result[0]["context"] == basic_texts[0]["text"] 101 | assert result[0]["vec"].shape == (768,) 102 | assert np.isclose(result[0]["vec"][0], 0.01501756374325071, atol=0.00001) 103 | 104 | 105 | def test_inferencer_with_fast_bert_tokenizer(): 106 | model = Inferencer.load("bert-base-german-cased", task_type='text_classification', 107 | use_fast=True, num_processes=0) 108 | tokenizer = model.processor.tokenizer 109 | assert type(tokenizer) is BertTokenizerFast 110 | 111 | 112 | if __name__ == "__main__": 113 | test_embeddings_extraction() 114 | -------------------------------------------------------------------------------- /test/test_model_versioning.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from farm.infer import Inferencer 4 | 5 | def test_wrong_revision(caplog=None): 6 | # We want this load attempt to fail because we specify an invalid revision 7 | failed_load = None 8 | try: 9 | failed_load = Inferencer.load("deepset/roberta-base-squad2", revision="xxx", task_type="question_answering") 10 | except: 11 | pass 12 | assert not failed_load 13 | 14 | def test_revision_v1(caplog=None): 15 | model = Inferencer.load("deepset/roberta-base-squad2", revision="v1.0", task_type="question_answering") 16 | assert torch.isclose(torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight), 17 | torch.sum(torch.tensor([-21394.6055]))) 18 | del model 19 | 20 | def test_revision_v2(caplog=None): 21 | model = Inferencer.load("deepset/roberta-base-squad2", revision="v2.0", task_type="question_answering") 22 | assert torch.isclose(torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight), 23 | torch.sum(torch.tensor([-21411.4414]))) 24 | del model 25 | 26 | def test_revision_default(caplog=None): 27 | # default model should be the same as v2 28 | model = Inferencer.load("deepset/roberta-base-squad2", task_type="question_answering") 29 | assert torch.isclose( 30 | torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight), 31 | torch.sum(torch.tensor([-21411.4414]))) 32 | del model 33 | -------------------------------------------------------------------------------- /test/test_natural_questions.py: -------------------------------------------------------------------------------- 1 | # TODO enable NQ tests again 2 | 3 | # import logging 4 | # from pathlib import Path 5 | # import numpy as np 6 | # import pytest 7 | # 8 | # from farm.data_handler.data_silo import DataSilo 9 | # from farm.data_handler.processor import NaturalQuestionsProcessor 10 | # from farm.modeling.adaptive_model import AdaptiveModel 11 | # from farm.modeling.language_model import LanguageModel 12 | # from farm.modeling.optimization import initialize_optimizer 13 | # from farm.modeling.prediction_head import QuestionAnsweringHead, TextClassificationHead 14 | # from farm.modeling.tokenization import Tokenizer 15 | # from farm.train import Trainer 16 | # from farm.utils import set_all_seeds, initialize_device_settings 17 | # from farm.infer import Inferencer, QAInferencer 18 | # 19 | # @pytest.fixture() 20 | # def distilbert_nq(caplog=None): 21 | # if caplog: 22 | # caplog.set_level(logging.CRITICAL) 23 | # 24 | # 25 | # set_all_seeds(seed=42) 26 | # device, n_gpu = initialize_device_settings(use_cuda=False) 27 | # batch_size = 2 28 | # n_epochs = 1 29 | # evaluate_every = 4 30 | # base_LM_model = "distilbert-base-uncased" 31 | # 32 | # tokenizer = Tokenizer.load( 33 | # pretrained_model_name_or_path=base_LM_model, do_lower_case=True 34 | # ) 35 | # processor = NaturalQuestionsProcessor( 36 | # tokenizer=tokenizer, 37 | # max_seq_len=20, 38 | # doc_stride=10, 39 | # max_query_length=6, 40 | # train_filename="train_sample.jsonl", 41 | # dev_filename="dev_sample.jsonl", 42 | # data_dir=Path("samples/nq") 43 | # ) 44 | # 45 | # data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) 46 | # language_model = LanguageModel.load(base_LM_model) 47 | # qa_head = QuestionAnsweringHead() 48 | # classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) 49 | # 50 | # model = AdaptiveModel( 51 | # language_model=language_model, 52 | # prediction_heads=[qa_head, classification_head], 53 | # embeds_dropout_prob=0.1, 54 | # lm_output_types=["per_token", "per_sequence"], 55 | # device=device, 56 | # ) 57 | # 58 | # model, optimizer, lr_schedule = initialize_optimizer( 59 | # model=model, 60 | # learning_rate=2e-5, 61 | # #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, 62 | # n_batches=len(data_silo.loaders["train"]), 63 | # n_epochs=n_epochs, 64 | # device=device 65 | # ) 66 | # trainer = Trainer( 67 | # model=model, 68 | # optimizer=optimizer, 69 | # data_silo=data_silo, 70 | # epochs=n_epochs, 71 | # n_gpu=n_gpu, 72 | # lr_schedule=lr_schedule, 73 | # evaluate_every=evaluate_every, 74 | # device=device 75 | # ) 76 | # trainer.train() 77 | # return model, processor 78 | # 79 | # 80 | # def test_training(distilbert_nq): 81 | # model, processor = distilbert_nq 82 | # assert type(model) == AdaptiveModel 83 | # assert type(processor) == NaturalQuestionsProcessor 84 | # 85 | # 86 | # def test_inference(distilbert_nq, caplog=None): 87 | # if caplog: 88 | # caplog.set_level(logging.CRITICAL) 89 | # model, processor = distilbert_nq 90 | # 91 | # save_dir = Path("testsave/qa_nq") 92 | # model.save(save_dir) 93 | # processor.save(save_dir) 94 | # 95 | # inferencer = QAInferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0) 96 | # assert inferencer is not None 97 | # 98 | # qa_format_1 = [ 99 | # { 100 | # "questions": ["Who counted the game among the best ever made?"], 101 | # "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." 102 | # } 103 | # ] 104 | # qa_format_2 = [ 105 | # { 106 | # "qas":["Who counted the game among the best ever made?"], 107 | # "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.", 108 | # } 109 | # ] 110 | # 111 | # result1 = inferencer.inference_from_dicts(dicts=qa_format_1) 112 | # result2 = inferencer.inference_from_dicts(dicts=qa_format_2) 113 | # assert result1 == result2 114 | # 115 | # if __name__ == "__main__": 116 | # test_training() 117 | # test_inference() -------------------------------------------------------------------------------- /test/test_ner.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | 4 | import numpy as np 5 | 6 | from farm.data_handler.data_silo import DataSilo 7 | from farm.data_handler.processor import NERProcessor 8 | from farm.modeling.optimization import initialize_optimizer 9 | from farm.infer import Inferencer 10 | from farm.modeling.adaptive_model import AdaptiveModel 11 | from farm.modeling.language_model import LanguageModel 12 | from farm.modeling.prediction_head import TokenClassificationHead 13 | from farm.modeling.tokenization import Tokenizer 14 | from farm.train import Trainer 15 | from farm.utils import set_all_seeds, initialize_device_settings 16 | 17 | import logging 18 | 19 | # TODO: Test slow tokenizers when reimplemented 20 | @pytest.mark.parametrize("use_fast", [True]) 21 | def test_ner(caplog, use_fast): 22 | if caplog: 23 | caplog.set_level(logging.CRITICAL) 24 | 25 | set_all_seeds(seed=42) 26 | device, n_gpu = initialize_device_settings(use_cuda=False) 27 | n_epochs = 3 28 | batch_size = 2 29 | evaluate_every = 1 30 | lang_model = "distilbert-base-german-cased" 31 | 32 | tokenizer = Tokenizer.load( 33 | pretrained_model_name_or_path=lang_model, do_lower_case=False, 34 | use_fast=use_fast, 35 | ) 36 | 37 | ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", 38 | "I-OTH"] 39 | 40 | processor = NERProcessor( 41 | tokenizer=tokenizer, 42 | max_seq_len=8, 43 | data_dir=Path("samples/ner"), 44 | train_filename="train-sample.txt", 45 | dev_filename="dev-sample.txt", 46 | test_filename=None, 47 | delimiter=" ", 48 | label_list=ner_labels, 49 | metric="seq_f1", 50 | multithreading_rust=False 51 | ) 52 | 53 | data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) 54 | language_model = LanguageModel.load(lang_model) 55 | prediction_head = TokenClassificationHead(num_labels=13) 56 | 57 | model = AdaptiveModel( 58 | language_model=language_model, 59 | prediction_heads=[prediction_head], 60 | embeds_dropout_prob=0.1, 61 | lm_output_types=["per_token"], 62 | device=device, 63 | ) 64 | 65 | model, optimizer, lr_schedule = initialize_optimizer( 66 | model=model, 67 | learning_rate=2e-5, 68 | #optimizer_opts={'name': 'AdamW', 'lr': 2E-05}, 69 | n_batches=len(data_silo.loaders["train"]), 70 | n_epochs=1, 71 | device=device, 72 | schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1} 73 | ) 74 | trainer = Trainer( 75 | model=model, 76 | optimizer=optimizer, 77 | data_silo=data_silo, 78 | epochs=n_epochs, 79 | n_gpu=n_gpu, 80 | lr_schedule=lr_schedule, 81 | evaluate_every=evaluate_every, 82 | device=device, 83 | ) 84 | 85 | save_dir = Path("testsave/ner") 86 | model = trainer.train() 87 | model.save(save_dir) 88 | processor.save(save_dir) 89 | 90 | del model 91 | del processor 92 | del optimizer 93 | del data_silo 94 | del trainer 95 | 96 | basic_texts = [ 97 | {"text": "Paris is a town in France."}, 98 | ] 99 | model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner", use_fast=use_fast) 100 | # labels arent correctly inserted from transformers 101 | # They are converted to LABEL_1 ... LABEL_N 102 | # For the inference result to contain predictions we need them in IOB NER format 103 | model.processor.tasks["ner"]["label_list"][-1] = "B-LOC" 104 | result = model.inference_from_dicts(dicts=basic_texts) 105 | 106 | assert result[0]["predictions"][0][0]["context"] == "Paris" 107 | assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32) 108 | assert result[0]["predictions"][0][0]["probability"] > 0.99 109 | assert result[0]["predictions"][0][0]["label"] == "LOC" 110 | 111 | 112 | if __name__ == "__main__": 113 | test_ner(None, True) 114 | -------------------------------------------------------------------------------- /test/test_ner_amp.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | 4 | from farm.data_handler.data_silo import DataSilo 5 | from farm.data_handler.processor import NERProcessor 6 | from farm.modeling.optimization import initialize_optimizer, AMP_AVAILABLE 7 | from farm.infer import Inferencer 8 | from farm.modeling.adaptive_model import AdaptiveModel 9 | from farm.modeling.language_model import LanguageModel 10 | from farm.modeling.prediction_head import TokenClassificationHead 11 | from farm.modeling.tokenization import Tokenizer 12 | from farm.train import Trainer 13 | from farm.utils import set_all_seeds, initialize_device_settings 14 | 15 | import logging 16 | 17 | 18 | def test_ner_amp(caplog): 19 | if caplog: 20 | caplog.set_level(logging.CRITICAL) 21 | 22 | set_all_seeds(seed=42) 23 | device, n_gpu = initialize_device_settings(use_cuda=True) 24 | n_epochs = 1 25 | batch_size = 2 26 | evaluate_every = 1 27 | lang_model = "bert-base-german-cased" 28 | if AMP_AVAILABLE: 29 | use_amp = 'O1' 30 | else: 31 | use_amp = None 32 | 33 | tokenizer = Tokenizer.load( 34 | pretrained_model_name_or_path=lang_model, do_lower_case=False 35 | ) 36 | 37 | ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", 38 | "I-OTH"] 39 | 40 | processor = NERProcessor( 41 | tokenizer=tokenizer, 42 | max_seq_len=8, 43 | data_dir=Path("samples/ner"), 44 | train_filename=Path("train-sample.txt"), 45 | dev_filename=Path("dev-sample.txt"), 46 | test_filename=None, 47 | delimiter=" ", 48 | label_list=ner_labels, 49 | metric="seq_f1" 50 | ) 51 | 52 | data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) 53 | language_model = LanguageModel.load(lang_model) 54 | prediction_head = TokenClassificationHead(num_labels=13) 55 | 56 | model = AdaptiveModel( 57 | language_model=language_model, 58 | prediction_heads=[prediction_head], 59 | embeds_dropout_prob=0.1, 60 | lm_output_types=["per_token"], 61 | device=device 62 | ) 63 | 64 | model, optimizer, lr_schedule = initialize_optimizer( 65 | model=model, 66 | learning_rate=2e-05, 67 | schedule_opts=None, 68 | n_batches=len(data_silo.loaders["train"]), 69 | n_epochs=n_epochs, 70 | device=device, 71 | use_amp=use_amp) 72 | 73 | trainer = Trainer( 74 | model=model, 75 | optimizer=optimizer, 76 | data_silo=data_silo, 77 | epochs=n_epochs, 78 | n_gpu=n_gpu, 79 | lr_schedule=lr_schedule, 80 | evaluate_every=evaluate_every, 81 | device=device, 82 | ) 83 | 84 | save_dir = Path("testsave/ner") 85 | trainer.train() 86 | model.save(save_dir) 87 | processor.save(save_dir) 88 | 89 | basic_texts = [ 90 | {"text": "1980 kam der Crown von Toyota"}, 91 | ] 92 | model = Inferencer.load(save_dir, num_processes=0) 93 | result = model.inference_from_dicts(dicts=basic_texts) 94 | 95 | assert result[0]["predictions"][0][0]["context"] == "1980" 96 | assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32) 97 | assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.161, rtol=0.05) 98 | assert result[0]["predictions"][0][0]["label"] == "LOC" 99 | 100 | 101 | if __name__ == "__main__": 102 | test_ner_amp(None) 103 | -------------------------------------------------------------------------------- /test/test_onnx_conversion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from farm.infer import Inferencer 5 | from farm.modeling.adaptive_model import AdaptiveModel 6 | 7 | 8 | @pytest.mark.parametrize("model_name", ["deepset/bert-base-cased-squad2", "deepset/roberta-base-squad2"]) 9 | def test_onnx_conversion_and_inference(tmp_path, model_name): 10 | AdaptiveModel.convert_to_onnx( 11 | model_name=model_name, output_path=tmp_path / "test-onnx", task_type="question_answering" 12 | ) 13 | onnx_inferencer = Inferencer.load(tmp_path / "test-onnx", task_type="question_answering", num_processes=0) 14 | qa_input = [ 15 | { 16 | "questions": ["What is the population of Berlin?"], 17 | "text": "Berlin is the capital and largest city of Germany by both area and population. Its 3,769,495 " 18 | "inhabitants as of December 31, 2019 make it the most populous city of the European Union, " 19 | "according to population within city limits.The city is also one of Germany's 16 federal states.", 20 | } 21 | ] 22 | result_onnx = onnx_inferencer.inference_from_dicts(qa_input)[0] 23 | assert result_onnx["predictions"][0]["answers"][0]["answer"] == "3,769,495" 24 | 25 | pytorch_inferencer = Inferencer.load(model_name, task_type="question_answering", num_processes=0) 26 | result_pytorch = pytorch_inferencer.inference_from_dicts(qa_input)[0] 27 | 28 | for (onnx, pytorch) in zip( 29 | result_onnx["predictions"][0]["answers"][0].items(), result_pytorch["predictions"][0]["answers"][0].items() 30 | ): 31 | # keys 32 | assert onnx[0] == pytorch[0] 33 | # values 34 | if type(onnx[1]) == float: 35 | np.testing.assert_almost_equal(onnx[1], pytorch[1], decimal=4) # score 36 | else: 37 | assert onnx[1] == pytorch[1] 38 | -------------------------------------------------------------------------------- /test/test_prediction_head.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from pathlib import Path 4 | import pytest 5 | 6 | from farm.data_handler.data_silo import DataSilo 7 | from farm.data_handler.processor import TextClassificationProcessor 8 | from farm.modeling.adaptive_model import AdaptiveModel 9 | from farm.modeling.language_model import LanguageModel 10 | from farm.modeling.prediction_head import TextClassificationHead 11 | from farm.modeling.tokenization import Tokenizer 12 | from farm.utils import set_all_seeds, initialize_device_settings 13 | 14 | 15 | def test_prediction_head_load_save_class_weights(tmp_path, caplog=None): 16 | """This is a regression test for #428 and #422.""" 17 | if caplog: 18 | caplog.set_level(logging.CRITICAL) 19 | 20 | set_all_seeds(seed=42) 21 | device, n_gpu = initialize_device_settings(use_cuda=False) 22 | batch_size = 1 23 | lang_model = "bert-base-german-cased" 24 | data_dir_path = "samples/doc_class" 25 | 26 | tokenizer = Tokenizer.load( 27 | pretrained_model_name_or_path=lang_model, 28 | do_lower_case=False) 29 | 30 | tcp_params = dict(tokenizer=tokenizer, 31 | max_seq_len=8, 32 | data_dir=Path(data_dir_path), 33 | train_filename="train-sample.tsv", 34 | label_list=["OTHER", "OFFENSE"], 35 | metric="f1_macro", 36 | dev_filename="test-sample.tsv", 37 | test_filename=None, 38 | dev_split=0.0, 39 | label_column_name="coarse_label") 40 | 41 | processor = TextClassificationProcessor(**tcp_params) 42 | 43 | data_silo = DataSilo( 44 | processor=processor, 45 | batch_size=batch_size) 46 | 47 | language_model = LanguageModel.load(lang_model) 48 | prediction_head = TextClassificationHead( 49 | num_labels=2, 50 | class_weights=data_silo.calculate_class_weights(task_name="text_classification")) 51 | 52 | model = AdaptiveModel( 53 | language_model=language_model, 54 | prediction_heads=[prediction_head], 55 | embeds_dropout_prob=0.1, 56 | lm_output_types=["per_sequence"], 57 | device=device) 58 | 59 | model.save(tmp_path) 60 | model_loaded = AdaptiveModel.load(tmp_path, device='cpu') 61 | assert model_loaded is not None 62 | 63 | def test_TextClassificationHead_class_weights_dimensions(): 64 | with pytest.raises(ValueError): 65 | class_wights = np.asarray([[0.4, 0.6], [0.8, 0.2]]) 66 | TextClassificationHead( 67 | num_labels=2, 68 | class_weights=class_wights) 69 | -------------------------------------------------------------------------------- /test/test_processor_saving_loading.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from farm.data_handler.processor import TextClassificationProcessor 5 | from farm.modeling.tokenization import Tokenizer 6 | from farm.utils import set_all_seeds 7 | import torch 8 | 9 | def test_processor_saving_loading(caplog): 10 | if caplog is not None: 11 | caplog.set_level(logging.CRITICAL) 12 | 13 | set_all_seeds(seed=42) 14 | lang_model = "bert-base-cased" 15 | 16 | tokenizer = Tokenizer.load( 17 | pretrained_model_name_or_path=lang_model, do_lower_case=False 18 | ) 19 | 20 | processor = TextClassificationProcessor(tokenizer=tokenizer, 21 | max_seq_len=128, 22 | data_dir=Path("samples/doc_class"), 23 | train_filename="train-sample.tsv", 24 | dev_filename=None, 25 | test_filename=None, 26 | label_column_name="coarse_label", 27 | dev_split=0.1, 28 | label_list=["OTHER", "OFFENSE"], 29 | metric=["f1_macro"] 30 | ) 31 | dicts = processor.file_to_dicts(file=Path("samples/doc_class/train-sample.tsv")) 32 | data, tensor_names, _ = processor.dataset_from_dicts(dicts) 33 | 34 | save_dir = Path("testsave/processor") 35 | processor.save(save_dir) 36 | 37 | processor = processor.load_from_dir(save_dir) 38 | dicts = processor.file_to_dicts(file=Path("samples/doc_class/train-sample.tsv")) 39 | data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts) 40 | 41 | assert tensor_names == tensor_names_loaded 42 | for i in range(len(data.tensors)): 43 | assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i])) 44 | 45 | if __name__ == "__main__": 46 | test_processor_saving_loading(None) 47 | -------------------------------------------------------------------------------- /test/test_s3e_pooling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | from pathlib import Path 4 | 5 | from farm.data_handler.processor import InferenceProcessor 6 | from farm.infer import Inferencer 7 | from farm.modeling.adaptive_model import AdaptiveModel 8 | from farm.modeling.language_model import LanguageModel 9 | from farm.modeling.tokenization import Tokenizer 10 | from farm.utils import set_all_seeds, initialize_device_settings 11 | from farm.modeling.wordembedding_utils import fit_s3e_on_corpus 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def test_s3e_fit(): 18 | # small test data 19 | language_model = Path("samples/s3e/tiny_fasttext_model") 20 | corpus_path = Path("samples/s3e/tiny_corpus.txt") 21 | save_dir = Path("testsave/fitted_s3e/") 22 | do_lower_case = False 23 | batch_size = 2 24 | use_gpu = False 25 | 26 | # Fit S3E on a corpus 27 | set_all_seeds(seed=42) 28 | device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False) 29 | 30 | # Create a InferenceProcessor 31 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case) 32 | processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128) 33 | 34 | # Create an AdaptiveModel 35 | language_model = LanguageModel.load(language_model) 36 | 37 | model = AdaptiveModel( 38 | language_model=language_model, 39 | prediction_heads=[], 40 | embeds_dropout_prob=0.1, 41 | lm_output_types=[], 42 | device=device) 43 | 44 | model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor, 45 | model=model, 46 | corpus=corpus_path, 47 | n_clusters=3, 48 | pca_n_components=30, 49 | svd_postprocessing=True, 50 | min_token_occurrences=1) 51 | 52 | # save everything to allow inference without fitting everything again 53 | model.save(save_dir) 54 | processor.save(save_dir) 55 | with open(save_dir / "s3e_stats.pkl", "wb") as f: 56 | pickle.dump(s3e_stats, f) 57 | 58 | # Load model, tokenizer and processor directly into Inferencer 59 | inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu, 60 | batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, 61 | s3e_stats=s3e_stats, num_processes=0) 62 | 63 | # Input 64 | basic_texts = [ 65 | {"text": "a man is walking on the street."}, 66 | {"text": "a woman is walking on the street."}, 67 | ] 68 | 69 | # Get embeddings for input text (you can vary the strategy and layer) 70 | result = inferencer.inference_from_dicts(dicts=basic_texts) 71 | assert result[0]["context"] == basic_texts[0]["text"] 72 | assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 73 | assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6 74 | 75 | 76 | def test_load_extract_s3e_embeddings(): 77 | load_dir = Path("samples/s3e/fitted_s3e") 78 | use_gpu = False 79 | batch_size = 2 80 | 81 | with open(load_dir / "s3e_stats.pkl", "rb") as f: 82 | s3e_stats = pickle.load(f) 83 | 84 | # Init inferencer 85 | inferencer = Inferencer.load(model_name_or_path=load_dir, task_type="embeddings", gpu=use_gpu, 86 | batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1, 87 | s3e_stats=s3e_stats, num_processes=0) 88 | 89 | # Input 90 | basic_texts = [ 91 | {"text": "a man is walking on the street."}, 92 | {"text": "a woman is walking on the street."}, 93 | ] 94 | 95 | # Get embeddings for input text 96 | result = inferencer.inference_from_dicts(dicts=basic_texts) 97 | assert result[0]["context"] == basic_texts[0]["text"] 98 | assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6 99 | assert result[0]["vec"][-2] + 0.06285100416478565 < 1e-6 100 | 101 | if __name__ == "__main__": 102 | test_s3e_fit() 103 | test_load_extract_s3e_embeddings() -------------------------------------------------------------------------------- /tutorials/sagemaker/source/doc_classification.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from farm.data_handler.data_silo import DataSilo 6 | from farm.data_handler.processor import TextClassificationProcessor 7 | from farm.modeling.adaptive_model import AdaptiveModel 8 | from farm.modeling.language_model import LanguageModel 9 | from farm.modeling.optimization import initialize_optimizer 10 | from farm.modeling.prediction_head import TextClassificationHead 11 | from farm.modeling.tokenization import Tokenizer 12 | from farm.train import Trainer 13 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings 14 | 15 | 16 | def doc_classification(args): 17 | logging.basicConfig( 18 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO 19 | ) 20 | 21 | ml_logger = MLFlowLogger(tracking_uri="") 22 | ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") 23 | 24 | set_all_seeds(seed=42) 25 | save_dir = Path("/opt/ml/model") 26 | use_amp = None 27 | 28 | device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) 29 | 30 | # 1.Create a tokenizer 31 | tokenizer = Tokenizer.load(pretrained_model_name_or_path=args.base_lm_model, do_lower_case=False) 32 | 33 | # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset 34 | # Here we load GermEval 2018 Data. 35 | label_list = ["OTHER", "OFFENSE"] 36 | metric = "f1_macro" 37 | 38 | processor = TextClassificationProcessor( 39 | tokenizer=tokenizer, 40 | max_seq_len=args.max_seq_len, 41 | data_dir=Path("../data/germeval18"), 42 | label_list=label_list, 43 | metric=metric, 44 | label_column_name="coarse_label", 45 | ) 46 | 47 | # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a 48 | # few descriptive statistics of our datasets 49 | data_silo = DataSilo(processor=processor, batch_size=args.batch_size) 50 | 51 | # 4. Create an AdaptiveModel 52 | # a) which consists of a pretrained language model as a basis 53 | language_model = LanguageModel.load(args.base_lm_model) 54 | # b) and a prediction head on top that is suited for our task => Text classification 55 | prediction_head = TextClassificationHead( 56 | class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list) 57 | ) 58 | 59 | model = AdaptiveModel( 60 | language_model=language_model, 61 | prediction_heads=[prediction_head], 62 | embeds_dropout_prob=0.1, 63 | lm_output_types=["per_sequence"], 64 | device=device, 65 | ) 66 | 67 | # 5. Create an optimizer 68 | model, optimizer, lr_schedule = initialize_optimizer( 69 | model=model, 70 | learning_rate=3e-5, 71 | device=device, 72 | n_batches=len(data_silo.loaders["train"]), 73 | n_epochs=args.n_epochs, 74 | use_amp=use_amp, 75 | ) 76 | 77 | # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time 78 | trainer = Trainer( 79 | model=model, 80 | optimizer=optimizer, 81 | data_silo=data_silo, 82 | epochs=args.n_epochs, 83 | n_gpu=n_gpu, 84 | lr_schedule=lr_schedule, 85 | evaluate_every=args.evaluate_every, 86 | device=device, 87 | ) 88 | 89 | # 7. Let it grow 90 | trainer.train() 91 | 92 | # 8. Hooray! You have a model. Store it: 93 | model.save(save_dir) 94 | processor.save(save_dir) 95 | 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser() 99 | 100 | parser.add_argument("--n_epochs", type=int, default=2, help="number of epochs (default: 2)") 101 | parser.add_argument("--batch_size", type=int, default=4, help="batch size (default: 4)") 102 | parser.add_argument("--max_seq_len", type=int, default=64, help="batch size (default: 64)") 103 | parser.add_argument( 104 | "--base_lm_model", 105 | type=str, 106 | default="bert-base-uncased", 107 | help="base language model to use (default: bert-base-uncased)", 108 | ) 109 | parser.add_argument( 110 | "--evaluate_every", type=int, default=100, help="perform evaluation every n steps (default: 100)" 111 | ) 112 | doc_classification(parser.parse_args()) 113 | -------------------------------------------------------------------------------- /tutorials/sagemaker/source/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/deepset-ai/farm.git@c2e86cdd52242d27702f5f383883b8e3421489ee#egg=farm --------------------------------------------------------------------------------