├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── question.md
    ├── stale.yml
    └── workflows
    │   ├── ci.yml
    │   └── cml.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .travis.yml
├── Dockerfile
├── Dockerfile-GPU
├── Dockerfile-SageMaker
├── Dockerfile-onnxruntime
├── LICENSE
├── MANIFEST.in
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── _static
    │   └── custom.css
    ├── api
    │   ├── data_handling.rst
    │   ├── modeling.rst
    │   └── running.rst
    ├── basic_usage.rst
    ├── conf.py
    ├── data_handling.rst
    ├── examples.rst
    ├── img
    │   ├── adaptive_model_no_bg.jpg
    │   ├── adaptive_model_no_bg_small.jpg
    │   ├── code_snippet_building_blocks.png
    │   ├── code_snippet_experiment.png
    │   ├── code_snippet_inference.png
    │   ├── data_silo_no_bg.jpg
    │   ├── data_silo_no_bg_small.jpg
    │   ├── deepset_logo.png
    │   ├── farm_logo_text_right_wide.png
    │   ├── inference-api-screen.png
    │   ├── logo.png
    │   └── sample_basket_no_bg.jpg
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── modeling.rst
    └── qa_formats.py
├── examples
    ├── conversion_huggingface_models.py
    ├── conversion_huggingface_models_classification.py
    ├── doc_classification.py
    ├── doc_classification_cola.py
    ├── doc_classification_crossvalidation.py
    ├── doc_classification_custom_optimizer.py
    ├── doc_classification_fasttext_LM.py
    ├── doc_classification_holdout.py
    ├── doc_classification_multilabel.py
    ├── doc_classification_multilabel_roberta.py
    ├── doc_classification_with_earlystopping.py
    ├── doc_classification_word_embedding_LM.py
    ├── doc_regression.py
    ├── dpr_encoder.py
    ├── embeddings_extraction.py
    ├── embeddings_extraction_s3e_pooling.py
    ├── evaluation.py
    ├── lm_finetuning.py
    ├── mtl01_tclass_tclass.py
    ├── natural_questions.py
    ├── ner.py
    ├── onnx_question_answering.py
    ├── passage_ranking.py
    ├── question_answering.py
    ├── question_answering_confidence.py
    ├── question_answering_crossvalidation.py
    ├── streaming_inference.py
    ├── text_pair_classification.py
    ├── train_from_scratch.py
    ├── train_from_scratch_with_sagemaker.py
    └── wordembedding_inference.py
├── experiments
    ├── electra_eval
    │   └── conll2003_en_config.json
    ├── german-bert2.0-eval
    │   ├── germEval14_config.json
    │   ├── germEval18Coarse_config.json
    │   └── germEval18Fine_config.json
    ├── lm_finetuning
    │   └── finetune_sample_config.json
    ├── ner
    │   ├── conll2003_de_config.json
    │   ├── conll2003_en_config.json
    │   └── germEval14_config.json
    ├── qa
    │   └── squad20_config.json
    ├── text_classification
    │   ├── cola_config.json
    │   ├── germEval18Coarse_config.json
    │   ├── germEval18Fine_config.json
    │   └── gnad_config.json
    ├── text_pair_classification
    │   └── asnq_binary_config.json
    └── xlm_roberta_eval
    │   ├── conll2003_de_config.json
    │   ├── germEval14_config.json
    │   └── germEval18Coarse_config.json
├── farm
    ├── __init__.py
    ├── _version.py
    ├── conversion
    │   ├── __init__.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   └── transformers.py
    ├── data_handler
    │   ├── __init__.py
    │   ├── data_silo.py
    │   ├── dataloader.py
    │   ├── dataset.py
    │   ├── input_features.py
    │   ├── inputs.py
    │   ├── nq_utils.py
    │   ├── processor.py
    │   ├── samples.py
    │   └── utils.py
    ├── eval.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── metrics.py
    │   ├── msmarco_passage_farm.py
    │   ├── msmarco_passage_official.py
    │   ├── semantic_answer_similarity_evaluation.py
    │   └── squad_evaluation.py
    ├── experiment.py
    ├── file_utils.py
    ├── infer.py
    ├── inference_rest_api.py
    ├── modeling
    │   ├── __init__.py
    │   ├── adaptive_model.py
    │   ├── biadaptive_model.py
    │   ├── language_model.py
    │   ├── optimization.py
    │   ├── prediction_head.py
    │   ├── predictions.py
    │   ├── tokenization.py
    │   └── wordembedding_utils.py
    ├── train.py
    ├── utils.py
    └── visual
    │   ├── __init__.py
    │   └── ascii
    │       ├── __init__.py
    │       ├── images.py
    │       └── text.py
├── readme.rst
├── requirements.txt
├── run_all_experiments.py
├── setup.cfg
├── setup.py
├── test
    ├── benchmarks
    │   ├── README.md
    │   ├── conftest.py
    │   ├── convert_result_to_csv.py
    │   ├── question_answering.py
    │   ├── question_answering_accuracy.py
    │   ├── question_answering_components.html
    │   ├── question_answering_components.py
    │   ├── sample_file.txt
    │   └── samples
    │   │   ├── question_answering_questions.txt
    │   │   └── question_answering_sample.txt
    ├── conftest.py
    ├── create_testdata.py
    ├── modeling
    │   └── test_optimization.py
    ├── samples
    │   ├── doc_class
    │   │   ├── test-sample.tsv
    │   │   └── train-sample.tsv
    │   ├── doc_class_other_text_column_name
    │   │   ├── test-sample.tsv
    │   │   └── train-sample.tsv
    │   ├── doc_regr
    │   │   ├── test-sample.tsv
    │   │   └── train-sample.tsv
    │   ├── doc_regr_other_text_column_name
    │   │   ├── test-sample.tsv
    │   │   └── train-sample.tsv
    │   ├── dpr
    │   │   └── sample.json
    │   ├── lm_finetuning
    │   │   ├── test-sample.txt
    │   │   └── train-sample.txt
    │   ├── ner
    │   │   ├── dev-sample.txt
    │   │   └── train-sample.txt
    │   ├── nq
    │   │   ├── dev_sample.jsonl
    │   │   └── train_sample.jsonl
    │   ├── qa
    │   │   ├── answer-offset-wrong.json
    │   │   ├── answer-wrong.json
    │   │   ├── dev-sample.json
    │   │   ├── eval-sample.json
    │   │   ├── noanswer.json
    │   │   ├── train-sample.json
    │   │   └── vanilla.json
    │   ├── s3e
    │   │   ├── fitted_s3e
    │   │   │   ├── language_model_config.json
    │   │   │   ├── processor_config.json
    │   │   │   ├── s3e_stats.pkl
    │   │   │   ├── vectors.txt
    │   │   │   └── vocab.txt
    │   │   ├── tiny_corpus.txt
    │   │   └── tiny_fasttext_model
    │   │   │   ├── language_model_config.json
    │   │   │   ├── vectors.txt
    │   │   │   └── vocab.txt
    │   ├── text_pair
    │   │   └── sample.tsv
    │   └── tokenizer
    │   │   ├── bert-base-cased-vocab.txt
    │   │   └── custom_vocab.txt
    ├── test_conversion.py
    ├── test_data_silo.py
    ├── test_doc_classification_distilbert.py
    ├── test_doc_regression.py
    ├── test_dpr.py
    ├── test_evaluation_metrics.py
    ├── test_inference.py
    ├── test_lm_finetuning.py
    ├── test_model_versioning.py
    ├── test_natural_questions.py
    ├── test_ner.py
    ├── test_ner_amp.py
    ├── test_onnx_conversion.py
    ├── test_prediction_head.py
    ├── test_processor_qa.py
    ├── test_processor_saving_loading.py
    ├── test_question_answering.py
    ├── test_s3e_pooling.py
    ├── test_text_pair.py
    └── test_tokenization.py
└── tutorials
    ├── 1_farm_building_blocks.ipynb
    ├── 2_Build_a_processor_for_your_own_dataset.ipynb
    └── sagemaker
        ├── 3_train_with_sagemaker.ipynb
        └── source
            ├── doc_classification.py
            └── requirements.txt


/.dockerignore:
--------------------------------------------------------------------------------
1 | saved_models/**
2 | data/**


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: We love animals, but bugs need to be reported.
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Error message**
14 | Error that was thrown (if available)
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Additional context**
20 | Add any other context about the problem here, like type of downstream task, part of  etc.. 
21 | 
22 | **To Reproduce**
23 | Steps to reproduce the behavior
24 | 
25 | **System:**
26 |  - OS: 
27 |  - GPU/CPU:
28 |  - FARM version:
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: 'Got an idea for improving FARM? '
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem or particular use case?**
11 | A clear and concise description of what the problem or use case is.
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Not sure how to use a component? Just ask :)
 4 | title: ''
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Question**
11 | Put your question here
12 | 
13 | **Additional context**
14 | Add any other context or screenshots about the question (optional).
15 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 120
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 21
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed in 21 days if no further activity occurs.
15 | 
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-20.04
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 | 
17 |     - name: Set up Python 3.8
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: 3.8
21 |       
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install pytest
26 |         pip install -r requirements.txt
27 |         pip install onnxruntime
28 |         pip install -e .
29 | 
30 |     - name: Run pytest - only "conversion" marker
31 |       run: cd test && pytest -m "conversion"
32 | 
33 |     - name: Run Pytest - all except conversion marker
34 |       run: cd test &&  pytest -m "not conversion"


--------------------------------------------------------------------------------
/.github/workflows/cml.yaml:
--------------------------------------------------------------------------------
 1 | name: benchmarks
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     types: [labeled]
 7 | jobs:
 8 |   deploy-cloud-runner:
 9 |     if: ${{ (github.event.action == 'labeled' && github.event.label.name == 'benchmark') || github.event.action == 'workflow_dispatch' }}
10 |     runs-on: [ubuntu-latest]
11 |     container: docker://dvcorg/cml
12 |     steps:
13 |       - name: deploy
14 |         env:
15 |           repo_token: ${{ secrets.HAYSTACK_BOT_TOKEN }}
16 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CI_ACCESS_KEY }}
17 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CI_SECRET_ACCESS_KEY }}
18 |           VPC: ${{ secrets.AWS_CI_VPC }}
19 |         run: |
20 |           echo "Deploying..."
21 |           RUNNER_LABELS="cml,aws"
22 |           RUNNER_REPO="https://github.com/${GITHUB_REPOSITORY}"
23 |           MACHINE="cml$(date +%s)"
24 |           docker-machine create \
25 |             --driver amazonec2 \
26 |             --amazonec2-instance-type p3.8xlarge \
27 |             --amazonec2-vpc-id $VPC \
28 |             --amazonec2-region us-east-1 \
29 |             --amazonec2-zone c \
30 |             --amazonec2-ssh-user ubuntu \
31 |             --amazonec2-ami ami-06a25ee8966373068 \
32 |             --amazonec2-root-size 150 \
33 |             $MACHINE
34 |           eval "$(docker-machine env --shell sh $MACHINE)"
35 | 
36 |           (
37 |           docker-machine ssh $MACHINE "sudo mkdir -p \
38 |             /docker_machine && \
39 |           sudo chmod 777 /docker_machine" && \
40 |           docker-machine scp -r -q ~/.docker/machine/ \
41 |             $MACHINE:/docker_machine && \
42 |           docker run --name runner -d \
43 |             --gpus all \
44 |             -v /docker_machine/machine:/root/.docker/machine \
45 |             --net host \
46 |             --ipc host \
47 |             -e DOCKER_MACHINE=$MACHINE \
48 |             -e repo_token=$repo_token \
49 |             -e RUNNER_LABELS=$RUNNER_LABELS \
50 |             -e RUNNER_REPO=$RUNNER_REPO \
51 |             -e RUNNER_IDLE_TIMEOUT=120 \
52 |             dvcorg/cml-py3:latest && \
53 |           sleep 20 && echo "Deployed $MACHINE"
54 |           ) || (echo "Shut down machine" && docker-machine rm -y -f $MACHINE && exit 1)
55 |   run-benchmark:
56 |     if: ${{ (github.event.action == 'labeled' && github.event.label.name == 'benchmark') || github.event.action == 'workflow_dispatch' }}
57 |     needs: deploy-cloud-runner
58 |     runs-on: [self-hosted,cml]
59 |     steps:
60 |       - uses: actions/checkout@v2
61 |       - name: cml_run
62 |         env:
63 |           repo_token: ${{ secrets.HAYSTACK_BOT_TOKEN }}
64 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_CI_ACCESS_KEY }}
65 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_CI_SECRET_ACCESS_KEY }}
66 |         run: |
67 |           apt-get update -y
68 |           apt-get install python3-dev -y
69 |           pip install -r requirements.txt
70 |           pip install .
71 |           cd test/benchmarks && python question_answering_accuracy.py
72 |           echo -en "## Benchmarks: QA Accuracy\n" >> accuracy_report.md
73 |           cat results_accuracy.md >> accuracy_report.md
74 |           cml-send-comment accuracy_report.md
75 |           python question_answering_components.py
76 |           echo -en "## Benchmarks: QA per component\n" >> components_report.md
77 |           cat results_per_component.md >> components_report.md
78 |           cml-send-comment components_report.md
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | apex
  4 | 
  5 | Pipfile*
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | .dmypy.json
117 | dmypy.json
118 | 
119 | # Pyre type checker
120 | .pyre/
121 | 
122 | # vscode
123 | .vscode
124 | 
125 | # pycharm
126 | .idea/
127 | 
128 | # TF code
129 | tensorflow_code
130 | 
131 | # training data
132 | data/
133 | 
134 | # models
135 | models/
136 | save/
137 | testsave/
138 | saved_models/
139 | 
140 | # mlruns
141 | mlruns/
142 | .DS_Store
143 | 
144 | # cache
145 | *cache*
146 | 
147 | sandbox/
148 | 
149 | 
150 | # files created by example scripts
151 | examples/doc_classification_holdout.results.json
152 | examples/doc_classification_xval.results.json
153 | 
154 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | -   repo: https://github.com/python/black
3 |     rev: stable
4 |     hooks:
5 |     - id: black
6 |       language_version: python3
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | cache: pip
 4 | python:
 5 |   - "3.7"
 6 | install:
 7 |   - "pip install -e ."
 8 |   - "pip install sphinx==2.1.2"
 9 |   - "pip install sphinx-rtd-theme==0.4.3"
10 | script:
11 |   - "cd test && pytest"
12 |   - "cd ../docs && sphinx-build -W -b html -d _build/doctrees . _build/html"
13 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.4-stretch
 2 | 
 3 | WORKDIR /home/user
 4 | 
 5 | COPY setup.py requirements.txt readme.rst /home/user/
 6 | RUN pip install -r requirements.txt
 7 | RUN pip install -e .
 8 | 
 9 | COPY farm /home/user/farm
10 | # optionally: copy some base models into the image to allow simple demos / comparisons
11 | #COPY saved_models /home/user/base_models
12 | 
13 | CMD FLASK_APP=farm.inference_rest_api flask run --host 0.0.0.0
14 | 


--------------------------------------------------------------------------------
/Dockerfile-GPU:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-devel
 2 | 
 3 | RUN apt-get update && apt-get install -y git
 4 | 
 5 | # Setup locales
 6 | RUN apt-get update \
 7 |     	&& apt-get install -y --no-install-recommends \
 8 |     		locales
 9 | RUN locale-gen en_US.UTF-8
10 | ENV LANG en_US.UTF-8
11 | ENV LANGUAGE en_US:en
12 | ENV LC_ALL en_US.UTF-8
13 | 
14 | WORKDIR /home/user
15 | 
16 | # Install apex
17 | RUN git clone https://github.com/NVIDIA/apex \
18 |     && cd apex \
19 |     && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
20 | 
21 | 
22 | 
23 | # Install FARM
24 | COPY setup.py requirements.txt readme.rst /home/user/
25 | RUN pip install -r requirements.txt
26 | COPY farm farm
27 | RUN pip install -e .
28 | 
29 | 
30 | # Copy Training Scripts
31 | COPY examples examples
32 | 
33 | CMD FLASK_APP=farm.inference_rest_api flask run --host 0.0.0.0
34 | 


--------------------------------------------------------------------------------
/Dockerfile-SageMaker:
--------------------------------------------------------------------------------
1 | FROM deepset/farm-gpu:latest
2 | COPY examples examples
3 | #COPY data/test data/test
4 | 
5 | # ENV SAGEMAKER_PROGRAM train.py
6 | ENTRYPOINT ["python3","-m", "torch.distributed.launch", "--nproc_per_node=4", "examples/train_from_scratch_with_sagemaker.py"]
7 | 


--------------------------------------------------------------------------------
/Dockerfile-onnxruntime:
--------------------------------------------------------------------------------
 1 | # Adapted from ONNXRuntime CUDA Dockerfile at https://github.com/microsoft/onnxruntime/blob/master/dockerfiles/Dockerfile.cuda
 2 | 
 3 | FROM nvidia/cuda:10.1-cudnn7-devel
 4 | 
 5 | ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 6 | ARG ONNXRUNTIME_BRANCH=master
 7 | 
 8 | RUN apt-get update &&\
 9 |     apt-get install -y sudo git bash
10 | 
11 | WORKDIR /code
12 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/code/cmake-3.14.3-Linux-x86_64/bin:/opt/miniconda/bin:${PATH}
13 | ENV LD_LIBRARY_PATH /opt/miniconda/lib:$LD_LIBRARY_PATH
14 | 
15 | # Prepare onnxruntime repository & build onnxruntime with CUDA
16 | RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\
17 |     /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\
18 |     cp onnxruntime/docs/Privacy.md /code/Privacy.md &&\
19 |     cp onnxruntime/ThirdPartyNotices.txt /code/ThirdPartyNotices.txt &&\
20 |     cp onnxruntime/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt &&\
21 |     cd onnxruntime &&\
22 |     /bin/sh ./build.sh --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_cuda --config Release --build_wheel --update --build --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) &&\
23 |     pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\
24 |     cd .. &&\
25 |     rm -rf onnxruntime cmake-3.14.3-Linux-x86_64
26 | 
27 | # Clone FARM repositry and install the requirements
28 | RUN git clone --depth 1 --branch 0.4.3 https://github.com/deepset-ai/farm.git
29 | RUN pip install -e FARM
30 | RUN pip install -r FARM/test/requirements.txt


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 | include readme.rst
4 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   inference-api:
 4 |     # this Docker image comes with preloaded models.
 5 |     image: "deepset/farm-inference-api:base-models-0.4.2"
 6 |     ports:
 7 |       - "5000:5000"
 8 |     # (optional) mount your own models
 9 |     volumes:
10 |       - "./saved_models:/home/user/saved_models"
11 |   inference-ui:
12 |     image: "deepset/farm-inference-ui:latest"
13 |     ports:
14 |       - "3000:80"
15 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/custom.css:
--------------------------------------------------------------------------------
1 | .wy-side-nav-search{ background-color: #FFEFDB }
2 | a.icon-home { color: #18A063 }
3 | .icon-home:before{ display:none }


--------------------------------------------------------------------------------
/docs/api/data_handling.rst:
--------------------------------------------------------------------------------
 1 | Data Handling
 2 | =============
 3 | 
 4 | 
 5 | Processor
 6 | ---------
 7 | 
 8 | .. automodule:: farm.data_handler.processor
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 |    :exclude-members: subclasses
13 | 
14 | Data Silo
15 | ----------
16 | 
17 | .. automodule:: farm.data_handler.data_silo
18 |    :members:
19 |    :undoc-members:
20 |    :show-inheritance:
21 | 
22 | Dataset
23 | -------
24 | 
25 | .. automodule:: farm.data_handler.dataset
26 |    :members:
27 |    :undoc-members:
28 |    :show-inheritance:
29 | 
30 | DataLoader
31 | ----------
32 | 
33 | .. automodule:: farm.data_handler.dataloader
34 |    :members:
35 |    :undoc-members:
36 |    :show-inheritance:
37 | 
38 | Samples
39 | -------
40 | 
41 | .. automodule:: farm.data_handler.samples
42 |    :members:
43 |    :undoc-members:
44 |    :show-inheritance:
45 | 
46 | Input Features
47 | ---------------
48 | 
49 | .. automodule:: farm.data_handler.input_features
50 |    :members:
51 |    :undoc-members:
52 |    :show-inheritance:
53 | 


--------------------------------------------------------------------------------
/docs/api/modeling.rst:
--------------------------------------------------------------------------------
 1 | Modeling
 2 | ========
 3 | 
 4 | Adaptive Model
 5 | ------------------
 6 | 
 7 | .. automodule:: farm.modeling.adaptive_model
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | BiAdaptive Model
13 | ------------------
14 | 
15 | .. automodule:: farm.modeling.biadaptive_model
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | Language Model
21 | --------------
22 | 
23 | .. automodule:: farm.modeling.language_model
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 |    :exclude-members:
28 | 
29 | Prediction Head
30 | ---------------
31 | 
32 | .. automodule:: farm.modeling.prediction_head
33 |    :members:
34 |    :undoc-members:
35 |    :show-inheritance:
36 |    :exclude-members: subclasses
37 | 
38 | 
39 | Optimization
40 | ------------
41 | 
42 | .. automodule:: farm.modeling.optimization
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 
47 | Tokenization
48 | ------------
49 | 
50 | .. automodule:: farm.modeling.tokenization
51 |    :members:
52 |    :undoc-members:
53 |    :show-inheritance:
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/api/running.rst:
--------------------------------------------------------------------------------
 1 | Running
 2 | =======
 3 | 
 4 | 
 5 | Train
 6 | -----
 7 | 
 8 | .. automodule:: farm.train
 9 |    :members:
10 |    :undoc-members:
11 |    :show-inheritance:
12 | 
13 | Eval
14 | ----
15 | 
16 | .. automodule:: farm.eval
17 |    :members:
18 |    :undoc-members:
19 |    :show-inheritance:
20 | 
21 | Infer
22 | -----
23 | 
24 | .. automodule:: farm.infer
25 |    :members:
26 |    :undoc-members:
27 |    :show-inheritance:
28 | 
29 | Experiment
30 | ----------
31 | 
32 | .. automodule:: farm.experiment
33 |    :members:
34 |    :undoc-members:
35 |    :show-inheritance:
36 | 
37 | Metrics
38 | -------
39 | 
40 | .. automodule:: farm.evaluation.metrics
41 |    :members:
42 |    :undoc-members:
43 |    :show-inheritance:
44 | 
45 | File utils
46 | -----------
47 | 
48 | .. automodule:: farm.file_utils
49 |    :members:
50 |    :undoc-members:
51 |    :show-inheritance:
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/basic_usage.rst:
--------------------------------------------------------------------------------
  1 | Basic Usage
  2 | ############
  3 | 
  4 | 1. Train a downstream model
  5 | ****************************
  6 | FARM offers two modes for model training:
  7 | 
  8 | **Option 1: Run experiment(s) from config**::
  9 | 
 10 |     from farm.experiment import run_experiment, load_experiments
 11 |     experiments = load_experiments(Path("experiments/ner/conll2003_de_config.json")
 12 |     run_experiment(experiments[0])
 13 | 
 14 | *Use cases:* Training your first model, hyperparameter optimization, evaluating a language model on multiple down-stream tasks.
 15 | 
 16 | **Option 2: Stick together your own building blocks**::
 17 | 
 18 |     # Choose a language model (e.g. from transformers' model hub: https://huggingface.co/models)
 19 |     language_model = "bert-base-german-cased"
 20 | 
 21 |     # Basic building blocks for data handling
 22 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model)
 23 |     processor = NERProcessor(tokenizer=tokenizer, data_dir=Path("../data/conll03-de"), max_seq_len=128)
 24 |     ...
 25 | 
 26 |     # AdaptiveModel = LanguageModel + PredictionHead(s)
 27 |     language_model = LanguageModel.load(language_model)
 28 |     prediction_head = TokenClassificationHead(num_labels=13)
 29 |     model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], ...)
 30 |     ...
 31 | 
 32 |     # Feed it to a Trainer, which keeps care of growing our model
 33 |     trainer = Trainer(
 34 |         model=model,
 35 |         optimizer=optimizer,
 36 |         data_silo=data_silo,
 37 |         epochs=n_epochs,
 38 |         lr_schedule=lr_schedule,
 39 |         evaluate_every=evaluate_every,
 40 |         n_gpu=n_gpu,
 41 |         device=device)
 42 | 
 43 |     # 7. Let it grow
 44 |     model = trainer.train()
 45 | 
 46 | See this `tutorial <https://github.com/deepset-ai/FARM/blob/master/tutorials/1_farm_building_blocks.ipynb>`_ for details
 47 | 
 48 | *Usecases:* Custom datasets, language models, prediction heads ...
 49 | 
 50 | 2. Run Inference
 51 | *****************
 52 | Use a `public model  <https://huggingface.co/models>`__  or your own to get predictions::
 53 | 
 54 |     # Load model, tokenizer & processor (local or any from https://huggingface.co/models)
 55 |     nlp = Inferencer.load("deepset/bert-large-uncased-whole-word-masking-squad2", task_type="question_answering")
 56 | 
 57 |     # Run predictions
 58 |     QA_input = [{"questions": ["Why is model conversion important?"],
 59 |                  "text": "Model conversion lets people easily switch between frameworks."}]
 60 |     result = nlp.inference_from_dicts(dicts=QA_input)
 61 | 
 62 | 3. Showcase your model (API + UI)
 63 | **********************************
 64 | 
 65 | Quick start
 66 | ===============
 67 | 
 68 | * Run :code:`docker-compose up`
 69 | * Open http://localhost:3000 in your browser
 70 | 
 71 | .. image:: img/inference-api-screen.png
 72 |     :alt: FARM Inferennce UI
 73 | 
 74 | One docker container exposes a REST API (localhost:5000) and another one runs a simple demo UI (localhost:3000).
 75 | You can use both of them individually and mount your own models.
 76 | 
 77 | API Docker
 78 | ==============
 79 | *(deepset/farm-inference-api)*
 80 | 
 81 | The API container includes FARM and is made for running trained (multiple) down-stream models in inference mode. It exposes a REST API on port 5000.
 82 | 
 83 | You can either start the docker via docker-compose (recommended) or manually via:
 84 | :code:`docker run -d -p 5000:5000 deepset/farm-inference-api:base-models`
 85 | 
 86 | **What models are loaded?**
 87 | 
 88 | The container is loading all models located in the docker's directory :code:`/home/user/saved_models`.
 89 | We have one image version with some exemplary models stored in this directory: :code:`farm-inference-api:base-models`.
 90 | This might be helpful if you just want to try the API/UI or compare your own model to some other baselines.
 91 | If you only want to run your own models, you can also use the smaller image with tag :code:`farm-inference-api:lastest`
 92 | 
 93 | **How can I add my own models?**
 94 | 
 95 | Just mount them from your disk into the docker directory :code:`/home/user/saved_models`.
 96 | The easiest way of doing this is to edit the :code:`docker-compose.yml`.  Just put your own path with the model folder(s)::
 97 | 
 98 |     volumes:
 99 |       - "./your_path/some_folder:/home/user/saved_models"
100 | 
101 | If you don't run via docker-compose you can also supply the mounted volume to :code:`docker run`::
102 | 
103 |     docker run -d \
104 |      -p 5000:5000  \
105 |      -v /your_path/some_folder:/home/user/saved_models \
106 |      deepset/farm-inference-api:base-models
107 | 
108 | UI Docker
109 | =============
110 | *(deepset/farm-inference-ui)*
111 | 
112 | The UI container can be launched in addition to provide a frontend that queries the API exposed on port 5000 by the other container.
113 | Start the container via docker-compose or individually via
114 | 
115 | :code:`docker run -d -p 3000:80 deepset/farm-inference-ui`
116 | 
117 | Open localhost:3000 in your browser. Then simply select the tab with your task on the left (e.g. QA), one of the models
118 | exposed by the API and enter some text that you want to feed to the model.
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath(".."))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = "FARM"
22 | copyright = "2019, deepset"
23 | author = "deepset"
24 | 
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"]
32 | 
33 | # Add any paths that contain templates here, relative to this directory.
34 | templates_path = ["_templates"]
35 | 
36 | # List of patterns, relative to source directory, that match files and
37 | # directories to ignore when looking for source files.
38 | # This pattern also affects html_static_path and html_extra_path.
39 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
40 | 
41 | autodoc_member_order = "bysource"
42 | 
43 | # -- Options for HTML output -------------------------------------------------
44 | 
45 | # The theme to use for HTML and HTML Help pages.  See the documentation for
46 | # a list of builtin themes.
47 | #
48 | html_theme = "sphinx_rtd_theme"
49 | 
50 | # Add any paths that contain custom static files (such as style sheets) here,
51 | # relative to this directory. They are copied after the builtin static files,
52 | # so a file named "default.css" will overwrite the builtin "default.css".
53 | html_static_path = ["_static"]
54 | 
55 | 
56 | html_logo = "img/logo.png"
57 | 
58 | html_context = {"css_files": ["_static/custom.css"]}
59 | 
60 | # -- Add autodocs for __init__() methods -------------------------------------
61 | 
62 | 
63 | def skip(app, what, name, obj, would_skip, options):
64 |     if name == "__init__":
65 |         return False
66 |     return would_skip
67 | 
68 | 
69 | def setup(app):
70 |     app.connect("autodoc-skip-member", skip)
71 | 


--------------------------------------------------------------------------------
/docs/data_handling.rst:
--------------------------------------------------------------------------------
 1 | Data Handling
 2 | ================================
 3 | 
 4 | 
 5 | Design Philosophy
 6 | ##################
 7 | In many cases adapting a language model to your own NLP problem requires heavy lifting on the preprocessing side.
 8 | To lessen this burden, we have designed the data handling with a few goals in mind. We want:
 9 | 
10 | * Customization of preprocessing components to be easy
11 | * Inspection of the inputs and outputs of different preprocessing stages to be possible
12 | * A structure that is general enough to handle the requirements of different NLP tasks
13 | 
14 | As such, you will find the following features in our code:
15 | 
16 | * The functions that we expect the user to customize are grouped together
17 | * Many of the generic pipeline components are easily reusable
18 | * There is a clear separation of generic and dataset/task/model specific components in the pipeline
19 | * Processing goes stage by stage rather than sample by sample so that you are able to inspect the full dataset at any point in the processing
20 | * Powerful debugging that allows inspecting a sample in different phases of the pipeline (raw, tokenized, featurized, tensors ...)
21 | 
22 | Building Blocks
23 | #################
24 | 
25 | .. image:: img/data_silo_no_bg.jpg
26 |     :alt: FARM Data Silo
27 | 
28 | In FARM the **Processor** contains the functions which handle the **conversion from file or request to PyTorch Datasets**.
29 | In essence, it prepares data to be consumed by the modelling components.
30 | This is done in stages to allow for easier debugging.
31 | It should be able to handle file input or requests.
32 | This class contains everything that needs to be customized when adapting a new dataset.
33 | Custom datasets can be handled by extending the Processor (e.g. see CONLLProcessor).
34 | 
35 | The **DataSilo** is a generic class that stores the train, dev and test data sets.
36 | It calls upon the methods from the Processor to do the loading and then exposes a DataLoader for each set.
37 | In cases where there is not a separate dev file, it will create one by slicing the train set.
38 | 
39 | .. image:: img/sample_basket_no_bg.jpg
40 |     :alt: FARM Sample Basket
41 | 
42 | The **Sample** and **SampleBasket** objects allow powerful debugging and logging capabilities as they store different views on the same sample (raw, tokenized, featurized ...)
43 | The **SampleBasket** stores one string sample as well as the one or more **Samples** that that string sample might generate.
44 | These data structures are design like this since a single document only generates one sample when performing document classification but can generate multiple samples for question answering.
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ================================
 3 | 
 4 | You can find exemplary scripts for the major down-stream tasks in :code:`examples/`
 5 | 
 6 | Document Classification
 7 | ##########################
 8 | (see :code:`examples/doc_classification.py` for full script)
 9 | 
10 | 1.Create a tokenizer::
11 | 
12 |     tokenizer = Tokenizer.load(
13 |         pretrained_model_name_or_path=lang_model,
14 |         do_lower_case=False)
15 | 
16 | 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset::
17 | 
18 |     processor = GermEval18CoarseProcessor(tokenizer=tokenizer,
19 |                               max_seq_len=128,
20 |                               data_dir="../data/germeval18")
21 | 
22 | 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets::
23 | 
24 |     data_silo = DataSilo(
25 |         processor=processor,
26 |         batch_size=batch_size)
27 | 
28 | 4. Create an AdaptiveModel
29 | a) which consists of a pretrained language model as a basis::
30 | 
31 |     language_model = LanguageModel.load(lang_model)
32 | 
33 | b) and a prediction head on top that is suited for our task => Text classification::
34 | 
35 |     prediction_head = TextClassificationHead(layer_dims=[768, len(processor.label_list)])
36 | 
37 |     model = AdaptiveModel(
38 |         language_model=language_model,
39 |         prediction_heads=[prediction_head],
40 |         embeds_dropout_prob=0.1,
41 |         lm_output_types=["per_sequence"],
42 |         device=device)
43 | 
44 | 5. Create an optimizer and optionally optimize model and optimizer with AMP::
45 | 
46 |     model, optimizer, warmup_linear = initialize_optimizer(
47 |         model=model,
48 |         learning_rate=2e-5,
49 |         warmup_proportion=0.1,
50 |         n_examples=data_silo.n_samples("train"),
51 |         batch_size=batch_size,
52 |         n_epochs=1)
53 | 
54 | 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time::
55 | 
56 |     trainer = Trainer(
57 |         optimizer=optimizer,
58 |         data_silo=data_silo,
59 |         epochs=n_epochs,
60 |         n_gpu=1,
61 |         warmup_linear=warmup_linear,
62 |         evaluate_every=evaluate_every,
63 |         device=device)
64 | 
65 | 7. Let it grow::
66 | 
67 |     model = trainer.train(model)
68 | 
69 | 8. Hooray! You have a model. Store it::
70 | 
71 |     save_dir = "save/bert-german-GNAD-tutorial"
72 |     model.save(save_dir)
73 |     processor.save(save_dir)
74 | 
75 | 9. Load it & harvest your fruits (Inference)::
76 | 
77 |     basic_texts = [
78 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
79 |         {"text": "Martin Müller spielt Fussball"},
80 |     ]
81 |     model = Inferencer(save_dir)
82 |     result = model.inference_from_dicts(dicts=basic_texts)
83 |     print(result)
84 | 


--------------------------------------------------------------------------------
/docs/img/adaptive_model_no_bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/adaptive_model_no_bg.jpg


--------------------------------------------------------------------------------
/docs/img/adaptive_model_no_bg_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/adaptive_model_no_bg_small.jpg


--------------------------------------------------------------------------------
/docs/img/code_snippet_building_blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_building_blocks.png


--------------------------------------------------------------------------------
/docs/img/code_snippet_experiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_experiment.png


--------------------------------------------------------------------------------
/docs/img/code_snippet_inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/code_snippet_inference.png


--------------------------------------------------------------------------------
/docs/img/data_silo_no_bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/data_silo_no_bg.jpg


--------------------------------------------------------------------------------
/docs/img/data_silo_no_bg_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/data_silo_no_bg_small.jpg


--------------------------------------------------------------------------------
/docs/img/deepset_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/deepset_logo.png


--------------------------------------------------------------------------------
/docs/img/farm_logo_text_right_wide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/farm_logo_text_right_wide.png


--------------------------------------------------------------------------------
/docs/img/inference-api-screen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/inference-api-screen.png


--------------------------------------------------------------------------------
/docs/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/logo.png


--------------------------------------------------------------------------------
/docs/img/sample_basket_no_bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/docs/img/sample_basket_no_bg.jpg


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | #############
 3 | Recommended (because of highly active development)::
 4 | 
 5 |     git clone https://github.com/deepset-ai/FARM.git
 6 |     cd FARM
 7 |     pip install -r requirements.txt
 8 |     pip install --editable .
 9 | 
10 | If problems occur, please do a git pull. the --editable flag will update changes immediately.
11 | 
12 | With pip::
13 | 
14 |     pip install farm
15 | 
16 | We recommend using Python 3.7.
17 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/modeling.rst:
--------------------------------------------------------------------------------
 1 | Modeling
 2 | ================================
 3 | 
 4 | 
 5 | Design Philosophy
 6 | ##################
 7 | We live in exciting times for NLP and see new publications on language models, adaptation strategies and down-stream applications on a weekly base.
 8 | However, keeping track with recent developments is not easy. Switching between pretrained models or adaptation strategies is not easy in practice, since most researchers publish their models in individual repos and not always have the desired down-stream tasks implemented.
 9 | 
10 | FARM offer's a more flexible and general approach of transfer learning by abstracting from the underlying pretrained language models and their prediction head.
11 | With FARM you can stick together any pretrained language model (BERT, XLNet or whatever comes next) with one or multiple prediction heads (NER, Doc classification ...) to form an AdaptiveModel.
12 | This allows you a fast and easy comparison between different language models and simplifies changes in your production system, if you want to migrate to a new model.
13 | 
14 | Building Blocks
15 | #################
16 | 
17 | .. image:: img/adaptive_model_no_bg.jpg
18 |     :alt: FARM Adaptive Model
19 | 
20 | 1. Language Model
21 | ********************
22 | * Standardized parent class for all language models out there (BERT, XLNet ...).
23 | * A pretrained language model converts tokens to vector representations
24 | 
25 | 2. Prediction Head
26 | ********************
27 | * Standardized parent class for all types of down-stream tasks (NER, Text classification, QA ...).
28 | * A prediction head retrieves vector representations from the language model and converts them into down-stream predictions (e.g. class probabilities)
29 | 
30 | 
31 | 3. AdaptiveModel
32 | ********************
33 | * Standardized parent class for end-to-end transfer learning models
34 | * Combines the language model with one or multiple prediction heads.
35 | * An AdaptiveModel 1) propagates the input to the language model, it's output to the prediction head(s) and then consolidates the loss(es) / predictions. During training the loss is backpropagated through the entire neural network (incl. language model). We will soon provide further adaptation strategies here like Adapter Modules or Discriminative Finetuning.
36 | 
37 | All three classes provide standardized interfaces for all kinds of model functions like retrieving logits, loss or formatted predictions.


--------------------------------------------------------------------------------
/docs/qa_formats.py:
--------------------------------------------------------------------------------
 1 | ####################################
 2 | ###### JSON (REST API) FORMAT ######
 3 | ####################################
 4 | 
 5 | # INPUT
 6 | 
 7 | input = [{"questions": ["What is X?"], "text":  "Some context containing the answer"}]
 8 | 
 9 | # OUTPUT
10 | 
11 | output= {
12 |     "task": "qa",
13 |     "predictions": [
14 |         {
15 |             "question": question,
16 |             "question_id": id,
17 |             "ground_truth": None,
18 |             "answers": answers,
19 |             "no_ans_gap": no_ans_gap # Add no_ans_gap to current no_ans_boost for switching top prediction
20 |         }
21 |     ],
22 | }
23 | 
24 | answer =   {"score": score,
25 |               "probability": -1,
26 |               "answer": string,
27 |               "offset_answer_start": ans_start_ch,
28 |               "offset_answer_end": ans_end_ch,
29 |               "context": context_string,
30 |               "offset_context_start": context_start_ch,
31 |               "offset_context_end": context_end_ch,
32 |               "document_id": document_id}
33 | 
34 | 
35 | ###############################
36 | ###### SQUAD EVAL FORMAT ######
37 | ###############################
38 | 
39 | # INPUT
40 | 
41 | input = [{"qas": ["What is X?"], "context":  "Some context containing the answer"}]
42 | 
43 | # OUTPUT
44 | 
45 | output = {"id": basket_id,
46 |           "preds": [[pred_str, start_t, end_t, score, sample_idx], ...]}
47 | 


--------------------------------------------------------------------------------
/examples/conversion_huggingface_models.py:
--------------------------------------------------------------------------------
 1 | from farm.modeling.adaptive_model import AdaptiveModel
 2 | from farm.modeling.tokenization import Tokenizer
 3 | from farm.conversion.transformers import Converter
 4 | from farm.infer import Inferencer
 5 | import pprint
 6 | from transformers.pipelines import pipeline
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | ##############################################
11 | ###  From Transformers -> FARM
12 | ##############################################
13 | def convert_from_transformers():
14 |     # CASE 1: MODEL
15 |     # Load model from transformers model hub (-> continue training / compare models / ...)
16 |     model = Converter.convert_from_transformers("deepset/bert-large-uncased-whole-word-masking-squad2", device="cpu")
17 |     #Alternative way to load from transformers model hub:
18 |     #model = AdaptiveModel.convert_from_transformers("deepset/bert-large-uncased-whole-word-masking-squad2", device="cpu", task_type="question_answering")
19 |     # ... continue as in the other examples e.g. to fine-tune this QA model on your own data
20 | 
21 |     # CASE 2: INFERENCER
22 |     # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions)
23 |     nlp = Inferencer.load("deepset/bert-large-uncased-whole-word-masking-squad2", task_type="question_answering")
24 | 
25 |     # run predictions
26 |     QA_input = [{"questions": ["Why is model conversion important?"],
27 |                  "text": "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."}]
28 |     result = nlp.inference_from_dicts(dicts=QA_input)
29 |     pprint.pprint(result)
30 |     nlp.close_multiprocessing_pool()
31 | 
32 |     # save it
33 |     farm_model_dir = Path("../saved_models/bert-english-qa-large")
34 |     nlp.save(farm_model_dir)
35 | 
36 | ##############################################
37 | ###  From FARM -> Transformers
38 | ##############################################
39 | def convert_to_transformers():
40 |     farm_model_dir = Path("../saved_models/bert-english-qa-large")
41 | 
42 |     # load from FARM format
43 |     model = AdaptiveModel.load(farm_model_dir, device="cpu")
44 |     tokenizer = Tokenizer.load(farm_model_dir)
45 | 
46 |     # convert to transformers
47 |     transformer_model = Converter.convert_to_transformers(model)[0]
48 |     #Alternative way to convert to transformers:
49 |     #transformer_model = model.convert_to_transformers()[0]
50 | 
51 |     # save it (Note: transformers uses strings rather than Path objects)
52 |     model_dir = "../saved_models/bert-large-uncased-whole-word-masking-squad2"
53 |     os.makedirs(model_dir, exist_ok=True)
54 |     transformer_model.save_pretrained(model_dir)
55 |     tokenizer.save_pretrained(model_dir)
56 | 
57 |     # run predictions (using transformers)
58 |     nlp = pipeline('question-answering', model=model_dir, tokenizer=model_dir)
59 |     res = nlp({
60 |         'question': 'Why is model conversion important?',
61 |         'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
62 |     })
63 |     pprint.pprint(res)
64 | 
65 |     # To upload to transformer's model hub run this in bash:
66 |     # transformers-cli upload  ../saved_models/bert-large-uncased-whole-word-masking-squad2
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     convert_from_transformers()
71 |     convert_to_transformers()


--------------------------------------------------------------------------------
/examples/conversion_huggingface_models_classification.py:
--------------------------------------------------------------------------------
 1 | from farm.modeling.adaptive_model import AdaptiveModel
 2 | from farm.conversion.transformers import Converter
 3 | from farm.data_handler.processor import Processor
 4 | 
 5 | from farm.infer import Inferencer
 6 | import pprint
 7 | from transformers.pipelines import pipeline
 8 | from pathlib import Path
 9 | 
10 | ##############################################
11 | ###  From Transformers -> FARM
12 | ##############################################
13 | def convert_from_transformers():
14 |     transformers_input_name = "deepset/bert-base-german-cased-hatespeech-GermEval18Coarse"
15 |     farm_output_dir = Path("../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse")
16 | 
17 |     # # CASE 1: MODEL
18 |     # # Load model from transformers model hub (-> continue training / compare models / ...)
19 |     model = Converter.convert_from_transformers(transformers_input_name, device="cpu")
20 | 
21 |     # # Alternative way to load from transformers model hub:
22 |     #model = AdaptiveModel.convert_from_transformers(transformers_input_name, device="cpu", task_type="text_classification")
23 |     # # ... continue as in the other examples e.g. to fine-tune this QA model on your own data
24 |     #
25 |     # # CASE 2: INFERENCER
26 |     # # Load Inferencer from transformers, incl. model & tokenizer (-> just get predictions)
27 |     nlp = Inferencer.load(transformers_input_name, task_type="text_classification")
28 |     #
29 |     # # run predictions
30 |     result = nlp.inference_from_dicts(dicts=[{"text": "Was ein scheiß Nazi!"}])
31 |     pprint.pprint(result)
32 |     nlp.close_multiprocessing_pool()
33 | 
34 |     # save it
35 |     nlp.save(farm_output_dir)
36 | 
37 | # ##############################################
38 | # ###  From FARM -> Transformers
39 | # ##############################################
40 | def convert_to_transformers():
41 |     farm_input_dir = Path("../saved_models/farm-bert-base-german-cased-hatespeech-GermEval18Coarse")
42 |     transformers_output_dir = "../saved_models/bert-base-german-cased-hatespeech-GermEval18Coarse"
43 |     #
44 |     # # # load from FARM format
45 |     model = AdaptiveModel.load(farm_input_dir, device="cpu")
46 |     processor = Processor.load_from_dir(farm_input_dir)
47 |     model.connect_heads_with_processor(processor.tasks)
48 | 
49 |     # convert to transformers
50 |     transformer_model = Converter.convert_to_transformers(model)[0]
51 |     # # Alternative way to convert to transformers:
52 |     #transformer_model = model.convert_to_transformers()[0]
53 | 
54 |     # save it (note: transformers use str instead of Path objects)
55 |     Path(transformers_output_dir).mkdir(parents=True, exist_ok=True)
56 |     transformer_model.save_pretrained(transformers_output_dir)
57 |     processor.tokenizer.save_pretrained(transformers_output_dir)
58 | 
59 |     # run predictions (using transformers)
60 |     nlp = pipeline('sentiment-analysis', model=str(transformers_output_dir), tokenizer=str(transformers_output_dir))
61 |     res = nlp("Was ein scheiß Nazi!")
62 |     pprint.pprint(res)
63 | 
64 |     # # To upload to transformer's model hub run this in bash:
65 |     # # transformers-cli upload  ../saved_models/bert-large-uncased-whole-word-masking-squad2
66 | 
67 | if __name__ == "__main__":
68 |     convert_from_transformers()
69 |     convert_to_transformers()


--------------------------------------------------------------------------------
/examples/doc_classification_cola.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import TextClassificationProcessor
  7 | from farm.modeling.optimization import initialize_optimizer
  8 | from farm.infer import Inferencer
  9 | from farm.modeling.adaptive_model import AdaptiveModel
 10 | from farm.modeling.language_model import LanguageModel
 11 | from farm.modeling.prediction_head import TextClassificationHead
 12 | from farm.modeling.tokenization import Tokenizer
 13 | from farm.train import Trainer
 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 15 | 
 16 | def doc_classification_cola():
 17 |     logging.basicConfig(
 18 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 19 |         datefmt="%m/%d/%Y %H:%M:%S",
 20 |         level=logging.INFO)
 21 | 
 22 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 23 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")
 24 | 
 25 |     ##########################
 26 |     ########## Settings
 27 |     ##########################
 28 |     set_all_seeds(seed=42)
 29 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 30 |     n_epochs = 5
 31 |     batch_size = 100
 32 |     evaluate_every = 20
 33 |     lang_model = "bert-base-cased"
 34 |     do_lower_case = False
 35 | 
 36 |     # 1.Create a tokenizer
 37 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
 38 | 
 39 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 40 |     # Here we load Cola 2018 Data automaticaly if it is not available.
 41 |     # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 42 | 
 43 |     label_list = ["0", "1"]
 44 |     metric = "mcc"
 45 | 
 46 |     processor = TextClassificationProcessor(tokenizer=tokenizer,
 47 |                                             max_seq_len=64,
 48 |                                             data_dir=Path("../data/cola"),
 49 |                                             dev_filename=Path("dev.tsv"),
 50 |                                             dev_split=None,
 51 |                                             test_filename=None,
 52 |                                             label_list=label_list,
 53 |                                             metric=metric,
 54 |                                             label_column_name="label"
 55 |                                             )
 56 | 
 57 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 58 |     data_silo = DataSilo(
 59 |         processor=processor,
 60 |         batch_size=batch_size)
 61 | 
 62 |     # 4. Create an AdaptiveModel
 63 |     # a) which consists of a pretrained language model as a basis
 64 |     language_model = LanguageModel.load(lang_model)
 65 | 
 66 |     # language_model = Roberta.load(lang_model)
 67 |     # b) and a prediction head on top that is suited for our task => Text classification
 68 |     prediction_head = TextClassificationHead(
 69 |         num_labels=len(label_list),
 70 |         class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
 71 | 
 72 |     model = AdaptiveModel(
 73 |         language_model=language_model,
 74 |         prediction_heads=[prediction_head],
 75 |         embeds_dropout_prob=0.1,
 76 |         lm_output_types=["per_sequence"],
 77 |         device=device)
 78 | 
 79 |     # 5. Create an optimizer
 80 |     model, optimizer, lr_schedule = initialize_optimizer(
 81 |         model=model,
 82 |         learning_rate=2e-5,
 83 |         device=device,
 84 |         n_batches=len(data_silo.loaders["train"]),
 85 |         n_epochs=n_epochs)
 86 | 
 87 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 88 |     trainer = Trainer(
 89 |         model=model,
 90 |         optimizer=optimizer,
 91 |         data_silo=data_silo,
 92 |         epochs=n_epochs,
 93 |         n_gpu=n_gpu,
 94 |         lr_schedule=lr_schedule,
 95 |         evaluate_every=evaluate_every,
 96 |         device=device)
 97 | 
 98 |     # 7. Let it grow
 99 |     trainer.train()
100 | 
101 |     # 8. Hooray! You have a model. Store it:
102 |     save_dir = Path("saved_models/bert-doc-tutorial")
103 |     model.save(save_dir)
104 |     processor.save(save_dir)
105 | 
106 |     # 9. Load it & harvest your fruits (Inference)
107 |     basic_texts = [
108 |         {"text": "The box contained the ball from the tree."},
109 |         {"text": "I'll fix you a drink."},
110 |     ]
111 |     model = Inferencer.load(save_dir)
112 |     result = model.inference_from_dicts(dicts=basic_texts)
113 |     print(result)
114 |     model.close_multiprocessing_pool()
115 | 
116 | if __name__ == "__main__":
117 |     doc_classification_cola()
118 | 
119 | # fmt: on
120 | 


--------------------------------------------------------------------------------
/examples/doc_classification_multilabel.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import TextClassificationProcessor
  7 | from farm.modeling.optimization import initialize_optimizer
  8 | from farm.infer import Inferencer
  9 | from farm.modeling.adaptive_model import AdaptiveModel
 10 | from farm.modeling.language_model import LanguageModel
 11 | from farm.modeling.prediction_head import MultiLabelTextClassificationHead
 12 | from farm.modeling.tokenization import Tokenizer
 13 | from farm.train import Trainer
 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 15 | 
 16 | def doc_classification_multilabel():
 17 |     logging.basicConfig(
 18 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 19 |         datefmt="%m/%d/%Y %H:%M:%S",
 20 |         level=logging.INFO)
 21 | 
 22 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 23 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
 24 | 
 25 |     ##########################
 26 |     ########## Settings
 27 |     ##########################
 28 |     set_all_seeds(seed=42)
 29 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 30 |     n_epochs = 1
 31 |     batch_size = 32
 32 | 
 33 |     evaluate_every = 500
 34 |     lang_model = "bert-base-uncased"
 35 |     do_lower_case = True
 36 | 
 37 |     # 1.Create a tokenizer
 38 |     tokenizer = Tokenizer.load(
 39 |         pretrained_model_name_or_path=lang_model,
 40 |         do_lower_case=do_lower_case)
 41 | 
 42 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 43 |     # Here we load Toxic Comments Data automaticaly if it is not available.
 44 | 
 45 |     label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
 46 |     metric = "acc"
 47 | 
 48 |     processor = TextClassificationProcessor(tokenizer=tokenizer,
 49 |                                             max_seq_len=128,
 50 |                                             data_dir=Path("../data/toxic-comments"),
 51 |                                             label_list=label_list,
 52 |                                             label_column_name="label",
 53 |                                             metric=metric,
 54 |                                             quote_char='"',
 55 |                                             multilabel=True,
 56 |                                             train_filename="train.tsv",
 57 |                                             dev_filename="val.tsv",
 58 |                                             test_filename=None,
 59 |                                             dev_split=0,
 60 |                                             )
 61 | 
 62 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 63 |     data_silo = DataSilo(
 64 |         processor=processor,
 65 |         batch_size=batch_size)
 66 | 
 67 |     # 4. Create an AdaptiveModel
 68 |     # a) which consists of a pretrained language model as a basis
 69 |     language_model = LanguageModel.load(lang_model)
 70 |     # b) and a prediction head on top that is suited for our task => Text classification
 71 |     prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))
 72 | 
 73 |     model = AdaptiveModel(
 74 |         language_model=language_model,
 75 |         prediction_heads=[prediction_head],
 76 |         embeds_dropout_prob=0.1,
 77 |         lm_output_types=["per_sequence"],
 78 |         device=device)
 79 | 
 80 |     # 5. Create an optimizer
 81 |     model, optimizer, lr_schedule = initialize_optimizer(
 82 |         model=model,
 83 |         learning_rate=3e-5,
 84 |         device=device,
 85 |         n_batches=len(data_silo.loaders["train"]),
 86 |         n_epochs=n_epochs)
 87 | 
 88 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 89 |     trainer = Trainer(
 90 |         model=model,
 91 |         optimizer=optimizer,
 92 |         data_silo=data_silo,
 93 |         epochs=n_epochs,
 94 |         n_gpu=n_gpu,
 95 |         lr_schedule=lr_schedule,
 96 |         evaluate_every=evaluate_every,
 97 |         device=device)
 98 | 
 99 |     # 7. Let it grow
100 |     trainer.train()
101 | 
102 |     # 8. Hooray! You have a model. Store it:
103 |     save_dir = Path("../saved_models/bert-german-multi-doc-tutorial")
104 |     model.save(save_dir)
105 |     processor.save(save_dir)
106 | 
107 |     # 9. Load it & harvest your fruits (Inference)
108 |     basic_texts = [
109 |         {"text": "You fucking bastards"},
110 |         {"text": "What a lovely world"},
111 |     ]
112 |     model = Inferencer.load(save_dir)
113 |     result = model.inference_from_dicts(dicts=basic_texts)
114 |     print(result)
115 |     model.close_multiprocessing_pool()
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     doc_classification_multilabel()
120 | 
121 | # fmt: on
122 | 


--------------------------------------------------------------------------------
/examples/doc_classification_multilabel_roberta.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import TextClassificationProcessor
  7 | from farm.modeling.optimization import initialize_optimizer
  8 | from farm.infer import Inferencer
  9 | from farm.modeling.adaptive_model import AdaptiveModel
 10 | from farm.modeling.language_model import Roberta
 11 | from farm.modeling.prediction_head import MultiLabelTextClassificationHead
 12 | from farm.modeling.tokenization import Tokenizer
 13 | from farm.train import Trainer
 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 15 | 
 16 | 
 17 | def doc_classification_multilabel_roberta():
 18 |     logging.basicConfig(
 19 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 20 |         datefmt="%m/%d/%Y %H:%M:%S",
 21 |         level=logging.INFO)
 22 | 
 23 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 24 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
 25 | 
 26 |     ##########################
 27 |     ########## Settings
 28 |     ##########################
 29 |     set_all_seeds(seed=42)
 30 |     device, n_gpu = initialize_device_settings(use_cuda=False)
 31 |     n_epochs = 1
 32 |     batch_size = 32
 33 | 
 34 |     evaluate_every = 500
 35 |     lang_model = "roberta-base"
 36 |     do_lower_case = False # roberta is a cased model
 37 | 
 38 |     # 1.Create a tokenizer
 39 |     tokenizer = Tokenizer.load(
 40 |         pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
 41 |     )
 42 | 
 43 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 44 |     # Here we load Toxic Comments Data automaticaly if it is not available.
 45 | 
 46 |     label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
 47 |     metric = "acc"
 48 | 
 49 |     processor = TextClassificationProcessor(tokenizer=tokenizer,
 50 |                                             max_seq_len=128,
 51 |                                             data_dir=Path("../data/toxic-comments"),
 52 |                                             label_list=label_list,
 53 |                                             label_column_name="label",
 54 |                                             metric=metric,
 55 |                                             quote_char='"',
 56 |                                             multilabel=True,
 57 |                                             train_filename=Path("train.tsv"),
 58 |                                             dev_filename=Path("val.tsv"),
 59 |                                             test_filename=None,
 60 |                                             dev_split=0,
 61 |                                             max_samples=1000
 62 |                                             )
 63 | 
 64 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 65 |     data_silo = DataSilo(
 66 |         processor=processor,
 67 |         batch_size=batch_size)
 68 | 
 69 |     # 4. Create an AdaptiveModel
 70 |     # a) which consists of a pretrained language model as a basis
 71 |     language_model = Roberta.load(lang_model)
 72 |     # b) and a prediction head on top that is suited for our task => Text classification
 73 |     prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))
 74 | 
 75 |     model = AdaptiveModel(
 76 |         language_model=language_model,
 77 |         prediction_heads=[prediction_head],
 78 |         embeds_dropout_prob=0.1,
 79 |         lm_output_types=["per_sequence"],
 80 |         device=device)
 81 | 
 82 |     # 5. Create an optimizer
 83 |     model, optimizer, lr_schedule = initialize_optimizer(
 84 |         model=model,
 85 |         learning_rate=3e-5,
 86 |         device=device,
 87 |         n_batches=len(data_silo.loaders["train"]),
 88 |         n_epochs=n_epochs)
 89 | 
 90 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 91 |     trainer = Trainer(
 92 |         model=model,
 93 |         optimizer=optimizer,
 94 |         data_silo=data_silo,
 95 |         epochs=n_epochs,
 96 |         n_gpu=n_gpu,
 97 |         lr_schedule=lr_schedule,
 98 |         evaluate_every=evaluate_every,
 99 |         device=device)
100 | 
101 |     # 7. Let it grow
102 |     trainer.train()
103 | 
104 |     # 8. Hooray! You have a model. Store it:
105 |     save_dir = Path("saved_models/bert-multi-doc-roberta")
106 |     model.save(save_dir)
107 |     processor.save(save_dir)
108 | 
109 |     # 9. Load it & harvest your fruits (Inference)
110 |     basic_texts = [
111 |         {"text": "You fucking bastards"},
112 |         {"text": "What a lovely world"},
113 |     ]
114 |     model = Inferencer.load(save_dir)
115 |     result = model.run_inference(dicts=basic_texts)
116 |     print(result)
117 |     model.close_multiprocessing_pool()
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     doc_classification_multilabel_roberta()
122 | 
123 | # fmt: on
124 | 


--------------------------------------------------------------------------------
/examples/doc_classification_word_embedding_LM.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | import time
  5 | 
  6 | from farm.data_handler.data_silo import DataSilo, StreamingDataSilo
  7 | from farm.data_handler.processor import TextClassificationProcessor
  8 | from farm.modeling.optimization import initialize_optimizer
  9 | from farm.infer import Inferencer
 10 | from farm.modeling.adaptive_model import AdaptiveModel
 11 | from farm.modeling.language_model import LanguageModel
 12 | from farm.modeling.prediction_head import TextClassificationHead
 13 | from farm.modeling.tokenization import Tokenizer
 14 | from farm.train import Trainer
 15 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 16 | 
 17 | def doc_classifcation():
 18 |     logging.basicConfig(
 19 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 20 |         datefmt="%m/%d/%Y %H:%M:%S",
 21 |         level=logging.INFO)
 22 | 
 23 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 24 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove")
 25 | 
 26 |     ##########################
 27 |     ########## Settings
 28 |     ##########################
 29 |     set_all_seeds(seed=42)
 30 |     n_epochs = 3
 31 |     batch_size = 32
 32 |     evaluate_every = 100
 33 |     # load from a local path:
 34 |     lang_model = Path("../saved_models/glove-german-uncased")
 35 |     # or through s3
 36 |     #lang_model = "glove-german-uncased"
 37 |     do_lower_case = True
 38 | 
 39 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 40 | 
 41 |     # 1.Create a tokenizer
 42 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
 43 | 
 44 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 45 |     # Here we load GermEval 2018 Data automaticaly if it is not available.
 46 |     # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
 47 |     label_list = ["OTHER", "OFFENSE"]
 48 |     metric = "f1_macro"
 49 | 
 50 |     processor = TextClassificationProcessor(
 51 |         tokenizer=tokenizer,
 52 |         max_seq_len=128,
 53 |         data_dir=Path("../data/germeval18"),
 54 |         label_list=label_list,
 55 |         dev_split=0,
 56 |         test_filename="test.tsv",
 57 |         train_filename="train.tsv",
 58 |         metric=metric,
 59 |         label_column_name="coarse_label")
 60 | 
 61 | 
 62 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
 63 |     data_silo = DataSilo(
 64 |         processor=processor,
 65 |         batch_size=batch_size,
 66 |         max_processes=1)
 67 | 
 68 |     # 4. Create an AdaptiveModel
 69 |     # a) which consists of an embedding model as a basis.
 70 |     # Word embedding models only converts words it has seen during training to embedding vectors.
 71 |     language_model = LanguageModel.load(lang_model)
 72 |     # b) and a prediction head on top that is suited for our task => Text classification
 73 |     prediction_head = TextClassificationHead(
 74 |         layer_dims=[300,600,len(label_list)],
 75 |         class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
 76 |         num_labels=len(label_list))
 77 | 
 78 |     model = AdaptiveModel(
 79 |         language_model=language_model,
 80 |         prediction_heads=[prediction_head],
 81 |         embeds_dropout_prob=0.1,
 82 |         lm_output_types=["per_sequence"],
 83 |         device=device)
 84 | 
 85 |     # 5. Create an optimizer
 86 |     model, optimizer, lr_schedule = initialize_optimizer(
 87 |         model=model,
 88 |         learning_rate=3e-5,
 89 |         device=device,
 90 |         n_batches=len(data_silo.loaders["train"]),
 91 |         n_epochs=n_epochs)
 92 | 
 93 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 94 |     trainer = Trainer(
 95 |         model=model,
 96 |         optimizer=optimizer,
 97 |         data_silo=data_silo,
 98 |         epochs=n_epochs,
 99 |         n_gpu=n_gpu,
100 |         lr_schedule=lr_schedule,
101 |         evaluate_every=evaluate_every,
102 |         device=device)
103 | 
104 |     # 7. Let it grow
105 |     trainer.train()
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     doc_classifcation()
110 | 
111 | # fmt: on
112 | 


--------------------------------------------------------------------------------
/examples/doc_regression.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import RegressionProcessor
  7 | from farm.experiment import initialize_optimizer
  8 | from farm.infer import Inferencer
  9 | from farm.modeling.adaptive_model import AdaptiveModel
 10 | from farm.modeling.language_model import LanguageModel
 11 | from farm.modeling.prediction_head import RegressionHead
 12 | from farm.modeling.tokenization import Tokenizer
 13 | from farm.train import Trainer
 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 15 | 
 16 | 
 17 | def doc_regression():
 18 |     logging.basicConfig(
 19 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 20 |         datefmt="%m/%d/%Y %H:%M:%S",
 21 |         level=logging.INFO)
 22 | 
 23 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 24 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression")
 25 | 
 26 |     ##########################
 27 |     ########## Settings
 28 |     ##########################
 29 |     set_all_seeds(seed=42)
 30 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 31 |     n_epochs = 5
 32 |     batch_size = 32
 33 |     evaluate_every = 30
 34 |     lang_model = "bert-base-cased"
 35 |     do_lower_case = False
 36 | 
 37 |     # 1.Create a tokenizer
 38 |     tokenizer = Tokenizer.load(
 39 |         pretrained_model_name_or_path=lang_model,
 40 |         do_lower_case=do_lower_case)
 41 | 
 42 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 43 |     #    We do not have a sample dataset for regression yet, add your own dataset to run the example
 44 |     processor = RegressionProcessor(tokenizer=tokenizer,
 45 |                                     max_seq_len=128,
 46 |                                     data_dir=Path("../data/<YOUR-DATASET>"),
 47 |                                     label_column_name="label"
 48 |                                     )
 49 | 
 50 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 51 |     data_silo = DataSilo(
 52 |         processor=processor,
 53 |         batch_size=batch_size)
 54 | 
 55 |     # 4. Create an AdaptiveModel
 56 |     # a) which consists of a pretrained language model as a basis
 57 |     language_model = LanguageModel.load(lang_model)
 58 |     # b) and a prediction head on top that is suited for our task => Text regression
 59 |     prediction_head = RegressionHead()
 60 | 
 61 |     model = AdaptiveModel(
 62 |         language_model=language_model,
 63 |         prediction_heads=[prediction_head],
 64 |         embeds_dropout_prob=0.1,
 65 |         lm_output_types=["per_sequence_continuous"],
 66 |         device=device)
 67 | 
 68 |     # 5. Create an optimizer
 69 |     model, optimizer, lr_schedule = initialize_optimizer(
 70 |         model=model,
 71 |         learning_rate=2e-5,
 72 |         device=device,
 73 |         n_batches=len(data_silo.loaders["train"]),
 74 |         n_epochs=n_epochs)
 75 | 
 76 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 77 |     trainer = Trainer(
 78 |         model=model,
 79 |         optimizer=optimizer,
 80 |         data_silo=data_silo,
 81 |         epochs=n_epochs,
 82 |         n_gpu=n_gpu,
 83 |         lr_schedule=lr_schedule,
 84 |         evaluate_every=evaluate_every,
 85 |         device=device)
 86 | 
 87 |     # 7. Let it grow
 88 |     trainer.train()
 89 | 
 90 |     # 8. Hooray! You have a model. Store it:
 91 |     save_dir = Path("saved_models/bert-doc-regression-tutorial")
 92 |     model.save(save_dir)
 93 |     processor.save(save_dir)
 94 | 
 95 |     # 9. Load it & harvest your fruits (Inference)
 96 |     #    Add your own text adapted to the dataset you provide
 97 |     basic_texts = [
 98 |         {"text": ""},
 99 |         {"text": ""},
100 |     ]
101 |     model = Inferencer.load(save_dir)
102 |     result = model.inference_from_dicts(dicts=basic_texts)
103 | 
104 |     print(result)
105 |     model.close_multiprocessing_pool()
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     doc_regression()
110 | 
111 | # fmt: on
112 | 


--------------------------------------------------------------------------------
/examples/embeddings_extraction.py:
--------------------------------------------------------------------------------
 1 | from farm.infer import Inferencer
 2 | from farm.utils import set_all_seeds
 3 | from pathlib import Path
 4 | 
 5 | def embeddings_extraction():
 6 |     set_all_seeds(seed=42)
 7 |     batch_size = 32
 8 |     use_gpu = False
 9 |     lang_model = "bert-base-german-cased"
10 |     # or local path:
11 |     # lang_model = Path("../saved_models/farm-bert-base-cased-squad2")
12 | 
13 |     # Input
14 |     basic_texts = [
15 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
16 |         {"text": "Martin Müller spielt Fussball"},
17 |     ]
18 | 
19 |     # Load model, tokenizer and processor directly into Inferencer
20 |     model = Inferencer.load(lang_model, task_type="embeddings", gpu=use_gpu, batch_size=batch_size,
21 |                             extraction_strategy="reduce_mean", extraction_layer=-2, num_processes=0)
22 | 
23 |     # Get embeddings for input text (you can vary the strategy and layer)
24 |     result = model.inference_from_dicts(dicts=basic_texts)
25 |     print(result)
26 |     model.close_multiprocessing_pool()
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     embeddings_extraction()
31 | 


--------------------------------------------------------------------------------
/examples/embeddings_extraction_s3e_pooling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pickle
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.processor import InferenceProcessor
  6 | from farm.infer import Inferencer
  7 | from farm.modeling.adaptive_model import AdaptiveModel
  8 | from farm.modeling.language_model import LanguageModel
  9 | from farm.modeling.tokenization import Tokenizer
 10 | from farm.utils import set_all_seeds, initialize_device_settings
 11 | from farm.modeling.wordembedding_utils import fit_s3e_on_corpus
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | """
 16 |     Example for generating sentence embeddings via the S3E pooling approach as described by Wang et al in the paper
 17 |     "Efficient Sentence Embedding via Semantic Subspace Analysis"
 18 |     (https://arxiv.org/abs/2002.09620)
 19 |     
 20 |     You can use classical models like fasttext, glove or word2vec and apply S3E on top. 
 21 |     This can be a powerful benchmark for plain transformer-based embeddings.   
 22 | 
 23 |     First, we fit the required stats on a custom corpus. This includes the derivation of token_weights depending on
 24 |     token occurences in the corpus, creation of the semantic clusters via k-means and a couple of
 25 |     pre-/post-processing steps to normalize the embeddings.
 26 |     
 27 |     Second, we feed the resulting objects into our Inferencer to extract the actual sentence embeddings for our sentences. 
 28 | """
 29 | 
 30 | def fit(language_model, corpus_path, save_dir, do_lower_case, batch_size=4, use_gpu=False):
 31 |     # Fit S3E on a corpus
 32 |     set_all_seeds(seed=42)
 33 |     device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)
 34 | 
 35 |     # Create a InferenceProcessor
 36 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case)
 37 |     processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
 38 | 
 39 |     # Create an AdaptiveModel
 40 |     language_model = LanguageModel.load(language_model)
 41 | 
 42 |     model = AdaptiveModel(
 43 |         language_model=language_model,
 44 |         prediction_heads=[],
 45 |         embeds_dropout_prob=0.1,
 46 |         lm_output_types=["per_sequence"],
 47 |         device=device)
 48 | 
 49 |     model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
 50 |                                                     model=model,
 51 |                                                     corpus=corpus_path,
 52 |                                                     n_clusters=10,
 53 |                                                     pca_n_components=300,
 54 |                                                     svd_postprocessing=True,
 55 |                                                     min_token_occurrences=1)
 56 | 
 57 |     # save everything to allow inference without fitting everything again
 58 |     model.save(save_dir)
 59 |     processor.save(save_dir)
 60 |     with open(save_dir / "s3e_stats.pkl", "wb") as f:
 61 |         pickle.dump(s3e_stats, f)
 62 | 
 63 |     # Load model, tokenizer and processor directly into Inferencer
 64 |     inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu,
 65 |                        batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1,
 66 |                        s3e_stats=s3e_stats)
 67 | 
 68 |     # Input
 69 |     basic_texts = [
 70 |         {"text": "a man is walking on the street."},
 71 |         {"text": "a woman is walking on the street."},
 72 |     ]
 73 | 
 74 |     # Get embeddings for input text (you can vary the strategy and layer)
 75 |     result = inferencer.inference_from_dicts(dicts=basic_texts)
 76 |     print(result)
 77 |     inferencer.close_multiprocessing_pool()
 78 | 
 79 | 
 80 | def extract_embeddings(load_dir, use_gpu, batch_size):
 81 |     with open(load_dir / "s3e_stats.pkl", "rb") as f:
 82 |         s3e_stats = pickle.load(f)
 83 | 
 84 |     # Init inferencer
 85 |     inferencer = Inferencer.load(model_name_or_path=load_dir, task_type="embeddings", gpu=use_gpu,
 86 |                        batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1,
 87 |                        s3e_stats=s3e_stats)
 88 | 
 89 |     # Input
 90 |     basic_texts = [
 91 |         {"text": "a man is walking on the street."},
 92 |         {"text": "a woman is walking on the street."},
 93 |     ]
 94 | 
 95 |     # Get embeddings for input text
 96 |     result = inferencer.inference_from_dicts(dicts=basic_texts)
 97 |     print(result)
 98 |     inferencer.close_multiprocessing_pool()
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     lang_model = "glove-english-uncased-6B"
103 |     do_lower_case = True
104 | 
105 |     # You can download this from:
106 |     # "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/lm_finetune_nips.tar.gz"
107 |     corpus_path = Path("../data/lm_finetune_nips/train.txt")
108 | 
109 |     s3e_dir = Path("../saved_models/fitted_s3e/")
110 | 
111 |     fit(language_model=lang_model,
112 |         do_lower_case=do_lower_case,
113 |         corpus_path=corpus_path,
114 |         save_dir=s3e_dir
115 |         )
116 | 
117 |     extract_embeddings(load_dir=s3e_dir, use_gpu=False, batch_size=10)


--------------------------------------------------------------------------------
/examples/evaluation.py:
--------------------------------------------------------------------------------
  1 | from farm.utils import initialize_device_settings
  2 | from farm.modeling.tokenization import Tokenizer
  3 | from farm.data_handler.processor import TextClassificationProcessor, SquadProcessor
  4 | from farm.data_handler.data_silo import DataSilo
  5 | from farm.eval import Evaluator
  6 | from farm.modeling.adaptive_model import AdaptiveModel
  7 | from pathlib import Path
  8 | 
  9 | def evaluate_classification():
 10 |     ##########################
 11 |     ########## Settings
 12 |     ##########################
 13 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 14 |     lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17"
 15 |     do_lower_case = False
 16 |     batch_size = 100
 17 | 
 18 |     data_dir = Path("../data/germeval17")
 19 |     evaluation_filename = "test_TIMESTAMP1.tsv"
 20 |     label_list = ["negative", "neutral", "positive"]
 21 |     metric = "f1_macro"
 22 | 
 23 |     # 1.Create a tokenizer
 24 |     tokenizer = Tokenizer.load(
 25 |         pretrained_model_name_or_path=lang_model,
 26 |         do_lower_case=do_lower_case)
 27 | 
 28 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 29 |     # Here we load GermEval 2017 Data automaticaly if it is not available.
 30 | 
 31 |     processor = TextClassificationProcessor(
 32 |         tokenizer=tokenizer,
 33 |         max_seq_len=384,
 34 |         label_list=label_list,
 35 |         metric=metric,
 36 |         train_filename=None,
 37 |         dev_filename=None,
 38 |         dev_split=0,
 39 |         test_filename=evaluation_filename,
 40 |         data_dir=data_dir,
 41 |     )
 42 | 
 43 |     # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 44 |     data_silo = DataSilo(
 45 |         processor=processor,
 46 |         batch_size=batch_size)
 47 | 
 48 |     # 4. Create an Evaluator
 49 |     evaluator = Evaluator(
 50 |         data_loader=data_silo.get_data_loader("test"),
 51 |         tasks=data_silo.processor.tasks,
 52 |         device=device
 53 |     )
 54 | 
 55 |     # 5. Load model
 56 |     model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification")
 57 |     # use "load" if you want to use a local model that was trained with FARM
 58 |     # model = AdaptiveModel.load(lang_model, device=device)
 59 |     model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
 60 | 
 61 |     # 6. Run the Evaluator
 62 |     results = evaluator.eval(model)
 63 |     f1_score = results[0]["f1_macro"]
 64 |     print("Macro-averaged F1-Score:", f1_score)
 65 | 
 66 | 
 67 | def evaluate_question_answering():
 68 |     ##########################
 69 |     ########## Settings
 70 |     ##########################
 71 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 72 |     lang_model = "deepset/roberta-base-squad2"
 73 |     do_lower_case = True
 74 | 
 75 |     data_dir = Path("../data/squad20")
 76 |     evaluation_filename = "dev-v2.0.json"
 77 | 
 78 |     batch_size = 50
 79 |     no_ans_boost = 0
 80 |     accuracy_at = 3 # accuracy at n is useful for answers inside long documents
 81 | 
 82 |     # 1.Create a tokenizer
 83 |     tokenizer = Tokenizer.load(
 84 |         pretrained_model_name_or_path=lang_model,
 85 |         do_lower_case=do_lower_case)
 86 | 
 87 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 88 |     processor = SquadProcessor(
 89 |         tokenizer=tokenizer,
 90 |         max_seq_len=256,
 91 |         label_list= ["start_token", "end_token"],
 92 |         metric="squad",
 93 |         train_filename=None,
 94 |         dev_filename=None,
 95 |         dev_split=0,
 96 |         test_filename=evaluation_filename,
 97 |         data_dir=data_dir,
 98 |         doc_stride=128,
 99 |     )
100 | 
101 |     # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
102 |     data_silo = DataSilo(
103 |         processor=processor,
104 |         batch_size=batch_size)
105 | 
106 |     # 4. Create an Evaluator
107 |     evaluator = Evaluator(
108 |         data_loader=data_silo.get_data_loader("test"),
109 |         tasks=data_silo.processor.tasks,
110 |         device=device
111 |     )
112 | 
113 |     # 5. Load model
114 |     model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
115 |     # use "load" if you want to use a local model that was trained with FARM
116 |     #model = AdaptiveModel.load(lang_model, device=device)
117 |     model.prediction_heads[0].no_ans_boost = no_ans_boost
118 |     model.prediction_heads[0].n_best = accuracy_at
119 |     model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
120 | 
121 |     # 6. Run the Evaluator
122 |     results = evaluator.eval(model)
123 |     f1_score = results[0]["f1"]
124 |     em_score = results[0]["EM"]
125 |     tnacc = results[0]["top_n_accuracy"]
126 |     print("F1-Score:", f1_score)
127 |     print("Exact Match Score:", em_score)
128 |     print(f"top_{accuracy_at}_accuracy:", tnacc)
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     #evaluate_classification()
133 |     evaluate_question_answering()
134 | 


--------------------------------------------------------------------------------
/examples/lm_finetuning.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | from farm.data_handler.data_silo import DataSilo
  5 | from farm.data_handler.processor import BertStyleLMProcessor
  6 | from farm.modeling.adaptive_model import AdaptiveModel
  7 | from farm.modeling.language_model import LanguageModel
  8 | from farm.modeling.prediction_head import BertLMHead, NextSentenceHead
  9 | from farm.modeling.tokenization import Tokenizer
 10 | from farm.train import Trainer
 11 | from farm.modeling.optimization import initialize_optimizer
 12 | 
 13 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 14 | 
 15 | 
 16 | def lm_finetuning():
 17 |     logging.basicConfig(
 18 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 19 |         datefmt="%m/%d/%Y %H:%M:%S",
 20 |         level=logging.INFO,
 21 |     )
 22 |     next_sent_pred_style = "bert-style"
 23 |     next_sent_pred=True
 24 |     set_all_seeds(seed=42)
 25 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 26 |     ml_logger.init_experiment(
 27 |         experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}"
 28 |     )
 29 |     ##########################
 30 |     ########## Settings
 31 |     ##########################
 32 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 33 |     n_epochs = 1
 34 |     batch_size = 32
 35 |     evaluate_every = 1000
 36 |     lang_model = "bert-base-cased"
 37 |     do_lower_case = False
 38 | 
 39 |     # 1.Create a tokenizer
 40 |     tokenizer = Tokenizer.load(
 41 |         pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
 42 |     )
 43 | 
 44 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 45 |     processor = BertStyleLMProcessor(
 46 |         data_dir=Path("../data/lm_finetune_nips"),
 47 |         tokenizer=tokenizer,
 48 |         max_seq_len=128,
 49 |         max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example
 50 |         next_sent_pred_style=next_sent_pred_style
 51 |     )
 52 | 
 53 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 54 |     data_silo = DataSilo(processor=processor, batch_size=batch_size, max_multiprocessing_chunksize=20)
 55 | 
 56 |     # 4. Create an AdaptiveModel
 57 |     # a) which consists of a pretrained language model as a basis
 58 |     language_model = LanguageModel.load(lang_model)
 59 |     # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
 60 |     lm_prediction_head = BertLMHead.load(lang_model)
 61 |     next_sentence_head = NextSentenceHead.load(lang_model)
 62 | 
 63 |     model = AdaptiveModel(
 64 |         language_model=language_model,
 65 |         prediction_heads=[lm_prediction_head, next_sentence_head],
 66 |         embeds_dropout_prob=0.1,
 67 |         lm_output_types=["per_token", "per_sequence"],
 68 |         device=device,
 69 |     )
 70 | 
 71 |     # 5. Create an optimizer
 72 |     model, optimizer, lr_schedule = initialize_optimizer(
 73 |         model=model,
 74 |         learning_rate=2e-5,
 75 |         device=device,
 76 |         n_batches=len(data_silo.loaders["train"]),
 77 |         n_epochs=n_epochs
 78 |     )
 79 | 
 80 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 81 |     trainer = Trainer(
 82 |         model=model,
 83 |         optimizer=optimizer,
 84 |         data_silo=data_silo,
 85 |         epochs=n_epochs,
 86 |         n_gpu=n_gpu,
 87 |         lr_schedule=lr_schedule,
 88 |         evaluate_every=evaluate_every,
 89 |         device=device,
 90 |         eval_report=False
 91 |     )
 92 | 
 93 |     # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
 94 |     trainer.train()
 95 | 
 96 |     # 8. Hooray! You have a model. Store it:
 97 |     save_dir = Path("saved_models/bert-english-lm-tutorial")
 98 |     model.save(save_dir)
 99 |     processor.save(save_dir)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     lm_finetuning()
104 | 


--------------------------------------------------------------------------------
/examples/ner.py:
--------------------------------------------------------------------------------
  1 | # fmt: off
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import NERProcessor
  7 | from farm.modeling.optimization import initialize_optimizer
  8 | from farm.infer import Inferencer
  9 | from farm.modeling.adaptive_model import AdaptiveModel
 10 | from farm.modeling.language_model import LanguageModel
 11 | from farm.modeling.prediction_head import TokenClassificationHead
 12 | from farm.modeling.tokenization import Tokenizer
 13 | from farm.train import Trainer
 14 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 15 | 
 16 | def ner():
 17 |     logging.basicConfig(
 18 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 19 |         datefmt="%m/%d/%Y %H:%M:%S",
 20 |         level=logging.INFO,
 21 |     )
 22 | 
 23 |     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
 24 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner")
 25 | 
 26 |     ##########################
 27 |     ########## Settings
 28 |     ##########################
 29 |     set_all_seeds(seed=42)
 30 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 31 |     n_epochs = 4
 32 |     batch_size = 32
 33 |     evaluate_every = 400
 34 |     lang_model = "bert-base-german-cased"
 35 |     do_lower_case = False
 36 | 
 37 |     # 1.Create a tokenizer
 38 |     tokenizer = Tokenizer.load(
 39 |         pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
 40 |     )
 41 | 
 42 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 43 |     # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
 44 |     ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]
 45 | 
 46 |     processor = NERProcessor(
 47 |         tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels
 48 |     )
 49 | 
 50 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
 51 |     data_silo = DataSilo(processor=processor, batch_size=batch_size)
 52 | 
 53 |     # 4. Create an AdaptiveModel
 54 |     # a) which consists of a pretrained language model as a basis
 55 |     language_model = LanguageModel.load(lang_model)
 56 |     # b) and a prediction head on top that is suited for our task => NER
 57 |     prediction_head = TokenClassificationHead(num_labels=len(ner_labels))
 58 | 
 59 |     model = AdaptiveModel(
 60 |         language_model=language_model,
 61 |         prediction_heads=[prediction_head],
 62 |         embeds_dropout_prob=0.1,
 63 |         lm_output_types=["per_token"],
 64 |         device=device,
 65 |     )
 66 | 
 67 |     # 5. Create an optimizer
 68 |     model, optimizer, lr_schedule = initialize_optimizer(
 69 |         model=model,
 70 |         learning_rate=1e-5,
 71 |         n_batches=len(data_silo.loaders["train"]),
 72 |         n_epochs=n_epochs,
 73 |         device=device,
 74 |     )
 75 | 
 76 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 77 |     trainer = Trainer(
 78 |         model=model,
 79 |         optimizer=optimizer,
 80 |         data_silo=data_silo,
 81 |         epochs=n_epochs,
 82 |         n_gpu=n_gpu,
 83 |         lr_schedule=lr_schedule,
 84 |         evaluate_every=evaluate_every,
 85 |         device=device,
 86 |     )
 87 | 
 88 |     # 7. Let it grow
 89 |     trainer.train()
 90 | 
 91 |     # 8. Hooray! You have a model. Store it:
 92 |     save_dir = "saved_models/bert-german-ner-tutorial"
 93 |     model.save(save_dir)
 94 |     processor.save(save_dir)
 95 | 
 96 | 
 97 |     # 9. Load it & harvest your fruits (Inference)
 98 |     basic_texts = [
 99 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
100 |         {"text": "Martin Müller spielt Handball in Berlin"},
101 |     ]
102 |     model = Inferencer.load(save_dir)
103 |     result = model.inference_from_dicts(dicts=basic_texts)
104 |     print(result)
105 | 
106 |     model.close_multiprocessing_pool()
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     ner()
111 | 


--------------------------------------------------------------------------------
/examples/onnx_question_answering.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from farm.infer import Inferencer
 4 | from farm.modeling.adaptive_model import AdaptiveModel
 5 | 
 6 | 
 7 | def onnx_runtime_example():
 8 |     """
 9 |     This example shows conversion of a transformers model from the Model Hub to
10 |     ONNX format & inference using ONNXRuntime.
11 |     """
12 | 
13 |     model_name_or_path = "deepset/roberta-base-squad2"
14 |     onnx_model_export_path = Path("./roberta-onnx")
15 | 
16 |     AdaptiveModel.convert_to_onnx(model_name_or_path, onnx_model_export_path, task_type="question_answering")
17 | 
18 |     # for ONNX models, the Inferencer uses ONNXRuntime under-the-hood
19 |     inferencer = Inferencer.load(model_name_or_path=onnx_model_export_path)
20 | 
21 |     qa_input = [
22 |         {
23 |             "questions": ["Who counted the game among the best ever made?"],
24 |             "text": "Twilight Princess was released to universal critical acclaim and commercial success. "
25 |             "It received perfect scores from major publications such as 1UP.com, Computer and Video Games, "
26 |             "Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators "
27 |             "GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii "
28 |             "version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called "
29 |             "it one of the greatest games ever created.",
30 |         }
31 |     ]
32 | 
33 |     results = inferencer.inference_from_dicts(qa_input)
34 |     print(results)
35 |     inferencer.close_multiprocessing_pool()
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     onnx_runtime_example()
40 | 


--------------------------------------------------------------------------------
/examples/streaming_inference.py:
--------------------------------------------------------------------------------
 1 | from farm.infer import Inferencer
 2 | 
 3 | 
 4 | def streaming_inference_example():
 5 |     """
 6 |     The FARM Inferencer has a high performance non-blocking streaming mode for large scale inference use cases. With
 7 |     this mode, the dicts parameter can optionally be a Python generator object that yield dicts, thus avoiding loading
 8 |     dicts in memory. The inference_from_dicts() method returns a generator that yield predictions. To use streaming,
 9 |     set the streaming param to True and determine optimal multiprocessing_chunksize by performing speed benchmarks.
10 |     """
11 | 
12 |     model_name_or_path = "deepset/bert-base-cased-squad2"
13 |     inferencer = Inferencer.load(model_name_or_path=model_name_or_path, task_type="question_answering", num_processes=8)
14 | 
15 |     dicts = sample_dicts_generator()  # it can be a list of dicts or a generator object
16 |     results = inferencer.inference_from_dicts(dicts, streaming=True, multiprocessing_chunksize=20)
17 | 
18 |     for prediction in results:  # results is a generator object that yields predictions
19 |         print(prediction)
20 | 
21 |     inferencer.close_multiprocessing_pool()
22 | 
23 | 
24 | def sample_dicts_generator():
25 |     """
26 |     This is a sample dicts generator. Some exemplary use cases:
27 | 
28 |     * read chunks of text from large files iteratively and generate inference predictions
29 |     * connect with external datasources, eg, a Elasticsearch Scroll API that reads all documents from a given index
30 |     * building a streaming microservice that reads from Kafka
31 | 
32 |     :return: a generator that yield dicts
33 |     :rtype: iter
34 |     """
35 |     qa_input = {
36 |         "questions": ["Who counted the game among the best ever made?"],
37 |         "text": "Twilight Princess was released to universal critical acclaim and commercial success. "
38 |                    "It received perfect scores from major publications such as 1UP.com, Computer and Video Games, "
39 |                    "Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators "
40 |                    "GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii "
41 |                    "version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called "
42 |                    "it one of the greatest games ever created.",
43 |     }
44 | 
45 |     for i in range(100000):
46 |         yield qa_input
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     streaming_inference_example()
51 | 


--------------------------------------------------------------------------------
/examples/wordembedding_inference.py:
--------------------------------------------------------------------------------
 1 | # fmt: off
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | from farm.data_handler.processor import InferenceProcessor
 7 | from farm.infer import Inferencer
 8 | from farm.modeling.adaptive_model import AdaptiveModel
 9 | from farm.modeling.language_model import LanguageModel
10 | from farm.modeling.tokenization import Tokenizer
11 | from farm.utils import set_all_seeds, initialize_device_settings
12 | 
13 | def embedding_extraction():
14 |     logging.basicConfig(
15 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
16 |         datefmt="%m/%d/%Y %H:%M:%S",
17 |         level=logging.INFO)
18 | 
19 |     ##########################
20 |     ########## Settings
21 |     ##########################
22 |     set_all_seeds(seed=42)
23 |     # load from a local path:
24 |     #lang_model = Path("../saved_models/glove-german-uncased")
25 |     # or through s3
26 |     lang_model = "glove-german-uncased" #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported
27 |     do_lower_case = True
28 |     use_amp = None
29 |     device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
30 | 
31 |     # Create a InferenceProcessor
32 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
33 |     processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
34 | 
35 |     # Create an AdaptiveModel
36 |     language_model = LanguageModel.load(lang_model)
37 |     model = AdaptiveModel(
38 |         language_model=language_model,
39 |         prediction_heads=[],
40 |         embeds_dropout_prob=0.1,
41 |         lm_output_types=["per_sequence"],
42 |         device=device)
43 | 
44 | 
45 |     # Create Inferencer for embedding extraction
46 |     inferencer = Inferencer(
47 |         model=model,
48 |         processor=processor,
49 |         task_type="embeddings"
50 |     )
51 | 
52 | 
53 |     # Extract vectors
54 |     basic_texts = [
55 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
56 |         {"text": "Martin Müller spielt Handball in Berlin"},
57 |     ]
58 | 
59 |     result = inferencer.extract_vectors(
60 |         dicts=basic_texts,
61 |         extraction_strategy="cls_token",
62 |         extraction_layer=-1
63 |     )
64 |     print(result)
65 |     inferencer.close_multiprocessing_pool()
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     embedding_extraction()
70 | 
71 | # fmt: on
72 | 


--------------------------------------------------------------------------------
/experiments/lm_finetuning/finetune_sample_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "general": {
 3 |     "cache_dir":   {"value": null, "default": "",                      "desc": "Path for storing pre-trained models downloaded from s3."},
 4 |     "data_dir":    {"value": null, "default": "data/lm_finetune_nips", "desc": "Input directory for downstream task. Should contain train + test (+ dev) files."},
 5 |     "output_dir":  {"value": null, "default": "saved_models",          "desc": "Output directory where model predictions and checkpoints will be saved."},
 6 | 
 7 |     "cuda":        {"value": false, "default": true, "desc": "CUDA flag,  uses CUDA if available."},
 8 |     "local_rank":  {"value": null, "default": -1,    "desc": "If local_rank == -1 -> multiGPU mode on one machine,  other values signal distributed computation across several nodes (apex install required)."},
 9 |     "use_amp":     {"value": null, "default": null, "desc": "Automatic mixed precision with APEX. Must be set to null to disable or to any optimisation level (see apex documentation). 'O1' is recommended."},
10 |     "seed":        {"value": null, "default": 42, "desc": "Random seed for initializations."}
11 |   },
12 | 
13 |   "task": {
14 |     "name":            {"value": null, "default": "test_lm_finetuning",  "desc": "Description."},
15 |     "output_mode":     {"value": null, "default": "lm",                  "desc": "Used for data loading and evaluation. Choices: classification,  ner, lm  TBD"},
16 |     "prediction_head": {"value": null, "default": "lm",                  "desc": "Prediction head on top of vanilla LM Model,  must correspond to task and data."},
17 | 
18 |     "do_eval":         {"value": null, "default": false, "desc": "Whether to run eval on the dev set."},
19 |     "do_train":        {"value": null, "default": true,  "desc": "Whether to run training. Can be used to only evaluate on an already trained model."},
20 | 
21 |     "processor_name":  {"value": null, "default": "BertStyleLMProcessor", "desc": "Class name of DataProcessor."},
22 |     "dev_split":        {"value": null, "default": 0.1,              "desc": "Split a dev set from the training set using dev_split as proportion."},
23 |     "train_filename":   {"value": null, "default": "train.txt",      "desc": "Filename for training."},
24 |     "dev_filename":     {"value": null, "default": "dev.txt",        "desc": "Filename for development."},
25 |     "test_filename":    {"value": null, "default": "test.txt",       "desc": "Filename for testing."}
26 |   },
27 | 
28 |   "parameter": {
29 |     "model":           {"value": "bert-base-cased", "default": null,  "desc": "Bert pre-trained model selected in the list: bert-base-uncased,  bert-large-uncased,  bert-base-cased,  bert-large-cased,  bert-base-multilingual-uncased,  bert-base-multilingual-cased,  bert-base-chinese."},
30 |     "lower_case":      {"value": null,              "default": false, "desc": "Set to true if you are using an uncased model."},
31 |     "max_seq_len":  {"value": null,              "default": 128,   "desc": "The maximum total input sequence length after WordPiece tokenization. 128 was too short for some texts"},
32 |     "balance_classes": {"value": null,              "default": true,  "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
33 | 
34 |     "num_train_epochs":             {"value": null, "default": 1.0, "desc": "Total number of training epochs to perform."},
35 |     "batch_size":                   {"value": null, "default": 64,  "desc": ""},
36 |     "gradient_accumulation_steps":  {"value": null, "default": 1,   "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."}
37 |   },
38 |   "optimizer": {
39 |     "learning_rate":      {"value": null, "default": 2e-5, "desc": "The learning rate for the optimizer."},
40 |     "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
41 |     "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.1}, "desc":  "opts for lr schedule"}
42 |  },
43 |   "logging": {
44 |     "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
45 |     "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
46 |     "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
47 | 
48 |     "mlflow_experiment": {"value": "debug_lm_finetuning",   "default": null, "desc": "Experiment name used for mlflow"},
49 |     "mlflow_run_name":   {"value": "lm finetuning example", "default": null, "desc": "Name of the particular run for mlflow"}
50 |   }
51 | }
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/farm/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch.multiprocessing as mp
 4 | from farm._version import __version__
 5 | 
 6 | logging.basicConfig(
 7 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 8 |     datefmt="%m/%d/%Y %H:%M:%S",
 9 |     level=logging.INFO,
10 | )
11 | 
12 | # reduce verbosity from transformers library
13 | logging.getLogger('transformers.configuration_utils').setLevel(logging.WARNING)
14 | 
15 | # https://pytorch.org/docs/stable/multiprocessing.html#sharing-strategies
16 | if "file_descriptor" in mp.get_all_sharing_strategies():
17 |     import resource
18 | 
19 |     mp.set_sharing_strategy("file_descriptor")
20 | 
21 |     rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
22 |     # seting soft limit to hard limit (=rlimit[1]) minus a small amount to be safe
23 |     resource.setrlimit(resource.RLIMIT_NOFILE, (rlimit[1]-512, rlimit[1]))
24 | 


--------------------------------------------------------------------------------
/farm/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.8.1-snapshot"
2 | 


--------------------------------------------------------------------------------
/farm/conversion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/conversion/__init__.py


--------------------------------------------------------------------------------
/farm/conversion/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | 
23 | import torch
24 | from transformers.modeling_bert import (
25 |     BertConfig,
26 |     BertForPreTraining,
27 |     load_tf_weights_in_bert,
28 | )
29 | 
30 | 
31 | def convert_tf_checkpoint_to_pytorch(
32 |     tf_checkpoint_path, bert_config_file, pytorch_dump_path
33 | ):
34 |     # Initialise PyTorch model
35 |     config = BertConfig.from_json_file(bert_config_file)
36 |     print("Building PyTorch model from configuration: {}".format(str(config)))
37 |     model = BertForPreTraining(config)
38 | 
39 |     # Load weights from tf checkpoint
40 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
41 | 
42 |     # Save pytorch-model
43 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
44 |     torch.save(model.state_dict(), pytorch_dump_path)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser()
49 |     ## Required parameters
50 |     parser.add_argument(
51 |         "--tf_checkpoint_path",
52 |         default=None,
53 |         type=str,
54 |         required=True,
55 |         help="Path the TensorFlow checkpoint path.",
56 |     )
57 |     parser.add_argument(
58 |         "--bert_config_file",
59 |         default=None,
60 |         type=str,
61 |         required=True,
62 |         help="The config json file corresponding to the pre-trained BERT model. \n"
63 |         "This specifies the model architecture.",
64 |     )
65 |     parser.add_argument(
66 |         "--pytorch_dump_path",
67 |         default=None,
68 |         type=str,
69 |         required=True,
70 |         help="Path to the output PyTorch model.",
71 |     )
72 |     args = parser.parse_args()
73 |     convert_tf_checkpoint_to_pytorch(
74 |         args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path
75 |     )
76 | 


--------------------------------------------------------------------------------
/farm/data_handler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/data_handler/__init__.py


--------------------------------------------------------------------------------
/farm/data_handler/dataloader.py:
--------------------------------------------------------------------------------
 1 | from math import ceil
 2 | 
 3 | from torch.utils.data import DataLoader, Dataset, Sampler
 4 | import torch
 5 | 
 6 | 
 7 | class NamedDataLoader(DataLoader):
 8 |     """
 9 |     A modified version of the PyTorch DataLoader that returns a dictionary where the key is
10 |     the name of the tensor and the value is the tensor itself.
11 |     """
12 | 
13 |     def __init__(self, dataset, batch_size, sampler=None, tensor_names=None, num_workers=0, pin_memory=False):
14 |         """
15 |         :param dataset: The dataset that will be wrapped by this NamedDataLoader
16 |         :type dataset: Dataset
17 |         :param sampler: The sampler used by the NamedDataLoader to choose which samples to include in the batch
18 |         :type sampler: Sampler
19 |         :param batch_size: The size of the batch to be returned by the NamedDataLoader
20 |         :type batch_size: int
21 |         :param tensor_names: The names of the tensor, in the order that the dataset returns them in.
22 |         :type tensor_names: list
23 |         :param num_workers: number of workers to use for the DataLoader
24 |         :type num_workers: int
25 |         :param pin_memory: argument for Data Loader to use page-locked memory for faster transfer of data to GPU
26 |         :type pin_memory: bool
27 |         """
28 | 
29 |         def collate_fn(batch):
30 |             """
31 |             A custom collate function that formats the batch as a dictionary where the key is
32 |             the name of the tensor and the value is the tensor itself
33 |             """
34 | 
35 |             if type(dataset).__name__ == "_StreamingDataSet":
36 |                 _tensor_names = dataset.tensor_names
37 |             else:
38 |                 _tensor_names = tensor_names
39 | 
40 |             if type(batch[0]) == list:
41 |                 batch = batch[0]
42 | 
43 |             assert len(batch[0]) == len(
44 |                 _tensor_names
45 |             ), "Dataset contains {} tensors while there are {} tensor names supplied: {}".format(
46 |                 len(batch[0]), len(_tensor_names), _tensor_names
47 |             )
48 |             lists_temp = [[] for _ in range(len(_tensor_names))]
49 |             ret = dict(zip(_tensor_names, lists_temp))
50 | 
51 |             for example in batch:
52 |                 for name, tensor in zip(_tensor_names, example):
53 |                     ret[name].append(tensor)
54 | 
55 |             for key in ret:
56 |                 ret[key] = torch.stack(ret[key])
57 | 
58 |             return ret
59 | 
60 |         super(NamedDataLoader, self).__init__(
61 |             dataset=dataset,
62 |             sampler=sampler,
63 |             batch_size=batch_size,
64 |             collate_fn=collate_fn,
65 |             pin_memory=pin_memory,
66 |             num_workers=num_workers,
67 |         )
68 | 
69 |     def __len__(self):
70 |         if type(self.dataset).__name__ == "_StreamingDataSet":
71 |             num_samples = len(self.dataset)
72 |             num_batches = ceil(num_samples / self.dataset.batch_size)
73 |             return num_batches
74 |         else:
75 |             return super().__len__()
76 | 
77 | 
78 | def covert_dataset_to_dataloader(dataset, sampler, batch_size):
79 |     """
80 |     Wraps a PyTorch Dataset with a DataLoader.
81 | 
82 |     :param dataset: Dataset to be wrapped.
83 |     :type dataset: Dataset
84 |     :param sampler: PyTorch sampler used to pick samples in a batch.
85 |     :type sampler: Sampler
86 |     :param batch_size: Number of samples in the batch.
87 |     :return: A DataLoader that wraps the input Dataset.
88 |     """
89 |     sampler_initialized = sampler(dataset)
90 |     data_loader = DataLoader(
91 |         dataset, sampler=sampler_initialized, batch_size=batch_size
92 |     )
93 |     return data_loader
94 | 


--------------------------------------------------------------------------------
/farm/data_handler/dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | import numpy as np
 3 | import numbers
 4 | import logging
 5 | import torch
 6 | from torch.utils.data import Dataset, ConcatDataset, TensorDataset
 7 | from farm.utils import flatten_list
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def convert_features_to_dataset(features):
13 |     """
14 |     Converts a list of feature dictionaries (one for each sample) into a PyTorch Dataset.
15 | 
16 |     :param features: A list of dictionaries. Each dictionary corresponds to one sample. Its keys are the
17 |                      names of the type of feature and the keys are the features themselves.
18 |     :Return: a Pytorch dataset and a list of tensor names.
19 |     """
20 |     # features can be an empty list in cases where down sampling occurs (e.g. Natural Questions
21 |     # downsamples instances of is_impossible)
22 |     if len(features) == 0:
23 |         return None, None
24 |     tensor_names = list(features[0].keys())
25 |     all_tensors = []
26 |     for t_name in tensor_names:
27 |         # Conversion of floats
28 |         if t_name == 'regression_label_ids':
29 |             cur_tensor = torch.tensor([sample[t_name] for sample in features], dtype=torch.float32)
30 |         else:
31 |             try:
32 |                 # Checking weather a non-integer will be silently converted to torch.long
33 |                 check = features[0][t_name]
34 |                 if isinstance(check, numbers.Number):
35 |                     base = check
36 |                 # extract a base variable from a nested lists or tuples
37 |                 elif isinstance(check, list):
38 |                     base = list(flatten_list(check))[0]
39 |                 # extract a base variable from numpy arrays
40 |                 else:
41 |                     base = check.ravel()[0]
42 |                 if not np.issubdtype(type(base), np.integer):
43 |                     logger.warning(f"Problem during conversion to torch tensors:\n"
44 |                                    f"A non-integer value for feature '{t_name}' with a value of: "
45 |                                    f"'{base}' will be converted to a torch tensor of dtype long.")
46 |             except:
47 |                 logger.warning(f"Could not determine type for feature '{t_name}'. "
48 |                                "Converting now to a tensor of default type long.")
49 | 
50 |             # Convert all remaining python objects to torch long tensors
51 |             cur_tensor = torch.tensor([sample[t_name] for sample in features], dtype=torch.long)
52 | 
53 |         all_tensors.append(cur_tensor)
54 | 
55 |     dataset = TensorDataset(*all_tensors)
56 |     return dataset, tensor_names
57 | 
58 | 
59 | class ConcatTensorDataset(ConcatDataset):
60 |     r"""ConcatDataset of only TensorDatasets which supports getting slices.
61 | 
62 |     This dataset allows the use of slices, e.g. ds[2:4] if all concatenated
63 |     datasets are either TensorDatasets or Subset or other ConcatTensorDataset instances
64 |     which eventually contain only TensorDataset instances. If no slicing is needed,
65 |     this class works exactly like torch.utils.data.ConcatDataset and can concatenate arbitrary
66 |     (not just TensorDataset) datasets.
67 | 
68 |     Args:
69 |         datasets (sequence): List of datasets to be concatenated
70 |     """
71 |     def __init__(self, datasets: Iterable[Dataset]) -> None:
72 |         super(ConcatTensorDataset, self).__init__(datasets)
73 | 
74 |     def __getitem__(self, idx):
75 |         if isinstance(idx, slice):
76 |             rows = [super(ConcatTensorDataset, self).__getitem__(i) for i in range(self.__len__())[idx]]
77 |             return tuple(map(torch.stack, zip(*rows)))
78 |         elif isinstance(idx, (list, np.ndarray)):
79 |             rows = [super(ConcatTensorDataset, self).__getitem__(i) for i in idx]
80 |             return tuple(map(torch.stack, zip(*rows)))
81 |         else:
82 |             return super(ConcatTensorDataset, self).__getitem__(idx)
83 | 


--------------------------------------------------------------------------------
/farm/data_handler/inputs.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | 
 3 | 
 4 | class Question:
 5 |     def __init__(self, text: str, uid: str=None):
 6 |         self.text = text
 7 |         self.uid = uid
 8 | 
 9 |     def to_dict(self):
10 |         ret = {"question": self.text,
11 |                "id": self.uid,
12 |                "answers": []}
13 |         return ret
14 | 
15 | 
16 | class QAInput:
17 |     def __init__(self, doc_text: str, questions: Union[List[Question], Question]):
18 |         self.doc_text = doc_text
19 |         if type(questions) == Question:
20 |             self.questions = [questions]
21 |         else:
22 |             self.questions = questions
23 | 
24 |     def to_dict(self):
25 |         questions = [q.to_dict() for q in self.questions]
26 |         ret = {"qas": questions,
27 |                "context": self.doc_text}
28 |         return ret
29 | 
30 | 


--------------------------------------------------------------------------------
/farm/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/evaluation/__init__.py


--------------------------------------------------------------------------------
/farm/evaluation/msmarco_passage_farm.py:
--------------------------------------------------------------------------------
 1 | from farm.evaluation.msmarco_passage_official import compute_metrics_from_files
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def msmarco_evaluation(preds_file, dev_file, qrels_file, output_file):
 7 |     """
 8 |     Performs official msmarco passage ranking evaluation (https://github.com/microsoft/MSMARCO-Passage-Ranking)
 9 |     on a file containing the is_relevent prediction scores. It will convert the input file (qid, pid, score)
10 |     into the format expected by the official eval function (compute_metrics_from_files)
11 | 
12 |     :param predictions_filename: File where each line is the is_relevant prediction score
13 |     :param dev_filename: File in format qid, query, pid, passage, label
14 |     :param qrels_filename: File in the format qid, pid when is_relevant=1
15 |     :param output_file: File to write to in format qid, pid, rank
16 | 
17 |     :return:
18 |     """
19 | 
20 |     # Initialize files
21 |     preds_scores = [float(l) for l in open(preds_file)]
22 |     dev_lines = [l for i,l in enumerate(open(dev_file)) if i != 0]
23 |     output = open(output_file, "w")
24 | 
25 |     # Populate a dict with all qid/pid/score triples
26 |     results = dict()
27 |     for i, (score, line) in enumerate(zip(preds_scores, dev_lines)):
28 |         if i == 0:
29 |             continue
30 |         qid, _, pid, _, _ = line.split("\t")
31 |         if qid not in results:
32 |             results[qid] = []
33 |         results[qid].append((pid, score))
34 | 
35 |     # ##########
36 |     # ### NOTE: This block is to generate a view that is interpretable when debugging
37 |     # ##########
38 |     # interpretable = dict()
39 |     # for i, (score, line) in enumerate(zip(preds_scores, dev_lines)):
40 |     #     if i == 0:
41 |     #         continue
42 |     #     _, query, _, passage, label = line.split("\t")
43 |     #     if query not in interpretable:
44 |     #         interpretable[query] = []
45 |     #     interpretable[query].append((passage, score, label[:-1]))
46 |     # for query in interpretable:
47 |     #     sorted_scores = sorted(interpretable[query], key= lambda x: x[1], reverse=True)[:10]
48 |     #     results[query] = sorted_scores
49 |     # relevant = []
50 |     # for query in interpretable:
51 |     #     for (passage, score, label) in interpretable[query]:
52 |     #         if label == "1":
53 |     #             relevant.append((passage, score))
54 |     # rel_scores = [x[1] for x in relevant]
55 |     # irrelevant = []
56 |     # for query in interpretable:
57 |     #     for (passage, score, label) in interpretable[query]:
58 |     #         if label == "0":
59 |     #             irrelevant.append((passage, score))
60 |     # irrel_scores = [x[1] for x in irrelevant]
61 |     # print()
62 | 
63 |     # Sort by scores and take top 10
64 |     for qid in list(results):
65 |         sorted_scores = sorted(results[qid], key= lambda x: x[1], reverse=True)[:10]
66 |         results[qid] = [(pid, i+1) for i, (pid, _)  in enumerate(sorted_scores)]
67 | 
68 |     # Write to file
69 |     for qid in list(results):
70 |         for (pid, rank) in results[qid]:
71 |             output.write(f"{qid}\t{pid}\t{rank}\n")
72 |     output.close()
73 | 
74 |     curr_qids = list(results)
75 |     df = pd.read_csv(qrels_file, sep="\t", header=None)
76 |     df = df.loc[df[0].isin(curr_qids)]
77 |     df.to_csv("tmp", sep="\t", header=None, index=None)
78 | 
79 |     path_to_reference = "tmp"
80 |     path_to_candidate = output_file
81 |     metrics = compute_metrics_from_files(path_to_reference, path_to_candidate)
82 |     print('#####################')
83 |     for metric in sorted(metrics):
84 |         print('{}: {}'.format(metric, metrics[metric]))
85 |     print('#####################')
86 |     os.remove(path_to_reference)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/farm/inference_rest_api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | from flask import Flask, request, make_response
 7 | from flask_cors import CORS
 8 | from flask_restplus import Api, Resource
 9 | 
10 | from farm.infer import Inferencer
11 | 
12 | logger = logging.getLogger(__name__)
13 | logging.basicConfig(
14 |     format="%(asctime)s %(levelname)-8s %(message)s",
15 |     level="INFO",
16 |     datefmt="%Y-%m-%d %H:%M:%S",
17 | )
18 | 
19 | MODELS_DIRS = ["saved_models", "base_models"]
20 | 
21 | model_paths = []
22 | for model_dir in MODELS_DIRS:
23 |     path = Path(model_dir)
24 |     if path.is_dir():
25 |         models = [f for f in path.iterdir() if f.is_dir()]
26 |         model_paths.extend(models)
27 | 
28 | INFERENCERS = {}
29 | for idx, model_dir in enumerate(model_paths):
30 |     # refer to examples/inferencer_multiprocessing.py for using multiprocessing in the Inferencers.
31 |     INFERENCERS[idx + 1] = Inferencer.load(str(model_dir), num_processes=0)
32 | 
33 | app = Flask(__name__)
34 | CORS(app)
35 | api = Api(app, debug=True, validate=True, version="1.0", title="FARM NLP APIs")
36 | app.config["JSON_SORT_KEYS"] = True
37 | app.config["RESTPLUS_VALIDATE"] = True
38 | 
39 | 
40 | @api.route("/models")
41 | class ModelListEndpoint(Resource):
42 |     def get(self):
43 |         resp = []
44 | 
45 |         for idx, model in INFERENCERS.items():
46 | 
47 |             #TODO UI still relies on the old prediction_type attribute, but we should switch this to inferencer.task_type
48 |             prediction_type = model.model.prediction_heads[0].model_type
49 | 
50 |             _res = {
51 |                 "id": idx,
52 |                 "name": model.name,
53 |                 "prediction_type": prediction_type,
54 |                 "language": model.language,
55 |             }
56 |             resp.append(_res)
57 | 
58 |         return resp
59 | 
60 | 
61 | class NumpyEncoder(json.JSONEncoder):
62 |     def default(self, obj):
63 |         if isinstance(obj, np.ndarray):
64 |             return obj.tolist()
65 |         if isinstance(obj, np.float32):
66 |             return str(obj)
67 |         return json.JSONEncoder.default(self, obj)
68 | 
69 | 
70 | @api.representation("application/json")
71 | def resp_json(data, code, headers=None):
72 |     resp = make_response(json.dumps(data, cls=NumpyEncoder), code)
73 |     resp.headers.extend(headers or {})
74 |     return resp
75 | 
76 | 
77 | @api.route("/models/<int:model_id>/inference")
78 | class InferenceEndpoint(Resource):
79 |     def post(self, model_id):
80 |         model = INFERENCERS.get(model_id, None)
81 |         if not model:
82 |             return "Model not found", 404
83 | 
84 |         dicts = request.get_json().get("input", None)
85 |         if not dicts:
86 |             return {}
87 |         results = model.inference_from_dicts(dicts=dicts)
88 |         return results[0]
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     app.run(host="0.0.0.0")
93 | 


--------------------------------------------------------------------------------
/farm/modeling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/modeling/__init__.py


--------------------------------------------------------------------------------
/farm/visual/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/visual/__init__.py


--------------------------------------------------------------------------------
/farm/visual/ascii/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/farm/visual/ascii/__init__.py


--------------------------------------------------------------------------------
/farm/visual/ascii/text.py:
--------------------------------------------------------------------------------
 1 | 
 2 | FARM_BLOCKS = """
 3 |  .----------------.  .----------------.  .----------------.  .----------------. 
 4 | | .--------------. || .--------------. || .--------------. || .--------------. |
 5 | | |  _________   | || |      __      | || |  _______     | || | ____    ____ | |
 6 | | | |_   ___  |  | || |     /  \     | || | |_   __ \    | || ||_   \  /   _|| |
 7 | | |   | |_  \_|  | || |    / /\ \    | || |   | |__) |   | || |  |   \/   |  | |
 8 | | |   |  _|      | || |   / ____ \   | || |   |  __ /    | || |  | |\  /| |  | |
 9 | | |  _| |_       | || | _/ /    \ \_ | || |  _| |  \ \_  | || | _| |_\/_| |_ | |
10 | | | |_____|      | || ||____|  |____|| || | |____| |___| | || ||_____||_____|| |
11 | | |              | || |              | || |              | || |              | |
12 | | '--------------' || '--------------' || '--------------' || '--------------' |
13 |  '----------------'  '----------------'  '----------------'  '----------------' 
14 | """
15 | 
16 | FARM_DOOM = """
17 |   __
18 |  / _|                    
19 | | |_ __ _ _ __ _ __ ___  
20 | |  _/ _` | '__| '_ ` _ \ 
21 | | || (_| | |  | | | | | |
22 | |_| \__,_|_|  |_| |_| |_|
23 | """
24 | 
25 | FARM_MODULAR = """
26 |  _______  _______  ______    __   __ 
27 | |       ||   _   ||    _ |  |  |_|  |
28 | |    ___||  |_|  ||   | ||  |       |
29 | |   |___ |       ||   |_||_ |       |
30 | |    ___||       ||    __  ||       |
31 | |   |    |   _   ||   |  | || ||_|| |
32 | |___|    |__| |__||___|  |_||_|   |_|
33 | """
34 | 
35 | FARM_COLOSSAL = """
36 |  .d888                               
37 | d88P"                                
38 | 888                                  
39 | 888888 8888b.  888d888 88888b.d88b.  
40 | 888       "88b 888P"   888 "888 "88b 
41 | 888   .d888888 888     888  888  888 
42 | 888   888  888 888     888  888  888 
43 | 888   "Y888888 888     888  888  888
44 | """
45 | 
46 | FARM_DIET_COLA = """
47 |    .-._.---'                      
48 |   (_) /                           
49 |      /--..-.    ).--..  .-. .-.   
50 |     /   (  |   /      )/   )   )  
51 |  .-/     `-'-'/      '/   /   (   
52 | (_/                            `-'
53 |                                """
54 | 
55 | WELCOME = """
56 |                    .                                        
57 |                   /                                /        
58 |  `)    (   .-.   / .-.  .-._..  .-. .-.   .-.  ---/---.-._. 
59 |  /  .   )./.-'_ / (    (   )  )/   )   )./.-'_   /   (   )  
60 | (_.' `-' (__.'_/_.-`---'`-'  '/   /   ( (__.'   /     `-'   
61 |                                        `-'                  """
62 | 
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # basics
 2 | setuptools
 3 | wheel
 4 | # PyTorch
 5 | # Temp. disabled the next line as it gets currently resolved to https://download.pytorch.org/whl/rocm3.8/torch-1.7.1%2Brocm3.8-cp38-cp38-linux_x86_64.whl
 6 | #--find-links=https://download.pytorch.org/whl/torch_stable.html
 7 | torch>1.5,<1.10
 8 | # progress bars in model download and training scripts
 9 | tqdm
10 | # Accessing files from S3 directly.
11 | boto3
12 | # Used for downloading models over HTTP
13 | requests
14 | # Scipy & sklearn for stats in run_classifier
15 | scipy>=1.3.2
16 | sklearn
17 | # Metrics or logging related
18 | seqeval
19 | mlflow<=1.13.1
20 | # huggingface repository
21 | transformers==4.7.0
22 | #sentence transformers
23 | sentence-transformers
24 | # accessing dictionary elements with dot notation
25 | dotmap
26 | # for inference-rest-apis
27 | Werkzeug==0.16.1
28 | flask
29 | flask-restplus
30 | flask-cors
31 | dill # pickle extension for (de-)serialization
32 | # optional for inference
33 | #fasttext==0.9.1
34 | # Inference with ONNX models. Install onnxruntime-gpu for Inference on GPUs
35 | #onnxruntime
36 | #onnxruntime_tools
37 | psutil
38 | sentencepiece
39 | 


--------------------------------------------------------------------------------
/run_all_experiments.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Downstream runner for all experiments in specified config files."""
15 | 
16 | from pathlib import Path
17 | from farm.experiment import run_experiment, load_experiments
18 | 
19 | 
20 | def main():
21 |     config_files = [
22 |         Path("experiments/ner/conll2003_de_config.json"),
23 |         Path("experiments/ner/conll2003_en_config.json"),
24 |         Path("experiments/ner/germEval14_config.json"),
25 |         Path("experiments/text_classification/germEval18Fine_config.json"),
26 |         Path("experiments/text_classification/germEval18Coarse_config.json"),
27 |         Path("experiments/text_classification/gnad_config.json"),
28 |         Path("experiments/text_classification/cola_config.json"),
29 |         Path("experiments/qa/squad20_config.json"),
30 |     ]
31 | 
32 |     for conf_file in config_files:
33 |         experiments = load_experiments(conf_file)
34 |         for experiment in experiments:
35 |             run_experiment(experiment)
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = readme.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import re
 4 | from io import open
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | 
 9 | def parse_requirements(filename):
10 |     """
11 |     Parse a requirements pip file returning the list of required packages. It exclude commented lines and --find-links directives.
12 | 
13 |     Args:
14 |         filename: pip requirements requirements
15 | 
16 |     Returns:
17 |         list of required package with versions constraints
18 | 
19 |     """
20 |     with open(filename) as file:
21 |         parsed_requirements = file.read().splitlines()
22 |     parsed_requirements = [line.strip()
23 |                            for line in parsed_requirements
24 |                            if not ((line.strip()[0] == "#") or line.strip().startswith('--find-links'))]
25 |     return parsed_requirements
26 | 
27 | 
28 | def get_dependency_links(filename):
29 |     """
30 |      Parse a requirements pip file looking for the --find-links directive.
31 |     Args:
32 |         filename:  pip requirements requirements
33 | 
34 |     Returns:
35 |         list of find-links's url
36 |     """
37 |     with open(filename) as file:
38 |         parsed_requirements = file.read().splitlines()
39 |     dependency_links = list()
40 |     for line in parsed_requirements:
41 |         line = line.strip()
42 |         if line.startswith('--find-links'):
43 |             dependency_links.append(line.split('=')[1])
44 |     return dependency_links
45 | 
46 | 
47 | dependency_links = get_dependency_links('requirements.txt')
48 | parsed_requirements = parse_requirements('requirements.txt')
49 | 
50 | 
51 | def versionfromfile(*filepath):
52 |     infile = os.path.join(*filepath)
53 |     with open(infile) as fp:
54 |         version_match = re.search(
55 |             r"^__version__\s*=\s*['\"]([^'\"]*)['\"]", fp.read(), re.M
56 |         )
57 |         if version_match:
58 |             return version_match.group(1)
59 |         raise RuntimeError("Unable to find version string in {}.".format(infile))
60 | 
61 | 
62 | here = os.path.abspath(os.path.dirname(__file__))
63 | 
64 | 
65 | setup(
66 |     name="farm",
67 |     version=versionfromfile(here, "farm", "_version.py"),
68 |     author="Timo Moeller, Malte Pietsch, Branden Chan, Tanay Soni, Bogdan Kostic, Julian Risch",
69 |     author_email="timo.moeller@deepset.ai",
70 |     description="Framework for finetuning and evaluating transformer based language models",
71 |     long_description=open("readme.rst", "r", encoding="utf-8").read(),
72 |     long_description_content_type="text/x-rst",
73 |     keywords="BERT NLP deep-learning language-model transformer qa question-answering transfer-learning",
74 |     license="Apache",
75 |     url="https://github.com/deepset-ai/FARM",
76 |     download_url="https://github.com/deepset-ai/FARM/archive/v"+versionfromfile(here, "farm", "_version.py")+".tar.gz",
77 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
78 |     dependency_links=dependency_links,
79 |     install_requires=parsed_requirements,
80 |     python_requires=">=3.6.0",
81 |     extras_require={
82 |         "fasttext": ["fasttext==0.9.1"],
83 |         "onnx": ["onnxruntime"],
84 |     },
85 |     tests_require=["pytest"],
86 |     classifiers=[
87 |         "Intended Audience :: Science/Research",
88 |         "License :: OSI Approved :: Apache Software License",
89 |         "Programming Language :: Python :: 3",
90 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
91 |     ],
92 | )
93 | 


--------------------------------------------------------------------------------
/test/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Inference Speed Benchmarks
 2 | 
 3 | FARM provides an automated speed benchmarking pipeline with options to parameterize the benchmarks with batch_size, 
 4 | max sequence length, document size, and so on.
 5 | 
 6 | The pipeline is implemented using [pytest-benchmark](https://github.com/ionelmc/pytest-benchmark). The warmup/iterations for each benchmark are configurable and the
 7 | results can be exported to a JSON file.
 8 | 
 9 |  
10 | 
11 | ## Question Answering
12 | 
13 | The `benchmarks/question_answering.py` file contains tests for inference with PyTorch(`test_question_answering_pytorch`) 
14 | and ONNXRuntime(`test_question_answering_onnx`).
15 | 
16 | The benchmarks are available [here](https://docs.google.com/spreadsheets/d/1ak9Cxj1zcNBDtjf7qn2j_ydKDDzpBgWiyJ7cO-7BPvA/edit?usp=sharing).
17 | 
18 | ### Running Benchmark with Docker
19 | 
20 | #### GPU
21 | For running benchmark on a GPU, bash into the Docker Image using ```docker run -it --gpus all deepset/farm-onnxruntime-gpu:0.4.3 bash```.
22 | Once inside the container, execute ```cd FARM/test && pytest benchmarks/question_answering.py -k test_question_answering_pytorch --use_gpu --benchmark-json result.json```.
23 | 
24 | #### CPU 
25 | Bash into the Docker container with ```docker run -it deepset/farm-inference-api:0.4.3 bash``` and then execute
26 |  ```cd test && pytest benchmarks/question_answering.py -k test_question_answering_pytorch --benchmark-json result.json```.
27 | 
28 | ### Exporting results in CSV format
29 | 
30 | The results of benchmarks are exported to a `result.json` file in the `test` folder. To convert results to csv format, 
31 | execute `python benchmarks/convert_result_to_csv.py`.


--------------------------------------------------------------------------------
/test/benchmarks/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from farm.infer import Inferencer
 6 | from farm.modeling.adaptive_model import AdaptiveModel
 7 | 
 8 | 
 9 | @pytest.fixture(scope="session")
10 | def onnx_adaptive_model_qa(use_gpu, num_processes, model_name_or_path="deepset/bert-base-cased-squad2"):
11 |     if (Path(model_name_or_path) / "model.onnx").is_file():  # load model directly if in ONNX format
12 |         onnx_model_path = model_name_or_path
13 |     else:  # convert to ONNX format
14 |         onnx_model_path = Path("benchmarks/onnx-export")
15 |         model = AdaptiveModel.convert_from_transformers(
16 |             model_name_or_path, device="cpu", task_type="question_answering"
17 |         )
18 |         model.convert_to_onnx(onnx_model_path)
19 | 
20 |     try:
21 |         model = Inferencer.load(
22 |             onnx_model_path, task_type="question_answering", batch_size=1, num_processes=num_processes, gpu=use_gpu
23 |         )
24 |         yield model
25 |     finally:
26 |         if num_processes != 0:
27 |             model.close_multiprocessing_pool()
28 | 
29 | 


--------------------------------------------------------------------------------
/test/benchmarks/convert_result_to_csv.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import csv
 3 | 
 4 | with open("result.json") as f:
 5 |     results = json.load(f)
 6 | 
 7 | with open("result.csv", "w") as f:
 8 |     fieldnames = list(results["benchmarks"][0]["params"].keys())
 9 |     fieldnames.append("time")
10 |     writer = csv.DictWriter(f, fieldnames=fieldnames)
11 |     writer.writeheader()
12 | 
13 |     for benchmark in results["benchmarks"]:
14 |         writer.writerow({"time": benchmark["stats"]["total"], **benchmark["params"]})
15 | 


--------------------------------------------------------------------------------
/test/benchmarks/question_answering.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("max_seq_len", [128, 256, 384])
10 | @pytest.mark.parametrize("batch_size", [1, 4, 16, 64])
11 | @pytest.mark.parametrize("document_size", [10_000, 100_000])
12 | @pytest.mark.parametrize("num_processes", [0], scope="session")
13 | def test_question_answering_pytorch(adaptive_model_qa, benchmark, max_seq_len, batch_size, use_gpu, document_size):
14 |     if use_gpu and not torch.cuda.is_available():
15 |         pytest.skip("Skipping benchmarking on GPU as it not available.")
16 | 
17 |     if not use_gpu and document_size > 10_000:
18 |         pytest.skip("Document size is large for CPU")
19 | 
20 |     with open("benchmarks/sample_file.txt") as f:
21 |         context = f.read()[:document_size]
22 |     QA_input = [{"qas": ["When were the first traces of Human life found in France?"], "context": context}]
23 | 
24 |     adaptive_model_qa.batch_size = batch_size
25 |     adaptive_model_qa.max_seq_len = max_seq_len
26 |     benchmark.pedantic(
27 |         target=adaptive_model_qa.inference_from_dicts, args=(QA_input,), warmup_rounds=1, iterations=3,
28 |     )
29 | 
30 | 
31 | @pytest.mark.parametrize("max_seq_len", [128, 256, 384])
32 | @pytest.mark.parametrize("batch_size", [1, 4, 16, 64])
33 | @pytest.mark.parametrize("document_size", [10_000, 100_000])
34 | @pytest.mark.parametrize("num_processes", [0], scope="session")
35 | def test_question_answering_onnx(onnx_adaptive_model_qa, benchmark, max_seq_len, batch_size, use_gpu, document_size):
36 |     if use_gpu and not torch.cuda.is_available():
37 |         pytest.skip("Skipping benchmarking on GPU as it not available.")
38 | 
39 |     if not use_gpu and document_size > 10_000:
40 |         pytest.skip("Document size is large for CPU")
41 | 
42 |     with open("benchmarks/sample_file.txt") as f:
43 |         context = f.read()[:document_size]
44 |     QA_input = [{"qas": ["When were the first traces of Human life found in France?"], "context": context}]
45 | 
46 |     onnx_adaptive_model_qa.batch_size = batch_size
47 |     onnx_adaptive_model_qa.max_seq_len = max_seq_len
48 |     benchmark.pedantic(
49 |         target=onnx_adaptive_model_qa.inference_from_dicts, args=(QA_input,), warmup_rounds=1, iterations=3
50 |     )
51 | 


--------------------------------------------------------------------------------
/test/benchmarks/question_answering_components.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <html>
 3 |   <head>
 4 |     <script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>
 5 |     <script type="text/javascript">
 6 |       google.charts.load('current', {'packages':['bar']});
 7 |       google.charts.setOnLoadCallback(drawChart);
 8 | 
 9 |       function drawChart() {
10 |         var data = google.visualization.arrayToDataTable(
11 |           [
12 |             ["Name", "preproc","language_model","prediction_head"],
13 |             ['deepset/minilm-uncased-squad2', 12.277034912109375, 5.79623876953125, 1.5562604980468748], ['deepset/roberta-base-squad2', 12.380782958984376, 13.71148828125, 1.5372104492187502], ['deepset/bert-base-cased-squad2', 9.938722900390625, 15.864041992187499, 1.6085009765625005], ['deepset/bert-large-uncased-whole-word-masking-squad2', 9.692403808593749, 45.28969921875, 1.785435546875], ['deepset/xlm-roberta-large-squad2', 8.079997680664063, 48.489154296875, 1.974138671875]
14 |           ]);
15 | 
16 |         var options = {
17 |           chart: {
18 |             title: 'QA Model Speed Comparison',
19 |             subtitle: 'Time per Component',
20 |           },
21 |           bars: 'horizontal', // Required for Material Bar Charts.
22 |           isStacked: true,
23 |           height: 300,
24 |           legend: {position: 'top', maxLines: 3},
25 |           hAxis: {minValue: 0}
26 | 
27 |         };
28 | 
29 |         var chart = new google.charts.Bar(document.getElementById('barchart_material'));
30 | 
31 |         chart.draw(data, google.charts.Bar.convertOptions(options));
32 |       }
33 |     </script>
34 |   </head>
35 |   <body>
36 |     <div id="barchart_material" style="width: 900px; height: 500px;"></div>
37 |   </body>
38 | </html>
39 | 
40 | 


--------------------------------------------------------------------------------
/test/benchmarks/samples/question_answering_questions.txt:
--------------------------------------------------------------------------------
1 | When were the first traces of Human life found in France?
2 | How many pretrained models are available in Transformers?
3 | What does Transformers provide?
4 | Transformers provides interoperability between which frameworks?


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
  1 | import psutil
  2 | import pytest
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import SquadProcessor
  7 | from farm.modeling.adaptive_model import AdaptiveModel
  8 | from farm.modeling.language_model import LanguageModel
  9 | from farm.modeling.optimization import initialize_optimizer
 10 | from farm.modeling.prediction_head import QuestionAnsweringHead
 11 | from farm.modeling.tokenization import Tokenizer
 12 | from farm.train import Trainer
 13 | from farm.utils import set_all_seeds, initialize_device_settings
 14 | from farm.infer import Inferencer, QAInferencer
 15 | 
 16 | 
 17 | def pytest_addoption(parser):
 18 |     """
 19 |     Hook to pass pytest-fixture arguments to tests from the command line.
 20 |     """
 21 |     parser.addoption("--use_gpu", action="store_true", default=False)
 22 | 
 23 | 
 24 | def pytest_generate_tests(metafunc):
 25 |     """
 26 |     This method gets called for all test cases. Here, we set the arguments supplied in pytest_addoption().
 27 |     """
 28 |     option_value = metafunc.config.option.use_gpu
 29 |     if 'use_gpu' in metafunc.fixturenames:
 30 |         if option_value:
 31 |             metafunc.parametrize("use_gpu", [True], scope="session")
 32 |         else:
 33 |             metafunc.parametrize("use_gpu", [False], scope="session")
 34 | 
 35 | 
 36 | def pytest_collection_modifyitems(items):
 37 |     for item in items:
 38 |         if "conversion" in item.nodeid:
 39 |             item.add_marker(pytest.mark.conversion)
 40 | 
 41 | 
 42 | @pytest.fixture(scope="module")
 43 | def adaptive_model_qa(use_gpu, num_processes):
 44 |     """
 45 |     PyTest Fixture for a Question Answering Inferencer based on PyTorch.
 46 |     """
 47 |     try:
 48 |         model = Inferencer.load(
 49 |             "deepset/bert-base-cased-squad2",
 50 |             task_type="question_answering",
 51 |             batch_size=16,
 52 |             num_processes=num_processes,
 53 |             gpu=use_gpu,
 54 |         )
 55 |         yield model
 56 |     finally:
 57 |         if num_processes != 0:
 58 |             # close the pool
 59 |             # we pass join=True to wait for all sub processes to close
 60 |             # this is because below we want to test if all sub-processes
 61 |             # have exited
 62 |             model.close_multiprocessing_pool(join=True)
 63 | 
 64 |     # check if all workers (sub processes) are closed
 65 |     current_process = psutil.Process()
 66 |     children = current_process.children()
 67 |     assert len(children) == 0
 68 | 
 69 | 
 70 | @pytest.fixture(scope="module")
 71 | def bert_base_squad2(request):
 72 |     model = QAInferencer.load(
 73 |             "deepset/minilm-uncased-squad2",
 74 |             task_type="question_answering",
 75 |             batch_size=4,
 76 |             num_processes=0,
 77 |             multithreading_rust=False,
 78 |             use_fast=True # TODO parametrize this to test slow as well
 79 |     )
 80 |     return model
 81 | 
 82 | # TODO add other model types (roberta, xlm-r, albert) here as well
 83 | 
 84 | @pytest.fixture(scope="module")
 85 | def distilbert_squad(request):
 86 |     set_all_seeds(seed=42)
 87 |     device, n_gpu = initialize_device_settings(use_cuda=False)
 88 |     batch_size = 2
 89 |     n_epochs = 1
 90 |     evaluate_every = 4
 91 |     base_LM_model = "distilbert-base-uncased"
 92 | 
 93 |     tokenizer = Tokenizer.load(
 94 |         pretrained_model_name_or_path=base_LM_model,
 95 |         do_lower_case=True,
 96 |         use_fast=True # TODO parametrize this to test slow as well
 97 |     )
 98 |     label_list = ["start_token", "end_token"]
 99 |     processor = SquadProcessor(
100 |         tokenizer=tokenizer,
101 |         max_seq_len=20,
102 |         doc_stride=10,
103 |         max_query_length=6,
104 |         train_filename="train-sample.json",
105 |         dev_filename="dev-sample.json",
106 |         test_filename=None,
107 |         data_dir=Path("samples/qa"),
108 |         label_list=label_list,
109 |         metric="squad"
110 |     )
111 | 
112 |     data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
113 |     language_model = LanguageModel.load(base_LM_model)
114 |     prediction_head = QuestionAnsweringHead()
115 |     model = AdaptiveModel(
116 |         language_model=language_model,
117 |         prediction_heads=[prediction_head],
118 |         embeds_dropout_prob=0.1,
119 |         lm_output_types=["per_token"],
120 |         device=device,
121 |     )
122 | 
123 |     model, optimizer, lr_schedule = initialize_optimizer(
124 |         model=model,
125 |         learning_rate=2e-5,
126 |         #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
127 |         n_batches=len(data_silo.loaders["train"]),
128 |         n_epochs=n_epochs,
129 |         device=device
130 |     )
131 |     trainer = Trainer(
132 |         model=model,
133 |         optimizer=optimizer,
134 |         data_silo=data_silo,
135 |         epochs=n_epochs,
136 |         n_gpu=n_gpu,
137 |         lr_schedule=lr_schedule,
138 |         evaluate_every=evaluate_every,
139 |         device=device
140 |     )
141 |     trainer.train()
142 | 
143 |     return model, processor
144 | 
145 | 


--------------------------------------------------------------------------------
/test/create_testdata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | import pprint
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def squad_subsample():
10 |     if not os.path.exists("samples/qa"):
11 |         os.makedirs("samples/qa")
12 | 
13 |     with open('../data/squad20/dev-v2.0.json') as json_file:
14 |         data = json.load(json_file)
15 | 
16 |     ss = data["data"][0]["paragraphs"][:1]
17 |     sample = {}
18 |     sample["data"] = [{"paragraphs": ss}]
19 |     # just creating same train and dev files
20 |     with open('samples/qa/dev-sample.json', 'w') as outfile:
21 |         json.dump(sample, outfile)
22 |     with open('samples/qa/train-sample.json', 'w') as outfile:
23 |         json.dump(sample, outfile)
24 | 
25 | def germeval14_subsample():
26 |     if not os.path.exists("samples/ner"):
27 |         os.makedirs("samples/ner")
28 | 
29 |     with open('../data/germeval14/dev.txt') as file:
30 |         data = file.readlines()
31 | 
32 |     ss = "".join(data[:200])
33 |     with open('samples/ner/train-sample.txt', 'w') as outfile:
34 |         outfile.write(ss)
35 |     with open('samples/ner/dev-sample.txt', 'w') as outfile:
36 |         outfile.write(ss)
37 | 
38 | def germeval18_subsample():
39 |     if not os.path.exists("samples/doc_class"):
40 |         os.makedirs("samples/doc_class")
41 |     with open('../data/germeval18/test.tsv') as file:
42 |         data = file.readlines()
43 | 
44 |     ss = "".join(data[:50])
45 |     with open('samples/doc_class/train-sample.tsv', 'w') as outfile:
46 |         outfile.write(ss)
47 |     with open('samples/doc_class/test-sample.tsv', 'w') as outfile:
48 |         outfile.write(ss)
49 | 
50 | if __name__=="__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('--task', type=str, default='', help="Which task to create testdata for: qa, ner, doc_class")
53 |     args = parser.parse_args()
54 |     if(args.task == "qa"):
55 |         logger.info("Creating test data for Question Answering, please make sure the original data is already downloaded and in data/squad20")
56 |         squad_subsample()
57 |     elif(args.task == "ner"):
58 |         logger.info(
59 |             "Creating test data for NER, please make sure the original data is already downloaded and in data/germeval14")
60 |         germeval14_subsample()
61 |     elif(args.task == "doc_class"):
62 |         logger.info(
63 |             "Creating test data for Document Classification, please make sure the original data is already downloaded and in data/germeval18")
64 |         germeval18_subsample()


--------------------------------------------------------------------------------
/test/modeling/test_optimization.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from farm.modeling.optimization import initialize_optimizer
4 | 
5 | 
6 | def test_initialize_optimizer_param_schedule_opts():
7 |     with pytest.raises(TypeError):
8 |         initialize_optimizer(None, 1, 1, 'cpu', 0.4e-5, schedule_opts=[])
9 | 


--------------------------------------------------------------------------------
/test/samples/doc_class/test-sample.tsv:
--------------------------------------------------------------------------------
1 | text	coarse_label	fine_label
2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺	OFFENSE	OTHER
3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen.	OTHER	OTHER


--------------------------------------------------------------------------------
/test/samples/doc_class/train-sample.tsv:
--------------------------------------------------------------------------------
1 | text	coarse_label	fine_label
2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺	OFFENSE	OTHER
3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen.	OTHER	OTHER
4 | #Merkel rollt dem Emir von #Katar, der islamistischen Terror unterstützt, den roten Teppich aus.Wir brauchen einen sofortigen #Waffenstopp!	OTHER	OTHER
5 | „Merle ist kein junges unschuldiges Mädchen“ Kch....... 😱 #tatort	OTHER	OTHER


--------------------------------------------------------------------------------
/test/samples/doc_class_other_text_column_name/test-sample.tsv:
--------------------------------------------------------------------------------
1 | text_other	coarse_label	fine_label
2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺	OFFENSE	OTHER
3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen.	OTHER	OTHER
4 | 


--------------------------------------------------------------------------------
/test/samples/doc_class_other_text_column_name/train-sample.tsv:
--------------------------------------------------------------------------------
1 | text_other	coarse_label	fine_label
2 | Meine Mutter hat mir erzählt, dass mein Vater einen Wahlkreiskandidaten nicht gewählt hat das Dreckschwein, weil der gegen die Homo-Ehe ist ☺	OFFENSE	OTHER
3 | @Tom174_ @davidbest95 Meine Reaktion; |LBR| Nicht jeder Moslem ist ein Terrorist. Aber jeder Moslem glaubt an Überlieferungen, die Gewalt und Terror begünstigen.	OTHER	OTHER
4 | #Merkel rollt dem Emir von #Katar, der islamistischen Terror unterstützt, den roten Teppich aus.Wir brauchen einen sofortigen #Waffenstopp!	OTHER	OTHER
5 | „Merle ist kein junges unschuldiges Mädchen“ Kch....... 😱 #tatort	OTHER	OTHER
6 | 


--------------------------------------------------------------------------------
/test/samples/doc_regr/test-sample.tsv:
--------------------------------------------------------------------------------
1 | text	label
2 | I love, love this dress except for the armpits. if they had just made the armpits a normal round shape with normal openings, the dress would have been perfection. so audrey hepburn!! but i had to say no. i really wish they would redo this dress with normal arm openings. i think it would sell like crazy.	4
3 | I wanted this sweater to work but sadly it failed. first, the pink was way to sheer for my liking. the sheerness caused a weird color overlap on the stomach area. then the band at the bottom was too tight causing a weird ballooning affect. a shirt underneath could work but it takes away from the beauty of the knit. the soft pink is gorgeous but not good for medium to light skinned folks.	2
4 | Oh my! i love this tee. it is super soft. i love how it doesn't look like a sack with no shape. i can't wait to get more colors. i am tall plus have a long torso and it still is long enough for me so this is definitely a win!	5
5 | I love the style of this swimsuit on the model. when i purchased is i didn't realize that there was no support (wire/ padding/ lining) in the chest. the rest of the swimsuit was great but i did not like the look in the chest - it provided no support. i ended up returning it.	3


--------------------------------------------------------------------------------
/test/samples/doc_regr/train-sample.tsv:
--------------------------------------------------------------------------------
1 | text	label
2 | The embroidery around the chest/collar is lovely.  but the lower half of the shirt didn't fit my post-pregnancy bod.  it's going back.	4
3 | "I am so pleased with this top! it is slightly fitted - i am 5'3"", 110 lbs, - and have trouble finding tops that are flattering but not too form fitting. also it is 100% cotton, which is a definite plus. as of now it is my go-to top - looks great with jeans or leggings."	5
4 | I honestly don't understand whey this top isn't sold out. i have it in both colors and love it! it's a cool, gauzy woven fabric, super soft and perfect for warm weather. the white fabric is doubled so it's not see-through, the pink (more of a pale terracotta) is doubled halfway up, so it's slightly sheer on top but your pants/skirt waistband will not show through. it is a loose-fitting top, so you may be able to size down. i usually wear size 4p, but it was sold out so i got regular size 2 and i	5
5 | How can you go wrong with soft cotton top that is neither too snug nor too loose? expect will wear these as layers under when really cold and by them selves in early spring and in the fall. great colors and love the multiple textures.	5


--------------------------------------------------------------------------------
/test/samples/doc_regr_other_text_column_name/test-sample.tsv:
--------------------------------------------------------------------------------
1 | text_other	label
2 | I love, love this dress except for the armpits. if they had just made the armpits a normal round shape with normal openings, the dress would have been perfection. so audrey hepburn!! but i had to say no. i really wish they would redo this dress with normal arm openings. i think it would sell like crazy.	4
3 | I wanted this sweater to work but sadly it failed. first, the pink was way to sheer for my liking. the sheerness caused a weird color overlap on the stomach area. then the band at the bottom was too tight causing a weird ballooning affect. a shirt underneath could work but it takes away from the beauty of the knit. the soft pink is gorgeous but not good for medium to light skinned folks.	2
4 | Oh my! i love this tee. it is super soft. i love how it doesn't look like a sack with no shape. i can't wait to get more colors. i am tall plus have a long torso and it still is long enough for me so this is definitely a win!	5
5 | I love the style of this swimsuit on the model. when i purchased is i didn't realize that there was no support (wire/ padding/ lining) in the chest. the rest of the swimsuit was great but i did not like the look in the chest - it provided no support. i ended up returning it.	3
6 | 


--------------------------------------------------------------------------------
/test/samples/doc_regr_other_text_column_name/train-sample.tsv:
--------------------------------------------------------------------------------
1 | text_other	label
2 | The embroidery around the chest/collar is lovely.  but the lower half of the shirt didn't fit my post-pregnancy bod.  it's going back.	4
3 | "I am so pleased with this top! it is slightly fitted - i am 5'3"", 110 lbs, - and have trouble finding tops that are flattering but not too form fitting. also it is 100% cotton, which is a definite plus. as of now it is my go-to top - looks great with jeans or leggings."	5
4 | I honestly don't understand whey this top isn't sold out. i have it in both colors and love it! it's a cool, gauzy woven fabric, super soft and perfect for warm weather. the white fabric is doubled so it's not see-through, the pink (more of a pale terracotta) is doubled halfway up, so it's slightly sheer on top but your pants/skirt waistband will not show through. it is a loose-fitting top, so you may be able to size down. i usually wear size 4p, but it was sold out so i got regular size 2 and i	5
5 | How can you go wrong with soft cotton top that is neither too snug nor too loose? expect will wear these as layers under when really cold and by them selves in early spring and in the fall. great colors and love the multiple textures.	5
6 | 


--------------------------------------------------------------------------------
/test/samples/lm_finetuning/test-sample.txt:
--------------------------------------------------------------------------------
1 | Text should be one-sentence-per-line, with empty lines between documents.
2 | A Seentence to teest whoole woord maasking, muust includio multiplee woords wiith subwoord tookens.
3 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
4 | 
5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.


--------------------------------------------------------------------------------
/test/samples/lm_finetuning/train-sample.txt:
--------------------------------------------------------------------------------
 1 | Text should be one-sentence-per-line, with empty lines between documents.
 2 | A Seentence to teest whoole woord maasking, muust includio multiplee woords wiith subwoord tookens.
 3 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
 4 | 
 5 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbb ccccccccccccccccccccccc
 6 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbb cccccccccccccccccccccccccccccc
 7 | aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbb ccccccccccccccccccccccc
 8 | 
 9 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
10 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
11 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
12 | 


--------------------------------------------------------------------------------
/test/samples/ner/dev-sample.txt:
--------------------------------------------------------------------------------
 1 | -DOCSTART- -X- -X- -X- O
 2 | 
 3 | Ereignis Ereignis NN I-NC O
 4 | und und KON O O
 5 | Erzählung Erzählung NN I-NC O
 6 | oder oder KON I-NC O
 7 | : : $. O O
 8 | 
 9 | Albrecht Albrecht NE B-NC I-PER
10 | Lehmann Lehmann NE I-NC I-PER
11 | läßt lassen VVFIN I-VC O
12 | Flüchtlinge Flüchtling NN I-NC O
13 | und und KON O O
14 | Vertriebene Vertriebene NN I-NC O
15 | in in APPR I-PC O
16 | Westdeutschland Westdeutschland NE I-NC I-LOC
17 | , , $, I-NC O
18 | 1945-1990 @card@ CARD I-NC O
19 | , , $, O O
20 | zu zu APPR I-PC O
21 | Wort Wort NN I-NC O
22 | kommen kommen VVFIN I-VC O
23 | 
24 | Einwanderungsfragen Einwanderungsfrage|Einwanderungsfragen NN I-NC O
25 | haben haben VAFIN I-VC O
26 | in in APPR I-PC O
27 | Deutschland Deutschland NE I-NC I-LOC
28 | in in APPR I-PC O
29 | den d ART I-NC O
30 | letzten letzt ADJA I-NC O
31 | Monaten Monat NN I-NC O
32 | Politik Politik NN B-NC O
33 | und und KON O O
34 | Medien Medium NN I-NC O
35 | beherrscht beherrschen VVPP I-VC O
36 | . . $. O O
37 | 
38 | in in APPR I-PC O
39 | Westdeutschland Westdeutschland NE I-NC I-LOC
40 | von von APPR I-PC O
41 | 1945 1945 CARD I-NC O
42 | bis bis APPR I-PC O
43 | 1990 1990 CARD I-NC O
44 | Aufmerksamkeit Aufmerksamkeit NN I-NC O
45 | zu zu PTKZU I-VC O
46 | erregen erregen VVINF I-VC O
47 | . . $. O O
48 | 


--------------------------------------------------------------------------------
/test/samples/ner/train-sample.txt:
--------------------------------------------------------------------------------
 1 | -DOCSTART- -X- -X- -X- O
 2 | 
 3 | Ereignis Ereignis NN I-NC O
 4 | und und KON O O
 5 | Erzählung Erzählung NN I-NC O
 6 | oder oder KON I-NC O
 7 | : : $. O O
 8 | 
 9 | Albrecht Albrecht NE B-NC I-PER
10 | Lehmann Lehmann NE I-NC I-PER
11 | läßt lassen VVFIN I-VC O
12 | Flüchtlinge Flüchtling NN I-NC O
13 | und und KON O O
14 | Vertriebene Vertriebene NN I-NC O
15 | in in APPR I-PC O
16 | Westdeutschland Westdeutschland NE I-NC I-LOC
17 | , , $, I-NC O
18 | 1945-1990 @card@ CARD I-NC O
19 | , , $, O O
20 | zu zu APPR I-PC O
21 | Wort Wort NN I-NC O
22 | kommen kommen VVFIN I-VC O
23 | 
24 | Einwanderungsfragen Einwanderungsfrage|Einwanderungsfragen NN I-NC O
25 | haben haben VAFIN I-VC O
26 | in in APPR I-PC O
27 | Deutschland Deutschland NE I-NC I-LOC
28 | in in APPR I-PC O
29 | den d ART I-NC O
30 | letzten letzt ADJA I-NC O
31 | Monaten Monat NN I-NC O
32 | Politik Politik NN B-NC O
33 | und und KON O O
34 | Medien Medium NN I-NC O
35 | beherrscht beherrschen VVPP I-VC O
36 | . . $. O O
37 | 
38 | in in APPR I-PC O
39 | Westdeutschland Westdeutschland NE I-NC I-LOC
40 | von von APPR I-PC O
41 | 1945 1945 CARD I-NC O
42 | bis bis APPR I-PC O
43 | 1990 1990 CARD I-NC O
44 | Aufmerksamkeit Aufmerksamkeit NN I-NC O
45 | zu zu PTKZU I-VC O
46 | erregen erregen VVINF I-VC O
47 | . . $. O O
48 | 
49 | 


--------------------------------------------------------------------------------
/test/samples/qa/answer-offset-wrong.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "10", "answer_start": 0}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/answer-wrong.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "11", "answer_start": 11}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/dev-sample.json:
--------------------------------------------------------------------------------
1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 53}], "is_impossible": false}], "context": "The Normans gave their name to Normandy, a region in France."}]}]}


--------------------------------------------------------------------------------
/test/samples/qa/eval-sample.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Paris?",
11 |                             "id": "5ad3d560604f3c001a3ff2c6",
12 |                             "answers": [],
13 |                             "is_impossible": true
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         },
19 |          {
20 |             "title": "Test2",
21 |             "paragraphs": [
22 |                 {
23 |                     "context": "Berlin has 10 inhabitants.",
24 |                     "qas": [
25 |                         {
26 |                             "question": "How many people live in Berlin?",
27 |                             "id": "5ad3d560604f3c001a3ff2c7",
28 |                             "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}],
29 |                             "is_impossible": false
30 |                         },
31 |                         {
32 |                             "question": "How many people live in Berlin?",
33 |                             "id": "5ad3d560604f3c001a3ff2c8",
34 |                             "answers": [{"text": "Berlin", "answer_start": 0}, {"text": "Berlin", "answer_start": 0}],
35 |                             "is_impossible": false
36 |                         }
37 |                     ]
38 |                 }
39 |             ]
40 |         }
41 |     ],
42 |     "version": "v2.0"
43 | }


--------------------------------------------------------------------------------
/test/samples/qa/noanswer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Paris?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [],
13 |                             "is_impossible": true
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/qa/train-sample.json:
--------------------------------------------------------------------------------
1 | {"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 159}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]}


--------------------------------------------------------------------------------
/test/samples/qa/vanilla.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data": [
 3 |         {
 4 |             "title": "Test",
 5 |             "paragraphs": [
 6 |                 {
 7 |                     "context": "Berlin has 10 inhabitants.",
 8 |                     "qas": [
 9 |                         {
10 |                             "question": "How many people live in Berlin?",
11 |                             "id": "5ad3d560604f3c001a3ff2c8",
12 |                             "answers": [{"text": "10", "answer_start": 11}, {"text": "10 inhabitants", "answer_start": 11}],
13 |                             "is_impossible": false
14 |                         }
15 |                     ]
16 |                 }
17 |             ]
18 |         }
19 |     ],
20 |     "version": "v2.0"
21 | }


--------------------------------------------------------------------------------
/test/samples/s3e/fitted_s3e/language_model_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "embeddings_filename": "vectors.txt",
3 |   "hidden_size": 300,
4 |   "language": "German",
5 |   "name": "WordEmbedding_LM",
6 |   "vocab_filename": "vocab.txt",
7 |   "vocab_size": 113
8 | }
9 | 


--------------------------------------------------------------------------------
/test/samples/s3e/fitted_s3e/processor_config.json:
--------------------------------------------------------------------------------
1 | {"baskets": [], "data_dir": null, "dev_filename": null, "dev_split": null, "max_seq_len": 128, "proxies": null, "tasks": {}, "test_filename": null, "train_filename": null, "tokenizer": "EmbeddingTokenizer", "processor": "InferenceProcessor"}


--------------------------------------------------------------------------------
/test/samples/s3e/fitted_s3e/s3e_stats.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepset-ai/FARM/5919538f721c7974ea951b322d30a3c0e84a1bc2/test/samples/s3e/fitted_s3e/s3e_stats.pkl


--------------------------------------------------------------------------------
/test/samples/s3e/fitted_s3e/vocab.txt:
--------------------------------------------------------------------------------
  1 | [CLS]
  2 | [SEP]
  3 | [UNK]
  4 | [PAD]
  5 | [MASK]
  6 | ,
  7 | the
  8 | .
  9 | and
 10 | to
 11 | of
 12 | a
 13 | in
 14 | is
 15 | for
 16 | that
 17 | it
 18 | on
 19 | with
 20 | )
 21 | (
 22 | you
 23 | was
 24 | are
 25 | this
 26 | have
 27 | !
 28 | but
 29 | by
 30 | ?
 31 | my
 32 | one
 33 | so
 34 | has
 35 | can
 36 | more
 37 | had
 38 | what
 39 | me
 40 | would
 41 | if
 42 | other
 43 | its
 44 | said
 45 | work
 46 | how
 47 | good
 48 | after
 49 | great
 50 | go
 51 | those
 52 | love
 53 | many
 54 | i
 55 | very
 56 | than
 57 | such
 58 | got
 59 | set
 60 | well
 61 | much
 62 | play
 63 | give
 64 | everything
 65 | does
 66 | man
 67 | person
 68 | buy
 69 | video
 70 | looking
 71 | sure
 72 | price
 73 | almost
 74 | wrong
 75 | woman
 76 | front
 77 | ways
 78 | spent
 79 | feature
 80 | fast
 81 | player
 82 | far
 83 | street
 84 | files
 85 | models
 86 | button
 87 | plays
 88 | forward
 89 | fill
 90 | walking
 91 | investment
 92 | opinion
 93 | panel
 94 | layout
 95 | im
 96 | consistently
 97 | practically
 98 | com
 99 | discovering
100 | formats
101 | alternate
102 | sleek
103 | happier
104 | smoothly
105 | reviewer
106 | dvd
107 | amazon
108 | apex
109 | nicest
110 | rewind
111 | mp3s
112 | cads
113 | cods
114 | 


--------------------------------------------------------------------------------
/test/samples/s3e/tiny_corpus.txt:
--------------------------------------------------------------------------------
 1 | a man is walking on the street .
 2 | a woman is walking on the street .
 3 | im a more happier person after discovering the button ! .
 4 | but , if you are looking for my opinion of the apex dvd player , i love it ! .
 5 | it practically plays almost everything you give it .
 6 | for the price it is a well spent investment ! .
 7 | this is by far the nicest one , in so many ways .
 8 | it is very sleek looking with a very good front panel button layout , and it has a great feature set .
 9 | its fast forward and rewind work much more smoothly and consistently than those of other models i have had .
10 | it plays alternate video formats cads cods very well .
11 | and amazon . com has it for such a great price how can you go wrong ? .
12 | what got me to buy was the reviewer that said it would play dvd fill of files ( mp3s ) .
13 | it sure does ! .
14 | 


--------------------------------------------------------------------------------
/test/samples/s3e/tiny_fasttext_model/language_model_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "embeddings_filename": "vectors.txt",
3 |   "hidden_size": 300,
4 |   "language": "German",
5 |   "name": "WordEmbedding_LM",
6 |   "vocab_filename": "vocab.txt",
7 |   "vocab_size": 4008
8 | }


--------------------------------------------------------------------------------
/test/samples/s3e/tiny_fasttext_model/vocab.txt:
--------------------------------------------------------------------------------
  1 | [CLS]
  2 | [SEP]
  3 | [UNK]
  4 | [PAD]
  5 | [MASK]
  6 | ,
  7 | the
  8 | .
  9 | and
 10 | to
 11 | of
 12 | a
 13 | in
 14 | is
 15 | for
 16 | that
 17 | it
 18 | on
 19 | with
 20 | )
 21 | (
 22 | you
 23 | was
 24 | are
 25 | this
 26 | have
 27 | !
 28 | but
 29 | by
 30 | ?
 31 | my
 32 | one
 33 | so
 34 | has
 35 | can
 36 | more
 37 | had
 38 | what
 39 | me
 40 | would
 41 | if
 42 | other
 43 | its
 44 | said
 45 | work
 46 | how
 47 | good
 48 | after
 49 | great
 50 | go
 51 | those
 52 | love
 53 | many
 54 | i
 55 | very
 56 | than
 57 | such
 58 | got
 59 | set
 60 | well
 61 | much
 62 | play
 63 | give
 64 | everything
 65 | does
 66 | man
 67 | person
 68 | buy
 69 | video
 70 | looking
 71 | sure
 72 | price
 73 | almost
 74 | wrong
 75 | woman
 76 | front
 77 | ways
 78 | spent
 79 | feature
 80 | fast
 81 | player
 82 | far
 83 | street
 84 | files
 85 | models
 86 | button
 87 | plays
 88 | forward
 89 | fill
 90 | walking
 91 | investment
 92 | opinion
 93 | panel
 94 | layout
 95 | im
 96 | consistently
 97 | practically
 98 | com
 99 | discovering
100 | formats
101 | alternate
102 | sleek
103 | happier
104 | smoothly
105 | reviewer
106 | dvd
107 | amazon
108 | apex
109 | nicest
110 | rewind
111 | mp3s
112 | cads
113 | cods
114 | 


--------------------------------------------------------------------------------
/test/samples/text_pair/sample.tsv:
--------------------------------------------------------------------------------
 1 | text	text_b	label
 2 | how many times have real madrid won the champions league in a row	They have also won the competition the most times in a row , winning it five times from 1956 to 1960 .	1
 3 | when did new york stop using the electric chair	Following the U.S. Supreme Court 's ruling declaring existing capital punishment statutes unconstitutional in Furman v. Georgia ( 1972 ) , New York was without a death penalty until 1995 , when then - Governor George Pataki signed a new statute into law , which provided for execution by lethal injection .	1
 4 | songs on 4 your eyez only j cole	`` Neighbors '' Cole 3 : 36 8 .	2
 5 | how many seasons of the blacklist are there on netflix	Retrieved March 27 , 2018 .	0
 6 | how many books are in the one piece series	The series spans over 800 chapters and more than 80 tankōbon volumes .	1
 7 | central idea of poem lines from the deserted village	It is a work of social commentary , and condemns rural depopulation and the pursuit of excessive wealth .	1
 8 | who shot first in the shot heard around the world	The North Bridge skirmish did see the first shots by Americans acting under orders , the first organized volley by Americans , the first British fatalities , and the first British retreat .	1
 9 | who is beauty and the beast written by	Beauty and the Beast ( French : La Belle et la Bête ) is a traditional fairy tale written by French novelist Gabrielle - Suzanne Barbot de Villeneuve and published in 1740 in La Jeune Américaine et les contes marins ( The Young American and Marine Tales ) .	1
10 | what episode does eleven come in season 1	Deep South Mag .	2
11 | love yourself by justin bieber is about who	Rolling Stone .	1
12 | who starred in the movie natural born killers	Scagnetti arrives and tells Mickey that unless he surrenders , he will cut off Mallory 's breasts .	0
13 | when does the new season on the 100 come out	Monty accidentally fries all of the wristbands .	1
14 | where was the super bowl 52 played at	Jump up ^ Chiari , Mike ( January 24 , 2018 ) .	0
15 | who won the academy award for the deer hunter	Best Director , Michael Cimino 3 .	1
16 | how long do former presidents get secret service protection	All living former presidents and their spouses are now entitled to receive lifetime Secret Service protection .	1
17 | the man in the high castle episode 1 season 1	Jump up ^ `` FX 's Tyrant casts Annet Mahendru ; Sebastian Roché in Amazon 's Man in the High Castle '' .	0
18 | who has hosted the most fifa world cups	On 1 June 2014 , The Sunday Times claimed to have obtained documents including e-mails , letters and bank transfers which allegedly proved that Bin Hammam had paid more than US $5 million to football officials to support the Qatar bid .	0
19 | what was the first form of manga in japan	Yomiuri Shimbun .	0


--------------------------------------------------------------------------------
/test/samples/tokenizer/custom_vocab.txt:
--------------------------------------------------------------------------------
1 | neverseentokens


--------------------------------------------------------------------------------
/test/test_doc_classification_distilbert.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import logging
 3 | import numpy as np
 4 | 
 5 | from farm.data_handler.data_silo import DataSilo
 6 | from farm.data_handler.processor import TextClassificationProcessor
 7 | from farm.modeling.optimization import initialize_optimizer
 8 | from farm.infer import Inferencer
 9 | from farm.modeling.adaptive_model import AdaptiveModel
10 | from farm.modeling.language_model import DistilBert
11 | from farm.modeling.prediction_head import TextClassificationHead
12 | from farm.modeling.tokenization import Tokenizer
13 | from farm.train import Trainer
14 | from farm.utils import set_all_seeds, initialize_device_settings
15 | 
16 | 
17 | def test_doc_classification(caplog):
18 |     if caplog:
19 |         caplog.set_level(logging.CRITICAL)
20 | 
21 |     set_all_seeds(seed=42)
22 |     device, n_gpu = initialize_device_settings(use_cuda=False)
23 |     n_epochs = 1
24 |     batch_size = 1
25 |     evaluate_every = 2
26 |     lang_model = "distilbert-base-german-cased"
27 | 
28 |     tokenizer = Tokenizer.load(
29 |         pretrained_model_name_or_path=lang_model,
30 |         do_lower_case=False)
31 | 
32 |     processor = TextClassificationProcessor(tokenizer=tokenizer,
33 |                                             max_seq_len=8,
34 |                                             data_dir=Path("samples/doc_class"),
35 |                                             train_filename=Path("train-sample.tsv"),
36 |                                             label_list=["OTHER", "OFFENSE"],
37 |                                             metric="f1_macro",
38 |                                             dev_filename="test-sample.tsv",
39 |                                             test_filename=None,
40 |                                             dev_split=0.0,
41 |                                             label_column_name="coarse_label")
42 | 
43 |     data_silo = DataSilo(
44 |         processor=processor,
45 |         batch_size=batch_size)
46 | 
47 |     language_model = DistilBert.load(lang_model)
48 |     prediction_head = TextClassificationHead(num_labels=2)
49 |     model = AdaptiveModel(
50 |         language_model=language_model,
51 |         prediction_heads=[prediction_head],
52 |         embeds_dropout_prob=0.1,
53 |         lm_output_types=["per_sequence"],
54 |         device=device)
55 | 
56 |     model, optimizer, lr_schedule = initialize_optimizer(
57 |         model=model,
58 |         learning_rate=2e-5,
59 |         n_batches=len(data_silo.loaders["train"]),
60 |         n_epochs=1,
61 |         device=device,
62 |         schedule_opts=None)
63 | 
64 |     trainer = Trainer(
65 |         model=model,
66 |         optimizer=optimizer,
67 |         data_silo=data_silo,
68 |         epochs=n_epochs,
69 |         n_gpu=n_gpu,
70 |         lr_schedule=lr_schedule,
71 |         evaluate_every=evaluate_every,
72 |         device=device)
73 | 
74 |     trainer.train()
75 | 
76 |     save_dir = Path("testsave/doc_class")
77 |     model.save(save_dir)
78 |     processor.save(save_dir)
79 | 
80 |     del model
81 |     del processor
82 |     del optimizer
83 |     del data_silo
84 |     del trainer
85 | 
86 |     basic_texts = [
87 |         {"text": "Malte liebt Berlin."},
88 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."}
89 |     ]
90 | 
91 |     inf = Inferencer.load(save_dir, batch_size=2, num_processes=0)
92 |     result = inf.inference_from_dicts(dicts=basic_texts)
93 |     assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
94 |     del inf
95 | 
96 | if __name__ == "__main__":
97 |     test_doc_classification(None)
98 | 


--------------------------------------------------------------------------------
/test/test_doc_regression.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from farm.data_handler.data_silo import DataSilo
  8 | from farm.data_handler.processor import RegressionProcessor
  9 | from farm.modeling.optimization import initialize_optimizer
 10 | from farm.infer import Inferencer
 11 | from farm.modeling.adaptive_model import AdaptiveModel
 12 | from farm.modeling.language_model import LanguageModel
 13 | from farm.modeling.prediction_head import RegressionHead
 14 | from farm.modeling.tokenization import Tokenizer
 15 | from farm.train import Trainer
 16 | from farm.utils import set_all_seeds, initialize_device_settings
 17 | 
 18 | @pytest.mark.parametrize("data_dir_path,text_column_name",
 19 |                          [("samples/doc_regr", None),
 20 |                           ("samples/doc_regr_other_text_column_name", "text_other")])
 21 | def test_doc_regression(data_dir_path, text_column_name, caplog=None):
 22 |     if caplog:
 23 |         caplog.set_level(logging.CRITICAL)
 24 | 
 25 |     set_all_seeds(seed=42)
 26 |     device, n_gpu = initialize_device_settings(use_cuda=False)
 27 |     n_epochs = 1
 28 |     batch_size = 1
 29 |     evaluate_every = 2
 30 |     lang_model = "bert-base-cased"
 31 | 
 32 |     tokenizer = Tokenizer.load(
 33 |         pretrained_model_name_or_path=lang_model,
 34 |         do_lower_case=False)
 35 | 
 36 |     rp_params = dict(tokenizer=tokenizer,
 37 |                             max_seq_len=8,
 38 |                             data_dir=Path(data_dir_path),
 39 |                             train_filename="train-sample.tsv",
 40 |                             dev_filename="test-sample.tsv",
 41 |                             test_filename=None,
 42 |                             label_column_name="label")
 43 | 
 44 |     if text_column_name is not None:
 45 |         rp_params["text_column_name"] = text_column_name
 46 | 
 47 |     processor = RegressionProcessor(**rp_params)
 48 | 
 49 |     data_silo = DataSilo(
 50 |         processor=processor,
 51 |         batch_size=batch_size)
 52 | 
 53 |     language_model = LanguageModel.load(lang_model)
 54 |     prediction_head = RegressionHead()
 55 |     model = AdaptiveModel(
 56 |         language_model=language_model,
 57 |         prediction_heads=[prediction_head],
 58 |         embeds_dropout_prob=0.1,
 59 |         lm_output_types=["per_sequence_continuous"],
 60 |         device=device
 61 |     )
 62 | 
 63 |     model, optimizer, lr_schedule = initialize_optimizer(
 64 |         model=model,
 65 |         learning_rate=2e-5,
 66 |         #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
 67 |         n_batches=len(data_silo.loaders["train"]),
 68 |         n_epochs=1,
 69 |         device=device,
 70 |         schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
 71 |     )
 72 | 
 73 |     trainer = Trainer(
 74 |         model=model,
 75 |         optimizer=optimizer,
 76 |         data_silo=data_silo,
 77 |         epochs=n_epochs,
 78 |         n_gpu=n_gpu,
 79 |         lr_schedule=lr_schedule,
 80 |         evaluate_every=evaluate_every,
 81 |         device=device
 82 |     )
 83 | 
 84 |     trainer.train()
 85 | 
 86 |     save_dir = Path("testsave/doc_regr")
 87 |     model.save(save_dir)
 88 |     processor.save(save_dir)
 89 | 
 90 |     del model
 91 |     del processor
 92 |     del optimizer
 93 |     del data_silo
 94 |     del trainer
 95 | 
 96 |     basic_texts = [
 97 |         {"text": "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."},
 98 |         {"text": "it just did not fit right. The top is very thin showing everything."},
 99 |     ]
100 | 
101 |     model = Inferencer.load(save_dir, num_processes=0)
102 |     result = model.inference_from_dicts(dicts=basic_texts)
103 |     assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
104 |     del model
105 | 


--------------------------------------------------------------------------------
/test/test_evaluation_metrics.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import math
 3 | import numpy as np
 4 | 
 5 | from farm.evaluation.metrics import compute_metrics
 6 | from farm.evaluation.semantic_answer_similarity_evaluation import semantic_answer_similarity
 7 | 
 8 | def test_compute_metrics_basic():
 9 |     # check we get some exception, may not always be the AssertionError we get now
10 |     with pytest.raises(Exception):
11 |         compute_metrics("acc", ["x"] * 10, [""] * 11)
12 |     ret = compute_metrics("acc", [], [])
13 |     assert isinstance(ret, dict)
14 |     assert "acc" in ret
15 |     assert math.isnan(ret["acc"])
16 |     with pytest.raises(Exception):
17 |         compute_metrics("asdfasdf", ["a"], ["b"])
18 |     ls = (["a"] * 5)
19 |     ls.extend(["b"] * 5)
20 |     ps = ["a"] * 10
21 |     ret = compute_metrics("acc", ps, ls)
22 |     assert ret["acc"] == 0.5
23 |     ret = compute_metrics("acc", ls, ps)
24 |     assert ret["acc"] == 0.5
25 |     ret = compute_metrics("f1_macro", ps, ls)
26 |     assert ret["f1_macro"] == 1/3
27 |     ret = compute_metrics("f1_macro", ls, ps)
28 |     assert ret["f1_macro"] == 1 / 3
29 |     ret = compute_metrics(["f1_macro", "acc"], ps, ls)
30 |     assert isinstance(ret, dict)
31 |     assert len(ret) == 2
32 |     assert "acc" in ret
33 |     assert "f1_macro" in ret
34 |     assert ret["f1_macro"] == 1/3
35 |     assert ret["acc"] == 0.5
36 |     ret = compute_metrics(["f1_macro", "acc", "acc"], ps, ls)
37 |     assert isinstance(ret, dict)
38 |     assert len(ret) == 2
39 |     assert "acc" in ret
40 |     assert "f1_macro" in ret
41 |     assert ret["f1_macro"] == 1/3
42 |     assert ret["acc"] == 0.5
43 |     ret = compute_metrics(["f1_macro", ["acc"]], ps, ls)
44 |     assert isinstance(ret, dict)
45 |     assert len(ret) == 2
46 |     assert "acc" in ret
47 |     assert "f1_macro" in ret
48 |     assert ret["f1_macro"] == 1/3
49 |     assert ret["acc"] == 0.5
50 | 
51 | def test_semantic_answer_similarity(bert_base_squad2):
52 |     bert_base_squad2.model.prediction_heads[0].n_best = 2
53 |     result = bert_base_squad2.inference_from_file(file="samples/qa/eval-sample.json",return_json=False)
54 | 
55 |     top1_sim, topn_sim, r, d = semantic_answer_similarity(result=result,
56 |                                                           sts_model_path_or_string="paraphrase-MiniLM-L6-v2",
57 |                                                           debug=True)
58 | 
59 |     assert np.isclose(top1_sim, 0.7405298)
60 |     assert np.isclose(topn_sim, 0.7405298)
61 |     assert len(d) == 1
62 |     assert "semantic_answer_score" in r[0].prediction[0].meta
63 | 
64 | 


--------------------------------------------------------------------------------
/test/test_inference.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | 
  4 | from farm.infer import Inferencer
  5 | from transformers import BertTokenizerFast
  6 | 
  7 | 
  8 | @pytest.mark.parametrize("streaming", [True, False])
  9 | @pytest.mark.parametrize("multiprocessing_chunksize", [None, 2])
 10 | @pytest.mark.parametrize("num_processes", [2, 0, None], scope="module")
 11 | def test_qa_format_and_results(adaptive_model_qa, streaming, multiprocessing_chunksize):
 12 |     qa_inputs_dicts = [
 13 |         {
 14 |             "questions": ["In what country is Normandy"],
 15 |             "text": "The Normans are an ethnic group that arose in Normandy, a northern region "
 16 |             "of France, from contact between Viking settlers and indigenous Franks and Gallo-Romans",
 17 |         },
 18 |         {
 19 |             "questions": ["Who counted the game among the best ever made?"],
 20 |             "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received "
 21 |             "perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic "
 22 |             "Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings "
 23 |             "and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores "
 24 |             "of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the "
 25 |             "greatest games ever created.",
 26 |         },
 27 |     ]
 28 |     ground_truths = ["France", "GameTrailers"]
 29 | 
 30 |     results = adaptive_model_qa.inference_from_dicts(
 31 |         dicts=qa_inputs_dicts,
 32 |         multiprocessing_chunksize=multiprocessing_chunksize,
 33 |         streaming=streaming,
 34 |     )
 35 |     # sample results
 36 |     # [
 37 |     #     {
 38 |     #         "task": "qa",
 39 |     #         "predictions": [
 40 |     #             {
 41 |     #                 "question": "In what country is Normandy",
 42 |     #                 "question_id": "None",
 43 |     #                 "ground_truth": None,
 44 |     #                 "answers": [
 45 |     #                     {
 46 |     #                         "score": 1.1272038221359253,
 47 |     #                         "probability": -1,
 48 |     #                         "answer": "France",
 49 |     #                         "offset_answer_start": 54,
 50 |     #                         "offset_answer_end": 60,
 51 |     #                         "context": "The Normans gave their name to Normandy, a region in France.",
 52 |     #                         "offset_context_start": 0,
 53 |     #                         "offset_context_end": 60,
 54 |     #                         "document_id": None,
 55 |     #                     }
 56 |     #                 ]
 57 |     #             }
 58 |     #         ],
 59 |     #     }
 60 |     # ]
 61 |     predictions = list(results)[0]["predictions"]
 62 | 
 63 |     for prediction, ground_truth, qa_input_dict in zip(
 64 |         predictions, ground_truths, qa_inputs_dicts
 65 |     ):
 66 |         assert prediction["question"] == qa_input_dict["questions"][0]
 67 |         answer = prediction["answers"][0]
 68 |         assert answer["answer"] in answer["context"]
 69 |         assert answer["answer"] == ground_truth
 70 |         assert (
 71 |                 {"answer", "score", "probability", "offset_answer_start", "offset_answer_end", "context",
 72 |                  "offset_context_start", "offset_context_end", "document_id"}
 73 |                 == answer.keys()
 74 |         )
 75 | 
 76 | 
 77 | @pytest.mark.parametrize("num_processes", [0], scope="session")
 78 | @pytest.mark.parametrize("use_fast", [True])
 79 | def test_embeddings_extraction(num_processes, use_fast):
 80 |     # Input
 81 |     basic_texts = [
 82 |         {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
 83 |         {"text": "Martin Müller spielt Fussball"},
 84 |     ]
 85 | 
 86 |     # Load model, tokenizer and processor directly into Inferencer
 87 |     model = Inferencer.load(
 88 |         model_name_or_path="bert-base-german-cased",
 89 |         task_type="embeddings",
 90 |         gpu=False,
 91 |         batch_size=5,
 92 |         extraction_strategy="reduce_mean",
 93 |         extraction_layer=-2,
 94 |         use_fast=use_fast,
 95 |         num_processes=num_processes,
 96 |     )
 97 | 
 98 |     # Get embeddings for input text (you can vary the strategy and layer)
 99 |     result = model.inference_from_dicts(dicts=basic_texts)
100 |     assert result[0]["context"] == basic_texts[0]["text"]
101 |     assert result[0]["vec"].shape == (768,)
102 |     assert np.isclose(result[0]["vec"][0], 0.01501756374325071, atol=0.00001)
103 | 
104 | 
105 | def test_inferencer_with_fast_bert_tokenizer():
106 |     model = Inferencer.load("bert-base-german-cased", task_type='text_classification',
107 |                             use_fast=True, num_processes=0)
108 |     tokenizer = model.processor.tokenizer
109 |     assert type(tokenizer) is BertTokenizerFast
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     test_embeddings_extraction()
114 | 


--------------------------------------------------------------------------------
/test/test_model_versioning.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from farm.infer import Inferencer
 4 | 
 5 | def test_wrong_revision(caplog=None):
 6 |     # We want this load attempt to fail because we specify an invalid revision
 7 |     failed_load = None
 8 |     try:
 9 |         failed_load = Inferencer.load("deepset/roberta-base-squad2", revision="xxx", task_type="question_answering")
10 |     except:
11 |         pass
12 |     assert not failed_load
13 | 
14 | def test_revision_v1(caplog=None):
15 |     model = Inferencer.load("deepset/roberta-base-squad2", revision="v1.0", task_type="question_answering")
16 |     assert torch.isclose(torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight),
17 |                 torch.sum(torch.tensor([-21394.6055])))
18 |     del model
19 | 
20 | def test_revision_v2(caplog=None):
21 |     model = Inferencer.load("deepset/roberta-base-squad2", revision="v2.0", task_type="question_answering")
22 |     assert torch.isclose(torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight),
23 |                        torch.sum(torch.tensor([-21411.4414])))
24 |     del model
25 | 
26 | def test_revision_default(caplog=None):
27 |     # default model should be the same as v2
28 |     model = Inferencer.load("deepset/roberta-base-squad2", task_type="question_answering")
29 |     assert torch.isclose(
30 |         torch.sum(model.model.language_model.model.encoder.layer[0].intermediate.dense.weight),
31 |         torch.sum(torch.tensor([-21411.4414])))
32 |     del model
33 | 


--------------------------------------------------------------------------------
/test/test_natural_questions.py:
--------------------------------------------------------------------------------
  1 | # TODO enable NQ tests again
  2 | 
  3 | # import logging
  4 | # from pathlib import Path
  5 | # import numpy as np
  6 | # import pytest
  7 | #
  8 | # from farm.data_handler.data_silo import DataSilo
  9 | # from farm.data_handler.processor import NaturalQuestionsProcessor
 10 | # from farm.modeling.adaptive_model import AdaptiveModel
 11 | # from farm.modeling.language_model import LanguageModel
 12 | # from farm.modeling.optimization import initialize_optimizer
 13 | # from farm.modeling.prediction_head import QuestionAnsweringHead, TextClassificationHead
 14 | # from farm.modeling.tokenization import Tokenizer
 15 | # from farm.train import Trainer
 16 | # from farm.utils import set_all_seeds, initialize_device_settings
 17 | # from farm.infer import Inferencer, QAInferencer
 18 | #
 19 | # @pytest.fixture()
 20 | # def distilbert_nq(caplog=None):
 21 | #     if caplog:
 22 | #         caplog.set_level(logging.CRITICAL)
 23 | #
 24 | #
 25 | #     set_all_seeds(seed=42)
 26 | #     device, n_gpu = initialize_device_settings(use_cuda=False)
 27 | #     batch_size = 2
 28 | #     n_epochs = 1
 29 | #     evaluate_every = 4
 30 | #     base_LM_model = "distilbert-base-uncased"
 31 | #
 32 | #     tokenizer = Tokenizer.load(
 33 | #         pretrained_model_name_or_path=base_LM_model, do_lower_case=True
 34 | #     )
 35 | #     processor = NaturalQuestionsProcessor(
 36 | #         tokenizer=tokenizer,
 37 | #         max_seq_len=20,
 38 | #         doc_stride=10,
 39 | #         max_query_length=6,
 40 | #         train_filename="train_sample.jsonl",
 41 | #         dev_filename="dev_sample.jsonl",
 42 | #         data_dir=Path("samples/nq")
 43 | #     )
 44 | #
 45 | #     data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
 46 | #     language_model = LanguageModel.load(base_LM_model)
 47 | #     qa_head = QuestionAnsweringHead()
 48 | #     classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list))
 49 | #
 50 | #     model = AdaptiveModel(
 51 | #         language_model=language_model,
 52 | #         prediction_heads=[qa_head, classification_head],
 53 | #         embeds_dropout_prob=0.1,
 54 | #         lm_output_types=["per_token", "per_sequence"],
 55 | #         device=device,
 56 | #     )
 57 | #
 58 | #     model, optimizer, lr_schedule = initialize_optimizer(
 59 | #         model=model,
 60 | #         learning_rate=2e-5,
 61 | #         #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
 62 | #         n_batches=len(data_silo.loaders["train"]),
 63 | #         n_epochs=n_epochs,
 64 | #         device=device
 65 | #     )
 66 | #     trainer = Trainer(
 67 | #         model=model,
 68 | #         optimizer=optimizer,
 69 | #         data_silo=data_silo,
 70 | #         epochs=n_epochs,
 71 | #         n_gpu=n_gpu,
 72 | #         lr_schedule=lr_schedule,
 73 | #         evaluate_every=evaluate_every,
 74 | #         device=device
 75 | #     )
 76 | #     trainer.train()
 77 | #     return model, processor
 78 | #
 79 | #
 80 | # def test_training(distilbert_nq):
 81 | #     model, processor = distilbert_nq
 82 | #     assert type(model) == AdaptiveModel
 83 | #     assert type(processor) == NaturalQuestionsProcessor
 84 | #
 85 | #
 86 | # def test_inference(distilbert_nq, caplog=None):
 87 | #     if caplog:
 88 | #         caplog.set_level(logging.CRITICAL)
 89 | #     model, processor = distilbert_nq
 90 | #
 91 | #     save_dir = Path("testsave/qa_nq")
 92 | #     model.save(save_dir)
 93 | #     processor.save(save_dir)
 94 | #
 95 | #     inferencer = QAInferencer.load(save_dir, batch_size=2, gpu=False, num_processes=0)
 96 | #     assert inferencer is not None
 97 | #
 98 | #     qa_format_1 = [
 99 | #         {
100 | #             "questions": ["Who counted the game among the best ever made?"],
101 | #             "text": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
102 | #         }
103 | #     ]
104 | #     qa_format_2 = [
105 | #         {
106 | #             "qas":["Who counted the game among the best ever made?"],
107 | #             "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
108 | #         }
109 | #     ]
110 | #
111 | #     result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
112 | #     result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
113 | #     assert result1 == result2
114 | #
115 | # if __name__ == "__main__":
116 | #     test_training()
117 | #     test_inference()


--------------------------------------------------------------------------------
/test/test_ner.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pytest
  3 | 
  4 | import numpy as np
  5 | 
  6 | from farm.data_handler.data_silo import DataSilo
  7 | from farm.data_handler.processor import NERProcessor
  8 | from farm.modeling.optimization import initialize_optimizer
  9 | from farm.infer import Inferencer
 10 | from farm.modeling.adaptive_model import AdaptiveModel
 11 | from farm.modeling.language_model import LanguageModel
 12 | from farm.modeling.prediction_head import TokenClassificationHead
 13 | from farm.modeling.tokenization import Tokenizer
 14 | from farm.train import Trainer
 15 | from farm.utils import set_all_seeds, initialize_device_settings
 16 | 
 17 | import logging
 18 | 
 19 | # TODO: Test slow tokenizers when reimplemented
 20 | @pytest.mark.parametrize("use_fast", [True])
 21 | def test_ner(caplog, use_fast):
 22 |     if caplog:
 23 |         caplog.set_level(logging.CRITICAL)
 24 | 
 25 |     set_all_seeds(seed=42)
 26 |     device, n_gpu = initialize_device_settings(use_cuda=False)
 27 |     n_epochs = 3
 28 |     batch_size = 2
 29 |     evaluate_every = 1
 30 |     lang_model = "distilbert-base-german-cased"
 31 | 
 32 |     tokenizer = Tokenizer.load(
 33 |         pretrained_model_name_or_path=lang_model, do_lower_case=False,
 34 |         use_fast=use_fast,
 35 |     )
 36 | 
 37 |     ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
 38 |                   "I-OTH"]
 39 | 
 40 |     processor = NERProcessor(
 41 |         tokenizer=tokenizer,
 42 |         max_seq_len=8,
 43 |         data_dir=Path("samples/ner"),
 44 |         train_filename="train-sample.txt",
 45 |         dev_filename="dev-sample.txt",
 46 |         test_filename=None,
 47 |         delimiter=" ",
 48 |         label_list=ner_labels,
 49 |         metric="seq_f1",
 50 |         multithreading_rust=False
 51 |     )
 52 | 
 53 |     data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
 54 |     language_model = LanguageModel.load(lang_model)
 55 |     prediction_head = TokenClassificationHead(num_labels=13)
 56 | 
 57 |     model = AdaptiveModel(
 58 |         language_model=language_model,
 59 |         prediction_heads=[prediction_head],
 60 |         embeds_dropout_prob=0.1,
 61 |         lm_output_types=["per_token"],
 62 |         device=device,
 63 |     )
 64 | 
 65 |     model, optimizer, lr_schedule = initialize_optimizer(
 66 |         model=model,
 67 |         learning_rate=2e-5,
 68 |         #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
 69 |         n_batches=len(data_silo.loaders["train"]),
 70 |         n_epochs=1,
 71 |         device=device,
 72 |         schedule_opts={'name': 'LinearWarmup', 'warmup_proportion': 0.1}
 73 |     )
 74 |     trainer = Trainer(
 75 |         model=model,
 76 |         optimizer=optimizer,
 77 |         data_silo=data_silo,
 78 |         epochs=n_epochs,
 79 |         n_gpu=n_gpu,
 80 |         lr_schedule=lr_schedule,
 81 |         evaluate_every=evaluate_every,
 82 |         device=device,
 83 |     )
 84 | 
 85 |     save_dir = Path("testsave/ner")
 86 |     model = trainer.train()
 87 |     model.save(save_dir)
 88 |     processor.save(save_dir)
 89 | 
 90 |     del model
 91 |     del processor
 92 |     del optimizer
 93 |     del data_silo
 94 |     del trainer
 95 | 
 96 |     basic_texts = [
 97 |         {"text": "Paris is a town in France."},
 98 |     ]
 99 |     model = Inferencer.load(model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english", num_processes=0, task_type="ner", use_fast=use_fast)
100 |     # labels arent correctly inserted from transformers
101 |     # They are converted to LABEL_1 ... LABEL_N
102 |     # For the inference result to contain predictions we need them in IOB NER format
103 |     model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
104 |     result = model.inference_from_dicts(dicts=basic_texts)
105 | 
106 |     assert result[0]["predictions"][0][0]["context"] == "Paris"
107 |     assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32)
108 |     assert result[0]["predictions"][0][0]["probability"] > 0.99
109 |     assert result[0]["predictions"][0][0]["label"] == "LOC"
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     test_ner(None, True)
114 | 


--------------------------------------------------------------------------------
/test/test_ner_amp.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import numpy as np
  3 | 
  4 | from farm.data_handler.data_silo import DataSilo
  5 | from farm.data_handler.processor import NERProcessor
  6 | from farm.modeling.optimization import initialize_optimizer, AMP_AVAILABLE
  7 | from farm.infer import Inferencer
  8 | from farm.modeling.adaptive_model import AdaptiveModel
  9 | from farm.modeling.language_model import LanguageModel
 10 | from farm.modeling.prediction_head import TokenClassificationHead
 11 | from farm.modeling.tokenization import Tokenizer
 12 | from farm.train import Trainer
 13 | from farm.utils import set_all_seeds, initialize_device_settings
 14 | 
 15 | import logging
 16 | 
 17 | 
 18 | def test_ner_amp(caplog):
 19 |     if caplog:
 20 |         caplog.set_level(logging.CRITICAL)
 21 | 
 22 |     set_all_seeds(seed=42)
 23 |     device, n_gpu = initialize_device_settings(use_cuda=True)
 24 |     n_epochs = 1
 25 |     batch_size = 2
 26 |     evaluate_every = 1
 27 |     lang_model = "bert-base-german-cased"
 28 |     if AMP_AVAILABLE:
 29 |         use_amp = 'O1'
 30 |     else:
 31 |         use_amp = None
 32 | 
 33 |     tokenizer = Tokenizer.load(
 34 |         pretrained_model_name_or_path=lang_model, do_lower_case=False
 35 |     )
 36 | 
 37 |     ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
 38 |                   "I-OTH"]
 39 | 
 40 |     processor = NERProcessor(
 41 |         tokenizer=tokenizer,
 42 |         max_seq_len=8,
 43 |         data_dir=Path("samples/ner"),
 44 |         train_filename=Path("train-sample.txt"),
 45 |         dev_filename=Path("dev-sample.txt"),
 46 |         test_filename=None,
 47 |         delimiter=" ",
 48 |         label_list=ner_labels,
 49 |         metric="seq_f1"
 50 |     )
 51 | 
 52 |     data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)
 53 |     language_model = LanguageModel.load(lang_model)
 54 |     prediction_head = TokenClassificationHead(num_labels=13)
 55 | 
 56 |     model = AdaptiveModel(
 57 |         language_model=language_model,
 58 |         prediction_heads=[prediction_head],
 59 |         embeds_dropout_prob=0.1,
 60 |         lm_output_types=["per_token"],
 61 |         device=device
 62 |     )
 63 | 
 64 |     model, optimizer, lr_schedule = initialize_optimizer(
 65 |         model=model,
 66 |         learning_rate=2e-05,
 67 |         schedule_opts=None,
 68 |         n_batches=len(data_silo.loaders["train"]),
 69 |         n_epochs=n_epochs,
 70 |         device=device,
 71 |         use_amp=use_amp)
 72 | 
 73 |     trainer = Trainer(
 74 |         model=model,
 75 |         optimizer=optimizer,
 76 |         data_silo=data_silo,
 77 |         epochs=n_epochs,
 78 |         n_gpu=n_gpu,
 79 |         lr_schedule=lr_schedule,
 80 |         evaluate_every=evaluate_every,
 81 |         device=device,
 82 |     )
 83 | 
 84 |     save_dir = Path("testsave/ner")
 85 |     trainer.train()
 86 |     model.save(save_dir)
 87 |     processor.save(save_dir)
 88 | 
 89 |     basic_texts = [
 90 |         {"text": "1980 kam der Crown von Toyota"},
 91 |     ]
 92 |     model = Inferencer.load(save_dir, num_processes=0)
 93 |     result = model.inference_from_dicts(dicts=basic_texts)
 94 | 
 95 |     assert result[0]["predictions"][0][0]["context"] == "1980"
 96 |     assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32)
 97 |     assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.161, rtol=0.05)
 98 |     assert result[0]["predictions"][0][0]["label"] == "LOC"
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     test_ner_amp(None)
103 | 


--------------------------------------------------------------------------------
/test/test_onnx_conversion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from farm.infer import Inferencer
 5 | from farm.modeling.adaptive_model import AdaptiveModel
 6 | 
 7 | 
 8 | @pytest.mark.parametrize("model_name", ["deepset/bert-base-cased-squad2", "deepset/roberta-base-squad2"])
 9 | def test_onnx_conversion_and_inference(tmp_path, model_name):
10 |     AdaptiveModel.convert_to_onnx(
11 |         model_name=model_name, output_path=tmp_path / "test-onnx", task_type="question_answering"
12 |     )
13 |     onnx_inferencer = Inferencer.load(tmp_path / "test-onnx", task_type="question_answering", num_processes=0)
14 |     qa_input = [
15 |         {
16 |             "questions": ["What is the population of Berlin?"],
17 |             "text": "Berlin is the capital and largest city of Germany by both area and population. Its 3,769,495 "
18 |             "inhabitants as of December 31, 2019 make it the most populous city of the European Union, "
19 |             "according to population within city limits.The city is also one of Germany's 16 federal states.",
20 |         }
21 |     ]
22 |     result_onnx = onnx_inferencer.inference_from_dicts(qa_input)[0]
23 |     assert result_onnx["predictions"][0]["answers"][0]["answer"] == "3,769,495"
24 | 
25 |     pytorch_inferencer = Inferencer.load(model_name, task_type="question_answering", num_processes=0)
26 |     result_pytorch = pytorch_inferencer.inference_from_dicts(qa_input)[0]
27 | 
28 |     for (onnx, pytorch) in zip(
29 |         result_onnx["predictions"][0]["answers"][0].items(), result_pytorch["predictions"][0]["answers"][0].items()
30 |     ):
31 |         # keys
32 |         assert onnx[0] == pytorch[0]
33 |         # values
34 |         if type(onnx[1]) == float:
35 |             np.testing.assert_almost_equal(onnx[1], pytorch[1], decimal=4)  # score
36 |         else:
37 |             assert onnx[1] == pytorch[1]
38 | 


--------------------------------------------------------------------------------
/test/test_prediction_head.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | from pathlib import Path
 4 | import pytest
 5 | 
 6 | from farm.data_handler.data_silo import DataSilo
 7 | from farm.data_handler.processor import TextClassificationProcessor
 8 | from farm.modeling.adaptive_model import AdaptiveModel
 9 | from farm.modeling.language_model import LanguageModel
10 | from farm.modeling.prediction_head import TextClassificationHead
11 | from farm.modeling.tokenization import Tokenizer
12 | from farm.utils import set_all_seeds, initialize_device_settings
13 | 
14 | 
15 | def test_prediction_head_load_save_class_weights(tmp_path, caplog=None):
16 |     """This is a regression test for #428 and #422."""
17 |     if caplog:
18 |         caplog.set_level(logging.CRITICAL)
19 | 
20 |     set_all_seeds(seed=42)
21 |     device, n_gpu = initialize_device_settings(use_cuda=False)
22 |     batch_size = 1
23 |     lang_model = "bert-base-german-cased"
24 |     data_dir_path = "samples/doc_class"
25 | 
26 |     tokenizer = Tokenizer.load(
27 |         pretrained_model_name_or_path=lang_model,
28 |         do_lower_case=False)
29 | 
30 |     tcp_params = dict(tokenizer=tokenizer,
31 |                       max_seq_len=8,
32 |                       data_dir=Path(data_dir_path),
33 |                       train_filename="train-sample.tsv",
34 |                       label_list=["OTHER", "OFFENSE"],
35 |                       metric="f1_macro",
36 |                       dev_filename="test-sample.tsv",
37 |                       test_filename=None,
38 |                       dev_split=0.0,
39 |                       label_column_name="coarse_label")
40 | 
41 |     processor = TextClassificationProcessor(**tcp_params)
42 | 
43 |     data_silo = DataSilo(
44 |         processor=processor,
45 |         batch_size=batch_size)
46 | 
47 |     language_model = LanguageModel.load(lang_model)
48 |     prediction_head = TextClassificationHead(
49 |         num_labels=2,
50 |         class_weights=data_silo.calculate_class_weights(task_name="text_classification"))
51 | 
52 |     model = AdaptiveModel(
53 |         language_model=language_model,
54 |         prediction_heads=[prediction_head],
55 |         embeds_dropout_prob=0.1,
56 |         lm_output_types=["per_sequence"],
57 |         device=device)
58 | 
59 |     model.save(tmp_path)
60 |     model_loaded = AdaptiveModel.load(tmp_path, device='cpu')
61 |     assert model_loaded is not None
62 | 
63 | def test_TextClassificationHead_class_weights_dimensions():
64 |     with pytest.raises(ValueError):
65 |         class_wights = np.asarray([[0.4, 0.6], [0.8, 0.2]])
66 |         TextClassificationHead(
67 |             num_labels=2,
68 |             class_weights=class_wights)
69 | 


--------------------------------------------------------------------------------
/test/test_processor_saving_loading.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | from farm.data_handler.processor import TextClassificationProcessor
 5 | from farm.modeling.tokenization import Tokenizer
 6 | from farm.utils import set_all_seeds
 7 | import torch
 8 | 
 9 | def test_processor_saving_loading(caplog):
10 |     if caplog is not None:
11 |         caplog.set_level(logging.CRITICAL)
12 | 
13 |     set_all_seeds(seed=42)
14 |     lang_model = "bert-base-cased"
15 | 
16 |     tokenizer = Tokenizer.load(
17 |         pretrained_model_name_or_path=lang_model, do_lower_case=False
18 |     )
19 | 
20 |     processor = TextClassificationProcessor(tokenizer=tokenizer,
21 |                                             max_seq_len=128,
22 |                                             data_dir=Path("samples/doc_class"),
23 |                                             train_filename="train-sample.tsv",
24 |                                             dev_filename=None,
25 |                                             test_filename=None,
26 |                                             label_column_name="coarse_label",
27 |                                             dev_split=0.1,
28 |                                             label_list=["OTHER", "OFFENSE"],
29 |                                             metric=["f1_macro"]
30 |                                             )
31 |     dicts = processor.file_to_dicts(file=Path("samples/doc_class/train-sample.tsv"))
32 |     data, tensor_names, _ = processor.dataset_from_dicts(dicts)
33 | 
34 |     save_dir = Path("testsave/processor")
35 |     processor.save(save_dir)
36 | 
37 |     processor = processor.load_from_dir(save_dir)
38 |     dicts = processor.file_to_dicts(file=Path("samples/doc_class/train-sample.tsv"))
39 |     data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts)
40 | 
41 |     assert tensor_names == tensor_names_loaded
42 |     for i in range(len(data.tensors)):
43 |         assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
44 | 
45 | if __name__ == "__main__":
46 |     test_processor_saving_loading(None)
47 | 


--------------------------------------------------------------------------------
/test/test_s3e_pooling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pickle
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.processor import InferenceProcessor
  6 | from farm.infer import Inferencer
  7 | from farm.modeling.adaptive_model import AdaptiveModel
  8 | from farm.modeling.language_model import LanguageModel
  9 | from farm.modeling.tokenization import Tokenizer
 10 | from farm.utils import set_all_seeds, initialize_device_settings
 11 | from farm.modeling.wordembedding_utils import fit_s3e_on_corpus
 12 | 
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def test_s3e_fit():
 18 |     # small test data
 19 |     language_model = Path("samples/s3e/tiny_fasttext_model")
 20 |     corpus_path = Path("samples/s3e/tiny_corpus.txt")
 21 |     save_dir = Path("testsave/fitted_s3e/")
 22 |     do_lower_case = False
 23 |     batch_size = 2
 24 |     use_gpu = False
 25 | 
 26 |     # Fit S3E on a corpus
 27 |     set_all_seeds(seed=42)
 28 |     device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)
 29 | 
 30 |     # Create a InferenceProcessor
 31 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model, do_lower_case=do_lower_case)
 32 |     processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)
 33 | 
 34 |     # Create an AdaptiveModel
 35 |     language_model = LanguageModel.load(language_model)
 36 | 
 37 |     model = AdaptiveModel(
 38 |         language_model=language_model,
 39 |         prediction_heads=[],
 40 |         embeds_dropout_prob=0.1,
 41 |         lm_output_types=[],
 42 |         device=device)
 43 | 
 44 |     model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
 45 |                                                     model=model,
 46 |                                                     corpus=corpus_path,
 47 |                                                     n_clusters=3,
 48 |                                                     pca_n_components=30,
 49 |                                                     svd_postprocessing=True,
 50 |                                                     min_token_occurrences=1)
 51 | 
 52 |     # save everything to allow inference without fitting everything again
 53 |     model.save(save_dir)
 54 |     processor.save(save_dir)
 55 |     with open(save_dir / "s3e_stats.pkl", "wb") as f:
 56 |         pickle.dump(s3e_stats, f)
 57 | 
 58 |     # Load model, tokenizer and processor directly into Inferencer
 59 |     inferencer = Inferencer(model=model, processor=processor, task_type="embeddings", gpu=use_gpu,
 60 |                        batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1,
 61 |                        s3e_stats=s3e_stats, num_processes=0)
 62 | 
 63 |     # Input
 64 |     basic_texts = [
 65 |         {"text": "a man is walking on the street."},
 66 |         {"text": "a woman is walking on the street."},
 67 |     ]
 68 | 
 69 |     # Get embeddings for input text (you can vary the strategy and layer)
 70 |     result = inferencer.inference_from_dicts(dicts=basic_texts)
 71 |     assert result[0]["context"] == basic_texts[0]["text"]
 72 |     assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6
 73 |     assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6
 74 | 
 75 | 
 76 | def test_load_extract_s3e_embeddings():
 77 |     load_dir = Path("samples/s3e/fitted_s3e")
 78 |     use_gpu = False
 79 |     batch_size = 2
 80 | 
 81 |     with open(load_dir / "s3e_stats.pkl", "rb") as f:
 82 |         s3e_stats = pickle.load(f)
 83 | 
 84 |     # Init inferencer
 85 |     inferencer = Inferencer.load(model_name_or_path=load_dir, task_type="embeddings", gpu=use_gpu,
 86 |                        batch_size=batch_size, extraction_strategy="s3e", extraction_layer=-1,
 87 |                        s3e_stats=s3e_stats, num_processes=0)
 88 | 
 89 |     # Input
 90 |     basic_texts = [
 91 |         {"text": "a man is walking on the street."},
 92 |         {"text": "a woman is walking on the street."},
 93 |     ]
 94 | 
 95 |     # Get embeddings for input text
 96 |     result = inferencer.inference_from_dicts(dicts=basic_texts)
 97 |     assert result[0]["context"] == basic_texts[0]["text"]
 98 |     assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6
 99 |     assert result[0]["vec"][-2] + 0.06285100416478565 < 1e-6
100 | 
101 | if __name__ == "__main__":
102 |     test_s3e_fit()
103 |     test_load_extract_s3e_embeddings()


--------------------------------------------------------------------------------
/tutorials/sagemaker/source/doc_classification.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | from pathlib import Path
  4 | 
  5 | from farm.data_handler.data_silo import DataSilo
  6 | from farm.data_handler.processor import TextClassificationProcessor
  7 | from farm.modeling.adaptive_model import AdaptiveModel
  8 | from farm.modeling.language_model import LanguageModel
  9 | from farm.modeling.optimization import initialize_optimizer
 10 | from farm.modeling.prediction_head import TextClassificationHead
 11 | from farm.modeling.tokenization import Tokenizer
 12 | from farm.train import Trainer
 13 | from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 14 | 
 15 | 
 16 | def doc_classification(args):
 17 |     logging.basicConfig(
 18 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 19 |     )
 20 | 
 21 |     ml_logger = MLFlowLogger(tracking_uri="")
 22 |     ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
 23 | 
 24 |     set_all_seeds(seed=42)
 25 |     save_dir = Path("/opt/ml/model")
 26 |     use_amp = None
 27 | 
 28 |     device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
 29 | 
 30 |     # 1.Create a tokenizer
 31 |     tokenizer = Tokenizer.load(pretrained_model_name_or_path=args.base_lm_model, do_lower_case=False)
 32 | 
 33 |     # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
 34 |     # Here we load GermEval 2018 Data.
 35 |     label_list = ["OTHER", "OFFENSE"]
 36 |     metric = "f1_macro"
 37 | 
 38 |     processor = TextClassificationProcessor(
 39 |         tokenizer=tokenizer,
 40 |         max_seq_len=args.max_seq_len,
 41 |         data_dir=Path("../data/germeval18"),
 42 |         label_list=label_list,
 43 |         metric=metric,
 44 |         label_column_name="coarse_label",
 45 |     )
 46 | 
 47 |     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
 48 |     #    few descriptive statistics of our datasets
 49 |     data_silo = DataSilo(processor=processor, batch_size=args.batch_size)
 50 | 
 51 |     # 4. Create an AdaptiveModel
 52 |     # a) which consists of a pretrained language model as a basis
 53 |     language_model = LanguageModel.load(args.base_lm_model)
 54 |     # b) and a prediction head on top that is suited for our task => Text classification
 55 |     prediction_head = TextClassificationHead(
 56 |         class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)
 57 |     )
 58 | 
 59 |     model = AdaptiveModel(
 60 |         language_model=language_model,
 61 |         prediction_heads=[prediction_head],
 62 |         embeds_dropout_prob=0.1,
 63 |         lm_output_types=["per_sequence"],
 64 |         device=device,
 65 |     )
 66 | 
 67 |     # 5. Create an optimizer
 68 |     model, optimizer, lr_schedule = initialize_optimizer(
 69 |         model=model,
 70 |         learning_rate=3e-5,
 71 |         device=device,
 72 |         n_batches=len(data_silo.loaders["train"]),
 73 |         n_epochs=args.n_epochs,
 74 |         use_amp=use_amp,
 75 |     )
 76 | 
 77 |     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
 78 |     trainer = Trainer(
 79 |         model=model,
 80 |         optimizer=optimizer,
 81 |         data_silo=data_silo,
 82 |         epochs=args.n_epochs,
 83 |         n_gpu=n_gpu,
 84 |         lr_schedule=lr_schedule,
 85 |         evaluate_every=args.evaluate_every,
 86 |         device=device,
 87 |     )
 88 | 
 89 |     # 7. Let it grow
 90 |     trainer.train()
 91 | 
 92 |     # 8. Hooray! You have a model. Store it:
 93 |     model.save(save_dir)
 94 |     processor.save(save_dir)
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     parser = argparse.ArgumentParser()
 99 | 
100 |     parser.add_argument("--n_epochs", type=int, default=2, help="number of epochs (default: 2)")
101 |     parser.add_argument("--batch_size", type=int, default=4, help="batch size (default: 4)")
102 |     parser.add_argument("--max_seq_len", type=int, default=64, help="batch size (default: 64)")
103 |     parser.add_argument(
104 |         "--base_lm_model",
105 |         type=str,
106 |         default="bert-base-uncased",
107 |         help="base language model to use (default: bert-base-uncased)",
108 |     )
109 |     parser.add_argument(
110 |         "--evaluate_every", type=int, default=100, help="perform evaluation every n steps (default: 100)"
111 |     )
112 |     doc_classification(parser.parse_args())
113 | 


--------------------------------------------------------------------------------
/tutorials/sagemaker/source/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/deepset-ai/farm.git@c2e86cdd52242d27702f5f383883b8e3421489ee#egg=farm


--------------------------------------------------------------------------------