├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_transformers.md
├── deploy_multi_version_doc.sh
├── docker
    ├── transformers-cpu
    │   └── Dockerfile
    ├── transformers-gpu
    │   └── Dockerfile
    ├── transformers-pytorch-cpu
    │   └── Dockerfile
    ├── transformers-pytorch-gpu
    │   └── Dockerfile
    ├── transformers-tensorflow-cpu
    │   └── Dockerfile
    └── transformers-tensorflow-gpu
    │   └── Dockerfile
├── docs
    ├── Makefile
    ├── README.md
    └── source
    │   ├── _static
    │       ├── css
    │       │   ├── Calibre-Light.ttf
    │       │   ├── Calibre-Medium.otf
    │       │   ├── Calibre-Regular.otf
    │       │   ├── Calibre-Thin.otf
    │       │   ├── code-snippets.css
    │       │   └── huggingface.css
    │       └── js
    │       │   ├── custom.js
    │       │   └── huggingface_logo.svg
    │   ├── benchmarks.md
    │   ├── bertology.rst
    │   ├── conf.py
    │   ├── converting_tensorflow_models.rst
    │   ├── examples.md
    │   ├── favicon.ico
    │   ├── glossary.rst
    │   ├── imgs
    │       ├── transformers_logo_name.png
    │       ├── warmup_constant_schedule.png
    │       ├── warmup_cosine_hard_restarts_schedule.png
    │       ├── warmup_cosine_schedule.png
    │       ├── warmup_cosine_warm_restarts_schedule.png
    │       └── warmup_linear_schedule.png
    │   ├── index.rst
    │   ├── installation.md
    │   ├── main_classes
    │       ├── configuration.rst
    │       ├── model.rst
    │       ├── optimizer_schedules.rst
    │       ├── pipelines.rst
    │       ├── processors.rst
    │       └── tokenizer.rst
    │   ├── migration.md
    │   ├── model_doc
    │       ├── albert.rst
    │       ├── auto.rst
    │       ├── bart.rst
    │       ├── bert.rst
    │       ├── camembert.rst
    │       ├── ctrl.rst
    │       ├── distilbert.rst
    │       ├── flaubert.rst
    │       ├── gpt.rst
    │       ├── gpt2.rst
    │       ├── roberta.rst
    │       ├── transformerxl.rst
    │       ├── xlm.rst
    │       ├── xlmroberta.rst
    │       └── xlnet.rst
    │   ├── model_sharing.md
    │   ├── multilingual.rst
    │   ├── notebooks.rst
    │   ├── pretrained_models.rst
    │   ├── quickstart.md
    │   ├── serialization.rst
    │   ├── torchscript.rst
    │   └── usage.rst
├── examples
    ├── README.md
    ├── benchmarks.py
    ├── bert_stable_fine_tuning
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── adamw.py
    │   ├── configs
    │   │   ├── cola-sampled
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_full-steps.yaml
    │   │   │   └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml
    │   │   ├── cola
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_normal-0.02.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   └── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml
    │   │   ├── mrpc-sampled
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml
    │   │   ├── mrpc
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_no-drop.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_normal-0.02.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_no-drop.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   └── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml
    │   │   ├── rte-sampled
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml
    │   │   └── rte
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_1e-05_adamW.yaml
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_1e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_3e-05_adamW.yaml
    │   │   │   ├── pooler-albert-large-v2_bsz_16_lr_3e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml
    │   │   │   ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml
    │   │   │   ├── pooler-roberta-large_bsz_16_lr_1e-05_adamW.yaml
    │   │   │   ├── pooler-roberta-large_bsz_16_lr_1e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-roberta-large_bsz_16_lr_2e-05_adamW.yaml
    │   │   │   ├── pooler-roberta-large_bsz_16_lr_2e-05_adamW_bias-correct.yaml
    │   │   │   ├── pooler-roberta-large_bsz_16_lr_3e-05_adamW.yaml
    │   │   │   └── pooler-roberta-large_bsz_16_lr_3e-05_adamW_bias-correct.yaml
    │   ├── glue.py
    │   ├── glue_metrics.py
    │   ├── pooling_albert.py
    │   ├── pooling_bert.py
    │   ├── pooling_roberta.py
    │   ├── run_docker.txt
    │   ├── run_finetuning.py
    │   ├── scripts
    │   │   ├── run_scripts.txt
    │   │   ├── seeds.sh
    │   │   └── train.sh
    │   └── utils.py
    ├── contrib
    │   ├── README.md
    │   ├── run_camembert.py
    │   ├── run_openai_gpt.py
    │   ├── run_swag.py
    │   └── run_transfo_xl.py
    ├── distillation
    │   ├── README.md
    │   ├── distiller.py
    │   ├── grouped_batch_sampler.py
    │   ├── lm_seqs_dataset.py
    │   ├── requirements.txt
    │   ├── run_squad_w_distillation.py
    │   ├── scripts
    │   │   ├── binarized_data.py
    │   │   ├── extract.py
    │   │   ├── extract_distilbert.py
    │   │   └── token_counts.py
    │   ├── train.py
    │   ├── training_configs
    │   │   ├── distilbert-base-cased.json
    │   │   ├── distilbert-base-multilingual-cased.json
    │   │   ├── distilbert-base-uncased.json
    │   │   ├── distilgpt2.json
    │   │   └── distilroberta-base.json
    │   └── utils.py
    ├── hans
    │   ├── hans_processors.py
    │   ├── test_hans.py
    │   └── utils_hans.py
    ├── mm-imdb
    │   ├── run_mmimdb.py
    │   └── utils_mmimdb.py
    ├── ner
    │   ├── README.md
    │   ├── run.sh
    │   ├── run_ner.py
    │   ├── run_pl.sh
    │   ├── run_pl_ner.py
    │   ├── run_tf_ner.py
    │   ├── transformer_base.py
    │   └── utils_ner.py
    ├── pplm
    │   ├── README.md
    │   ├── imgs
    │   │   ├── headfigure.png
    │   │   └── wooly.png
    │   ├── pplm_classification_head.py
    │   ├── run_pplm.py
    │   └── run_pplm_discrim_train.py
    ├── requirements.txt
    ├── run_bertology.py
    ├── run_generation.py
    ├── run_glue.py
    ├── run_language_modeling.py
    ├── run_multiple_choice.py
    ├── run_squad.py
    ├── run_tf_glue.py
    ├── run_xnli.py
    ├── squad
    │   ├── configs
    │   │   ├── albert-base-v1.yaml
    │   │   ├── bert-base-cased.yaml
    │   │   └── roberta-base.yaml
    │   ├── run_finetuning.py
    │   ├── scripts
    │   │   ├── iterations_info.txt
    │   │   ├── run_scripts.txt
    │   │   ├── train.sh
    │   │   └── train_multi_gpu.sh
    │   └── utils.py
    ├── summarization
    │   ├── __init__.py
    │   ├── bart
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── evaluate_cnn.py
    │   │   └── test_bart_examples.py
    │   └── bertabs
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── configuration_bertabs.py
    │   │   ├── convert_bertabs_original_pytorch_checkpoint.py
    │   │   ├── modeling_bertabs.py
    │   │   ├── requirements.txt
    │   │   ├── run_summarization.py
    │   │   ├── test_utils_summarization.py
    │   │   └── utils_summarization.py
    ├── test_examples.py
    ├── tests_samples
    │   ├── .gitignore
    │   ├── MRPC
    │   │   ├── dev.tsv
    │   │   └── train.tsv
    │   └── SQUAD
    │   │   ├── dev-v2.0.json
    │   │   └── train-v2.0.json
    └── utils_multiple_choice.py
├── hubconf.py
├── images
    ├── fig1.png
    └── tab1.png
├── model_cards
    ├── DeepPavlov
    │   ├── bert-base-bg-cs-pl-ru-cased
    │   │   └── README.md
    │   ├── bert-base-cased-conversational
    │   │   └── README.md
    │   ├── bert-base-multilingual-cased-sentence
    │   │   └── README.md
    │   ├── rubert-base-cased-conversational
    │   │   └── README.md
    │   ├── rubert-base-cased-sentence
    │   │   └── README.md
    │   └── rubert-base-cased
    │   │   └── README.md
    ├── KB
    │   ├── albert-base-swedish-cased-alpha
    │   │   └── README.md
    │   ├── bert-base-swedish-cased-ner
    │   │   └── README.md
    │   └── bert-base-swedish-cased
    │   │   └── README.md
    ├── Musixmatch
    │   ├── umberto-commoncrawl-cased-v1
    │   │   └── README.md
    │   └── umberto-wikipedia-uncased-v1
    │   │   └── README.md
    ├── ahotrod
    │   ├── albert_xxlargev1_squad2_512
    │   │   └── README.md
    │   └── xlnet_large_squad2_512
    │   │   └── README.md
    ├── asafaya
    │   └── bert-base-arabic
    │   │   └── README.md
    ├── aubmindlab
    │   ├── bert-base-arabert
    │   │   └── README.md
    │   └── bert-base-arabertv01
    │   │   └── README.md
    ├── bert-base-german-cased-README.md
    ├── binwang
    │   └── xlnet-base-cased
    │   │   └── README.md
    ├── camembert-base-README.md
    ├── canwenxu
    │   └── BERT-of-Theseus-MNLI
    │   │   └── README.md
    ├── dbmdz
    │   ├── bert-base-german-cased
    │   │   └── README.md
    │   ├── bert-base-german-europeana-cased
    │   │   └── README.md
    │   ├── bert-base-german-europeana-uncased
    │   │   └── README.md
    │   ├── bert-base-german-uncased
    │   │   └── README.md
    │   ├── bert-base-italian-cased
    │   │   └── README.md
    │   ├── bert-base-italian-uncased
    │   │   └── README.md
    │   ├── bert-base-italian-xxl-cased
    │   │   └── README.md
    │   ├── bert-base-italian-xxl-uncased
    │   │   └── README.md
    │   ├── bert-base-turkish-cased
    │   │   └── README.md
    │   └── distilbert-base-turkish-cased
    │   │   └── README.md
    ├── deepset
    │   └── roberta-base-squad2
    │   │   └── README.md
    ├── djstrong
    │   └── bg_cs_pl_ru_cased_L-12_H-768_A-12
    │   │   └── README.md
    ├── dkleczek
    │   └── bert-base-polish-uncased-v1
    │   │   └── README.md
    ├── emilyalsentzer
    │   ├── Bio_ClinicalBERT
    │   │   └── README.md
    │   └── Bio_Discharge_Summary_BERT
    │   │   └── README.md
    ├── fmikaelian
    │   ├── camembert-base-fquad
    │   │   └── README.md
    │   ├── camembert-base-squad
    │   │   └── README.md
    │   └── flaubert-base-uncased-squad
    │   │   └── README.md
    ├── henryk
    │   └── bert-base-multilingual-cased-finetuned-dutch-squad2
    │   │   └── README.md
    ├── huggingface
    │   ├── CodeBERTa-language-id
    │   │   └── README.md
    │   └── CodeBERTa-small-v1
    │   │   └── README.md
    ├── jplu
    │   ├── tf-camembert-base
    │   │   └── README.md
    │   ├── tf-xlm-roberta-base
    │   │   └── README.md
    │   └── tf-xlm-roberta-large
    │   │   └── README.md
    ├── julien-c
    │   ├── EsperBERTo-small-pos
    │   │   └── README.md
    │   ├── EsperBERTo-small
    │   │   └── README.md
    │   ├── bert-xsmall-dummy
    │   │   └── README.md
    │   └── dummy-unknown
    │   │   └── README.md
    ├── lvwerra
    │   └── gpt2-medium-taboo
    │   │   └── README.md
    ├── lysandre
    │   ├── arxiv-nlp
    │   │   └── README.md
    │   └── arxiv
    │   │   └── README.md
    ├── microsoft
    │   ├── DialoGPT-large
    │   │   └── README.md
    │   ├── DialoGPT-medium
    │   │   └── README.md
    │   └── DialoGPT-small
    │   │   └── README.md
    ├── mrm8488
    │   ├── bert-base-spanish-wwm-cased-finetuned-spa-squad2-es
    │   │   └── README.md
    │   ├── bert-multi-cased-finedtuned-xquad-tydiqa-goldp
    │   │   └── README.md
    │   ├── bert-multi-cased-finetuned-xquadv1
    │   │   └── README.md
    │   ├── bert-multi-uncased-finetuned-xquadv1
    │   │   └── README.md
    │   ├── bert-spanish-cased-finetuned-ner
    │   │   └── README.md
    │   ├── bert-spanish-cased-finetuned-pos
    │   │   └── README.md
    │   ├── bert-uncased-finetuned-qnli
    │   │   └── README.md
    │   ├── distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es
    │   │   └── README.md
    │   └── xlm-multi-finetuned-xquadv1
    │   │   └── README.md
    ├── nlpaueb
    │   └── bert-base-greek-uncased-v1
    │   │   └── README.md
    ├── nlptown
    │   └── bert-base-multilingual-uncased-sentiment
    │   │   └── README.md
    ├── severinsimmler
    │   └── literary-german-bert
    │   │   ├── README.md
    │   │   ├── kfold.png
    │   │   └── prosa-jahre.png
    ├── twmkn9
    │   └── albert-base-v2-squad2
    │   │   └── README.md
    └── voidful
    │   ├── albert_chinese_base
    │       └── README.md
    │   ├── albert_chinese_large
    │       └── README.md
    │   ├── albert_chinese_small
    │       └── README.md
    │   ├── albert_chinese_tiny
    │       └── README.md
    │   ├── albert_chinese_xlarge
    │       └── README.md
    │   └── albert_chinese_xxlarge
    │       └── README.md
├── notebooks
    ├── 01-training-tokenizers.ipynb
    ├── 02-transformers.ipynb
    ├── 03-pipelines.ipynb
    └── README.md
├── setup.cfg
├── setup.py
├── src
    └── transformers
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── commands
    │       ├── __init__.py
    │       ├── convert.py
    │       ├── download.py
    │       ├── env.py
    │       ├── run.py
    │       ├── serving.py
    │       ├── train.py
    │       └── user.py
    │   ├── configuration_albert.py
    │   ├── configuration_auto.py
    │   ├── configuration_bart.py
    │   ├── configuration_bert.py
    │   ├── configuration_camembert.py
    │   ├── configuration_ctrl.py
    │   ├── configuration_distilbert.py
    │   ├── configuration_flaubert.py
    │   ├── configuration_gpt2.py
    │   ├── configuration_mmbt.py
    │   ├── configuration_openai.py
    │   ├── configuration_roberta.py
    │   ├── configuration_t5.py
    │   ├── configuration_transfo_xl.py
    │   ├── configuration_utils.py
    │   ├── configuration_xlm.py
    │   ├── configuration_xlm_roberta.py
    │   ├── configuration_xlnet.py
    │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    │   ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_pytorch_checkpoint_to_tf2.py
    │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    │   ├── data
    │       ├── __init__.py
    │       ├── metrics
    │       │   ├── __init__.py
    │       │   └── squad_metrics.py
    │       └── processors
    │       │   ├── __init__.py
    │       │   ├── glue.py
    │       │   ├── squad.py
    │       │   ├── utils.py
    │       │   └── xnli.py
    │   ├── file_utils.py
    │   ├── hf_api.py
    │   ├── modelcard.py
    │   ├── modeling_albert.py
    │   ├── modeling_auto.py
    │   ├── modeling_bart.py
    │   ├── modeling_bert.py
    │   ├── modeling_camembert.py
    │   ├── modeling_ctrl.py
    │   ├── modeling_distilbert.py
    │   ├── modeling_encoder_decoder.py
    │   ├── modeling_flaubert.py
    │   ├── modeling_gpt2.py
    │   ├── modeling_mmbt.py
    │   ├── modeling_openai.py
    │   ├── modeling_roberta.py
    │   ├── modeling_t5.py
    │   ├── modeling_tf_albert.py
    │   ├── modeling_tf_auto.py
    │   ├── modeling_tf_bert.py
    │   ├── modeling_tf_camembert.py
    │   ├── modeling_tf_ctrl.py
    │   ├── modeling_tf_distilbert.py
    │   ├── modeling_tf_gpt2.py
    │   ├── modeling_tf_openai.py
    │   ├── modeling_tf_pytorch_utils.py
    │   ├── modeling_tf_roberta.py
    │   ├── modeling_tf_t5.py
    │   ├── modeling_tf_transfo_xl.py
    │   ├── modeling_tf_transfo_xl_utilities.py
    │   ├── modeling_tf_utils.py
    │   ├── modeling_tf_xlm.py
    │   ├── modeling_tf_xlm_roberta.py
    │   ├── modeling_tf_xlnet.py
    │   ├── modeling_transfo_xl.py
    │   ├── modeling_transfo_xl_utilities.py
    │   ├── modeling_utils.py
    │   ├── modeling_xlm.py
    │   ├── modeling_xlm_roberta.py
    │   ├── modeling_xlnet.py
    │   ├── optimization.py
    │   ├── optimization_tf.py
    │   ├── pipelines.py
    │   ├── tokenization_albert.py
    │   ├── tokenization_auto.py
    │   ├── tokenization_bart.py
    │   ├── tokenization_bert.py
    │   ├── tokenization_bert_japanese.py
    │   ├── tokenization_camembert.py
    │   ├── tokenization_ctrl.py
    │   ├── tokenization_distilbert.py
    │   ├── tokenization_flaubert.py
    │   ├── tokenization_gpt2.py
    │   ├── tokenization_openai.py
    │   ├── tokenization_roberta.py
    │   ├── tokenization_t5.py
    │   ├── tokenization_transfo_xl.py
    │   ├── tokenization_utils.py
    │   ├── tokenization_xlm.py
    │   ├── tokenization_xlm_roberta.py
    │   ├── tokenization_xlnet.py
    │   └── utils_encoder_decoder.py
├── templates
    ├── adding_a_new_example_script
    │   ├── README.md
    │   ├── run_xxx.py
    │   └── utils_xxx.py
    └── adding_a_new_model
    │   ├── README.md
    │   ├── configuration_xxx.py
    │   ├── convert_xxx_original_tf_checkpoint_to_pytorch.py
    │   ├── modeling_tf_xxx.py
    │   ├── modeling_xxx.py
    │   ├── tests
    │       ├── test_modeling_tf_xxx.py
    │       ├── test_modeling_xxx.py
    │       └── test_tokenization_xxx.py
    │   └── tokenization_xxx.py
├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── dummy-config.json
    │   ├── empty.txt
    │   ├── input.txt
    │   ├── sample_text.txt
    │   ├── spiece.model
    │   └── test_sentencepiece.model
    ├── test_activations.py
    ├── test_configuration_auto.py
    ├── test_configuration_common.py
    ├── test_doc_samples.py
    ├── test_hf_api.py
    ├── test_model_card.py
    ├── test_modeling_albert.py
    ├── test_modeling_auto.py
    ├── test_modeling_bart.py
    ├── test_modeling_bert.py
    ├── test_modeling_common.py
    ├── test_modeling_ctrl.py
    ├── test_modeling_distilbert.py
    ├── test_modeling_flaubert.py
    ├── test_modeling_gpt2.py
    ├── test_modeling_openai.py
    ├── test_modeling_roberta.py
    ├── test_modeling_t5.py
    ├── test_modeling_tf_albert.py
    ├── test_modeling_tf_auto.py
    ├── test_modeling_tf_bert.py
    ├── test_modeling_tf_common.py
    ├── test_modeling_tf_ctrl.py
    ├── test_modeling_tf_distilbert.py
    ├── test_modeling_tf_gpt2.py
    ├── test_modeling_tf_openai_gpt.py
    ├── test_modeling_tf_roberta.py
    ├── test_modeling_tf_t5.py
    ├── test_modeling_tf_transfo_xl.py
    ├── test_modeling_tf_xlm.py
    ├── test_modeling_tf_xlnet.py
    ├── test_modeling_transfo_xl.py
    ├── test_modeling_xlm.py
    ├── test_modeling_xlm_roberta.py
    ├── test_modeling_xlnet.py
    ├── test_optimization.py
    ├── test_optimization_tf.py
    ├── test_pipelines.py
    ├── test_tokenization_albert.py
    ├── test_tokenization_auto.py
    ├── test_tokenization_bert.py
    ├── test_tokenization_bert_japanese.py
    ├── test_tokenization_common.py
    ├── test_tokenization_ctrl.py
    ├── test_tokenization_distilbert.py
    ├── test_tokenization_fast.py
    ├── test_tokenization_gpt2.py
    ├── test_tokenization_openai.py
    ├── test_tokenization_roberta.py
    ├── test_tokenization_t5.py
    ├── test_tokenization_transfo_xl.py
    ├── test_tokenization_utils.py
    ├── test_tokenization_xlm.py
    ├── test_tokenization_xlm_roberta.py
    ├── test_tokenization_xlnet.py
    └── utils.py
├── transformers-cli
├── utils
    ├── download_glue_data.py
    └── link_tester.py
└── valohai.yaml


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: quality style test test-examples
 2 | 
 3 | # Check that source code meets quality standards
 4 | 
 5 | quality:
 6 | 	black --check --line-length 119 --target-version py35 examples templates tests src utils
 7 | 	isort --check-only --recursive examples templates tests src utils
 8 | 	flake8 examples templates tests src utils
 9 | 
10 | # Format source code automatically
11 | 
12 | style:
13 | 	black --line-length 119 --target-version py35 examples templates tests src utils
14 | 	isort --recursive examples templates tests src utils
15 | 
16 | # Run tests for the library
17 | 
18 | test:
19 | 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
20 | 
21 | # Run tests for examples
22 | 
23 | test-examples:
24 | 	python -m pytest -n auto --dist=loadfile -s -v ./examples/
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines
 2 | 
 3 | ### Marius Mosbach, Maksym Andriushchenko, Dietrich Klakow
 4 | ##### Saarland University and EPFL
 5 | 
 6 | This repository contains code for the paper [On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines](https://arxiv.org/abs/2006.04884). It is a fork of the [Huggingface Transformers repository](https://github.com/huggingface/transformers) (v2.5.1).
 7 | 
 8 | ## Abstract
 9 | 
10 | Fine-tuning pre-trained transformer-based language models such as BERT has become a common practice dominating leaderboards across various NLP benchmarks. Despite the strong empirical performance of fine-tuned models, fine-tuning is an unstable process: training the same model with multiple random seeds can result in a large variance of the task performance. Previous literature (Devlin et al., 2019; Lee et al., 2020; Dodge et al., 2020) identified two potential reasons for the observed instability: catastrophic forgetting and a small size of the fine-tuning datasets. In this paper, we show that both hypotheses fail to explain
11 | the fine-tuning instability. We analyze BERT, RoBERTa, and ALBERT, finetuned on three commonly used datasets from the GLUE benchmark and show that the observed instability is caused by optimization difficulties that lead to vanishing gradients. Additionally, we show that the remaining variance of the downstream task performance can be attributed to differences in generalization where fine-tuned models with the samw training loss exhibit noticeably different test performance. Based on our analysis, we present a simple but strong baseline that makes fine-tuning BERT-based models significantly more stable than previously proposed approaches.
12 | 
13 | ![](images/fig1.png)
14 | 
15 | ![](images/tab1.png)
16 | 
17 | ## Reproducing our results
18 | 
19 | See `/examples/bert_stable_fine_tuning/README.md` for how to setup Docker and run our models. 
20 | 


--------------------------------------------------------------------------------
/deploy_multi_version_doc.sh:
--------------------------------------------------------------------------------
 1 | cd docs
 2 | 
 3 | function deploy_doc(){
 4 | 	echo "Creating doc at commit $1 and pushing to folder $2"
 5 | 	git checkout $1
 6 | 	if [ ! -z "$2" ] 
 7 | 	then
 8 | 		echo "Pushing version" $2
 9 | 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
10 | 	else
11 | 		echo "Pushing master"
12 | 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
13 | 	fi
14 | }
15 | 
16 | deploy_doc "master" 
17 | deploy_doc "b33a385" v1.0.0
18 | deploy_doc "fe02e45" v1.1.0
19 | deploy_doc "89fd345" v1.2.0
20 | deploy_doc "fc9faa8" v2.0.0
21 | deploy_doc "3ddce1d" v2.1.1
22 | deploy_doc "f2f3294" v2.2.0
23 | deploy_doc "d0f8b9a" v2.3.0
24 | 


--------------------------------------------------------------------------------
/docker/transformers-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     tensorflow-cpu \
19 |     torch
20 | 
21 | WORKDIR /workspace
22 | COPY . transformers/
23 | RUN cd transformers/ && \
24 |     python3 -m pip install --no-cache-dir .
25 | 
26 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docker/transformers-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     tensorflow \
19 |     torch
20 | 
21 | WORKDIR /workspace
22 | COPY . transformers/
23 | RUN cd transformers/ && \
24 |     python3 -m pip install --no-cache-dir .
25 | 
26 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docker/transformers-pytorch-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     torch
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docker/transformers-pytorch-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     torch
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docker/transformers-tensorflow-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     tensorflow-cpu
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docker/transformers-tensorflow-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     tensorflow
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Generating the documentation
 2 | 
 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
 4 | you can install them with the following command, at the root of the code repository:
 5 | 
 6 | ```bash
 7 | pip install -e ".[docs]"
 8 | ```
 9 | 
10 | ## Packages installed
11 | 
12 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from
13 | `requirements.txt`, you do not need to run the following commands.
14 | 
15 | Building it requires the package `sphinx` that you can
16 | install using:
17 | 
18 | ```bash
19 | pip install -U sphinx
20 | ```
21 | 
22 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
23 | [Read The Docs](https://readthedocs.org/). You can install it using the following command:
24 | 
25 | ```bash
26 | pip install sphinx_rtd_theme
27 | ```
28 | 
29 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
30 | 
31 | ```bash
32 | pip install recommonmark
33 | ```
34 | 
35 | ## Building the documentation
36 | 
37 | Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
38 | command to generate it:
39 | 
40 | ```bash
41 | ln -s ../../examples/README.md examples.md
42 | ```
43 | 
44 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
45 | 
46 | ```bash
47 | make html
48 | ```
49 | 
50 | ---
51 | **NOTE**
52 | 
53 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
54 | directory before rebuilding. Run the following command to clean and build:
55 | 
56 | ```bash
57 | make clean && make html
58 | ```
59 | 
60 | ---
61 | 
62 | It should build the static app that will be available under `/docs/_build/html`
63 | 
64 | ## Adding a new element to the tree (toc-tree)
65 | 
66 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
67 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
68 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Light.ttf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Medium.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Regular.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Thin.otf


--------------------------------------------------------------------------------
/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .highlight .c1, .highlight .sd{
 3 |     color: #999
 4 | }
 5 | 
 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
 7 |     color: #FB8D68;
 8 | }
 9 | 
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
11 |     color: #6670FF;
12 | }


--------------------------------------------------------------------------------
/docs/source/benchmarks.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
 4 | benchmark will help keep track of the preformance improvements that are brought to our models across versions.
 5 | 
 6 | ## Benchmarking all models for inference
 7 | 
 8 | As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
 9 | and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
10 | TensorFlow XLA) and GPUs.
11 | 
12 | The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
13 | 
14 | The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
15 | 
16 | ## TF2 with mixed precision, XLA, Distribution (@tlkh)
17 | 
18 | This work was done by [Timothy Liu](https://github.com/tlkh).
19 | 
20 | There are very positive results to be gained from the various TensorFlow 2.0 features:
21 | 
22 | - Automatic Mixed Precision (AMP)
23 | - XLA compiler
24 | - Distribution strategies (multi-GPU)
25 | 
26 | The benefits are listed here (tested on CoLA, MRPC, SST-2):
27 | 
28 | - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
29 | - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
30 | - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
31 | - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
32 | 
33 | The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
34 | on a single GPU gives the following results:
35 | 
36 | - CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
37 | - MRPC: AMP results in lower acc (0.823 vs 0.835)
38 | - SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
39 | 
40 | However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
41 | 
42 | CoLA: AMP results in higher acc (0.828 vs 0.812)
43 | MRPC: AMP results in lower acc (0.817 vs 0.827)
44 | SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
45 | 
46 | The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
47 | 
48 | Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
49 | as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
50 | can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
51 | 
52 | The benefits as seen on SST-2 (larger dataset) is much clear.
53 | 
54 | All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
55 | 


--------------------------------------------------------------------------------
/docs/source/bertology.rst:
--------------------------------------------------------------------------------
 1 | BERTology
 2 | ---------
 3 | 
 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 5 | 
 6 | 
 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
10 | 
11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
12 | 
13 | 
14 | * accessing all the hidden-states of BERT/GPT/GPT-2,
15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2,
16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
17 | 
18 | To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
19 | 


--------------------------------------------------------------------------------
/docs/source/examples.md:
--------------------------------------------------------------------------------
1 | ../../examples/README.md


--------------------------------------------------------------------------------
/docs/source/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/favicon.ico


--------------------------------------------------------------------------------
/docs/source/imgs/transformers_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/transformers_logo_name.png


--------------------------------------------------------------------------------
/docs/source/imgs/warmup_constant_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_constant_schedule.png


--------------------------------------------------------------------------------
/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png


--------------------------------------------------------------------------------
/docs/source/imgs/warmup_cosine_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_schedule.png


--------------------------------------------------------------------------------
/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png


--------------------------------------------------------------------------------
/docs/source/imgs/warmup_linear_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_linear_schedule.png


--------------------------------------------------------------------------------
/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Transformers is tested on Python 3.5+ and PyTorch 1.1.0
 4 | 
 5 | ## With pip
 6 | 
 7 | PyTorch Transformers can be installed using pip as follows:
 8 | 
 9 | ``` bash
10 | pip install transformers
11 | ```
12 | 
13 | ## From source
14 | 
15 | To install from source, clone the repository and install with:
16 | 
17 | ``` bash
18 | git clone https://github.com/huggingface/transformers.git
19 | cd transformers
20 | pip install .
21 | ```
22 | 
23 | ## Tests
24 | 
25 | An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
26 | 
27 | Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
28 | 
29 | ## OpenAI GPT original tokenization workflow
30 | 
31 | If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
32 | 
33 | ``` bash
34 | pip install spacy ftfy==4.4.3
35 | python -m spacy download en
36 | ```
37 | 
38 | If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
39 | 
40 | ## Note on model downloads (Continuous Integration or large-scale deployments)
41 | 
42 | If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
43 | 
44 | ## Do you want to run a Transformer model on a mobile device?
45 | 
46 | You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
47 | 
48 | It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
49 | 
50 | At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
51 | or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
52 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PretrainedConfig``
 7 | ~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | .. autoclass:: transformers.PretrainedConfig
10 |     :members:
11 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/model.rst:
--------------------------------------------------------------------------------
 1 | Models
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to:
 7 | 
 8 | - resize the input token embeddings when new tokens are added to the vocabulary
 9 | - prune the attention heads of the model.
10 | 
11 | ``PreTrainedModel``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.PreTrainedModel
15 |     :members:
16 | 
17 | ``TFPreTrainedModel``
18 | ~~~~~~~~~~~~~~~~~~~~~
19 | 
20 | .. autoclass:: transformers.TFPreTrainedModel
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/optimizer_schedules.rst:
--------------------------------------------------------------------------------
 1 | Optimizer
 2 | ----------------------------------------------------
 3 | 
 4 | The ``.optimization`` module provides:
 5 | 
 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
 8 | - a gradient accumulation class to accumulate the gradients of multiple batches
 9 | 
10 | ``AdamW``
11 | ~~~~~~~~~~~~~~~~
12 | 
13 | .. autoclass:: transformers.AdamW
14 |     :members:
15 | 
16 | ``AdamWeightDecay``
17 | ~~~~~~~~~~~~~~~~~~~
18 | 
19 | .. autoclass:: transformers.AdamWeightDecay
20 |     :members:
21 | 
22 | .. autofunction:: transformers.create_optimizer
23 | 
24 | Schedules
25 | ----------------------------------------------------
26 | 
27 | Learning Rate Schedules
28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 | .. autofunction:: transformers.get_constant_schedule
30 | 
31 | 
32 | .. autofunction:: transformers.get_constant_schedule_with_warmup
33 | 
34 | .. image:: /imgs/warmup_constant_schedule.png
35 |     :target: /imgs/warmup_constant_schedule.png
36 |     :alt:
37 | 
38 | 
39 | .. autofunction:: transformers.get_cosine_schedule_with_warmup
40 | 
41 | .. image:: /imgs/warmup_cosine_schedule.png
42 |     :target: /imgs/warmup_cosine_schedule.png
43 |     :alt:
44 | 
45 | 
46 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
47 | 
48 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
49 |     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
50 |     :alt:
51 | 
52 | 
53 | 
54 | .. autofunction:: transformers.get_linear_schedule_with_warmup
55 | 
56 | .. image:: /imgs/warmup_linear_schedule.png
57 |     :target: /imgs/warmup_linear_schedule.png
58 |     :alt:
59 | 
60 | ``Warmup``
61 | ~~~~~~~~~~~~~~~~
62 | 
63 | .. autoclass:: transformers.WarmUp
64 |     :members:
65 | 
66 | Gradient Strategies
67 | ----------------------------------------------------
68 | 
69 | ``GradientAccumulator``
70 | ~~~~~~~~~~~~~~~~~~~~~~~
71 | 
72 | .. autoclass:: transformers.GradientAccumulator
73 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/pipelines.rst:
--------------------------------------------------------------------------------
 1 | Pipelines
 2 | ----------------------------------------------------
 3 | 
 4 | The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
 5 | of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
 6 | Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering.
 7 | 
 8 | There are two categories of pipeline abstractions to be aware about:
 9 | 
10 | - The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
11 | - The other task-specific pipelines, such as :class:`~transformers.NerPipeline`
12 |   or :class:`~transformers.QuestionAnsweringPipeline`
13 | 
14 | The pipeline abstraction
15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 | 
17 | The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
18 | other pipeline but requires an additional argument which is the `task`.
19 | 
20 | .. autoclass:: transformers.pipeline
21 |     :members:
22 | 
23 | 
24 | The task specific pipelines
25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 | 
27 | Parent class: Pipeline
28 | =========================================
29 | 
30 | .. autoclass:: transformers.Pipeline
31 |     :members: predict, transform, save_pretrained
32 | 
33 | NerPipeline
34 | ==========================================
35 | 
36 | .. autoclass:: transformers.NerPipeline
37 | 
38 | TokenClassificationPipeline
39 | ==========================================
40 | 
41 | This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for
42 | documentation and usage examples.
43 | 
44 | FillMaskPipeline
45 | ==========================================
46 | 
47 | .. autoclass:: transformers.FillMaskPipeline
48 | 
49 | FeatureExtractionPipeline
50 | ==========================================
51 | 
52 | .. autoclass:: transformers.FeatureExtractionPipeline
53 | 
54 | TextClassificationPipeline
55 | ==========================================
56 | 
57 | .. autoclass:: transformers.TextClassificationPipeline
58 | 
59 | QuestionAnsweringPipeline
60 | ==========================================
61 | 
62 | .. autoclass:: transformers.QuestionAnsweringPipeline
63 | 
64 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/tokenizer.rst:
--------------------------------------------------------------------------------
 1 | Tokenizer
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
 7 | 
 8 | - tokenizing, converting tokens to ids and back and encoding/decoding,
 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
11 | 
12 | ``PreTrainedTokenizer``
13 | ~~~~~~~~~~~~~~~~~~~~~~~~
14 | 
15 | .. autoclass:: transformers.PreTrainedTokenizer
16 |     :members:
17 | 


--------------------------------------------------------------------------------
/docs/source/model_doc/auto.rst:
--------------------------------------------------------------------------------
 1 | AutoModels
 2 | -----------
 3 | 
 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
 5 | 
 6 | AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
 7 | 
 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
 9 | 
10 | 
11 | ``AutoConfig``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.AutoConfig
15 |     :members:
16 | 
17 | 
18 | ``AutoTokenizer``
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.AutoTokenizer
22 |     :members:
23 | 
24 | 
25 | ``AutoModel``
26 | ~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.AutoModel
29 |     :members:
30 | 
31 | 
32 | ``AutoModelForPreTraining``
33 | ~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.AutoModelForPreTraining
36 |     :members:
37 | 
38 | 
39 | ``AutoModelWithLMHead``
40 | ~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.AutoModelWithLMHead
43 |     :members:
44 | 
45 | 
46 | ``AutoModelForSequenceClassification``
47 | ~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.AutoModelForSequenceClassification
50 |     :members:
51 | 
52 | 
53 | ``AutoModelForQuestionAnswering``
54 | ~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.AutoModelForQuestionAnswering
57 |     :members:
58 | 
59 | 
60 | ``AutoModelForTokenClassification``
61 | ~~~~~~~~~~~~~~~~~~~~~
62 | 
63 | .. autoclass:: transformers.AutoModelForTokenClassification
64 |     :members:
65 | 
66 | 


--------------------------------------------------------------------------------
/docs/source/model_doc/bart.rst:
--------------------------------------------------------------------------------
 1 | Bart
 2 | ----------------------------------------------------
 3 | **DISCLAIMER:** This model is still a work in progress, if you see something strange,
 4 | file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
 5 | @sshleifer
 6 | 
 7 | Paper
 8 | ~~~~~
 9 | The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
10 | According to the abstract,
11 | 
12 | - Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT).
13 | - The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token.
14 | - BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.
15 | 
16 | The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
17 | 
18 | 
19 | Implementation Notes
20 | ~~~~~~~~~~~~~~~~~~~~
21 | - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
22 | - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
23 | - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
24 | - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
25 | - Models that load the ``"bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
26 | 
27 | 
28 | 
29 | BartModel
30 | ~~~~~~~~~~~~~
31 | 
32 | .. autoclass:: transformers.BartModel
33 |     :members: forward
34 | 
35 | .. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
36 | 
37 | 
38 | BartForConditionalGeneration
39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
40 | 
41 | .. autoclass:: transformers.BartForConditionalGeneration
42 |     :members: generate, forward
43 | 
44 | 
45 | BartForSequenceClassification
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | .. autoclass:: transformers.BartForSequenceClassification
49 |     :members: forward
50 | 
51 | BartConfig
52 | ~~~~~~~~~~~~~~~~~~~~~
53 | 
54 | .. autoclass:: transformers.BartConfig
55 |     :members:
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/source/model_doc/flaubert.rst:
--------------------------------------------------------------------------------
 1 | FlauBERT
 2 | ----------------------------------------------------
 3 | 
 4 | The FlauBERT model was proposed in the paper
 5 | `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
 6 | It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
 7 | 
 8 | The abstract from the paper is the following:
 9 | 
10 | *Language models have become a key step to achieve state-of-the art results in many different Natural Language
11 | Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
12 | way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
13 | contextualization at the sentence level. This has been widely demonstrated for English using contextualized
14 | representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
15 | al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
16 | and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
17 | for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
18 | classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
19 | of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
20 | evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
21 | to the research community for further reproducible experiments in French NLP.*
22 | 
23 | 
24 | FlaubertConfig
25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 | 
27 | .. autoclass:: transformers.FlaubertConfig
28 |     :members:
29 | 
30 | 
31 | FlaubertTokenizer
32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 | 
34 | .. autoclass:: transformers.FlaubertTokenizer
35 |     :members:
36 | 
37 | 
38 | FlaubertModel
39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
40 | 
41 | .. autoclass:: transformers.FlaubertModel
42 |     :members:
43 | 
44 | 
45 | FlaubertWithLMHeadModel
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | .. autoclass:: transformers.FlaubertWithLMHeadModel
49 |     :members:
50 | 
51 | 
52 | FlaubertForSequenceClassification
53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
54 | 
55 | .. autoclass:: transformers.FlaubertForSequenceClassification
56 |     :members:
57 | 
58 | 
59 | FlaubertForQuestionAnsweringSimple
60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
61 | 
62 | .. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
63 |     :members:
64 | 
65 | 
66 | FlaubertForQuestionAnswering
67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
68 | 
69 | .. autoclass:: transformers.FlaubertForQuestionAnswering
70 |     :members:
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/docs/source/model_doc/transformerxl.rst:
--------------------------------------------------------------------------------
 1 | Transformer XL
 2 | ----------------------------------------------------
 3 | 
 4 | Overview
 5 | ~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | The Transformer-XL model was proposed in
 8 | `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__
 9 | by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
10 | It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
11 | previously computed hidden-states to attend to longer context (memory).
12 | This model also uses adaptive softmax inputs and outputs (tied).
13 | 
14 | The abstract from the paper is the following:
15 | 
16 | *Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
17 | setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
18 | beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
19 | a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
20 | the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
21 | 450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
22 | to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
23 | of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
24 | Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
25 | coherent, novel text articles with thousands of tokens.*
26 | 
27 | Tips:
28 | 
29 | - Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
30 |   The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
31 | - Transformer-XL is one of the few models that has no sequence length limit.
32 | 
33 | 
34 | TransfoXLConfig
35 | ~~~~~~~~~~~~~~~~~~~~~
36 | 
37 | .. autoclass:: transformers.TransfoXLConfig
38 |     :members:
39 | 
40 | 
41 | TransfoXLTokenizer
42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
43 | 
44 | .. autoclass:: transformers.TransfoXLTokenizer
45 |     :members: save_vocabulary
46 | 
47 | 
48 | TransfoXLModel
49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
50 | 
51 | .. autoclass:: transformers.TransfoXLModel
52 |     :members:
53 | 
54 | 
55 | TransfoXLLMHeadModel
56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
57 | 
58 | .. autoclass:: transformers.TransfoXLLMHeadModel
59 |     :members:
60 | 
61 | 
62 | TFTransfoXLModel
63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
64 | 
65 | .. autoclass:: transformers.TFTransfoXLModel
66 |     :members:
67 | 
68 | 
69 | TFTransfoXLLMHeadModel
70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
71 | 
72 | .. autoclass:: transformers.TFTransfoXLLMHeadModel
73 |     :members:
74 | 


--------------------------------------------------------------------------------
/docs/source/model_sharing.md:
--------------------------------------------------------------------------------
 1 | # Model upload and sharing
 2 | 
 3 | Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
 4 | 
 5 | **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:
 6 | 
 7 | ```shell
 8 | transformers-cli login
 9 | # log in using the same credentials as on huggingface.co
10 | ```
11 | Upload your model:
12 | ```shell
13 | transformers-cli upload ./path/to/pretrained_model/
14 | 
15 | # ^^ Upload folder containing weights/tokenizer/config
16 | # saved via `.save_pretrained()`
17 | 
18 | transformers-cli upload ./config.json [--filename folder/foobar.json]
19 | 
20 | # ^^ Upload a single file
21 | # (you can optionally override its filename, which can be nested inside a folder)
22 | ```
23 | 
24 | If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
25 | ```shell
26 | --organization organization_name
27 | ```
28 | 
29 | Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
30 | ```python
31 | "username/pretrained_model"
32 | # or if an org:
33 | "organization_name/pretrained_model"
34 | ```
35 | 
36 | **Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
37 | 
38 | Your model now has a page on huggingface.co/models 🔥
39 | 
40 | Anyone can load it from code:
41 | ```python
42 | tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
43 | model = AutoModel.from_pretrained("namespace/pretrained_model")
44 | ```
45 | 
46 | List all your files on S3:
47 | ```shell
48 | transformers-cli s3 ls
49 | ```
50 | 
51 | You can also delete unneeded files:
52 | 
53 | ```shell
54 | transformers-cli s3 rm …
55 | ```
56 | 


--------------------------------------------------------------------------------
/docs/source/notebooks.rst:
--------------------------------------------------------------------------------
 1 | Notebooks
 2 | ================================================
 3 | 
 4 | We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 5 | 
 6 | 
 7 | *
 8 |   The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 9 | 
10 | *
11 |   The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
12 | 
13 | *
14 |   The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
15 | 
16 | Please follow the instructions given in the notebooks to run and modify them.
17 | 


--------------------------------------------------------------------------------
/examples/bert_stable_fine_tuning/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image must at least have pytorch and CUDA installed.
 2 | # We are using NVIDIA NGC's PyTorch image here, see: https://ngc.nvidia.com/catalog/containers/nvidia:pytorch for latest version
 3 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.09-py3
 4 | FROM $BASE_IMAGE
 5 | ARG BASE_IMAGE
 6 | LABEL repository="bert-stable-fine-tuning"
 7 | 
 8 | # Set path to CUDA
 9 | ENV CUDA_HOME=/usr/local/cuda
10 | 
11 | # Install additional programs
12 | RUN apt update && \
13 |     apt install -y build-essential \
14 |                    htop \
15 |                    git \
16 |                    curl \
17 |                    ca-certificates \
18 |                    vim \
19 |                    tmux && \
20 |     rm -rf /var/lib/apt/lists
21 | 
22 | # Update pip
23 | RUN SHA=ToUcHMe which python3
24 | RUN SHA=ToUcHMe python3 -m pip install --upgrade pip
25 | 
26 | # See http://bugs.python.org/issue19846
27 | ENV LANG C.UTF-8
28 | 
29 | # Install additional dependencies
30 | RUN python3 -m pip install wandb
31 | RUN python3 -m pip install autopep8
32 | RUN python3 -m pip install attrdict
33 | RUN conda install pylint
34 | 
35 | # Specify a new user (USER_NAME and USER_UID are specified via --build-arg)
36 | ARG USER_UID
37 | ARG USER_NAME
38 | ENV USER_GID=$USER_UID
39 | ENV USER_GROUP="users"
40 | 
41 | # Create the user
42 | RUN mkdir /home/$USER_NAME 
43 | RUN useradd -l -d /home/$USER_NAME -u $USER_UID -g $USER_GROUP $USER_NAME
44 | 
45 | # Setup VSCode stuff (comment when not using vscode)
46 | RUN mkdir /home/$USER_NAME/.vscode-server 
47 | RUN mkdir /home/$USER_NAME/.vscode-server-insiders
48 | 
49 | # Change owner of home dir
50 | RUN chown -R ${USER_UID}:${USER_GID} /home/$USER_NAME/
51 | 
52 | # Set workdir when starting container
53 | WORKDIR /transformers
54 | 
55 | # Add workdir to PYTHONPATH
56 | ENV PYTHONPATH="$PYTHONPATH:/transformers"
57 | 
58 | CMD ["/bin/bash"]
59 | 


--------------------------------------------------------------------------------
/examples/bert_stable_fine_tuning/README.md:
--------------------------------------------------------------------------------
 1 | ## Installing & Getting Started
 2 | 
 3 | 1. Clone the repository.
 4 | 
 5 | ````
 6 | git clone git@github.com:uds-lsv/bert-stable-fine-tuning.git
 7 | cd bert-stable-fine-tuning/examples/bert_stable_fine_tuning
 8 | ````
 9 | 
10 | 2. [Download datasets](https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py) from the GLUE benchmark.
11 | 
12 | 3. Setup a Docker image and start a container. 
13 | 
14 | ````
15 | docker build -f ./Dockerfile --build-arg USER_UID=$UID --build-arg USER_NAME=$(id -un) -t bert-stable-fine-tuning:latest .
16 | 
17 | docker run -it --rm --runtime=nvidia --pid=host --ipc=host \
18 |     -v /path/to/bert-stable-fine-tuning:/transformers \
19 |     -v /path/to/pre-trained-transformers:/pre-trained-transformers \
20 |     -v /path/to/datasets:/datasets \
21 |     -v /path/to/bert-stable-fine-tuning/logfiles:/logfiles \
22 |     -v /path/to/bert-stable-fine-tuning/checkpoints:/checkpoints \
23 |     -v /path/to/bert-stable-fine-tuning/tb-logs:/tb-logs \
24 |     -v /path/to/bert-stable-fine-tuning/wandb-logs:/wandb-logs \
25 |     bert-stable-fine-tuning:latest
26 | ````
27 | 
28 | Add `--user=<username>` to the `docker run` command above in order to run the container as your user. Use `--gpus=all instead` instead of `--runtime=nvidia` for more recent Docker versions (starting from 19.03). More information on Docker can be found here: `/bert_stable_fine_tuning/run_docker.txt`
29 | 
30 | 4. Install huggingface transformers in editable mode **inside** the container.
31 | 
32 | ````
33 | python3 -m pip install -e . --user
34 | ````
35 | 
36 | 5. Fine-tune BERT-large-uncased on RTE. (You might want to check `./bert_stable_fine_tuning/scripts/seeds.sh` first.) 
37 | 
38 | ````
39 | bash /transformers/examples/bert_stable_fine_tuning/scripts/seeds.sh /transformers/examples/bert_stable_fine_tuning/configs/rte/pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml 1 1 0  
40 | ````
41 | 
42 | 6. Additional config files for RTE, MRPC, and CoLA can be found here: `./bert_stable_fine_tuning/configs`. Bash commands for every config file can be found here: `./bert_stable_fine_tuning/scripts/run_scripts.sh`
43 | 
44 | **Happy stable fine-tuning!** :rocket: :metal: 
45 | 
46 | 


--------------------------------------------------------------------------------
/examples/bert_stable_fine_tuning/run_docker.txt:
--------------------------------------------------------------------------------
 1 | # Build docker image from docker file (using latest base PyTorch image from NVIDIA GPU Cloud)
 2 | docker build -f ./Dockerfile --build-arg USER_UID=$UID --build-arg USER_NAME=$(id -un) -t bert-stable-fine-tuning:latest .
 3 | 
 4 | 
 5 | # Launch docker container from docker image as root
 6 | # add --user=<username> to run as user <username>
 7 | # /path/to/pre-trained-transformers is a dir that holds the downloaded weights for the pre-trained models 
 8 | 
 9 | docker run -it --rm --runtime=nvidia --pid=host --ipc=host \
10 |     -v /path/to/bert-stable-fine-tuning:/transformers \
11 |     -v /path/to/pre-trained-transformers:/pre-trained-transformers \
12 |     -v /path/to/datasets:/datasets \
13 |     -v /path/to/bert-stable-fine-tuning/logfiles:/logfiles \
14 |     -v /path/to/bert-stable-fine-tuning/checkpoints:/checkpoints \
15 |     -v /path/to/bert-stable-fine-tuning/tb-logs:/tb-logs \
16 |     -v /path/to/bert-stable-fine-tuning/wandb-logs:/wandb-logs \
17 |     bert-stable-fine-tuning:latest
18 | 
19 | 
20 | # For latest docker version (starting from 19.03) use:
21 | # --gpus=all instead of --runtime=nvidia
22 | 
23 | ######################################
24 | # Make huggingface transformers editable inside container
25 | ######################################
26 | 
27 | export PYTHONPATH="$PYTHONPATH:/transformers"
28 | python3 -m pip install -e . --user
29 | 


--------------------------------------------------------------------------------
/examples/bert_stable_fine_tuning/scripts/seeds.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="$PYTHONPATH:/transformers"
 3 | 
 4 | # Passed as arguements
 5 | CONFIG_FILE=$1
 6 | FIRST_SEED=$2
 7 | LAST_SEED=$3
 8 | export CUDA_VISIBLE_DEVICES=$4
 9 | 
10 | # Setup weights & biases environment variables
11 | # Comment lines below if you don't want to use wandb
12 | export WANDB_API_KEY=your-key
13 | export WANDB_USERNAME="your-username"
14 | export WANDB_ENTITY="your-entity"
15 | 
16 | # Train the same model on the same dataset with different random seeds
17 | for SEED in $(seq $FIRST_SEED $LAST_SEED);
18 | do
19 |     python /transformers/examples/bert_stable_fine_tuning/run_finetuning.py \
20 |         --config ${CONFIG_FILE} \
21 |         --do_train \
22 |         --do_eval \
23 |         --seed ${SEED}
24 | done


--------------------------------------------------------------------------------
/examples/bert_stable_fine_tuning/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="$PYTHONPATH:/transformers"
 3 | 
 4 | # Passed as arguements
 5 | CONFIG_FILE=$1
 6 | SEED=$2
 7 | export CUDA_VISIBLE_DEVICES=$3
 8 | 
 9 | # Setup weights & biases environment variables
10 | # Comment lines below if you don't want to use wandb
11 | export WANDB_API_KEY=your-key
12 | export WANDB_USERNAME="your-username"
13 | export WANDB_ENTITY="your-entity"
14 | 
15 | python /transformers/examples/bert_stable_fine_tuning/run_finetuning.py \
16 |     --config ${CONFIG_FILE} \
17 |     --do_train \
18 |     --do_eval \
19 |     --seed ${SEED}


--------------------------------------------------------------------------------
/examples/contrib/README.md:
--------------------------------------------------------------------------------
1 | # Community contributed examples
2 | 
3 | This folder contains examples which are not actively maintained (mostly contributed by the community).
4 | 
5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
6 | 


--------------------------------------------------------------------------------
/examples/contrib/run_camembert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from transformers.modeling_camembert import CamembertForMaskedLM
 4 | from transformers.tokenization_camembert import CamembertTokenizer
 5 | 
 6 | 
 7 | def fill_mask(masked_input, model, tokenizer, topk=5):
 8 |     # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
 9 |     assert masked_input.count("<mask>") == 1
10 |     input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
11 |     logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
12 |     masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
13 |     logits = logits[0, masked_index, :]
14 |     prob = logits.softmax(dim=0)
15 |     values, indices = prob.topk(k=topk, dim=0)
16 |     topk_predicted_token_bpe = " ".join(
17 |         [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
18 |     )
19 |     masked_token = tokenizer.mask_token
20 |     topk_filled_outputs = []
21 |     for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
22 |         predicted_token = predicted_token_bpe.replace("\u2581", " ")
23 |         if " {0}".format(masked_token) in masked_input:
24 |             topk_filled_outputs.append(
25 |                 (
26 |                     masked_input.replace(" {0}".format(masked_token), predicted_token),
27 |                     values[index].item(),
28 |                     predicted_token,
29 |                 )
30 |             )
31 |         else:
32 |             topk_filled_outputs.append(
33 |                 (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
34 |             )
35 |     return topk_filled_outputs
36 | 
37 | 
38 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
39 | model = CamembertForMaskedLM.from_pretrained("camembert-base")
40 | model.eval()
41 | 
42 | masked_input = "Le camembert est <mask> :)"
43 | print(fill_mask(masked_input, model, tokenizer, topk=3))
44 | 


--------------------------------------------------------------------------------
/examples/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.6
7 | scipy==1.3.1
8 | 


--------------------------------------------------------------------------------
/examples/distillation/scripts/token_counts.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training the distilled model.
17 | """
18 | import argparse
19 | import logging
20 | import pickle
21 | from collections import Counter
22 | 
23 | 
24 | logging.basicConfig(
25 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
26 | )
27 | logger = logging.getLogger(__name__)
28 | 
29 | if __name__ == "__main__":
30 |     parser = argparse.ArgumentParser(
31 |         description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
32 |     )
33 |     parser.add_argument(
34 |         "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
35 |     )
36 |     parser.add_argument(
37 |         "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
38 |     )
39 |     parser.add_argument("--vocab_size", default=30522, type=int)
40 |     args = parser.parse_args()
41 | 
42 |     logger.info(f"Loading data from {args.data_file}")
43 |     with open(args.data_file, "rb") as fp:
44 |         data = pickle.load(fp)
45 | 
46 |     logger.info("Counting occurences for MLM.")
47 |     counter = Counter()
48 |     for tk_ids in data:
49 |         counter.update(tk_ids)
50 |     counts = [0] * args.vocab_size
51 |     for k, v in counter.items():
52 |         counts[k] = v
53 | 
54 |     logger.info(f"Dump to {args.token_counts_dump}")
55 |     with open(args.token_counts_dump, "wb") as handle:
56 |         pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
57 | 


--------------------------------------------------------------------------------
/examples/distillation/training_configs/distilbert-base-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 28996
14 |   }
15 |   


--------------------------------------------------------------------------------
/examples/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 119547
14 |   }
15 |   


--------------------------------------------------------------------------------
/examples/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 30522
14 |   }
15 |   


--------------------------------------------------------------------------------
/examples/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"initializer_range": 0.02,
 3 | 	"layer_norm_epsilon": 0.00001,
 4 | 	"n_ctx": 1024,
 5 | 	"n_embd": 768,
 6 | 	"n_head": 12,
 7 | 	"n_layer": 6,
 8 | 	"n_positions": 1024,
 9 | 	"vocab_size": 50257
10 | }


--------------------------------------------------------------------------------
/examples/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vocab_size": 50265,
 3 |     "hidden_size": 768,
 4 |     "num_hidden_layers": 6,
 5 |     "num_attention_heads": 12,
 6 |     "intermediate_size": 3072,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "attention_probs_dropout_prob": 0.1,
10 |     "max_position_embeddings": 514,
11 |     "type_vocab_size": 1,
12 |     "initializer_range": 0.02,
13 |     "layer_norm_eps": 0.00001
14 | }


--------------------------------------------------------------------------------
/examples/ner/run.sh:
--------------------------------------------------------------------------------
 1 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
 2 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
 3 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
 4 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
 5 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
 6 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
 7 |  wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
 8 | export MAX_LENGTH=128
 9 | export BERT_MODEL=bert-base-multilingual-cased
10 | python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
11 | python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
12 | python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
13 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
14 | export OUTPUT_DIR=germeval-model
15 | export BATCH_SIZE=32
16 | export NUM_EPOCHS=3
17 | export SAVE_STEPS=750
18 | export SEED=1
19 | 
20 | python3 run_ner.py --data_dir ./ \
21 | --model_type bert \
22 | --labels ./labels.txt \
23 | --model_name_or_path $BERT_MODEL \
24 | --output_dir $OUTPUT_DIR \
25 | --max_seq_length  $MAX_LENGTH \
26 | --num_train_epochs $NUM_EPOCHS \
27 | --per_gpu_train_batch_size $BATCH_SIZE \
28 | --save_steps $SAVE_STEPS \
29 | --seed $SEED \
30 | --do_train \
31 | --do_eval \
32 | --do_predict
33 | 


--------------------------------------------------------------------------------
/examples/ner/run_pl.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Install newest ptl.
 4 | pip install -U git+http://github.com/PyTorchLightning/pytorch-lightning/
 5 | # for seqeval metrics import
 6 | pip install -r ../requirements.txt
 7 | 
 8 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
 9 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
10 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
11 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
12 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
13 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
14 |  wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
15 | export MAX_LENGTH=128
16 | export BERT_MODEL=bert-base-multilingual-cased
17 | python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
18 | python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
19 | python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
20 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
21 | export BATCH_SIZE=32
22 | export NUM_EPOCHS=3
23 | export SEED=1
24 | 
25 | export OUTPUT_DIR_NAME=germeval-model
26 | export CURRENT_DIR=${PWD}
27 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
28 | mkdir -p $OUTPUT_DIR
29 | 
30 | python3 run_pl_ner.py --data_dir ./ \
31 | --model_type bert \
32 | --labels ./labels.txt \
33 | --model_name_or_path $BERT_MODEL \
34 | --output_dir $OUTPUT_DIR \
35 | --max_seq_length  $MAX_LENGTH \
36 | --num_train_epochs $NUM_EPOCHS \
37 | --train_batch_size 32 \
38 | --seed $SEED \
39 | --do_train \
40 | --do_predict


--------------------------------------------------------------------------------
/examples/pplm/README.md:
--------------------------------------------------------------------------------
 1 | # Plug and Play Language Models: a Simple Approach to Controlled Text Generation
 2 | 
 3 | Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
 4 | 
 5 | This folder contains the original code used to run the Plug and Play Language Model (PPLM).
 6 | 
 7 | Paper link: https://arxiv.org/abs/1912.02164
 8 | 
 9 | Blog link: https://eng.uber.com/pplm
10 | 
11 | Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
12 | 
13 | 
14 | ## Setup
15 | 
16 | ```bash
17 | git clone https://github.com/huggingface/transformers && cd transformers
18 | pip install .
19 | pip install nltk torchtext # additional requirements.
20 | cd examples/pplm
21 | ```
22 | 
23 | ## PPLM-BoW 
24 | 
25 | ### Example command for bag-of-words control
26 | 
27 | ```bash
28 | python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
29 | ```
30 | 
31 | ### Tuning hyperparameters for bag-of-words control
32 | 
33 | 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
34 | 
35 | 2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
36 | 	a) Reduce the `--stepsize` </br>
37 | 	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
38 | 	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
39 | 
40 | 
41 | ## PPLM-Discrim
42 | 
43 | ### Example command for discriminator based sentiment control
44 | 
45 | ```bash
46 | python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
47 | ```
48 | 
49 | ### Tuning hyperparameters for discriminator control
50 | 
51 | 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
52 | 
53 | 2. Use `--class_label 3` for negative, and `--class_label 2` for positive
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/pplm/imgs/headfigure.png


--------------------------------------------------------------------------------
/examples/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/pplm/imgs/wooly.png


--------------------------------------------------------------------------------
/examples/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class ClassificationHead(torch.nn.Module):
 5 |     """Classification Head for  transformer encoders"""
 6 | 
 7 |     def __init__(self, class_size, embed_size):
 8 |         super().__init__()
 9 |         self.class_size = class_size
10 |         self.embed_size = embed_size
11 |         # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 |         # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 |         self.mlp = torch.nn.Linear(embed_size, class_size)
14 | 
15 |     def forward(self, hidden_state):
16 |         # hidden_state = F.relu(self.mlp1(hidden_state))
17 |         # hidden_state = self.mlp2(hidden_state)
18 |         logits = self.mlp(hidden_state)
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | tensorboard
3 | scikit-learn
4 | seqeval
5 | 


--------------------------------------------------------------------------------
/examples/squad/configs/albert-base-v1.yaml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   data_dir: /datasets/squad
 3 |   train_file: train-v1.1.json
 4 |   predict_file: dev-v1.1.json
 5 |   version_2_with_negative: false
 6 |   overwrite_cache: false # Overwrite the cached training and evaluation sets
 7 |   lang_id: 0 # language id of input for language-specific xlm models
 8 |   doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks.
 9 |   threads: 1 # multiple threads for converting example to features
10 | 
11 | output:
12 |   log_dir: /logfiles/squad11
13 |   checkpoint_dir: /checkpoints/squad11
14 |   verbose_logging: false # If true, all of the warnings related to data processing will be printed.
15 | 
16 | tensorboard:
17 |   enable: false
18 |   log_dir: /tb-logs/squad11
19 |   log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it
20 | 
21 | wandb:
22 |   enable: true
23 |   project_name: squad-fine-tuning
24 |   log_dir: /wandb-logs/squad11
25 | 
26 | model:
27 |   model_type: albert
28 |   model_name_or_path: albert-base-v1 # Path to pre-trained model or shortcut name of huggingface transformer models
29 |   config_name: albert-base-v1 # Pretrained config name or path if not the same as model_name
30 |   tokenizer_name: albert-base-v1 # Pretrained tokenizer name or path if not the same as model_name
31 |   cache_dir: /pre-trained-transformers
32 |   do_lower_case: true # ALBERT vocab is uncased
33 |   max_seq_length: 384
34 |   max_query_length: 64 # The maximum number of tokens for the question
35 |   max_answer_length: 30 # The maximum length of an answer that can be generated.
36 |   null_score_diff_threshold: 0.0
37 |   n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file.
38 | 
39 | training:
40 |   num_train_epochs: 3
41 |   max_steps: -1 # overrides num_train_epochs
42 |   evaluate_during_training: true
43 |   per_gpu_train_batch_size: 8
44 |   gradient_accumulation_steps: 1
45 |   train_logging_steps: 300 # adjust based on batch_size and number of GPUs used
46 |   eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used
47 |   save_steps: -1
48 | 
49 | optimizer:
50 |   learning_rate: 0.00003 # learning-rate should not be too large
51 |   learning_rate_schedule: warmup-linear
52 |   warmup_steps: 0
53 |   weight_decay: 0.0
54 |   adam_epsilon: 0.00000001
55 |   max_grad_norm: 1.0
56 |   fp16: false
57 |   fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html
58 | 
59 | eval:
60 |   eval_all_checkpoints: false
61 |   per_gpu_eval_batch_size: 100 # eval will always run on a single GPU
62 | 


--------------------------------------------------------------------------------
/examples/squad/configs/bert-base-cased.yaml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   data_dir: /datasets/squad
 3 |   train_file: train-v1.1.json
 4 |   predict_file: dev-v1.1.json
 5 |   version_2_with_negative: false
 6 |   overwrite_cache: false # Overwrite the cached training and evaluation sets
 7 |   lang_id: 0 # language id of input for language-specific xlm models
 8 |   doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks.
 9 |   threads: 1 # multiple threads for converting example to features
10 | 
11 | output:
12 |   log_dir: /logfiles/squad11
13 |   checkpoint_dir: /checkpoints/squad11
14 |   verbose_logging: false # If true, all of the warnings related to data processing will be printed.
15 | 
16 | tensorboard:
17 |   enable: false
18 |   log_dir: /tb-logs/squad11
19 |   log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it
20 | 
21 | wandb:
22 |   enable: true
23 |   project_name: squad-fine-tuning
24 |   log_dir: /wandb-logs/squad11
25 | 
26 | model:
27 |   model_type: bert
28 |   model_name_or_path: bert-base-cased # Path to pre-trained model or shortcut name of huggingface transformer models
29 |   config_name: bert-base-cased # Pretrained config name or path if not the same as model_name
30 |   tokenizer_name: bert-base-cased # Pretrained tokenizer name or path if not the same as model_name
31 |   cache_dir: /pre-trained-transformers
32 |   do_lower_case: false
33 |   max_seq_length: 384
34 |   max_query_length: 64 # The maximum number of tokens for the question
35 |   max_answer_length: 30 # The maximum length of an answer that can be generated.
36 |   null_score_diff_threshold: 0.0
37 |   n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file.
38 | 
39 | training:
40 |   num_train_epochs: 3
41 |   max_steps: -1 # overrides num_train_epochs
42 |   evaluate_during_training: true
43 |   per_gpu_train_batch_size: 8
44 |   gradient_accumulation_steps: 1
45 |   train_logging_steps: 300 # adjust based on batch_size and number of GPUs used
46 |   eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used
47 |   save_steps: -1
48 | 
49 | optimizer:
50 |   learning_rate: 0.00003 # learning-rate should not be too large
51 |   learning_rate_schedule: warmup-linear
52 |   warmup_steps: 0
53 |   weight_decay: 0.0
54 |   adam_epsilon: 0.00000001
55 |   max_grad_norm: 1.0
56 |   fp16: false
57 |   fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html
58 | 
59 | eval:
60 |   eval_all_checkpoints: false
61 |   per_gpu_eval_batch_size: 100 # eval will always run on a single GPU
62 | 


--------------------------------------------------------------------------------
/examples/squad/configs/roberta-base.yaml:
--------------------------------------------------------------------------------
 1 | input:
 2 |   data_dir: /datasets/squad
 3 |   train_file: train-v1.1.json
 4 |   predict_file: dev-v1.1.json
 5 |   version_2_with_negative: false
 6 |   overwrite_cache: false # Overwrite the cached training and evaluation sets
 7 |   lang_id: 0 # language id of input for language-specific xlm models
 8 |   doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks.
 9 |   threads: 1 # multiple threads for converting example to features
10 | 
11 | output:
12 |   log_dir: /logfiles/squad11
13 |   checkpoint_dir: /checkpoints/squad11
14 |   verbose_logging: false # If true, all of the warnings related to data processing will be printed.
15 | 
16 | tensorboard:
17 |   enable: false
18 |   log_dir: /tb-logs/squad11
19 |   log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it
20 | 
21 | wandb:
22 |   enable: true
23 |   project_name: squad-fine-tuning
24 |   log_dir: /wandb-logs/squad11
25 | 
26 | model:
27 |   model_type: roberta
28 |   model_name_or_path: roberta-base # Path to pre-trained model or shortcut name of huggingface transformer models
29 |   config_name: roberta-base # Pretrained config name or path if not the same as model_name
30 |   tokenizer_name: roberta-base # Pretrained tokenizer name or path if not the same as model_name
31 |   cache_dir: /pre-trained-transformers
32 |   do_lower_case: false
33 |   max_seq_length: 384
34 |   max_query_length: 64 # The maximum number of tokens for the question
35 |   max_answer_length: 30 # The maximum length of an answer that can be generated.
36 |   null_score_diff_threshold: 0.0
37 |   n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file.
38 | 
39 | training:
40 |   num_train_epochs: 3
41 |   max_steps: -1 # overrides num_train_epochs
42 |   evaluate_during_training: true
43 |   per_gpu_train_batch_size: 8
44 |   gradient_accumulation_steps: 1
45 |   train_logging_steps: 300 # adjust based on batch_size and number of GPUs used
46 |   eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used
47 |   save_steps: -1
48 | 
49 | optimizer:
50 |   learning_rate: 0.00003 # learning-rate should not be too large
51 |   learning_rate_schedule: warmup-linear
52 |   warmup_steps: 0
53 |   weight_decay: 0.0
54 |   adam_epsilon: 0.00000001
55 |   max_grad_norm: 1.0
56 |   fp16: false
57 |   fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html
58 | 
59 | eval:
60 |   eval_all_checkpoints: false
61 |   per_gpu_eval_batch_size: 100 # eval will always run on a single GPU
62 | 


--------------------------------------------------------------------------------
/examples/squad/scripts/iterations_info.txt:
--------------------------------------------------------------------------------
 1 | # SQUAD v1.1
 2 | 
 3 | ## Single GPU training
 4 | 
 5 | batch_size  iterations
 6 | 2   44432
 7 | 4   22216
 8 | 6   14811
 9 | 8   11108
10 | 16  5554    # this is the largest batch size that fits on a P100 with base models (of course it also depends on max_seq_len)
11 | 24  3703
12 | 32  2777
13 | 
14 | ## Multi GPU training
15 | 
16 | gpus    batch_size_per_gpu  total_batch_size    iterations_per_gpu
17 | 2   8   16  5554
18 | 4   8   32  2777
19 | 
20 | # SQUAD v2.0
21 | 
22 | ## Single GPU training
23 | 
24 | ## Multi GPU training
25 | 


--------------------------------------------------------------------------------
/examples/squad/scripts/run_scripts.txt:
--------------------------------------------------------------------------------
 1 | # Single GPU taining 
 2 | # - run as follows (args: config file, seed, gpu)
 3 | 
 4 | # For SQUAD v2.0 make sure to set 'version_2_with_negative' to true in config file and change dirs accordingly
 5 | 
 6 | # SQUAD v1.1/v2.0
 7 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/bert-base-cased.yaml 123 7
 8 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/albert-base-v1.yaml 123 7
 9 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/roberta-base.yaml 123 7
10 | 
11 | 
12 | -------------------------------------------------------------------------------------------------------------------------------------
13 | 
14 | # Multi GPU taining 
15 | # - run as follows (args: config file, seed, n_gpus, list_of_gpus)
16 | # - make sure to adjust config files accordingly 
17 | 
18 | # SQUAD v1.1/v2-0
19 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/bert-base-cased.yaml 123 4 4,5,6,7
20 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/albert-base-v1.yaml 123 4 4,5,6,7
21 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/roberta-base.yaml 123 4 4,5,6,7


--------------------------------------------------------------------------------
/examples/squad/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="$PYTHONPATH:/transformers"
 3 | 
 4 | # Passed as arguements
 5 | CONFIG_FILE=$1
 6 | SEED=$2
 7 | export CUDA_VISIBLE_DEVICES=$3
 8 | 
 9 | # Setup weights & biases environment variables
10 | # Comment lines below if you don't want to use wandb
11 | export WANDB_API_KEY=your-key
12 | export WANDB_USERNAME="your-username"
13 | export WANDB_ENTITY="your-entity"
14 | 
15 | python /transformers/examples/squad/run_finetuning.py \
16 |     --config ${CONFIG_FILE} \
17 |     --do_train \
18 |     --do_eval \
19 |     --seed ${SEED}


--------------------------------------------------------------------------------
/examples/squad/scripts/train_multi_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="$PYTHONPATH:/transformers"
 3 | 
 4 | # Passed as arguements
 5 | CONFIG_FILE=$1
 6 | SEED=$2
 7 | N_GPUS=$3
 8 | export CUDA_VISIBLE_DEVICES=$4
 9 | 
10 | # Setup weights & biases environment variables
11 | # Comment lines below if you don't want to use wandb
12 | export WANDB_API_KEY=your-key
13 | export WANDB_USERNAME="your-username"
14 | export WANDB_ENTITY="your-entity"
15 | 
16 | python -m torch.distributed.launch --nproc_per_node ${N_GPUS} /transformers/examples/squad/run_finetuning.py \
17 |     --config ${CONFIG_FILE} \
18 |     --do_train   \
19 |     --do_eval   \
20 |     --seed ${SEED}


--------------------------------------------------------------------------------
/examples/squad/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import yaml
 4 | 
 5 | import torch
 6 | import numpy as np
 7 | from attrdict import AttrDict
 8 | 
 9 | 
10 | def read_config(config_file):
11 |     # Source: https://stackoverflow.com/questions/1773805/how-can-i-parse-a-yaml-file-in-python
12 |     with open(config_file, 'r') as stream:
13 |         try:
14 |             config = yaml.safe_load(stream)
15 |         except yaml.YAMLError as exc:
16 |             print(f"Catched the following YAMLError:\n{exc}")
17 | 
18 |     # Convert to AttrDict to allow acessing by dot e.g. config.seed
19 |     config = AttrDict(config)
20 | 
21 |     return config
22 | 
23 | 
24 | def save_config(config_file, output_file):
25 |     config_file = dict(config_file)
26 |     with open(output_file, 'w') as yaml_file:
27 |         yaml.dump(config_file, yaml_file, default_flow_style=True)
28 | 
29 | 
30 | def create_unique_dir(path, config, timestamp):
31 |     new_dir = os.path.join(path, timestamp)
32 | 
33 |     for name in [config.model.model_name_or_path, config.optimizer.learning_rate_schedule]:
34 |         new_dir += f'_{name}'
35 | 
36 |     if config.optimizer.fp16:
37 |         new_dir += f'_fp16_{config.optimizer.fp16_opt_level}'
38 | 
39 |     if not os.path.exists(new_dir):
40 |         os.makedirs(new_dir)
41 | 
42 |     return new_dir
43 | 
44 | 
45 | def set_seed(args):
46 |     random.seed(args.seed)
47 |     np.random.seed(args.seed)
48 |     torch.manual_seed(args.seed)
49 |     if args.n_gpu > 0:
50 |         torch.cuda.manual_seed_all(args.seed)
51 | 
52 | 
53 | def to_list(tensor):
54 |     return tensor.detach().cpu().tolist()


--------------------------------------------------------------------------------
/examples/summarization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/__init__.py


--------------------------------------------------------------------------------
/examples/summarization/bart/README.md:
--------------------------------------------------------------------------------
 1 | ### Get the CNN Data
 2 | To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
 3 | 
 4 | ```bash
 5 | tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
 6 | ```
 7 | this should make a directory called cnn_dm/ with files like `test.source`. 
 8 | To use your own data, copy that files format. Each article to be summarized is on its own line.
 9 | 
10 | ### Usage
11 | To create summaries for each article in dataset, run:
12 | ```bash
13 | python evaluate_cnn.py <path_to_test.source> cnn_test_summaries.txt
14 | ```
15 | the default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
16 | 
17 | ### Where is the code?
18 | The core model is in `src/transformers/modeling_bart.py`. This directory only contains examples.
19 | 
20 | ### (WIP) Rouge Scores
21 | 
22 | ### Stanford CoreNLP Setup
23 | ```
24 | ptb_tokenize () {
25 |     cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
26 | }
27 | 
28 | sudo apt install openjdk-8-jre-headless
29 | sudo apt-get install ant
30 | wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
31 | unzip stanford-corenlp-full-2018-10-05.zip
32 | cd stanford-corenlp-full-2018-10-05
33 | export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar
34 | ```
35 | Then run `ptb_tokenize` on `test.target` and your generated hypotheses.
36 | ### Rouge Setup
37 | Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge).
38 | I also needed to run `sudo apt-get install libxml-parser-perl`
39 | 
40 | ```python
41 | from files2rouge import files2rouge
42 | from files2rouge import settings
43 | files2rouge.run(<path_to_tokenized_hypo>,
44 |                 <path_to_tokenized_target>,
45 |                saveto='rouge_output.txt')
46 | ```
47 | 


--------------------------------------------------------------------------------
/examples/summarization/bart/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/bart/__init__.py


--------------------------------------------------------------------------------
/examples/summarization/bart/evaluate_cnn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | from transformers import BartForConditionalGeneration, BartTokenizer
 8 | 
 9 | 
10 | DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11 | 
12 | 
13 | def chunks(lst, n):
14 |     """Yield successive n-sized chunks from lst."""
15 |     for i in range(0, len(lst), n):
16 |         yield lst[i : i + n]
17 | 
18 | 
19 | def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
20 |     fout = Path(out_file).open("w")
21 |     model = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(device)
22 |     tokenizer = BartTokenizer.from_pretrained("bart-large")
23 | 
24 |     max_length = 140
25 |     min_length = 55
26 | 
27 |     for batch in tqdm(list(chunks(lns, batch_size))):
28 |         dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True)
29 |         summaries = model.generate(
30 |             input_ids=dct["input_ids"].to(device),
31 |             attention_mask=dct["attention_mask"].to(device),
32 |             num_beams=4,
33 |             length_penalty=2.0,
34 |             max_length=max_length + 2,  # +2 from original because we start at step=1 and stop before max_length
35 |             min_length=min_length + 1,  # +1 from original because we start at step=1
36 |             no_repeat_ngram_size=3,
37 |             early_stopping=True,
38 |             do_sample=False,
39 |             decoder_start_token_id=model.config.eos_token_ids[0],
40 |         )
41 |         dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
42 |         for hypothesis in dec:
43 |             fout.write(hypothesis + "\n")
44 |             fout.flush()
45 | 
46 | 
47 | def _run_generate():
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument(
50 |         "source_path", type=str, help="like cnn_dm/test.source",
51 |     )
52 |     parser.add_argument(
53 |         "output_path", type=str, help="where to save summaries",
54 |     )
55 |     parser.add_argument(
56 |         "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
57 |     )
58 |     parser.add_argument(
59 |         "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
60 |     )
61 |     args = parser.parse_args()
62 |     lns = [" " + x.rstrip() for x in open(args.source_path).readlines()]
63 |     generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     _run_generate()
68 | 


--------------------------------------------------------------------------------
/examples/summarization/bart/test_bart_examples.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | import tempfile
 4 | import unittest
 5 | from pathlib import Path
 6 | from unittest.mock import patch
 7 | 
 8 | from .evaluate_cnn import _run_generate
 9 | 
10 | 
11 | articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
12 | 
13 | logging.basicConfig(level=logging.DEBUG)
14 | 
15 | logger = logging.getLogger()
16 | 
17 | 
18 | class TestBartExamples(unittest.TestCase):
19 |     def test_bart_cnn_cli(self):
20 |         stream_handler = logging.StreamHandler(sys.stdout)
21 |         logger.addHandler(stream_handler)
22 |         tmp = Path(tempfile.gettempdir()) / "utest_generations.hypo"
23 |         with tmp.open("w") as f:
24 |             f.write("\n".join(articles))
25 |         testargs = ["evaluate_cnn.py", str(tmp), "output.txt"]
26 |         with patch.object(sys, "argv", testargs):
27 |             _run_generate()
28 |             self.assertTrue(Path("output.txt").exists())
29 | 


--------------------------------------------------------------------------------
/examples/summarization/bertabs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/bertabs/__init__.py


--------------------------------------------------------------------------------
/examples/summarization/bertabs/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 | 


--------------------------------------------------------------------------------
/examples/tests_samples/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | cache*
3 | temp*
4 | !*.tsv
5 | !*.json
6 | !.gitignore


--------------------------------------------------------------------------------
/examples/tests_samples/MRPC/dev.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/examples/tests_samples/MRPC/train.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/images/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/images/fig1.png


--------------------------------------------------------------------------------
/images/tab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/images/tab1.png


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - bulgarian
 4 | - czech
 5 | - polish
 6 | - russian
 7 | ---
 8 | 
 9 | # bert-base-bg-cs-pl-ru-cased
10 | 
11 | SlavicBERT\[1\] \(Slavic \(bg, cs, pl, ru\), cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian. Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT.
12 | 
13 | 
14 | \[1\]: Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. \(2019\). [Tuning Multilingual Transformers for Language-Specific Named Entity Recognition](https://www.aclweb.org/anthology/W19-3712/). ACL anthology W19-3712.
15 | 


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/bert-base-cased-conversational/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - english
 4 | ---
 5 | 
 6 | # bert-base-cased-conversational
 7 | 
 8 | Conversational BERT \(English, cased, 12‑layer, 768‑hidden, 12‑heads, 110M parameters\) was trained on the English part of Twitter, Reddit, DailyDialogues\[1\], OpenSubtitles\[2\], Debates\[3\], Blogs\[4\], Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took English cased version of BERT‑base as an initialization for English Conversational BERT.
 9 | 
10 | 
11 | \[1\]: Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017.
12 | 
13 | \[2\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
14 | 
15 | \[3\]: Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016.
16 | 
17 | \[4\]: J. Schler, M. Koppel, S. Argamon and J. Pennebaker \(2006\). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs.
18 | 


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - multilingual
 4 | ---
 5 | 
 6 | # bert-base-multilingual-cased-sentence
 7 | 
 8 | Sentence Multilingual BERT \(101 languages, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) is a representation‑based sentence encoder for 101 languages of Multilingual BERT. It is initialized with Multilingual BERT and then fine‑tuned on english MultiNLI\[1\] and on dev set of multilingual XNLI\[2\]. Sentence representations are mean pooled token embeddings in the same manner as in Sentence‑BERT\[3\].
 9 | 
10 | 
11 | \[1\]: Williams A., Nangia N. & Bowman S. \(2017\) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint [arXiv:1704.05426](https://arxiv.org/abs/1704.05426)
12 | 
13 | \[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
14 | 
15 | \[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
16 | 


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/rubert-base-cased-conversational/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - russian
 4 | ---
 5 | 
 6 | # rubert-base-cased-conversational
 7 | 
 8 | Conversational RuBERT \(Russian, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on OpenSubtitles\[1\], [Dirty](https://d3.ru/), [Pikabu](https://pikabu.ru/), and a Social Media segment of Taiga corpus\[2\]. We assembled a new vocabulary for Conversational RuBERT model on this data and initialized the model with [RuBERT](../rubert-base-cased).
 9 | 
10 | 
11 | \[1\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\)
12 | 
13 | \[2\]: Shavrina T., Shapovalova O. \(2017\) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017.
14 | 


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/rubert-base-cased-sentence/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - russian
 4 | ---
 5 | 
 6 | # rubert-base-cased-sentence
 7 | 
 8 | Sentence RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) is a representation‑based sentence encoder for Russian. It is initialized with RuBERT and fine‑tuned on SNLI\[1\] google-translated to russian and on russian part of XNLI dev set\[2\]. Sentence representations are mean pooled token embeddings in the same manner as in Sentence‑BERT\[3\].
 9 | 
10 | 
11 | \[1\]: S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. \(2015\) A large annotated corpus for learning natural language inference. arXiv preprint [arXiv:1508.05326](https://arxiv.org/abs/1508.05326)
12 | 
13 | \[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053)
14 | 
15 | \[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084)
16 | 


--------------------------------------------------------------------------------
/model_cards/DeepPavlov/rubert-base-cased/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - russian
 4 | ---
 5 | 
 6 | # rubert-base-cased
 7 | 
 8 | RuBERT \(Russian, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on the Russian part of Wikipedia and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version of BERT‑base as an initialization for RuBERT\[1\].
 9 | 
10 | 
11 | \[1\]: Kuratov, Y., Arkhipov, M. \(2019\). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint [arXiv:1905.07213](https://arxiv.org/abs/1905.07213).
12 | 


--------------------------------------------------------------------------------
/model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md:
--------------------------------------------------------------------------------
 1 | ## Albert xxlarge version 1 language model fine-tuned on SQuAD2.0
 2 | 
 3 | ### with the following results:
 4 | 
 5 | ```
 6 | exact: 85.65653162637918
 7 | f1: 89.260458954177
 8 | total': 11873
 9 | HasAns_exact': 82.6417004048583
10 | HasAns_f1': 89.8598902096736
11 | HasAns_total': 5928
12 | NoAns_exact': 88.66274179983179
13 | NoAns_f1': 88.66274179983179
14 | NoAns_total': 5945
15 | best_exact': 85.65653162637918
16 | best_exact_thresh': 0.0
17 | best_f1': 89.2604589541768
18 | best_f1_thresh': 0.0
19 | ```
20 | 
21 | ### from script:
22 | 
23 | ```
24 | python -m torch.distributed.launch --nproc_per_node=2 ${RUN_SQUAD_DIR}/run_squad.py \
25 | --model_type albert \
26 | --model_name_or_path albert-xxlarge-v1 \
27 | --do_train \
28 | --train_file ${SQUAD_DIR}/train-v2.0.json \
29 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \
30 | --version_2_with_negative \
31 | --num_train_epochs 3 \
32 | --max_steps 8144 \
33 | --warmup_steps 814 \
34 | --do_lower_case \
35 | --learning_rate 3e-5 \
36 | --max_seq_length 512 \
37 | --doc_stride 128 \
38 | --save_steps 2000 \
39 | --per_gpu_train_batch_size 1 \
40 | --gradient_accumulation_steps 24 \
41 | --output_dir ${MODEL_PATH}
42 | 
43 | CUDA_VISIBLE_DEVICES=0 python ${RUN_SQUAD_DIR}/run_squad.py \
44 | --model_type albert \
45 | --model_name_or_path ${MODEL_PATH} \
46 | --do_eval \
47 | --train_file ${SQUAD_DIR}/train-v2.0.json \
48 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \
49 | --version_2_with_negative \
50 | --do_lower_case \
51 | --max_seq_length 512 \
52 | --per_gpu_eval_batch_size 48 \
53 | --output_dir ${MODEL_PATH}
54 | ```
55 | 
56 | ### using the following system & software:
57 | 
58 | ```
59 | OS/Platform: Linux-4.15.0-76-generic-x86_64-with-debian-buster-sid
60 | GPU/CPU: 2 x NVIDIA 1080Ti / Intel i7-8700
61 | Transformers: 2.3.0
62 | PyTorch: 1.4.0
63 | TensorFlow: 2.1.0
64 | Python: 3.7.6
65 | ```
66 | 
67 | ### Inferencing / prediction works with the current Transformers v2.4.1
68 | 
69 | ### Access this albert_xxlargev1_sqd2_512 fine-tuned model with "tried & true" code:
70 | 
71 | ```python
72 | config_class, model_class, tokenizer_class = \
73 |         AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer
74 | 
75 | model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512"
76 | config = config_class.from_pretrained(model_name_or_path)
77 | tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True)
78 | model = model_class.from_pretrained(model_name_or_path, config=config)
79 | ```
80 | 
81 | ### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my app & confirm:
82 | 
83 | ```python
84 | from transformers import AutoConfig, AutoTokenizer, AutoModel
85 | 
86 | model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512"
87 | config = AutoConfig.from_pretrained(model_name_or_path)
88 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
89 | model = AutoModel.from_pretrained(model_name_or_path, config=config)
90 | ```


--------------------------------------------------------------------------------
/model_cards/ahotrod/xlnet_large_squad2_512/README.md:
--------------------------------------------------------------------------------
 1 | ## XLNet large language model fine-tuned on SQuAD2.0
 2 | 
 3 | ### with the following results:
 4 | 
 5 | ```
 6 |   "exact": 82.07698138633876,
 7 |   "f1": 85.898874470488,
 8 |   "total": 11873,
 9 |   "HasAns_exact": 79.60526315789474,
10 |   "HasAns_f1": 87.26000954590184,
11 |   "HasAns_total": 5928,
12 |   "NoAns_exact": 84.54163162321278,
13 |   "NoAns_f1": 84.54163162321278,
14 |   "NoAns_total": 5945,
15 |   "best_exact": 83.22243746315169,
16 |   "best_exact_thresh": -11.112004280090332,
17 |   "best_f1": 86.88541353813282,
18 |   "best_f1_thresh": -11.112004280090332
19 | ```
20 | ### from script:
21 | ```
22 | python -m torch.distributed.launch --nproc_per_node=2 ${RUN_SQUAD_DIR}/run_squad.py \
23 |   --model_type xlnet \
24 |   --model_name_or_path xlnet-large-cased \
25 |   --do_train \
26 |   --train_file ${SQUAD_DIR}/train-v2.0.json \
27 |   --predict_file ${SQUAD_DIR}/dev-v2.0.json \
28 |   --version_2_with_negative \
29 |   --num_train_epochs 3 \
30 |   --learning_rate 3e-5 \
31 |   --adam_epsilon 1e-6 \
32 |   --max_seq_length 512 \
33 |   --doc_stride 128 \
34 |   --save_steps 2000 \
35 |   --per_gpu_train_batch_size 1 \
36 |   --gradient_accumulation_steps 24 \
37 |   --output_dir ${MODEL_PATH}
38 | 
39 | CUDA_VISIBLE_DEVICES=0 python ${RUN_SQUAD_DIR}/run_squad_II.py \
40 |   --model_type xlnet \
41 |   --model_name_or_path ${MODEL_PATH} \
42 |   --do_eval \
43 |   --train_file ${SQUAD_DIR}/train-v2.0.json \
44 |   --predict_file ${SQUAD_DIR}/dev-v2.0.json \
45 |   --version_2_with_negative \
46 |   --max_seq_length 512 \
47 |   --per_gpu_eval_batch_size 48 \
48 |   --output_dir ${MODEL_PATH}
49 | ```
50 | ### using the following system & software:
51 | ```
52 | OS/Platform: Linux-4.15.0-76-generic-x86_64-with-debian-buster-sid
53 | GPU/CPU: 2 x NVIDIA 1080Ti / Intel i7-8700
54 | Transformers: 2.1.1
55 | PyTorch: 1.4.0
56 | TensorFlow: 2.1.0
57 | Python: 3.7.6
58 | ```
59 | ### Inferencing / prediction works with Transformers v2.4.1, the latest version tested
60 | 
61 | ### Utilize this xlnet_large_squad2_512 fine-tuned model with:
62 | ```python
63 | config_class, model_class, tokenizer_class = \
64 |         XLNetConfig, XLNetforQuestionAnswering, XLNetTokenizer
65 | model_name_or_path = "ahotrod/xlnet_large_squad2_512"
66 | config = config_class.from_pretrained(model_name_or_path)
67 | tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True)
68 | model = model_class.from_pretrained(model_name_or_path, config=config)
69 | ```
70 | ### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my apps & confirm:
71 | ```python
72 | from transformers import AutoConfig, AutoTokenizer, AutoModel
73 | model_name_or_path = "ahotrod/xlnet_large_squad2_512"
74 | config = AutoConfig.from_pretrained(model_name_or_path)
75 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
76 | model = AutoModel.from_pretrained(model_name_or_path, config=config)
77 | ```
78 | 


--------------------------------------------------------------------------------
/model_cards/asafaya/bert-base-arabic/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: arabic
 3 | ---
 4 | 
 5 | # Arabic BERT Model
 6 | 
 7 | Pretrained BERT base language model for Arabic
 8 | 
 9 | ## Pretraining Corpus
10 | 
11 | `arabic-bert-base` model was pretrained on ~8.2 Billion words:
12 | 
13 | - Arabic version of [OSCAR](https://traces1.inria.fr/oscar/) - filtered from [Common Crawl](http://commoncrawl.org/)
14 | - Recent dump of Arabic [Wikipedia](https://dumps.wikimedia.org/backup-index.html)
15 | 
16 | and other Arabic resources which sum up to ~95GB of text.
17 | 
18 | __Notes on training data:__
19 | 
20 | - Our final version of corpus contains some non-Arabic words inlines, which we did not remove from sentences since that would affect some tasks like NER.
21 | - Although non-Arabic characters were lowered as a preprocessing step, since Arabic characters does not have upper or lower case, there is no cased and uncased version of the model.
22 | - The corpus and vocabulary set are not restricted to Modern Standard Arabic, they contain some dialectical Arabic too.
23 | 
24 | ## Pretraining details
25 | 
26 | - This model was trained using Google BERT's github [repository](https://github.com/google-research/bert) on a single TPU v3-8 provided for free from [TFRC](https://www.tensorflow.org/tfrc).
27 | - Our pretraining procedure follows training settings of bert with some changes: trained for 3M training steps with batchsize of 128, instead of 1M with batchsize of 256.
28 | 
29 | ## Load Pretrained Model
30 | 
31 | You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:  
32 | 
33 | ```python
34 | from transformers import AutoTokenizer, AutoModel
35 | 
36 | tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
37 | model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
38 | ```
39 | 
40 | ## Results
41 | 
42 | For further details on the models performance or any other queries, please refer to [Arabic-BERT](https://github.com/alisafaya/Arabic-BERT)
43 | 
44 | ## Acknowledgement
45 | 
46 | Thanks to Google for providing free TPU for the training process and for Huggingface for hosting this model on their servers 😊
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/model_cards/binwang/xlnet-base-cased/README.md:
--------------------------------------------------------------------------------
1 | This model is pre-trained **XLNET** with 12 layers.
2 | 
3 | It comes with paper: SBERT-WK: A Sentence Embedding Method By Dissecting BERT-based Word Models
4 | 
5 | Project Page: [SBERT-WK](https://github.com/BinWang28/SBERT-WK-Sentence-Embedding)
6 | 


--------------------------------------------------------------------------------
/model_cards/camembert-base-README.md:
--------------------------------------------------------------------------------
 1 | # CamemBERT 
 2 | 
 3 | CamemBERT is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR.  
 4 | 
 5 | CamemBERT was originally evaluated on four different downstream tasks for French: part-of-speech (POS) tagging, dependency parsing, named entity recognition (NER) and natural language inference (NLI); improving the state of the art for most tasks over previous monolingual and multilingual approaches, which confirms the effectiveness of large pretrained language models for French.   
 6 | 
 7 | CamemBERT was trained and evaluated by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.  
 8 | 
 9 | Preprint can be found [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894)  
10 | 


--------------------------------------------------------------------------------
/model_cards/canwenxu/BERT-of-Theseus-MNLI/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | thumbnail: https://raw.githubusercontent.com/JetRunner/BERT-of-Theseus/master/bert-of-theseus.png
 3 | ---
 4 | 
 5 | # BERT-of-Theseus
 6 | See our paper ["BERT-of-Theseus: Compressing BERT by Progressive Module Replacing"](http://arxiv.org/abs/2002.02925).
 7 | 
 8 | BERT-of-Theseus is a new compressed BERT by progressively replacing the components of the original BERT.
 9 | 
10 | ![BERT of Theseus](https://github.com/JetRunner/BERT-of-Theseus/blob/master/bert-of-theseus.png?raw=true)
11 | 
12 | ## Load Pretrained Model on MNLI
13 | 
14 | We provide a 6-layer pretrained model on MNLI as a general-purpose model, which can transfer to other sentence classification tasks, outperforming DistillBERT (with the same 6-layer structure) on six tasks of GLUE (dev set).
15 | 
16 | | Method          | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2 | STS-B |
17 | |-----------------|------|------|------|------|------|-------|-------|
18 | | BERT-base       | 83.5 | 89.5 | 91.2 | 89.8 | 71.1 | 91.5  | 88.9  |
19 | | DistillBERT     | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7  | 81.2  |
20 | | BERT-of-Theseus | 82.1 | 87.5 | 88.8 | 88.8 | 70.1 | 91.8  | 87.8  |
21 | 


--------------------------------------------------------------------------------
/model_cards/dbmdz/bert-base-german-europeana-cased/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: german
 3 | tags:
 4 |   - "historic german"
 5 | ---
 6 | 
 7 | # 🤗 + 📚 dbmdz BERT models
 8 | 
 9 | In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
10 | Library open sources German Europeana BERT models 🎉
11 | 
12 | # German Europeana BERT
13 | 
14 | We use the open source [Europeana newspapers](http://www.europeana-newspapers.eu/)
15 | that were provided by *The European Library*. The final
16 | training corpus has a size of 51GB and consists of 8,035,986,369 tokens.
17 | 
18 | Detailed information about the data and pretraining steps can be found in
19 | [this repository](https://github.com/stefan-it/europeana-bert).
20 | 
21 | ## Model weights
22 | 
23 | Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
24 | compatible weights are available. If you need access to TensorFlow checkpoints,
25 | please raise an issue!
26 | 
27 | | Model                                      | Downloads
28 | | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------
29 | | `dbmdz/bert-base-german-europeana-cased`   | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/config.json)   • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/pytorch_model.bin)   • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/vocab.txt)
30 | 
31 | ## Results
32 | 
33 | For results on Historic NER, please refer to [this repository](https://github.com/stefan-it/europeana-bert).
34 | 
35 | ## Usage
36 | 
37 | With Transformers >= 2.3 our German Europeana BERT models can be loaded like:
38 | 
39 | ```python
40 | from transformers import AutoModel, AutoTokenizer
41 | 
42 | tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-europeana-cased")
43 | model = AutoModel.from_pretrained("dbmdz/bert-base-german-europeana-cased")
44 | ```
45 | 
46 | # Huggingface model hub
47 | 
48 | All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
49 | 
50 | # Contact (Bugs, Feedback, Contribution and more)
51 | 
52 | For questions about our BERT models just open an issue
53 | [here](https://github.com/dbmdz/berts/issues/new) 🤗
54 | 
55 | # Acknowledgments
56 | 
57 | Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
58 | Thanks for providing access to the TFRC ❤️
59 | 
60 | Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
61 | it is possible to download both cased and uncased models from their S3 storage 🤗
62 | 


--------------------------------------------------------------------------------
/model_cards/dbmdz/bert-base-german-europeana-uncased/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: german
 3 | tags:
 4 |   - "historic german"
 5 | ---
 6 | 
 7 | # 🤗 + 📚 dbmdz BERT models
 8 | 
 9 | In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State
10 | Library open sources German Europeana BERT models 🎉
11 | 
12 | # German Europeana BERT
13 | 
14 | We use the open source [Europeana newspapers](http://www.europeana-newspapers.eu/)
15 | that were provided by *The European Library*. The final
16 | training corpus has a size of 51GB and consists of 8,035,986,369 tokens.
17 | 
18 | Detailed information about the data and pretraining steps can be found in
19 | [this repository](https://github.com/stefan-it/europeana-bert).
20 | 
21 | ## Model weights
22 | 
23 | Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers)
24 | compatible weights are available. If you need access to TensorFlow checkpoints,
25 | please raise an issue!
26 | 
27 | | Model                                      | Downloads
28 | | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------
29 | | `dbmdz/bert-base-german-europeana-uncased` | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/vocab.txt)
30 | 
31 | ## Results
32 | 
33 | For results on Historic NER, please refer to [this repository](https://github.com/stefan-it/europeana-bert).
34 | 
35 | ## Usage
36 | 
37 | With Transformers >= 2.3 our German Europeana BERT models can be loaded like:
38 | 
39 | ```python
40 | from transformers import AutoModel, AutoTokenizer
41 | 
42 | tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-europeana-uncased")
43 | model = AutoModel.from_pretrained("dbmdz/bert-base-german-europeana-uncased")
44 | ```
45 | 
46 | # Huggingface model hub
47 | 
48 | All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz).
49 | 
50 | # Contact (Bugs, Feedback, Contribution and more)
51 | 
52 | For questions about our BERT models just open an issue
53 | [here](https://github.com/dbmdz/berts/issues/new) 🤗
54 | 
55 | # Acknowledgments
56 | 
57 | Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC).
58 | Thanks for providing access to the TFRC ❤️
59 | 
60 | Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team,
61 | it is possible to download both cased and uncased models from their S3 storage 🤗
62 | 


--------------------------------------------------------------------------------
/model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md:
--------------------------------------------------------------------------------
1 | Slavic BERT from https://github.com/deepmipt/Slavic-BERT-NER http://files.deeppavlov.ai/deeppavlov_data/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz
2 | 


--------------------------------------------------------------------------------
/model_cards/emilyalsentzer/Bio_ClinicalBERT/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ClinicalBERT - Bio + Clinical BERT Model
 3 | 
 4 | The [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) paper contains four unique clinicalBERT models: initialized with BERT-Base (`cased_L-12_H-768_A-12`) or BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`) & trained on either all MIMIC notes or only discharge summaries. 
 5 | 
 6 | This model card describes the Bio+Clinical BERT model, which was initialized from [BioBERT](https://arxiv.org/abs/1901.08746) & trained on all MIMIC notes. 
 7 | 
 8 | ## Pretraining Data
 9 | The `Bio_ClinicalBERT` model was trained on all notes from [MIMIC III](https://www.nature.com/articles/sdata201635), a database containing electronic health records from ICU patients at the Beth Israel Hospital in Boston, MA. For more details on MIMIC, see [here](https://mimic.physionet.org/). All notes from the `NOTEEVENTS` table were included (~880M words).
10 | 
11 | ## Model Pretraining 
12 | 
13 | ### Note Preprocessing
14 | Each note in MIMIC was first split into sections using a rules-based section splitter (e.g. discharge summary notes were split into "History of Present Illness", "Family History", "Brief Hospital Course", etc. sections). Then each section was split into sentences using SciSpacy (`en core sci md` tokenizer). 
15 | 
16 | ### Pretraining Procedures
17 | The model was trained using code from [Google's BERT repository](https://github.com/google-research/bert) on a GeForce GTX TITAN X 12 GB GPU. Model parameters were initialized with BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`).
18 | 
19 | ### Pretraining Hyperparameters
20 | We used a batch size of 32, a maximum sequence length of 128, and a learning rate of 5 · 10−5 for pre-training our models. The models trained on all MIMIC notes  were trained for 150,000 steps. The dup factor for duplicating input data with different masks was set to 5. All other default parameters were used (specifically, masked language model probability = 0.15
21 | and max predictions per sequence = 20).
22 | 
23 | ## How to use the model
24 | 
25 | Load the model via the transformers library:
26 | ```
27 | from transformers import AutoTokenizer, AutoModel
28 | tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
29 | model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
30 | ```
31 | 
32 | ## More Information
33 | 
34 | Refer to the original paper, [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) (NAACL Clinical NLP Workshop 2019) for additional details and performance on NLI and NER tasks.
35 | 
36 | ## Questions?
37 | 
38 | Post a Github issue on the [clinicalBERT repo](https://github.com/EmilyAlsentzer/clinicalBERT) or email emilya@mit.edu with any questions.
39 | 
40 | 


--------------------------------------------------------------------------------
/model_cards/emilyalsentzer/Bio_Discharge_Summary_BERT/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ClinicalBERT - Bio + Discharge Summary BERT Model
 3 | 
 4 | The [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) paper contains four unique clinicalBERT models: initialized with BERT-Base (`cased_L-12_H-768_A-12`) or BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`) & trained on either all MIMIC notes or only discharge summaries. 
 5 | 
 6 | This model card describes the Bio+Discharge Summary BERT model, which was initialized from [BioBERT](https://arxiv.org/abs/1901.08746) & trained on only discharge summaries from MIMIC. 
 7 | 
 8 | ## Pretraining Data
 9 | The `Bio_Discharge_Summary_BERT` model was trained on all discharge summaries from [MIMIC III](https://www.nature.com/articles/sdata201635), a database containing electronic health records from ICU patients at the Beth Israel Hospital in Boston, MA. For more details on MIMIC, see [here](https://mimic.physionet.org/). All notes from the `NOTEEVENTS` table were included (~880M words).
10 | 
11 | ## Model Pretraining 
12 | 
13 | ### Note Preprocessing
14 | Each note in MIMIC was first split into sections using a rules-based section splitter (e.g. discharge summary notes were split into "History of Present Illness", "Family History", "Brief Hospital Course", etc. sections). Then each section was split into sentences using SciSpacy (`en core sci md` tokenizer). 
15 | 
16 | ### Pretraining Procedures
17 | The model was trained using code from [Google's BERT repository](https://github.com/google-research/bert) on a GeForce GTX TITAN X 12 GB GPU. Model parameters were initialized with BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`).
18 | 
19 | ### Pretraining Hyperparameters
20 | We used a batch size of 32, a maximum sequence length of 128, and a learning rate of 5 · 10−5 for pre-training our models. The models trained on all MIMIC notes  were trained for 150,000 steps. The dup factor for duplicating input data with different masks was set to 5. All other default parameters were used (specifically, masked language model probability = 0.15
21 | and max predictions per sequence = 20).
22 | 
23 | ## How to use the model
24 | 
25 | Load the model via the transformers library:
26 | ```
27 | from transformers import AutoTokenizer, AutoModel
28 | tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
29 | model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
30 | ```
31 | 
32 | ## More Information
33 | 
34 | Refer to the original paper, [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) (NAACL Clinical NLP Workshop 2019) for additional details and performance on NLI and NER tasks.
35 | 
36 | ## Questions?
37 | 
38 | Post a Github issue on the [clinicalBERT repo](https://github.com/EmilyAlsentzer/clinicalBERT) or email emilya@mit.edu with any questions.
39 | 
40 | 


--------------------------------------------------------------------------------
/model_cards/fmikaelian/camembert-base-fquad/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: french
 3 | ---
 4 | 
 5 | # camembert-base-fquad
 6 | 
 7 | ## Description
 8 | 
 9 | A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [FQuAD](https://fquad.illuin.tech/))
10 | 
11 | ## Training hyperparameters
12 | 
13 | ```shell
14 | python3 ./examples/run_squad.py \
15 | --model_type camembert \
16 | --model_name_or_path camembert-base \
17 | --do_train \
18 | --do_eval \
19 | --do_lower_case \
20 | --train_file train.json \
21 | --predict_file valid.json \
22 | --learning_rate 3e-5 \
23 | --num_train_epochs 2 \
24 | --max_seq_length 384 \
25 | --doc_stride 128 \
26 | --output_dir output \
27 | --per_gpu_eval_batch_size=3 \
28 | --per_gpu_train_batch_size=3 \
29 | --save_steps 10000
30 | ``` 
31 | 
32 | ## Evaluation results
33 | 
34 | ```shell
35 | {"f1": 77.24515316052342, "exact_match": 52.82308657465496}
36 | ```
37 | 
38 | ## Usage
39 | 
40 | ```python
41 | from transformers import pipeline
42 | 
43 | nlp = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad')
44 | 
45 | nlp({
46 |     'question': "Qui est Claude Monet?",
47 |     'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
48 | })
49 | ```


--------------------------------------------------------------------------------
/model_cards/fmikaelian/camembert-base-squad/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: french
 3 | ---
 4 | 
 5 | # camembert-base-squad
 6 | 
 7 | ## Description
 8 | 
 9 | A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD))
10 | 
11 | ## Training hyperparameters
12 | 
13 | ```shell
14 | python3 ./examples/run_squad.py \
15 | --model_type camembert \
16 | --model_name_or_path camembert-base \
17 | --do_train \
18 | --do_eval \
19 | --do_lower_case \
20 | --train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \
21 | --predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \
22 | --learning_rate 3e-5 \
23 | --num_train_epochs 2 \
24 | --max_seq_length 384 \
25 | --doc_stride 128 \
26 | --output_dir output3 \
27 | --per_gpu_eval_batch_size=3 \
28 | --per_gpu_train_batch_size=3 \
29 | --save_steps 10000
30 | ``` 
31 | 
32 | ## Evaluation results
33 | 
34 | ```shell
35 | {"f1": 79.8570684959745, "exact_match": 59.21327108373895}
36 | ```
37 | 
38 | ## Usage
39 | 
40 | ```python
41 | from transformers import pipeline
42 | 
43 | nlp = pipeline('question-answering', model='fmikaelian/camembert-base-squad', tokenizer='fmikaelian/camembert-base-squad')
44 | 
45 | nlp({
46 |     'question': "Qui est Claude Monet?",
47 |     'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
48 | })
49 | ```


--------------------------------------------------------------------------------
/model_cards/fmikaelian/flaubert-base-uncased-squad/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: french
 3 | ---
 4 | 
 5 | # flaubert-base-uncased-squad
 6 | 
 7 | ## Description
 8 | 
 9 | A baseline model for question-answering in french ([flaubert](https://github.com/getalp/Flaubert) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD))
10 | 
11 | ## Training hyperparameters
12 | 
13 | ```shell
14 | python3 ./examples/run_squad.py \
15 | --model_type flaubert \
16 | --model_name_or_path flaubert-base-uncased \
17 | --do_train \
18 | --do_eval \
19 | --do_lower_case \
20 | --train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \
21 | --predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \
22 | --learning_rate 3e-5 \
23 | --num_train_epochs 2 \
24 | --max_seq_length 384 \
25 | --doc_stride 128 \
26 | --output_dir output \
27 | --per_gpu_eval_batch_size=3 \
28 | --per_gpu_train_batch_size=3
29 | ``` 
30 | 
31 | ## Evaluation results
32 | 
33 | ```shell
34 | {"f1": 68.66174806561969, "exact_match": 49.299692063176714}
35 | ```
36 | 
37 | ## Usage
38 | 
39 | ```python
40 | from transformers import pipeline
41 | 
42 | nlp = pipeline('question-answering', model='fmikaelian/flaubert-base-uncased-squad', tokenizer='fmikaelian/flaubert-base-uncased-squad')
43 | 
44 | nlp({
45 |     'question': "Qui est Claude Monet?",
46 |     'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
47 | })
48 | ```


--------------------------------------------------------------------------------
/model_cards/henryk/bert-base-multilingual-cased-finetuned-dutch-squad2/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: dutch
 3 | ---
 4 | 
 5 | # Multilingual + Dutch SQuAD2.0
 6 | 
 7 | This model is the multilingual model provided by the Google research team with a fine-tuned dutch Q&A downstream task.
 8 | 
 9 | ## Details of the language model(bert-base-multilingual-cased)
10 | 
11 | Language model ([**bert-base-multilingual-cased**](https://github.com/google-research/bert/blob/master/multilingual.md)):
12 | 12-layer, 768-hidden, 12-heads, 110M parameters.
13 | Trained on cased text in the top 104 languages with the largest Wikipedias.
14 | 
15 | ## Details of the downstream task - Dataset
16 | Using the `mtranslate` Python module, [**SQuAD2.0**](https://rajpurkar.github.io/SQuAD-explorer/) was machine-translated. In order to find the start tokens the direct translations of the answers were searched in the corresponding paragraphs. Since the answer could not always be found in the text, due to the different translations depending on the context (missing context in the pure answer), a loss of question-answer examples occurred. This is a potential problem where errors can occur in the data set (but in the end it was a quick and dirty solution that worked well enough for my task).
17 | 
18 | | Dataset                | # Q&A |
19 | | ---------------------- | ----- |
20 | | SQuAD2.0 Train         | 130 K |
21 | | Dutch SQuAD2.0 Train   | 99  K |
22 | | SQuAD2.0 Dev           | 12  K |
23 | | Dutch SQuAD2.0 Dev     | 10  K |
24 | 
25 | ## Model training
26 | 
27 | The model was trained on a Tesla V100 GPU with the following command:
28 | 
29 | ```python
30 | export SQUAD_DIR=path/to/nl_squad
31 | 
32 | python run_squad.py \
33 |   --model_type bert \
34 |   --model_name_or_path bert-base-multilingual-cased \
35 |   --version_2_with_negative \
36 |   --do_train \
37 |   --do_eval \
38 |   --train_file $SQUAD_DIR/train_nl-v2.0.json \
39 |   --predict_file $SQUAD_DIR/dev_nl-v2.0.json \
40 |   --per_gpu_train_batch_size 12 \
41 |   --learning_rate 3e-5 \
42 |   --num_train_epochs 2.0 \
43 |   --max_seq_length 384 \
44 |   --doc_stride 128 \
45 |   --output_dir /tmp/output_dir/
46 | ```
47 | 
48 | **Results**:
49 | 
50 | {'exact': **67.38**, 'f1': **71.36**} 


--------------------------------------------------------------------------------
/model_cards/jplu/tf-camembert-base/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow CamemBERT
 2 | 
 3 | In this repository you will find different versions of the CamemBERT model for Tensorflow.
 4 | 
 5 | ## CamemBERT
 6 | 
 7 | [CamemBERT](https://camembert-model.fr/) is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR.
 8 | 
 9 | ## Model Weights
10 | 
11 | | Model                            | Downloads
12 | | -------------------------------- | ---------------------------------------------------------------------------------------------------------------
13 | | `jplu/tf-camembert-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/tf_model.h5)
14 | 
15 | ## Usage
16 | 
17 | With Transformers >= 2.4 the Tensorflow models of CamemBERT can be loaded like:
18 | 
19 | ```python
20 | from transformers import TFCamembertModel
21 | 
22 | model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base")
23 | ```
24 | 
25 | ## Huggingface model hub
26 | 
27 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
28 | 
29 | ## Acknowledgments
30 | 
31 | Thanks to all the Huggingface team for the support and their amazing library!
32 | 


--------------------------------------------------------------------------------
/model_cards/jplu/tf-xlm-roberta-base/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow XLM-RoBERTa
 2 | 
 3 | In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow.
 4 | 
 5 | ## XLM-RoBERTa
 6 | 
 7 | [XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks.
 8 | 
 9 | ## Model Weights
10 | 
11 | | Model                            | Downloads
12 | | -------------------------------- | ---------------------------------------------------------------------------------------------------------------
13 | | `jplu/tf-xlm-roberta-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5)
14 | | `jplu/tf-xlm-roberta-large`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5)
15 | 
16 | ## Usage
17 | 
18 | With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like:
19 | 
20 | ```python
21 | from transformers import TFXLMRobertaModel
22 | 
23 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
24 | ```
25 | Or
26 | ```
27 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large")
28 | ```
29 | 
30 | ## Huggingface model hub
31 | 
32 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
33 | 
34 | ## Acknowledgments
35 | 
36 | Thanks to all the Huggingface team for the support and their amazing library!
37 | 


--------------------------------------------------------------------------------
/model_cards/jplu/tf-xlm-roberta-large/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow XLM-RoBERTa
 2 | 
 3 | In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow.
 4 | 
 5 | ## XLM-RoBERTa
 6 | 
 7 | [XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks.
 8 | 
 9 | ## Model Weights
10 | 
11 | | Model                            | Downloads
12 | | -------------------------------- | ---------------------------------------------------------------------------------------------------------------
13 | | `jplu/tf-xlm-roberta-base`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5)
14 | | `jplu/tf-xlm-roberta-large`   | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5)
15 | 
16 | ## Usage
17 | 
18 | With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like:
19 | 
20 | ```python
21 | from transformers import TFXLMRobertaModel
22 | 
23 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
24 | ```
25 | Or
26 | ```
27 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large")
28 | ```
29 | 
30 | ## Huggingface model hub
31 | 
32 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu).
33 | 
34 | ## Acknowledgments
35 | 
36 | Thanks to all the Huggingface team for the support and their amazing library!
37 | 


--------------------------------------------------------------------------------
/model_cards/julien-c/EsperBERTo-small-pos/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: esperanto
 3 | thumbnail: https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png
 4 | ---
 5 | 
 6 | # EsperBERTo: RoBERTa-like Language model trained on Esperanto
 7 | 
 8 | **Companion model to blog post https://huggingface.co/blog/how-to-train** 🔥
 9 | 
10 | ## Training Details
11 | 
12 | - current checkpoint: 566000
13 | - machine name: `galinette`
14 | 
15 | 
16 | ![](https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png)
17 | 
18 | ## Example pipeline
19 | 
20 | ```python
21 | from transformers import TokenClassificationPipeline, pipeline
22 | 
23 | 
24 | MODEL_PATH = "./models/EsperBERTo-small-pos/"
25 | 
26 | nlp = pipeline(
27 |     "ner",
28 |     model=MODEL_PATH,
29 |     tokenizer=MODEL_PATH,
30 | )
31 | # or instantiate a TokenClassificationPipeline directly.
32 | 
33 | nlp("Mi estas viro kej estas tago varma.")
34 | 
35 | # {'entity': 'PRON', 'score': 0.9979867339134216, 'word': ' Mi'}
36 | # {'entity': 'VERB', 'score': 0.9683094620704651, 'word': ' estas'}
37 | # {'entity': 'VERB', 'score': 0.9797462821006775, 'word': ' estas'}
38 | # {'entity': 'NOUN', 'score': 0.8509314060211182, 'word': ' tago'}
39 | # {'entity': 'ADJ', 'score': 0.9996201395988464, 'word': ' varma'}
40 | ```


--------------------------------------------------------------------------------
/model_cards/julien-c/EsperBERTo-small/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: esperanto
 3 | thumbnail: https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png
 4 | ---
 5 | 
 6 | # EsperBERTo: RoBERTa-like Language model trained on Esperanto
 7 | 
 8 | **Companion model to blog post https://huggingface.co/blog/how-to-train** 🔥
 9 | 
10 | ## Training Details
11 | 
12 | - current checkpoint: 566000
13 | - machine name: `galinette`
14 | 
15 | 
16 | ![](https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png)
17 | 
18 | ## Example pipeline
19 | 
20 | ```python
21 | from transformers import pipeline
22 | 
23 | fill_mask = pipeline(
24 |     "fill-mask",
25 |     model="julien-c/EsperBERTo-small",
26 |     tokenizer="julien-c/EsperBERTo-small"
27 | )
28 | 
29 | fill_mask("Jen la komenco de bela <mask>.")
30 | 
31 | # This is the beginning of a beautiful <mask>.
32 | # =>
33 | 
34 | # {
35 | #     'score':0.06502299010753632
36 | #     'sequence':'<s> Jen la komenco de bela vivo.</s>'
37 | #     'token':1099
38 | # }
39 | # {
40 | #     'score':0.0421181358397007
41 | #     'sequence':'<s> Jen la komenco de bela vespero.</s>'
42 | #     'token':5100
43 | # }
44 | # {
45 | #     'score':0.024884626269340515
46 | #     'sequence':'<s> Jen la komenco de bela laboro.</s>'
47 | #     'token':1570
48 | # }
49 | # {
50 | #     'score':0.02324388362467289
51 | #     'sequence':'<s> Jen la komenco de bela tago.</s>'
52 | #     'token':1688
53 | # }
54 | # {
55 | #     'score':0.020378097891807556
56 | #     'sequence':'<s> Jen la komenco de bela festo.</s>'
57 | #     'token':4580
58 | # }
59 | ```
60 | 


--------------------------------------------------------------------------------
/model_cards/julien-c/bert-xsmall-dummy/README.md:
--------------------------------------------------------------------------------
 1 | ## How to build a dummy model
 2 | 
 3 | 
 4 | ```python
 5 | from transformers.configuration_bert import BertConfig
 6 | from transformers.modeling_bert import BertForMaskedLM
 7 | from transformers.modeling_tf_bert import TFBertForMaskedLM
 8 | from transformers.tokenization_bert import BertTokenizer
 9 | 
10 | 
11 | SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
12 | DIRNAME = "./bert-xsmall-dummy"
13 | 
14 | config = BertConfig(10, 20, 1, 1, 40)
15 | 
16 | model = BertForMaskedLM(config)
17 | model.save_pretrained(DIRNAME)
18 | 
19 | tf_model = TFBertForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
20 | tf_model.save_pretrained(DIRNAME)
21 | 
22 | # Slightly different for tokenizer.
23 | # tokenizer = BertTokenizer.from_pretrained(DIRNAME)
24 | # tokenizer.save_pretrained()
25 | ```
26 | 


--------------------------------------------------------------------------------
/model_cards/julien-c/dummy-unknown/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | tags:
 3 | - ci
 4 | ---
 5 | 
 6 | ## Dummy model used for unit testing and CI
 7 | 
 8 | 
 9 | ```python
10 | import json
11 | import os
12 | from transformers.configuration_roberta import RobertaConfig
13 | from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM
14 | 
15 | DIRNAME = "./dummy-unknown"
16 | 
17 | 
18 | config = RobertaConfig(10, 20, 1, 1, 40)
19 | 
20 | model = RobertaForMaskedLM(config)
21 | model.save_pretrained(DIRNAME)
22 | 
23 | tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True)
24 | tf_model.save_pretrained(DIRNAME)
25 | 
26 | # Tokenizer:
27 | 
28 | vocab = [
29 |     "l",
30 |     "o",
31 |     "w",
32 |     "e",
33 |     "r",
34 |     "s",
35 |     "t",
36 |     "i",
37 |     "d",
38 |     "n",
39 |     "\u0120",
40 |     "\u0120l",
41 |     "\u0120n",
42 |     "\u0120lo",
43 |     "\u0120low",
44 |     "er",
45 |     "\u0120lowest",
46 |     "\u0120newer",
47 |     "\u0120wider",
48 |     "<unk>",
49 | ]
50 | vocab_tokens = dict(zip(vocab, range(len(vocab))))
51 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
52 | 
53 | vocab_file = os.path.join(DIRNAME, "vocab.json")
54 | merges_file = os.path.join(DIRNAME, "merges.txt")
55 | with open(vocab_file, "w", encoding="utf-8") as fp:
56 |     fp.write(json.dumps(vocab_tokens) + "\n")
57 | with open(merges_file, "w", encoding="utf-8") as fp:
58 |     fp.write("\n".join(merges))
59 | ```
60 | 


--------------------------------------------------------------------------------
/model_cards/lvwerra/gpt2-medium-taboo/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-2 (medium) Taboo
 2 | 
 3 | ## What is it?
 4 | A fine-tuned GPT-2 version for Taboo cards generation.
 5 | 
 6 | ## Training setting
 7 | 
 8 | The model was trained on ~900 Taboo cards in the following format for 100 epochs:
 9 | ```
10 | Describe the word Glitch without using the words Problem, Unexpected, Technology, Minor, Outage.
11 | ````
12 | 
13 | 


--------------------------------------------------------------------------------
/model_cards/lysandre/arxiv-nlp/README.md:
--------------------------------------------------------------------------------
1 | # ArXiv-NLP GPT-2 checkpoint
2 | 
3 | This is a GPT-2 small checkpoint for PyTorch. It is the official `gpt2-small` fine-tuned to ArXiv paper on the computational linguistics field.
4 | 
5 | ## Training data
6 | 
7 | This model was trained on a subset of ArXiv papers that were parsed from PDF to txt. The resulting data is made of 80MB of text from the computational linguistics (cs.CL) field.


--------------------------------------------------------------------------------
/model_cards/lysandre/arxiv/README.md:
--------------------------------------------------------------------------------
1 | # ArXiv GPT-2 checkpoint
2 | 
3 | This is a GPT-2 small checkpoint for PyTorch. It is the official `gpt2-small` finetuned to ArXiv paper on physics fields.
4 | 
5 | ## Training data
6 | 
7 | This model was trained on a subset of ArXiv papers that were parsed from PDF to txt. The resulting data is made of 130MB of text, mostly from quantum physics (quant-ph) and other physics sub-fields.
8 | 


--------------------------------------------------------------------------------
/model_cards/microsoft/DialoGPT-large/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png
 3 | tags:
 4 | - conversational
 5 | ---
 6 | 
 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)
 8 | 
 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 
10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test.
11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 
12 | 
13 | * Multi-turn generation examples from an interactive environment:
14 | 
15 | |Role | Response |
16 | |---------|--------|
17 | |User | Does money buy happiness? |
18 | | Bot |  Depends how much money you spend on it .|
19 | |User | What is the best way to buy happiness ? |
20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . |
21 | |User |This is so difficult ! |
22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money |
23 | 
24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT)
25 | 
26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536)
27 | 
28 | ### How to use
29 | 
30 | Now we are ready to try out how the model works as a chatting partner!
31 | 
32 | ```python
33 | from transformers import AutoModelWithLMHead, AutoTokenizer
34 | import torch
35 | 
36 | 
37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-large")
39 | 
40 | # Let's chat for 5 lines
41 | for step in range(5):
42 | 	# encode the new user input, add the eos_token and return a tensor in Pytorch
43 | 	new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
44 | 
45 | 	# append the new user input tokens to the chat history
46 | 	bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
47 | 
48 | 	# generated a response while limiting the total chat history to 1000 tokens, 
49 | 	chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
50 | 
51 | 	# pretty print last ouput tokens from bot
52 | 	print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
53 | ```
54 | 


--------------------------------------------------------------------------------
/model_cards/microsoft/DialoGPT-medium/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png
 3 | tags:
 4 | - conversational
 5 | ---
 6 | 
 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)
 8 | 
 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 
10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test.
11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 
12 | 
13 | * Multi-turn generation examples from an interactive environment:
14 | 
15 | |Role | Response |
16 | |---------|--------|
17 | |User | Does money buy happiness? |
18 | | Bot |  Depends how much money you spend on it .|
19 | |User | What is the best way to buy happiness ? |
20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . |
21 | |User |This is so difficult ! |
22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money |
23 | 
24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT)
25 | 
26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536)
27 | 
28 | ### How to use
29 | 
30 | Now we are ready to try out how the model works as a chatting partner!
31 | 
32 | ```python
33 | from transformers import AutoModelWithLMHead, AutoTokenizer
34 | import torch
35 | 
36 | 
37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")
39 | 
40 | # Let's chat for 5 lines
41 | for step in range(5):
42 | 	# encode the new user input, add the eos_token and return a tensor in Pytorch
43 | 	new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
44 | 
45 | 	# append the new user input tokens to the chat history
46 | 	bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
47 | 
48 | 	# generated a response while limiting the total chat history to 1000 tokens, 
49 | 	chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
50 | 
51 | 	# pretty print last ouput tokens from bot
52 | 	print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
53 | ```
54 | 


--------------------------------------------------------------------------------
/model_cards/microsoft/DialoGPT-small/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png
 3 | tags:
 4 | - conversational
 5 | ---
 6 | 
 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)
 8 | 
 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 
10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test.
11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 
12 | 
13 | * Multi-turn generation examples from an interactive environment:
14 | 
15 | |Role | Response |
16 | |---------|--------|
17 | |User | Does money buy happiness? |
18 | | Bot |  Depends how much money you spend on it .|
19 | |User | What is the best way to buy happiness ? |
20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . |
21 | |User |This is so difficult ! |
22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money |
23 | 
24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT)
25 | 
26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536)
27 | 
28 | ### How to use
29 | 
30 | Now we are ready to try out how the model works as a chatting partner!
31 | 
32 | ```python
33 | from transformers import AutoModelWithLMHead, AutoTokenizer
34 | import torch
35 | 
36 | 
37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")
39 | 
40 | # Let's chat for 5 lines
41 | for step in range(5):
42 | 	# encode the new user input, add the eos_token and return a tensor in Pytorch
43 | 	new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
44 | 
45 | 	# append the new user input tokens to the chat history
46 | 	bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
47 | 
48 | 	# generated a response while limiting the total chat history to 1000 tokens, 
49 | 	chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
50 | 
51 | 	# pretty print last ouput tokens from bot
52 | 	print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
53 | ```
54 | 


--------------------------------------------------------------------------------
/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: spanish
 3 | thumbnail: https://i.imgur.com/jgBdimh.png
 4 | ---
 5 | 
 6 | # Spanish BERT (BETO) + NER
 7 | 
 8 | This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corpora) of the Spanish BERT cased [(BETO)](https://github.com/dccuchile/beto) for **NER** downstream task.
 9 | 
10 | ## Details of the downstream task (NER) - Dataset
11 | 
12 | - [Dataset:  CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) 
13 | 
14 | I preprocessed the dataset and splitted it as train / dev (80/20)
15 | 
16 | | Dataset                | # Examples |
17 | | ---------------------- | ----- |
18 | | Train                  | 8.7 K |
19 | | Dev                    | 2.2 K |
20 | 
21 | 
22 | - [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
23 | 
24 | - Labels covered:
25 | 
26 | ```
27 | B-LOC
28 | B-MISC
29 | B-ORG
30 | B-PER
31 | I-LOC
32 | I-MISC
33 | I-ORG
34 | I-PER
35 | O
36 | ```
37 | 
38 | ## Metrics on evaluation set:
39 | 
40 | |                                                      Metric                                                       |  # score  |
41 | | :------------------------------------------------------------------------------------: | :-------: |
42 | | F1                                       | **90.17**  
43 | | Precision                                | **89.86** | 
44 | | Recall                                   | **90.47** |    
45 | 
46 | ## Comparison:
47 | 
48 | |                                                      Model                                                       |  # score  |
49 | | :--------------------------------------------------------------------------------------------------------------: | :-------: |
50 | |                                        bert-base-spanish-wwm-cased (BETO)                                        |   88.43   |
51 | | [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** |
52 | |                                              Best Multilingual BERT                                              |   87.38   |
53 | 
54 | ## Model in action
55 | 
56 | Fast usage with **pipelines**:
57 | 
58 | ```python
59 | from transformers import pipeline
60 | 
61 | nlp_ner = pipeline(
62 |     "ner",
63 |     model="mrm8488/bert-spanish-cased-finetuned-ner",
64 |     tokenizer=(
65 |         'mrm8488/bert-spanish-cased-finetuned-ner',  
66 |         {"use_fast": False}
67 | ))
68 | 
69 | nlp_ner(text)
70 | 
71 | #Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}]
72 | ```
73 | 
74 | > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
75 | 
76 | > Made with <span style="color: #e25555;">&hearts;</span> in Spain
77 | 


--------------------------------------------------------------------------------
/model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: english
 3 | thumbnail:
 4 | ---
 5 | 
 6 | # [BERT](https://huggingface.co/deepset/bert-base-cased-squad2) fine tuned on [QNLI](https://github.com/rhythmcao/QNLI)+ compression ([BERT-of-Theseus](https://github.com/JetRunner/BERT-of-Theseus))
 7 | 
 8 | I used a [Bert model fine tuned on **SQUAD v2**](https://huggingface.co/deepset/bert-base-cased-squad2) and then I fine tuned it on **QNLI** using **compression** (with a constant replacing rate) as proposed in **BERT-of-Theseus**
 9 | 
10 | ## Details of the downstream task (QNLI):
11 | 
12 | ### Getting the dataset
13 | ```bash
14 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/train.tsv
15 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/test.tsv
16 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/dev.tsv
17 | 
18 | mkdir QNLI_dataset
19 | mv *.tsv QNLI_dataset
20 | ```
21 | 
22 | ### Model training
23 | 
24 | The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command:
25 | 
26 | ```bash
27 | !python /content/BERT-of-Theseus/run_glue.py \
28 |   --model_name_or_path deepset/bert-base-cased-squad2 \
29 |   --task_name qnli \
30 |   --do_train \
31 |   --do_eval \
32 |   --do_lower_case \
33 |   --data_dir /content/QNLI_dataset \
34 |   --max_seq_length 128 \
35 |   --per_gpu_train_batch_size 32 \
36 |   --per_gpu_eval_batch_size 32 \
37 |   --learning_rate 2e-5 \
38 |   --save_steps 2000 \
39 |   --num_train_epochs 50 \
40 |   --output_dir /content/ouput_dir \
41 |   --evaluate_during_training \
42 |   --replacing_rate 0.7 \
43 |   --steps_for_replacing 2500 
44 | ```
45 | 
46 | ## Metrics:
47 | 
48 | | Model          | Accuracy |
49 | |-----------------|------|
50 | | BERT-base       | 91.2 |
51 | | BERT-of-Theseus | 88.8 |
52 | | [bert-uncased-finetuned-qnli](https://huggingface.co/mrm8488/bert-uncased-finetuned-qnli) | 87.2
53 | | DistillBERT     | 85.3 |
54 | 
55 | 
56 | 
57 | 
58 | > [See all my models](https://huggingface.co/models?search=mrm8488)
59 | 
60 | > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
61 | 
62 | > Made with <span style="color: #e25555;">&hearts;</span> in Spain
63 | 


--------------------------------------------------------------------------------
/model_cards/nlptown/bert-base-multilingual-uncased-sentiment/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - english
 4 | - dutch
 5 | - german
 6 | - french
 7 | - italian
 8 | - spanish
 9 | ---
10 | 
11 | # bert-base-multilingual-uncased-sentiment
12 | 
13 | This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish and Italian. It predicts the sentiment of the review as a number of stars (between 1 and 5).
14 | 
15 | This model is intended for direct use as a sentiment analysis model for product reviews in any of the six languages above, or for further finetuning on related sentiment analysis tasks.
16 | 
17 | ## Training data
18 | 
19 | Here is the number of product reviews we used for finetuning the model: 
20 | 
21 | | Language | Number of reviews |
22 | | -------- | ----------------- |
23 | | English  | 150k           |
24 | | Dutch    | 80k            |
25 | | German   | 137k           |
26 | | French   | 140k           |
27 | | Italian  | 72k            |
28 | | Spanish  | 50k            |
29 | 
30 | ## Accuracy
31 | 
32 | The finetuned model obtained the following accuracy on 5,000 held-out product reviews in each of the languages:
33 | 
34 | - Accuracy (exact) is the exact match on the number of stars.
35 | - Accuracy (off-by-1) is the percentage of reviews where the number of stars the model predicts differs by a maximum of 1 from the number given by the human reviewer. 
36 | 
37 | 
38 | | Language | Accuracy (exact) | Accuracy (off-by-1) |
39 | | -------- | ---------------------- | ------------------- |
40 | | English  | 67%                 | 95%
41 | | Dutch    | 57%                 | 93%
42 | | German   | 61%                 | 94%
43 | | French   | 59%                 | 94%
44 | | Italian  | 59%                 | 95%
45 | | Spanish  | 58%                 | 95%
46 | 
47 | ## Contact 
48 | 
49 | Contact [NLP Town](https://www.nlp.town) for questions, feedback and/or requests for similar models.
50 | 


--------------------------------------------------------------------------------
/model_cards/severinsimmler/literary-german-bert/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: german
 3 | thumbnail: kfold.png
 4 | ---
 5 | 
 6 | # German BERT for literary texts
 7 | 
 8 | This German BERT is based on `bert-base-german-dbmdz-cased`, and has been adapted to the domain of literary texts by fine-tuning the language modeling task on the [Corpus of German-Language Fiction](https://figshare.com/articles/Corpus_of_German-Language_Fiction_txt_/4524680/1). Afterwards the model was fine-tuned for named entity recognition on the [DROC](https://gitlab2.informatik.uni-wuerzburg.de/kallimachos/DROC-Release) corpus, so you can use it to recognize protagonists in German novels.
 9 | 
10 | 
11 | # Stats
12 | 
13 | ## Language modeling
14 | 
15 | The [Corpus of German-Language Fiction](https://figshare.com/articles/Corpus_of_German-Language_Fiction_txt_/4524680/1) consists of 3,194 documents with 203,516,988 tokens or 1,520,855 types. The publication year of the texts ranges from the 18th to the 20th century:
16 | 
17 | ![years](prosa-jahre.png)
18 | 
19 | 
20 | ### Results
21 | 
22 | After one epoch:
23 | 
24 | | Model            | Perplexity |
25 | | ---------------- | ---------- |
26 | | Vanilla BERT     | 6.82       |
27 | | Fine-tuned BERT  | 4.98       |
28 | 
29 | 
30 | ## Named entity recognition
31 | 
32 | The provided model was also fine-tuned for two epochs on 10,799 sentences for training, validated on 547 and tested on 1,845 with three labels: `B-PER`, `I-PER` and `O`.
33 | 
34 | 
35 | ## Results
36 | 
37 | | Dataset | Precision | Recall | F1   |
38 | | ------- | --------- | ------ | ---- |
39 | | Dev     | 96.4      | 87.3   | 91.6 |
40 | | Test    | 92.8      | 94.9   | 93.8 |
41 | 
42 | The model has also been evaluated using 10-fold cross validation and compared with a classic Conditional Random Field baseline described in [Jannidis et al.](https://opus.bibliothek.uni-wuerzburg.de/opus4-wuerzburg/frontdoor/deliver/index/docId/14333/file/Jannidis_Figurenerkennung_Roman.pdf) (2015):
43 | 
44 | ![kfold](kfold.png)
45 | 
46 | 
47 | # References
48 | 
49 | Markus Krug, Lukas Weimer, Isabella Reger, Luisa Macharowsky, Stephan Feldhaus, Frank Puppe, Fotis Jannidis, [Description of a Corpus of Character References in German Novels](http://webdoc.sub.gwdg.de/pub/mon/dariah-de/dwp-2018-27.pdf), 2018.
50 | 
51 | Fotis Jannidis, Isabella Reger, Lukas Weimer, Markus Krug, Martin Toepfer, Frank Puppe, [Automatische Erkennung von Figuren in deutschsprachigen Romanen](https://opus.bibliothek.uni-wuerzburg.de/opus4-wuerzburg/frontdoor/deliver/index/docId/14333/file/Jannidis_Figurenerkennung_Roman.pdf), 2015.
52 | 


--------------------------------------------------------------------------------
/model_cards/severinsimmler/literary-german-bert/kfold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/model_cards/severinsimmler/literary-german-bert/kfold.png


--------------------------------------------------------------------------------
/model_cards/severinsimmler/literary-german-bert/prosa-jahre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/model_cards/severinsimmler/literary-german-bert/prosa-jahre.png


--------------------------------------------------------------------------------
/model_cards/twmkn9/albert-base-v2-squad2/README.md:
--------------------------------------------------------------------------------
 1 | This model is ALBERT base v2 trained on SQuAD v2 as:
 2 | 
 3 | ```
 4 | python run_squad.py 
 5 | --model_type albert 
 6 | --model_name_or_path albert-base-v2 
 7 | --do_train --do_eval 
 8 | --do_lower_case 
 9 | --version_2_with_negative 
10 | --train_file $SQUAD_DIR/train-v2.0.json 
11 | --predict_file $SQUAD_DIR/dev-v2.0.json 
12 | --per_gpu_train_batch_size 8 
13 | --num_train_epochs 3 
14 | --learning_rate 3e-5 
15 | --max_seq_length 384 
16 | --doc_stride 128 
17 | --output_dir ./tmp/albert_base_fine/
18 | ```
19 | 
20 | Performance on a dev subset is close to the original paper:
21 | 
22 | ```
23 | Results: 
24 | {
25 |     'exact': 78.71010200723923, 
26 |     'f1': 81.89228117126069, 
27 |     'total': 6078, 
28 |     'HasAns_exact': 75.39518900343643, 
29 |     'HasAns_f1': 82.04167868004215, 
30 |     'HasAns_total': 2910, 
31 |     'NoAns_exact': 81.7550505050505, 
32 |     'NoAns_f1': 81.7550505050505, 
33 |     'NoAns_total': 3168, 
34 |     'best_exact': 78.72655478775913, 
35 |     'best_exact_thresh': 0.0, 
36 |     'best_f1': 81.90873395178066, 
37 |     'best_f1_thresh': 0.0
38 | }
39 | ```
40 | 
41 | We are hopeful this might save you time, energy, and compute. Cheers!


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_base/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_base
 7 | 
 8 | This a albert_chinese_base model from [Google's github](https://github.com/google-research/ALBERT)  
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_base model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_base 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_base'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `感 0.36333346366882324`   
45 | 


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_large/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_large
 7 | 
 8 | This a albert_chinese_large model from [Google's github](https://github.com/google-research/ALBERT)  
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_large model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_large 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_large'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `心 0.9422469735145569`   
45 | 


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_small/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_small
 7 | 
 8 | This a albert_chinese_small model from [brightmart/albert_zh project](https://github.com/brightmart/albert_zh), albert_small_google_zh model    
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_small model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_small 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_small'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `感 0.6390823125839233`   
45 | 


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_tiny/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_tiny
 7 | 
 8 | This a albert_chinese_tiny model from [brightmart/albert_zh project](https://github.com/brightmart/albert_zh), albert_tiny_google_zh model    
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_tiny model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_tiny 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_tiny'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `感 0.40312355756759644`   
45 | 


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_xlarge/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_xlarge
 7 | 
 8 | This a albert_chinese_xlarge model from [Google's github](https://github.com/google-research/ALBERT)  
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_xlarge model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_xlarge 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_xlarge'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `心 0.9942440390586853`   
45 | 


--------------------------------------------------------------------------------
/model_cards/voidful/albert_chinese_xxlarge/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language:
 3 | - chinese
 4 | ---
 5 | 
 6 | # albert_chinese_xxlarge
 7 | 
 8 | This a albert_chinese_xxlarge model from [Google's github](https://github.com/google-research/ALBERT)  
 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py)
10 | 
11 | ## Attention (注意)
12 | 
13 | Since sentencepiece is not used in albert_chinese_xxlarge model   
14 | you have to call BertTokenizer instead of AlbertTokenizer !!!    
15 | we can eval it using an example on MaskedLM   
16 |    
17 | 由於 albert_chinese_xxlarge 模型沒有用 sentencepiece   
18 | 用AlbertTokenizer會載不進詞表，因此需要改用BertTokenizer !!!   
19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確   
20 |    
21 | ## Justify (驗證有效性)
22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj)   
23 | ```python
24 | from transformers import *
25 | import torch
26 | from torch.nn.functional import softmax
27 | 
28 | pretrained = 'voidful/albert_chinese_xxlarge'
29 | tokenizer = BertTokenizer.from_pretrained(pretrained)
30 | model = AlbertForMaskedLM.from_pretrained(pretrained)
31 | 
32 | inputtext = "今天[MASK]情很好"
33 | 
34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103)
35 | 
36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
37 | outputs = model(input_ids, masked_lm_labels=input_ids)
38 | loss, prediction_scores = outputs[:2]
39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist()
40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item()
41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
42 | print(predicted_token,logit_prob[predicted_index])
43 | ```
44 | Result: `心 0.995713472366333`   
45 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Transformers Notebooks
 2 | 
 3 | You can find here a list of the official notebooks provided by Hugging Face.
 4 | 
 5 | Also, we would like to list here interesting content created by the community. 
 6 | If you wrote some notebook(s) leveraging transformers and would like be listed here, please open a 
 7 | Pull Request and we'll review it so it can be included here. 
 8 | 
 9 | 
10 | ## Hugging Face's notebooks :hugs:
11 | 
12 | | Notebook     |      Description      |   |
13 | |:----------|:-------------:|------:|
14 | | [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
15 | | [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb) |
16 | | [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb) |
17 | | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|
18 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | ensure_newline_before_comments = True
 3 | force_grid_wrap = 0
 4 | include_trailing_comma = True
 5 | known_first_party = transformers
 6 | known_third_party =
 7 |     absl
 8 |     fairseq
 9 |     fastprogress
10 |     git
11 |     h5py
12 |     MeCab
13 |     nltk
14 |     numpy
15 |     packaging
16 |     PIL
17 |     psutil
18 |     pytorch_lightning
19 |     seqeval
20 |     sklearn
21 |     tensorboardX
22 |     tensorflow
23 |     tensorflow_datasets
24 |     torch
25 |     torchtext
26 |     torchvision
27 |     torch_xla
28 | 
29 | line_length = 119
30 | lines_after_imports = 2
31 | multi_line_output = 3
32 | use_parentheses = True
33 | 
34 | [flake8]
35 | ignore = E203, E501, W503
36 | max-line-length = 119
37 | 


--------------------------------------------------------------------------------
/src/transformers/activations.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | def swish(x):
 8 |     return x * torch.sigmoid(x)
 9 | 
10 | 
11 | def _gelu_python(x):
12 |     """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
13 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
14 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
15 |         This is now written in C in torch.nn.functional
16 |         Also see https://arxiv.org/abs/1606.08415
17 |     """
18 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
19 | 
20 | 
21 | if torch.__version__ < "1.4.0":
22 |     gelu = _gelu_python
23 | else:
24 |     gelu = F.gelu
25 | 
26 | 
27 | def gelu_new(x):
28 |     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
29 |         Also see https://arxiv.org/abs/1606.08415
30 |     """
31 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
32 | 
33 | 
34 | ACT2FN = {
35 |     "relu": F.relu,
36 |     "swish": swish,
37 |     "gelu": gelu,
38 |     "tanh": F.tanh,
39 |     "gelu_new": gelu_new,
40 | }
41 | 
42 | 
43 | def get_activation(activation_string):
44 |     if activation_string in ACT2FN:
45 |         return ACT2FN[activation_string]
46 |     else:
47 |         raise KeyError(
48 |             "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
49 |                 activation_string, list(ACT2FN.keys())
50 |             )
51 |         )
52 | 


--------------------------------------------------------------------------------
/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseTransformersCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/src/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from transformers.commands import BaseTransformersCLICommand
 4 | 
 5 | 
 6 | def download_command_factory(args):
 7 |     return DownloadCommand(args.model, args.cache_dir, args.force)
 8 | 
 9 | 
10 | class DownloadCommand(BaseTransformersCLICommand):
11 |     @staticmethod
12 |     def register_subcommand(parser: ArgumentParser):
13 |         download_parser = parser.add_parser("download")
14 |         download_parser.add_argument(
15 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 |         )
17 |         download_parser.add_argument(
18 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 |         )
20 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
21 |         download_parser.set_defaults(func=download_command_factory)
22 | 
23 |     def __init__(self, model: str, cache: str, force: bool):
24 |         self._model = model
25 |         self._cache = cache
26 |         self._force = force
27 | 
28 |     def run(self):
29 |         from transformers import AutoModel, AutoTokenizer
30 | 
31 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 | 


--------------------------------------------------------------------------------
/src/transformers/commands/env.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers import __version__ as version
 5 | from transformers import is_tf_available, is_torch_available
 6 | from transformers.commands import BaseTransformersCLICommand
 7 | 
 8 | 
 9 | def info_command_factory(_):
10 |     return EnvironmentCommand()
11 | 
12 | 
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 |     @staticmethod
15 |     def register_subcommand(parser: ArgumentParser):
16 |         download_parser = parser.add_parser("env")
17 |         download_parser.set_defaults(func=info_command_factory)
18 | 
19 |     def run(self):
20 |         pt_version = "not installed"
21 |         pt_cuda_available = "NA"
22 |         if is_torch_available():
23 |             import torch
24 | 
25 |             pt_version = torch.__version__
26 |             pt_cuda_available = torch.cuda.is_available()
27 | 
28 |         tf_version = "not installed"
29 |         tf_cuda_available = "NA"
30 |         if is_tf_available():
31 |             import tensorflow as tf
32 | 
33 |             tf_version = tf.__version__
34 |             try:
35 |                 # deprecated in v2.1
36 |                 tf_cuda_available = tf.test.is_gpu_available()
37 |             except AttributeError:
38 |                 # returns list of devices, convert to bool
39 |                 tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 | 
41 |         info = {
42 |             "`transformers` version": version,
43 |             "Platform": platform.platform(),
44 |             "Python version": platform.python_version(),
45 |             "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 |             "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 |             "Using GPU in script?": "<fill in>",
48 |             "Using distributed or parallel set-up in script?": "<fill in>",
49 |         }
50 | 
51 |         print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 |         print(self.format_dict(info))
53 | 
54 |         return info
55 | 
56 |     @staticmethod
57 |     def format_dict(d):
58 |         return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 | 
32 | 
33 | class CamembertConfig(RobertaConfig):
34 |     """
35 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 |     superclass for the appropriate documentation alongside usage examples.
37 |     """
38 | 
39 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
40 |     model_type = "camembert"
41 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class MMBTConfig(object):
26 |     """Configuration class to store the configuration of a `MMBT Model`.
27 | 
28 |     Args:
29 |         config (:obj:`~transformers.PreTrainedConfig`):
30 |             Config of the underlying Transformer models. Its values are
31 |             copied over to use a single config.
32 |         num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/src/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
28 |     "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
29 |     "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
30 |     "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
31 |     "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
32 |     "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
33 | }
34 | 
35 | 
36 | class XLMRobertaConfig(RobertaConfig):
37 |     """
38 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
39 |     superclass for the appropriate documentation alongside usage examples.
40 |     """
41 | 
42 |     pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
43 |     model_type = "xlm-roberta"
44 | 


--------------------------------------------------------------------------------
/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = AlbertConfig.from_json_file(albert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = AlbertForMaskedLM(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--albert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--bert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained BERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | from transformers.file_utils import WEIGHTS_NAME
 7 | 
 8 | 
 9 | DIALOGPT_MODELS = ["small", "medium", "large"]
10 | 
11 | OLD_KEY = "lm_head.decoder.weight"
12 | NEW_KEY = "lm_head.weight"
13 | 
14 | 
15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
16 |     d = torch.load(checkpoint_path)
17 |     d[NEW_KEY] = d.pop(OLD_KEY)
18 |     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
19 |     torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--dialogpt_path", default=".", type=str)
25 |     args = parser.parse_args()
26 |     for MODEL in DIALOGPT_MODELS:
27 |         checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
28 |         pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
29 |         convert_dialogpt_checkpoint(
30 |             checkpoint_path, pytorch_dump_folder_path,
31 |         )
32 | 


--------------------------------------------------------------------------------
/src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if gpt2_config_file == "":
32 |         config = GPT2Config()
33 |     else:
34 |         config = GPT2Config.from_json_file(gpt2_config_file)
35 |     model = GPT2Model(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
55 |     )
56 |     parser.add_argument(
57 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
58 |     )
59 |     parser.add_argument(
60 |         "--gpt2_config_file",
61 |         default="",
62 |         type=str,
63 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
64 |         "This specifies the model architecture.",
65 |     )
66 |     args = parser.parse_args()
67 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
68 | 


--------------------------------------------------------------------------------
/src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
30 |     # Construct model
31 |     if openai_config_file == "":
32 |         config = OpenAIGPTConfig()
33 |     else:
34 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
35 |     model = OpenAIGPTModel(config)
36 | 
37 |     # Load weights from numpy
38 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
39 | 
40 |     # Save pytorch-model
41 |     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 |     pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
45 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
46 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 |         f.write(config.to_json_string())
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     # Required parameters
53 |     parser.add_argument(
54 |         "--openai_checkpoint_folder_path",
55 |         default=None,
56 |         type=str,
57 |         required=True,
58 |         help="Path to the TensorFlow checkpoint path.",
59 |     )
60 |     parser.add_argument(
61 |         "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
62 |     )
63 |     parser.add_argument(
64 |         "--openai_config_file",
65 |         default="",
66 |         type=str,
67 |         help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |         "This specifies the model architecture.",
69 |     )
70 |     args = parser.parse_args()
71 |     convert_openai_checkpoint_to_pytorch(
72 |         args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
73 |     )
74 | 


--------------------------------------------------------------------------------
/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = T5Config.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = T5Model(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained T5 model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .metrics import is_sklearn_available
 6 | from .processors import (
 7 |     DataProcessor,
 8 |     InputExample,
 9 |     InputFeatures,
10 |     SingleSentenceClassificationProcessor,
11 |     SquadExample,
12 |     SquadFeatures,
13 |     SquadV1Processor,
14 |     SquadV2Processor,
15 |     glue_convert_examples_to_features,
16 |     glue_output_modes,
17 |     glue_processors,
18 |     glue_tasks_num_labels,
19 |     squad_convert_examples_to_features,
20 |     xnli_output_modes,
21 |     xnli_processors,
22 |     xnli_tasks_num_labels,
23 | )
24 | 
25 | 
26 | if is_sklearn_available():
27 |     from .metrics import glue_compute_metrics, xnli_compute_metrics
28 | 


--------------------------------------------------------------------------------
/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 | 
5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
9 | 


--------------------------------------------------------------------------------
/src/transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .tokenization_roberta import RobertaTokenizer
17 | 
18 | 
19 | # vocab and merges same as roberta
20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
23 | 
24 | 
25 | class BartTokenizer(RobertaTokenizer):
26 |     # merges and vocab same as Roberta
27 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
28 |     pretrained_vocab_files_map = {
29 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
30 |         "merges_file": {m: merges_url for m in _all_bart_models},
31 |     }
32 | 


--------------------------------------------------------------------------------
/src/transformers/utils_encoder_decoder.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Classes to support Encoder-Decoder architectures """
16 | 
17 | 
18 | def prepare_encoder_decoder_model_kwargs(**kwargs):
19 |     """ Prepare the encoder and decoder's keyword arguments.
20 | 
21 |     Keyword arguments come in 3 flavors:
22 |     - encoder-specific (prefixed by `encoder_`)
23 |     - decoder-specific (prefixed by `decoder_`)
24 |     - those that apply to the model as whole.
25 | 
26 |     We let the specific kwargs override the common ones in case of
27 |     conflict.
28 |     """
29 | 
30 |     kwargs_common = {
31 |         argument: value
32 |         for argument, value in kwargs.items()
33 |         if not argument.startswith("encoder_") and not argument.startswith("decoder_")
34 |     }
35 |     if "input_ids" in kwargs_common:
36 |         kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids")
37 | 
38 |     decoder_kwargs = kwargs_common.copy()
39 |     encoder_kwargs = kwargs_common.copy()
40 |     encoder_kwargs.update(
41 |         {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")}
42 |     )
43 |     decoder_kwargs.update(
44 |         {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")}
45 |     )
46 |     decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
47 |     return encoder_kwargs, decoder_kwargs
48 | 


--------------------------------------------------------------------------------
/templates/adding_a_new_example_script/README.md:
--------------------------------------------------------------------------------
1 | # How to add a new example script in 🤗Transformers
2 | 
3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
4 | 
5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
6 | 


--------------------------------------------------------------------------------
/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert XXX checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = XxxConfig.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = XxxForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_xxx(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/templates/adding_a_new_model/tests/test_tokenization_xxx.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 XXX Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import os
18 | import unittest
19 | 
20 | from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
21 | 
22 | from .test_tokenization_common import TokenizerTesterMixin
23 | 
24 | 
25 | class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
26 | 
27 |     tokenizer_class = XxxTokenizer
28 | 
29 |     def setUp(self):
30 |         super().setUp()
31 | 
32 |         vocab_tokens = [
33 |             "[UNK]",
34 |             "[CLS]",
35 |             "[SEP]",
36 |             "want",
37 |             "##want",
38 |             "##ed",
39 |             "wa",
40 |             "un",
41 |             "runn",
42 |             "##ing",
43 |             ",",
44 |             "low",
45 |             "lowest",
46 |         ]
47 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
48 |         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
49 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
50 | 
51 |     def get_tokenizer(self, **kwargs):
52 |         return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
53 | 
54 |     def get_input_output_texts(self):
55 |         input_text = "UNwant\u00E9d,running"
56 |         output_text = "unwanted, running"
57 |         return input_text, output_text
58 | 
59 |     def test_full_tokenizer(self):
60 |         tokenizer = self.tokenizer_class(self.vocab_file)
61 | 
62 |         tokens = tokenizer.tokenize("UNwant\u00E9d,running")
63 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
64 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
65 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/dummy-config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "model_type": "roberta"
3 | }


--------------------------------------------------------------------------------
/tests/fixtures/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/empty.txt


--------------------------------------------------------------------------------
/tests/fixtures/input.txt:
--------------------------------------------------------------------------------
1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer
2 | 


--------------------------------------------------------------------------------
/tests/fixtures/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/spiece.model


--------------------------------------------------------------------------------
/tests/fixtures/test_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/test_sentencepiece.model


--------------------------------------------------------------------------------
/tests/test_activations.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from transformers import is_torch_available
 4 | 
 5 | from .utils import require_torch
 6 | 
 7 | 
 8 | if is_torch_available():
 9 |     from transformers.activations import _gelu_python, get_activation, gelu_new
10 |     import torch
11 | 
12 | 
13 | @require_torch
14 | class TestActivations(unittest.TestCase):
15 |     def test_gelu_versions(self):
16 |         x = torch.Tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100])
17 |         torch_builtin = get_activation("gelu")
18 |         self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item())
19 |         self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item())
20 | 
21 |     def test_get_activation(self):
22 |         get_activation("swish")
23 |         get_activation("relu")
24 |         get_activation("tanh")
25 |         with self.assertRaises(KeyError):
26 |             get_activation("bogus")
27 |         with self.assertRaises(KeyError):
28 |             get_activation(None)
29 | 


--------------------------------------------------------------------------------
/tests/test_configuration_auto.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import unittest
18 | 
19 | from transformers.configuration_auto import CONFIG_MAPPING, AutoConfig
20 | from transformers.configuration_bert import BertConfig
21 | from transformers.configuration_roberta import RobertaConfig
22 | 
23 | from .utils import DUMMY_UNKWOWN_IDENTIFIER
24 | 
25 | 
26 | SAMPLE_ROBERTA_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy-config.json")
27 | 
28 | 
29 | class AutoConfigTest(unittest.TestCase):
30 |     def test_config_from_model_shortcut(self):
31 |         config = AutoConfig.from_pretrained("bert-base-uncased")
32 |         self.assertIsInstance(config, BertConfig)
33 | 
34 |     def test_config_model_type_from_local_file(self):
35 |         config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
36 |         self.assertIsInstance(config, RobertaConfig)
37 | 
38 |     def test_config_model_type_from_model_identifier(self):
39 |         config = AutoConfig.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER)
40 |         self.assertIsInstance(config, RobertaConfig)
41 | 
42 |     def test_config_for_model_str(self):
43 |         config = AutoConfig.for_model("roberta")
44 |         self.assertIsInstance(config, RobertaConfig)
45 | 
46 |     def test_pattern_matching_fallback(self):
47 |         """
48 |         In cases where config.json doesn't include a model_type,
49 |         perform a few safety checks on the config mapping's order.
50 |         """
51 |         # no key string should be included in a later key string (typical failure case)
52 |         keys = list(CONFIG_MAPPING.keys())
53 |         for i, key in enumerate(keys):
54 |             self.assertFalse(any(key in later_key for later_key in keys[i + 1 :]))
55 | 


--------------------------------------------------------------------------------
/tests/test_tokenization_ctrl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import json
17 | import os
18 | import unittest
19 | 
20 | from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
21 | 
22 | from .test_tokenization_common import TokenizerTesterMixin
23 | 
24 | 
25 | class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
26 | 
27 |     tokenizer_class = CTRLTokenizer
28 | 
29 |     def setUp(self):
30 |         super().setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
34 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
35 |         merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
36 |         self.special_tokens_map = {"unk_token": "<unk>"}
37 | 
38 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
39 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
40 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
41 |             fp.write(json.dumps(vocab_tokens) + "\n")
42 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
43 |             fp.write("\n".join(merges))
44 | 
45 |     def get_tokenizer(self, **kwargs):
46 |         kwargs.update(self.special_tokens_map)
47 |         return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
48 | 
49 |     def get_input_output_texts(self):
50 |         input_text = "adapt react readapt apt"
51 |         output_text = "adapt react readapt apt"
52 |         return input_text, output_text
53 | 
54 |     def test_full_tokenizer(self):
55 |         tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
56 |         text = "adapt react readapt apt"
57 |         bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split()
58 |         tokens = tokenizer.tokenize(text)
59 |         self.assertListEqual(tokens, bpe_tokens)
60 | 
61 |         input_tokens = tokens + [tokenizer.unk_token]
62 | 
63 |         input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
64 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
65 | 


--------------------------------------------------------------------------------
/tests/test_tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from transformers.tokenization_distilbert import DistilBertTokenizer
18 | 
19 | from .test_tokenization_bert import BertTokenizationTest
20 | from .utils import slow
21 | 
22 | 
23 | class DistilBertTokenizationTest(BertTokenizationTest):
24 | 
25 |     tokenizer_class = DistilBertTokenizer
26 | 
27 |     def get_tokenizer(self, **kwargs):
28 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
29 | 
30 |     @slow
31 |     def test_sequence_builders(self):
32 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
33 | 
34 |         text = tokenizer.encode("sequence builders", add_special_tokens=False)
35 |         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
36 | 
37 |         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
38 |         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
39 | 
40 |         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
41 |         assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
42 |             tokenizer.sep_token_id
43 |         ]
44 | 


--------------------------------------------------------------------------------
/tests/test_tokenization_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import unittest
18 | 
19 | from transformers import PreTrainedTokenizer
20 | from transformers.tokenization_gpt2 import GPT2Tokenizer
21 | 
22 | from .utils import slow
23 | 
24 | 
25 | class TokenizerUtilsTest(unittest.TestCase):
26 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
27 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
28 |         for model_name in s3_models[:1]:
29 |             tokenizer = tokenizer_class.from_pretrained(model_name)
30 |             self.assertIsNotNone(tokenizer)
31 |             self.assertIsInstance(tokenizer, tokenizer_class)
32 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
33 | 
34 |             for special_tok in tokenizer.all_special_tokens:
35 |                 self.assertIsInstance(special_tok, str)
36 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
37 |                 self.assertIsInstance(special_tok_id, int)
38 | 
39 |     @slow
40 |     def test_pretrained_tokenizers(self):
41 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
42 | 


--------------------------------------------------------------------------------
/transformers-cli:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands.convert import ConvertCommand
 5 | from transformers.commands.download import DownloadCommand
 6 | from transformers.commands.env import EnvironmentCommand
 7 | from transformers.commands.run import RunCommand
 8 | from transformers.commands.serving import ServeCommand
 9 | from transformers.commands.user import UserCommands
10 | 
11 | if __name__ == '__main__':
12 |     parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
13 |     commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
14 | 
15 |     # Register commands
16 |     ConvertCommand.register_subcommand(commands_parser)
17 |     DownloadCommand.register_subcommand(commands_parser)
18 |     EnvironmentCommand.register_subcommand(commands_parser)
19 |     RunCommand.register_subcommand(commands_parser)
20 |     ServeCommand.register_subcommand(commands_parser)
21 |     UserCommands.register_subcommand(commands_parser)
22 | 
23 |     # Let's go
24 |     args = parser.parse_args()
25 | 
26 |     if not hasattr(args, 'func'):
27 |         parser.print_help()
28 |         exit(1)
29 | 
30 |     # Run
31 |     service = args.func(args)
32 |     service.run()
33 | 


--------------------------------------------------------------------------------
/utils/link_tester.py:
--------------------------------------------------------------------------------
 1 | """ Link tester.
 2 | 
 3 | This little utility reads all the python files in the repository,
 4 | scans for links pointing to S3 and tests the links one by one. Raises an error
 5 | at the end of the scan if at least one link was reported broken.
 6 | """
 7 | import os
 8 | import re
 9 | import sys
10 | 
11 | import requests
12 | 
13 | 
14 | REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
15 | 
16 | 
17 | S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
18 | 
19 | 
20 | def list_python_files_in_repository():
21 |     """ List all python files in the repository.
22 | 
23 |     This function assumes that the script is executed in the root folder.
24 |     """
25 |     source_code_files = []
26 |     for path, subdirs, files in os.walk("."):
27 |         if "templates" in path:
28 |             continue
29 |         for name in files:
30 |             if ".py" in name and ".pyc" not in name:
31 |                 path_to_files = os.path.join(path, name)
32 |                 source_code_files.append(path_to_files)
33 | 
34 |     return source_code_files
35 | 
36 | 
37 | def find_all_links(file_paths):
38 |     links = []
39 |     for path in file_paths:
40 |         links += scan_code_for_links(path)
41 | 
42 |     return [link for link in links if link != S3_BUCKET_PREFIX]
43 | 
44 | 
45 | def scan_code_for_links(source):
46 |     """ Scans the file to find links using a regular expression.
47 |     Returns a list of links.
48 |     """
49 |     with open(source, "r") as content:
50 |         content = content.read()
51 |         raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
52 |         links = [prefix + suffix for _, prefix, suffix in raw_links]
53 | 
54 |     return links
55 | 
56 | 
57 | def check_all_links(links):
58 |     """ Check that the provided links are valid.
59 | 
60 |     Links are considered valid if a HEAD request to the server
61 |     returns a 200 status code.
62 |     """
63 |     broken_links = []
64 |     for link in links:
65 |         head = requests.head(link)
66 |         if head.status_code != 200:
67 |             broken_links.append(link)
68 | 
69 |     return broken_links
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     file_paths = list_python_files_in_repository()
74 |     links = find_all_links(file_paths)
75 |     broken_links = check_all_links(links)
76 |     print("Looking for broken links to pre-trained models/configs/tokenizers...")
77 |     if broken_links:
78 |         print("The following links did not respond:")
79 |         for link in broken_links:
80 |             print("- {}".format(link))
81 |         sys.exit(1)
82 |     print("All links are ok.")
83 | 


--------------------------------------------------------------------------------