├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── README_transformers.md ├── deploy_multi_version_doc.sh ├── docker ├── transformers-cpu │ └── Dockerfile ├── transformers-gpu │ └── Dockerfile ├── transformers-pytorch-cpu │ └── Dockerfile ├── transformers-pytorch-gpu │ └── Dockerfile ├── transformers-tensorflow-cpu │ └── Dockerfile └── transformers-tensorflow-gpu │ └── Dockerfile ├── docs ├── Makefile ├── README.md └── source │ ├── _static │ ├── css │ │ ├── Calibre-Light.ttf │ │ ├── Calibre-Medium.otf │ │ ├── Calibre-Regular.otf │ │ ├── Calibre-Thin.otf │ │ ├── code-snippets.css │ │ └── huggingface.css │ └── js │ │ ├── custom.js │ │ └── huggingface_logo.svg │ ├── benchmarks.md │ ├── bertology.rst │ ├── conf.py │ ├── converting_tensorflow_models.rst │ ├── examples.md │ ├── favicon.ico │ ├── glossary.rst │ ├── imgs │ ├── transformers_logo_name.png │ ├── warmup_constant_schedule.png │ ├── warmup_cosine_hard_restarts_schedule.png │ ├── warmup_cosine_schedule.png │ ├── warmup_cosine_warm_restarts_schedule.png │ └── warmup_linear_schedule.png │ ├── index.rst │ ├── installation.md │ ├── main_classes │ ├── configuration.rst │ ├── model.rst │ ├── optimizer_schedules.rst │ ├── pipelines.rst │ ├── processors.rst │ └── tokenizer.rst │ ├── migration.md │ ├── model_doc │ ├── albert.rst │ ├── auto.rst │ ├── bart.rst │ ├── bert.rst │ ├── camembert.rst │ ├── ctrl.rst │ ├── distilbert.rst │ ├── flaubert.rst │ ├── gpt.rst │ ├── gpt2.rst │ ├── roberta.rst │ ├── transformerxl.rst │ ├── xlm.rst │ ├── xlmroberta.rst │ └── xlnet.rst │ ├── model_sharing.md │ ├── multilingual.rst │ ├── notebooks.rst │ ├── pretrained_models.rst │ ├── quickstart.md │ ├── serialization.rst │ ├── torchscript.rst │ └── usage.rst ├── examples ├── README.md ├── benchmarks.py ├── bert_stable_fine_tuning │ ├── Dockerfile │ ├── README.md │ ├── adamw.py │ ├── configs │ │ ├── cola-sampled │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_full-steps.yaml │ │ │ └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml │ │ ├── cola │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_normal-0.02.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ └── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml │ │ ├── mrpc-sampled │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml │ │ ├── mrpc │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_no-drop.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_normal-0.02.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_no-drop.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ └── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml │ │ ├── rte-sampled │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ └── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_full-steps.yaml │ │ └── rte │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_1e-05_adamW.yaml │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_1e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_3e-05_adamW.yaml │ │ │ ├── pooler-albert-large-v2_bsz_16_lr_3e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_1e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_3e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_bias-correct_early-stopping_20.yaml │ │ │ ├── pooler-bert-large-uncased_bsz_16_lr_5e-05_adamW_early-stopping.yaml │ │ │ ├── pooler-roberta-large_bsz_16_lr_1e-05_adamW.yaml │ │ │ ├── pooler-roberta-large_bsz_16_lr_1e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-roberta-large_bsz_16_lr_2e-05_adamW.yaml │ │ │ ├── pooler-roberta-large_bsz_16_lr_2e-05_adamW_bias-correct.yaml │ │ │ ├── pooler-roberta-large_bsz_16_lr_3e-05_adamW.yaml │ │ │ └── pooler-roberta-large_bsz_16_lr_3e-05_adamW_bias-correct.yaml │ ├── glue.py │ ├── glue_metrics.py │ ├── pooling_albert.py │ ├── pooling_bert.py │ ├── pooling_roberta.py │ ├── run_docker.txt │ ├── run_finetuning.py │ ├── scripts │ │ ├── run_scripts.txt │ │ ├── seeds.sh │ │ └── train.sh │ └── utils.py ├── contrib │ ├── README.md │ ├── run_camembert.py │ ├── run_openai_gpt.py │ ├── run_swag.py │ └── run_transfo_xl.py ├── distillation │ ├── README.md │ ├── distiller.py │ ├── grouped_batch_sampler.py │ ├── lm_seqs_dataset.py │ ├── requirements.txt │ ├── run_squad_w_distillation.py │ ├── scripts │ │ ├── binarized_data.py │ │ ├── extract.py │ │ ├── extract_distilbert.py │ │ └── token_counts.py │ ├── train.py │ ├── training_configs │ │ ├── distilbert-base-cased.json │ │ ├── distilbert-base-multilingual-cased.json │ │ ├── distilbert-base-uncased.json │ │ ├── distilgpt2.json │ │ └── distilroberta-base.json │ └── utils.py ├── hans │ ├── hans_processors.py │ ├── test_hans.py │ └── utils_hans.py ├── mm-imdb │ ├── run_mmimdb.py │ └── utils_mmimdb.py ├── ner │ ├── README.md │ ├── run.sh │ ├── run_ner.py │ ├── run_pl.sh │ ├── run_pl_ner.py │ ├── run_tf_ner.py │ ├── transformer_base.py │ └── utils_ner.py ├── pplm │ ├── README.md │ ├── imgs │ │ ├── headfigure.png │ │ └── wooly.png │ ├── pplm_classification_head.py │ ├── run_pplm.py │ └── run_pplm_discrim_train.py ├── requirements.txt ├── run_bertology.py ├── run_generation.py ├── run_glue.py ├── run_language_modeling.py ├── run_multiple_choice.py ├── run_squad.py ├── run_tf_glue.py ├── run_xnli.py ├── squad │ ├── configs │ │ ├── albert-base-v1.yaml │ │ ├── bert-base-cased.yaml │ │ └── roberta-base.yaml │ ├── run_finetuning.py │ ├── scripts │ │ ├── iterations_info.txt │ │ ├── run_scripts.txt │ │ ├── train.sh │ │ └── train_multi_gpu.sh │ └── utils.py ├── summarization │ ├── __init__.py │ ├── bart │ │ ├── README.md │ │ ├── __init__.py │ │ ├── evaluate_cnn.py │ │ └── test_bart_examples.py │ └── bertabs │ │ ├── README.md │ │ ├── __init__.py │ │ ├── configuration_bertabs.py │ │ ├── convert_bertabs_original_pytorch_checkpoint.py │ │ ├── modeling_bertabs.py │ │ ├── requirements.txt │ │ ├── run_summarization.py │ │ ├── test_utils_summarization.py │ │ └── utils_summarization.py ├── test_examples.py ├── tests_samples │ ├── .gitignore │ ├── MRPC │ │ ├── dev.tsv │ │ └── train.tsv │ └── SQUAD │ │ ├── dev-v2.0.json │ │ └── train-v2.0.json └── utils_multiple_choice.py ├── hubconf.py ├── images ├── fig1.png └── tab1.png ├── model_cards ├── DeepPavlov │ ├── bert-base-bg-cs-pl-ru-cased │ │ └── README.md │ ├── bert-base-cased-conversational │ │ └── README.md │ ├── bert-base-multilingual-cased-sentence │ │ └── README.md │ ├── rubert-base-cased-conversational │ │ └── README.md │ ├── rubert-base-cased-sentence │ │ └── README.md │ └── rubert-base-cased │ │ └── README.md ├── KB │ ├── albert-base-swedish-cased-alpha │ │ └── README.md │ ├── bert-base-swedish-cased-ner │ │ └── README.md │ └── bert-base-swedish-cased │ │ └── README.md ├── Musixmatch │ ├── umberto-commoncrawl-cased-v1 │ │ └── README.md │ └── umberto-wikipedia-uncased-v1 │ │ └── README.md ├── ahotrod │ ├── albert_xxlargev1_squad2_512 │ │ └── README.md │ └── xlnet_large_squad2_512 │ │ └── README.md ├── asafaya │ └── bert-base-arabic │ │ └── README.md ├── aubmindlab │ ├── bert-base-arabert │ │ └── README.md │ └── bert-base-arabertv01 │ │ └── README.md ├── bert-base-german-cased-README.md ├── binwang │ └── xlnet-base-cased │ │ └── README.md ├── camembert-base-README.md ├── canwenxu │ └── BERT-of-Theseus-MNLI │ │ └── README.md ├── dbmdz │ ├── bert-base-german-cased │ │ └── README.md │ ├── bert-base-german-europeana-cased │ │ └── README.md │ ├── bert-base-german-europeana-uncased │ │ └── README.md │ ├── bert-base-german-uncased │ │ └── README.md │ ├── bert-base-italian-cased │ │ └── README.md │ ├── bert-base-italian-uncased │ │ └── README.md │ ├── bert-base-italian-xxl-cased │ │ └── README.md │ ├── bert-base-italian-xxl-uncased │ │ └── README.md │ ├── bert-base-turkish-cased │ │ └── README.md │ └── distilbert-base-turkish-cased │ │ └── README.md ├── deepset │ └── roberta-base-squad2 │ │ └── README.md ├── djstrong │ └── bg_cs_pl_ru_cased_L-12_H-768_A-12 │ │ └── README.md ├── dkleczek │ └── bert-base-polish-uncased-v1 │ │ └── README.md ├── emilyalsentzer │ ├── Bio_ClinicalBERT │ │ └── README.md │ └── Bio_Discharge_Summary_BERT │ │ └── README.md ├── fmikaelian │ ├── camembert-base-fquad │ │ └── README.md │ ├── camembert-base-squad │ │ └── README.md │ └── flaubert-base-uncased-squad │ │ └── README.md ├── henryk │ └── bert-base-multilingual-cased-finetuned-dutch-squad2 │ │ └── README.md ├── huggingface │ ├── CodeBERTa-language-id │ │ └── README.md │ └── CodeBERTa-small-v1 │ │ └── README.md ├── jplu │ ├── tf-camembert-base │ │ └── README.md │ ├── tf-xlm-roberta-base │ │ └── README.md │ └── tf-xlm-roberta-large │ │ └── README.md ├── julien-c │ ├── EsperBERTo-small-pos │ │ └── README.md │ ├── EsperBERTo-small │ │ └── README.md │ ├── bert-xsmall-dummy │ │ └── README.md │ └── dummy-unknown │ │ └── README.md ├── lvwerra │ └── gpt2-medium-taboo │ │ └── README.md ├── lysandre │ ├── arxiv-nlp │ │ └── README.md │ └── arxiv │ │ └── README.md ├── microsoft │ ├── DialoGPT-large │ │ └── README.md │ ├── DialoGPT-medium │ │ └── README.md │ └── DialoGPT-small │ │ └── README.md ├── mrm8488 │ ├── bert-base-spanish-wwm-cased-finetuned-spa-squad2-es │ │ └── README.md │ ├── bert-multi-cased-finedtuned-xquad-tydiqa-goldp │ │ └── README.md │ ├── bert-multi-cased-finetuned-xquadv1 │ │ └── README.md │ ├── bert-multi-uncased-finetuned-xquadv1 │ │ └── README.md │ ├── bert-spanish-cased-finetuned-ner │ │ └── README.md │ ├── bert-spanish-cased-finetuned-pos │ │ └── README.md │ ├── bert-uncased-finetuned-qnli │ │ └── README.md │ ├── distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es │ │ └── README.md │ └── xlm-multi-finetuned-xquadv1 │ │ └── README.md ├── nlpaueb │ └── bert-base-greek-uncased-v1 │ │ └── README.md ├── nlptown │ └── bert-base-multilingual-uncased-sentiment │ │ └── README.md ├── severinsimmler │ └── literary-german-bert │ │ ├── README.md │ │ ├── kfold.png │ │ └── prosa-jahre.png ├── twmkn9 │ └── albert-base-v2-squad2 │ │ └── README.md └── voidful │ ├── albert_chinese_base │ └── README.md │ ├── albert_chinese_large │ └── README.md │ ├── albert_chinese_small │ └── README.md │ ├── albert_chinese_tiny │ └── README.md │ ├── albert_chinese_xlarge │ └── README.md │ └── albert_chinese_xxlarge │ └── README.md ├── notebooks ├── 01-training-tokenizers.ipynb ├── 02-transformers.ipynb ├── 03-pipelines.ipynb └── README.md ├── setup.cfg ├── setup.py ├── src └── transformers │ ├── __init__.py │ ├── activations.py │ ├── commands │ ├── __init__.py │ ├── convert.py │ ├── download.py │ ├── env.py │ ├── run.py │ ├── serving.py │ ├── train.py │ └── user.py │ ├── configuration_albert.py │ ├── configuration_auto.py │ ├── configuration_bart.py │ ├── configuration_bert.py │ ├── configuration_camembert.py │ ├── configuration_ctrl.py │ ├── configuration_distilbert.py │ ├── configuration_flaubert.py │ ├── configuration_gpt2.py │ ├── configuration_mmbt.py │ ├── configuration_openai.py │ ├── configuration_roberta.py │ ├── configuration_t5.py │ ├── configuration_transfo_xl.py │ ├── configuration_utils.py │ ├── configuration_xlm.py │ ├── configuration_xlm_roberta.py │ ├── configuration_xlnet.py │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ ├── convert_pytorch_checkpoint_to_tf2.py │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ ├── data │ ├── __init__.py │ ├── metrics │ │ ├── __init__.py │ │ └── squad_metrics.py │ └── processors │ │ ├── __init__.py │ │ ├── glue.py │ │ ├── squad.py │ │ ├── utils.py │ │ └── xnli.py │ ├── file_utils.py │ ├── hf_api.py │ ├── modelcard.py │ ├── modeling_albert.py │ ├── modeling_auto.py │ ├── modeling_bart.py │ ├── modeling_bert.py │ ├── modeling_camembert.py │ ├── modeling_ctrl.py │ ├── modeling_distilbert.py │ ├── modeling_encoder_decoder.py │ ├── modeling_flaubert.py │ ├── modeling_gpt2.py │ ├── modeling_mmbt.py │ ├── modeling_openai.py │ ├── modeling_roberta.py │ ├── modeling_t5.py │ ├── modeling_tf_albert.py │ ├── modeling_tf_auto.py │ ├── modeling_tf_bert.py │ ├── modeling_tf_camembert.py │ ├── modeling_tf_ctrl.py │ ├── modeling_tf_distilbert.py │ ├── modeling_tf_gpt2.py │ ├── modeling_tf_openai.py │ ├── modeling_tf_pytorch_utils.py │ ├── modeling_tf_roberta.py │ ├── modeling_tf_t5.py │ ├── modeling_tf_transfo_xl.py │ ├── modeling_tf_transfo_xl_utilities.py │ ├── modeling_tf_utils.py │ ├── modeling_tf_xlm.py │ ├── modeling_tf_xlm_roberta.py │ ├── modeling_tf_xlnet.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── modeling_utils.py │ ├── modeling_xlm.py │ ├── modeling_xlm_roberta.py │ ├── modeling_xlnet.py │ ├── optimization.py │ ├── optimization_tf.py │ ├── pipelines.py │ ├── tokenization_albert.py │ ├── tokenization_auto.py │ ├── tokenization_bart.py │ ├── tokenization_bert.py │ ├── tokenization_bert_japanese.py │ ├── tokenization_camembert.py │ ├── tokenization_ctrl.py │ ├── tokenization_distilbert.py │ ├── tokenization_flaubert.py │ ├── tokenization_gpt2.py │ ├── tokenization_openai.py │ ├── tokenization_roberta.py │ ├── tokenization_t5.py │ ├── tokenization_transfo_xl.py │ ├── tokenization_utils.py │ ├── tokenization_xlm.py │ ├── tokenization_xlm_roberta.py │ ├── tokenization_xlnet.py │ └── utils_encoder_decoder.py ├── templates ├── adding_a_new_example_script │ ├── README.md │ ├── run_xxx.py │ └── utils_xxx.py └── adding_a_new_model │ ├── README.md │ ├── configuration_xxx.py │ ├── convert_xxx_original_tf_checkpoint_to_pytorch.py │ ├── modeling_tf_xxx.py │ ├── modeling_xxx.py │ ├── tests │ ├── test_modeling_tf_xxx.py │ ├── test_modeling_xxx.py │ └── test_tokenization_xxx.py │ └── tokenization_xxx.py ├── tests ├── __init__.py ├── fixtures │ ├── dummy-config.json │ ├── empty.txt │ ├── input.txt │ ├── sample_text.txt │ ├── spiece.model │ └── test_sentencepiece.model ├── test_activations.py ├── test_configuration_auto.py ├── test_configuration_common.py ├── test_doc_samples.py ├── test_hf_api.py ├── test_model_card.py ├── test_modeling_albert.py ├── test_modeling_auto.py ├── test_modeling_bart.py ├── test_modeling_bert.py ├── test_modeling_common.py ├── test_modeling_ctrl.py ├── test_modeling_distilbert.py ├── test_modeling_flaubert.py ├── test_modeling_gpt2.py ├── test_modeling_openai.py ├── test_modeling_roberta.py ├── test_modeling_t5.py ├── test_modeling_tf_albert.py ├── test_modeling_tf_auto.py ├── test_modeling_tf_bert.py ├── test_modeling_tf_common.py ├── test_modeling_tf_ctrl.py ├── test_modeling_tf_distilbert.py ├── test_modeling_tf_gpt2.py ├── test_modeling_tf_openai_gpt.py ├── test_modeling_tf_roberta.py ├── test_modeling_tf_t5.py ├── test_modeling_tf_transfo_xl.py ├── test_modeling_tf_xlm.py ├── test_modeling_tf_xlnet.py ├── test_modeling_transfo_xl.py ├── test_modeling_xlm.py ├── test_modeling_xlm_roberta.py ├── test_modeling_xlnet.py ├── test_optimization.py ├── test_optimization_tf.py ├── test_pipelines.py ├── test_tokenization_albert.py ├── test_tokenization_auto.py ├── test_tokenization_bert.py ├── test_tokenization_bert_japanese.py ├── test_tokenization_common.py ├── test_tokenization_ctrl.py ├── test_tokenization_distilbert.py ├── test_tokenization_fast.py ├── test_tokenization_gpt2.py ├── test_tokenization_openai.py ├── test_tokenization_roberta.py ├── test_tokenization_t5.py ├── test_tokenization_transfo_xl.py ├── test_tokenization_utils.py ├── test_tokenization_xlm.py ├── test_tokenization_xlm_roberta.py ├── test_tokenization_xlnet.py └── utils.py ├── transformers-cli ├── utils ├── download_glue_data.py └── link_tester.py └── valohai.yaml /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: quality style test test-examples 2 | 3 | # Check that source code meets quality standards 4 | 5 | quality: 6 | black --check --line-length 119 --target-version py35 examples templates tests src utils 7 | isort --check-only --recursive examples templates tests src utils 8 | flake8 examples templates tests src utils 9 | 10 | # Format source code automatically 11 | 12 | style: 13 | black --line-length 119 --target-version py35 examples templates tests src utils 14 | isort --recursive examples templates tests src utils 15 | 16 | # Run tests for the library 17 | 18 | test: 19 | python -m pytest -n auto --dist=loadfile -s -v ./tests/ 20 | 21 | # Run tests for examples 22 | 23 | test-examples: 24 | python -m pytest -n auto --dist=loadfile -s -v ./examples/ 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines 2 | 3 | ### Marius Mosbach, Maksym Andriushchenko, Dietrich Klakow 4 | ##### Saarland University and EPFL 5 | 6 | This repository contains code for the paper [On the Stability of Fine-tuning BERT: Misconceptions, Explanations, and Strong Baselines](https://arxiv.org/abs/2006.04884). It is a fork of the [Huggingface Transformers repository](https://github.com/huggingface/transformers) (v2.5.1). 7 | 8 | ## Abstract 9 | 10 | Fine-tuning pre-trained transformer-based language models such as BERT has become a common practice dominating leaderboards across various NLP benchmarks. Despite the strong empirical performance of fine-tuned models, fine-tuning is an unstable process: training the same model with multiple random seeds can result in a large variance of the task performance. Previous literature (Devlin et al., 2019; Lee et al., 2020; Dodge et al., 2020) identified two potential reasons for the observed instability: catastrophic forgetting and a small size of the fine-tuning datasets. In this paper, we show that both hypotheses fail to explain 11 | the fine-tuning instability. We analyze BERT, RoBERTa, and ALBERT, finetuned on three commonly used datasets from the GLUE benchmark and show that the observed instability is caused by optimization difficulties that lead to vanishing gradients. Additionally, we show that the remaining variance of the downstream task performance can be attributed to differences in generalization where fine-tuned models with the samw training loss exhibit noticeably different test performance. Based on our analysis, we present a simple but strong baseline that makes fine-tuning BERT-based models significantly more stable than previously proposed approaches. 12 | 13 | ![](images/fig1.png) 14 | 15 | ![](images/tab1.png) 16 | 17 | ## Reproducing our results 18 | 19 | See `/examples/bert_stable_fine_tuning/README.md` for how to setup Docker and run our models. 20 | -------------------------------------------------------------------------------- /deploy_multi_version_doc.sh: -------------------------------------------------------------------------------- 1 | cd docs 2 | 3 | function deploy_doc(){ 4 | echo "Creating doc at commit $1 and pushing to folder $2" 5 | git checkout $1 6 | if [ ! -z "$2" ] 7 | then 8 | echo "Pushing version" $2 9 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 10 | else 11 | echo "Pushing master" 12 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 13 | fi 14 | } 15 | 16 | deploy_doc "master" 17 | deploy_doc "b33a385" v1.0.0 18 | deploy_doc "fe02e45" v1.1.0 19 | deploy_doc "89fd345" v1.2.0 20 | deploy_doc "fc9faa8" v2.0.0 21 | deploy_doc "3ddce1d" v2.1.1 22 | deploy_doc "f2f3294" v2.2.0 23 | deploy_doc "d0f8b9a" v2.3.0 24 | -------------------------------------------------------------------------------- /docker/transformers-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow-cpu \ 19 | torch 20 | 21 | WORKDIR /workspace 22 | COPY . transformers/ 23 | RUN cd transformers/ && \ 24 | python3 -m pip install --no-cache-dir . 25 | 26 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/transformers-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow \ 19 | torch 20 | 21 | WORKDIR /workspace 22 | COPY . transformers/ 23 | RUN cd transformers/ && \ 24 | python3 -m pip install --no-cache-dir . 25 | 26 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/transformers-pytorch-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | torch 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/transformers-pytorch-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | torch 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/transformers-tensorflow-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow-cpu 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docker/transformers-tensorflow-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Generating the documentation 2 | 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 4 | you can install them with the following command, at the root of the code repository: 5 | 6 | ```bash 7 | pip install -e ".[docs]" 8 | ``` 9 | 10 | ## Packages installed 11 | 12 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from 13 | `requirements.txt`, you do not need to run the following commands. 14 | 15 | Building it requires the package `sphinx` that you can 16 | install using: 17 | 18 | ```bash 19 | pip install -U sphinx 20 | ``` 21 | 22 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 23 | [Read The Docs](https://readthedocs.org/). You can install it using the following command: 24 | 25 | ```bash 26 | pip install sphinx_rtd_theme 27 | ``` 28 | 29 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text: 30 | 31 | ```bash 32 | pip install recommonmark 33 | ``` 34 | 35 | ## Building the documentation 36 | 37 | Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 38 | command to generate it: 39 | 40 | ```bash 41 | ln -s ../../examples/README.md examples.md 42 | ``` 43 | 44 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder: 45 | 46 | ```bash 47 | make html 48 | ``` 49 | 50 | --- 51 | **NOTE** 52 | 53 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build 54 | directory before rebuilding. Run the following command to clean and build: 55 | 56 | ```bash 57 | make clean && make html 58 | ``` 59 | 60 | --- 61 | 62 | It should build the static app that will be available under `/docs/_build/html` 63 | 64 | ## Adding a new element to the tree (toc-tree) 65 | 66 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it 67 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension. 68 | -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } -------------------------------------------------------------------------------- /docs/source/benchmarks.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 4 | benchmark will help keep track of the preformance improvements that are brought to our models across versions. 5 | 6 | ## Benchmarking all models for inference 7 | 8 | As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with 9 | and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for 10 | TensorFlow XLA) and GPUs. 11 | 12 | The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 13 | 14 | The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). 15 | 16 | ## TF2 with mixed precision, XLA, Distribution (@tlkh) 17 | 18 | This work was done by [Timothy Liu](https://github.com/tlkh). 19 | 20 | There are very positive results to be gained from the various TensorFlow 2.0 features: 21 | 22 | - Automatic Mixed Precision (AMP) 23 | - XLA compiler 24 | - Distribution strategies (multi-GPU) 25 | 26 | The benefits are listed here (tested on CoLA, MRPC, SST-2): 27 | 28 | - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size 29 | - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset) 30 | - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100 31 | - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput 32 | 33 | The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 34 | on a single GPU gives the following results: 35 | 36 | - CoLA: AMP results in slighter lower acc (0.820 vs 0.824) 37 | - MRPC: AMP results in lower acc (0.823 vs 0.835) 38 | - SST-2: AMP results in slighter lower acc (0.918 vs 0.922) 39 | 40 | However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results: 41 | 42 | CoLA: AMP results in higher acc (0.828 vs 0.812) 43 | MRPC: AMP results in lower acc (0.817 vs 0.827) 44 | SST-2: AMP results in slightly lower acc (0.926 vs 0.929) 45 | 46 | The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py). 47 | 48 | Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well 49 | as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 50 | can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x) 51 | 52 | The benefits as seen on SST-2 (larger dataset) is much clear. 53 | 54 | All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445). 55 | -------------------------------------------------------------------------------- /docs/source/bertology.rst: -------------------------------------------------------------------------------- 1 | BERTology 2 | --------- 3 | 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are: 5 | 6 | 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341 10 | 11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): 12 | 13 | 14 | * accessing all the hidden-states of BERT/GPT/GPT-2, 15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2, 16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650. 17 | 18 | To help you understand and use these features, we have added a specific example script: `bertology.py `_ while extract information and prune a model pre-trained on GLUE. 19 | -------------------------------------------------------------------------------- /docs/source/examples.md: -------------------------------------------------------------------------------- 1 | ../../examples/README.md -------------------------------------------------------------------------------- /docs/source/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/favicon.ico -------------------------------------------------------------------------------- /docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Transformers is tested on Python 3.5+ and PyTorch 1.1.0 4 | 5 | ## With pip 6 | 7 | PyTorch Transformers can be installed using pip as follows: 8 | 9 | ``` bash 10 | pip install transformers 11 | ``` 12 | 13 | ## From source 14 | 15 | To install from source, clone the repository and install with: 16 | 17 | ``` bash 18 | git clone https://github.com/huggingface/transformers.git 19 | cd transformers 20 | pip install . 21 | ``` 22 | 23 | ## Tests 24 | 25 | An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples). 26 | 27 | Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests. 28 | 29 | ## OpenAI GPT original tokenization workflow 30 | 31 | If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`: 32 | 33 | ``` bash 34 | pip install spacy ftfy==4.4.3 35 | python -m spacy download en 36 | ``` 37 | 38 | If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). 39 | 40 | ## Note on model downloads (Continuous Integration or large-scale deployments) 41 | 42 | If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help. 43 | 44 | ## Do you want to run a Transformer model on a mobile device? 45 | 46 | You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo. 47 | 48 | It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices. 49 | 50 | At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML, 51 | or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting! 52 | -------------------------------------------------------------------------------- /docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ---------------------------------------------------- 3 | 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PretrainedConfig`` 7 | ~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. autoclass:: transformers.PretrainedConfig 10 | :members: 11 | -------------------------------------------------------------------------------- /docs/source/main_classes/model.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to: 7 | 8 | - resize the input token embeddings when new tokens are added to the vocabulary 9 | - prune the attention heads of the model. 10 | 11 | ``PreTrainedModel`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.PreTrainedModel 15 | :members: 16 | 17 | ``TFPreTrainedModel`` 18 | ~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: transformers.TFPreTrainedModel 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/source/main_classes/optimizer_schedules.rst: -------------------------------------------------------------------------------- 1 | Optimizer 2 | ---------------------------------------------------- 3 | 4 | The ``.optimization`` module provides: 5 | 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``: 8 | - a gradient accumulation class to accumulate the gradients of multiple batches 9 | 10 | ``AdamW`` 11 | ~~~~~~~~~~~~~~~~ 12 | 13 | .. autoclass:: transformers.AdamW 14 | :members: 15 | 16 | ``AdamWeightDecay`` 17 | ~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.AdamWeightDecay 20 | :members: 21 | 22 | .. autofunction:: transformers.create_optimizer 23 | 24 | Schedules 25 | ---------------------------------------------------- 26 | 27 | Learning Rate Schedules 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | .. autofunction:: transformers.get_constant_schedule 30 | 31 | 32 | .. autofunction:: transformers.get_constant_schedule_with_warmup 33 | 34 | .. image:: /imgs/warmup_constant_schedule.png 35 | :target: /imgs/warmup_constant_schedule.png 36 | :alt: 37 | 38 | 39 | .. autofunction:: transformers.get_cosine_schedule_with_warmup 40 | 41 | .. image:: /imgs/warmup_cosine_schedule.png 42 | :target: /imgs/warmup_cosine_schedule.png 43 | :alt: 44 | 45 | 46 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup 47 | 48 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png 49 | :target: /imgs/warmup_cosine_hard_restarts_schedule.png 50 | :alt: 51 | 52 | 53 | 54 | .. autofunction:: transformers.get_linear_schedule_with_warmup 55 | 56 | .. image:: /imgs/warmup_linear_schedule.png 57 | :target: /imgs/warmup_linear_schedule.png 58 | :alt: 59 | 60 | ``Warmup`` 61 | ~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.WarmUp 64 | :members: 65 | 66 | Gradient Strategies 67 | ---------------------------------------------------- 68 | 69 | ``GradientAccumulator`` 70 | ~~~~~~~~~~~~~~~~~~~~~~~ 71 | 72 | .. autoclass:: transformers.GradientAccumulator 73 | -------------------------------------------------------------------------------- /docs/source/main_classes/pipelines.rst: -------------------------------------------------------------------------------- 1 | Pipelines 2 | ---------------------------------------------------- 3 | 4 | The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most 5 | of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity 6 | Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. 7 | 8 | There are two categories of pipeline abstractions to be aware about: 9 | 10 | - The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines 11 | - The other task-specific pipelines, such as :class:`~transformers.NerPipeline` 12 | or :class:`~transformers.QuestionAnsweringPipeline` 13 | 14 | The pipeline abstraction 15 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 16 | 17 | The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any 18 | other pipeline but requires an additional argument which is the `task`. 19 | 20 | .. autoclass:: transformers.pipeline 21 | :members: 22 | 23 | 24 | The task specific pipelines 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | Parent class: Pipeline 28 | ========================================= 29 | 30 | .. autoclass:: transformers.Pipeline 31 | :members: predict, transform, save_pretrained 32 | 33 | NerPipeline 34 | ========================================== 35 | 36 | .. autoclass:: transformers.NerPipeline 37 | 38 | TokenClassificationPipeline 39 | ========================================== 40 | 41 | This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for 42 | documentation and usage examples. 43 | 44 | FillMaskPipeline 45 | ========================================== 46 | 47 | .. autoclass:: transformers.FillMaskPipeline 48 | 49 | FeatureExtractionPipeline 50 | ========================================== 51 | 52 | .. autoclass:: transformers.FeatureExtractionPipeline 53 | 54 | TextClassificationPipeline 55 | ========================================== 56 | 57 | .. autoclass:: transformers.TextClassificationPipeline 58 | 59 | QuestionAnsweringPipeline 60 | ========================================== 61 | 62 | .. autoclass:: transformers.QuestionAnsweringPipeline 63 | 64 | -------------------------------------------------------------------------------- /docs/source/main_classes/tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers: 7 | 8 | - tokenizing, converting tokens to ids and back and encoding/decoding, 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), 10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization) 11 | 12 | ``PreTrainedTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.PreTrainedTokenizer 16 | :members: 17 | -------------------------------------------------------------------------------- /docs/source/model_doc/auto.rst: -------------------------------------------------------------------------------- 1 | AutoModels 2 | ----------- 3 | 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method. 5 | 6 | AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary: 7 | 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``). 9 | 10 | 11 | ``AutoConfig`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.AutoConfig 15 | :members: 16 | 17 | 18 | ``AutoTokenizer`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.AutoTokenizer 22 | :members: 23 | 24 | 25 | ``AutoModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.AutoModel 29 | :members: 30 | 31 | 32 | ``AutoModelForPreTraining`` 33 | ~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.AutoModelForPreTraining 36 | :members: 37 | 38 | 39 | ``AutoModelWithLMHead`` 40 | ~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.AutoModelWithLMHead 43 | :members: 44 | 45 | 46 | ``AutoModelForSequenceClassification`` 47 | ~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.AutoModelForSequenceClassification 50 | :members: 51 | 52 | 53 | ``AutoModelForQuestionAnswering`` 54 | ~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.AutoModelForQuestionAnswering 57 | :members: 58 | 59 | 60 | ``AutoModelForTokenClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.AutoModelForTokenClassification 64 | :members: 65 | 66 | -------------------------------------------------------------------------------- /docs/source/model_doc/bart.rst: -------------------------------------------------------------------------------- 1 | Bart 2 | ---------------------------------------------------- 3 | **DISCLAIMER:** This model is still a work in progress, if you see something strange, 4 | file a `Github Issue `__ and assign 5 | @sshleifer 6 | 7 | Paper 8 | ~~~~~ 9 | The Bart model was `proposed `_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019. 10 | According to the abstract, 11 | 12 | - Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT). 13 | - The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token. 14 | - BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE. 15 | 16 | The Authors' code can be found `here `_ 17 | 18 | 19 | Implementation Notes 20 | ~~~~~~~~~~~~~~~~~~~~ 21 | - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting. 22 | - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``) if they are not passed. This is different than some other modeling APIs. 23 | - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space. 24 | - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings 25 | - Models that load the ``"bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks. 26 | 27 | 28 | 29 | BartModel 30 | ~~~~~~~~~~~~~ 31 | 32 | .. autoclass:: transformers.BartModel 33 | :members: forward 34 | 35 | .. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs 36 | 37 | 38 | BartForConditionalGeneration 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | .. autoclass:: transformers.BartForConditionalGeneration 42 | :members: generate, forward 43 | 44 | 45 | BartForSequenceClassification 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | .. autoclass:: transformers.BartForSequenceClassification 49 | :members: forward 50 | 51 | BartConfig 52 | ~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | .. autoclass:: transformers.BartConfig 55 | :members: 56 | 57 | -------------------------------------------------------------------------------- /docs/source/model_doc/flaubert.rst: -------------------------------------------------------------------------------- 1 | FlauBERT 2 | ---------------------------------------------------- 3 | 4 | The FlauBERT model was proposed in the paper 5 | `FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le et al. 6 | It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like). 7 | 8 | The abstract from the paper is the following: 9 | 10 | *Language models have become a key step to achieve state-of-the art results in many different Natural Language 11 | Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient 12 | way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their 13 | contextualization at the sentence level. This has been widely demonstrated for English using contextualized 14 | representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et 15 | al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large 16 | and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre 17 | for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text 18 | classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most 19 | of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified 20 | evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared 21 | to the research community for further reproducible experiments in French NLP.* 22 | 23 | 24 | FlaubertConfig 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | .. autoclass:: transformers.FlaubertConfig 28 | :members: 29 | 30 | 31 | FlaubertTokenizer 32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | .. autoclass:: transformers.FlaubertTokenizer 35 | :members: 36 | 37 | 38 | FlaubertModel 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | .. autoclass:: transformers.FlaubertModel 42 | :members: 43 | 44 | 45 | FlaubertWithLMHeadModel 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | 48 | .. autoclass:: transformers.FlaubertWithLMHeadModel 49 | :members: 50 | 51 | 52 | FlaubertForSequenceClassification 53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 54 | 55 | .. autoclass:: transformers.FlaubertForSequenceClassification 56 | :members: 57 | 58 | 59 | FlaubertForQuestionAnsweringSimple 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | .. autoclass:: transformers.FlaubertForQuestionAnsweringSimple 63 | :members: 64 | 65 | 66 | FlaubertForQuestionAnswering 67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 68 | 69 | .. autoclass:: transformers.FlaubertForQuestionAnswering 70 | :members: 71 | 72 | 73 | -------------------------------------------------------------------------------- /docs/source/model_doc/transformerxl.rst: -------------------------------------------------------------------------------- 1 | Transformer XL 2 | ---------------------------------------------------- 3 | 4 | Overview 5 | ~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | The Transformer-XL model was proposed in 8 | `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `__ 9 | by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. 10 | It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse 11 | previously computed hidden-states to attend to longer context (memory). 12 | This model also uses adaptive softmax inputs and outputs (tied). 13 | 14 | The abstract from the paper is the following: 15 | 16 | *Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the 17 | setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency 18 | beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and 19 | a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves 20 | the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 21 | 450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up 22 | to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results 23 | of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on 24 | Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably 25 | coherent, novel text articles with thousands of tokens.* 26 | 27 | Tips: 28 | 29 | - Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. 30 | The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left. 31 | - Transformer-XL is one of the few models that has no sequence length limit. 32 | 33 | 34 | TransfoXLConfig 35 | ~~~~~~~~~~~~~~~~~~~~~ 36 | 37 | .. autoclass:: transformers.TransfoXLConfig 38 | :members: 39 | 40 | 41 | TransfoXLTokenizer 42 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 43 | 44 | .. autoclass:: transformers.TransfoXLTokenizer 45 | :members: save_vocabulary 46 | 47 | 48 | TransfoXLModel 49 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 50 | 51 | .. autoclass:: transformers.TransfoXLModel 52 | :members: 53 | 54 | 55 | TransfoXLLMHeadModel 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 57 | 58 | .. autoclass:: transformers.TransfoXLLMHeadModel 59 | :members: 60 | 61 | 62 | TFTransfoXLModel 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | 65 | .. autoclass:: transformers.TFTransfoXLModel 66 | :members: 67 | 68 | 69 | TFTransfoXLLMHeadModel 70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 71 | 72 | .. autoclass:: transformers.TFTransfoXLLMHeadModel 73 | :members: 74 | -------------------------------------------------------------------------------- /docs/source/model_sharing.md: -------------------------------------------------------------------------------- 1 | # Model upload and sharing 2 | 3 | Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library. 4 | 5 | **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then: 6 | 7 | ```shell 8 | transformers-cli login 9 | # log in using the same credentials as on huggingface.co 10 | ``` 11 | Upload your model: 12 | ```shell 13 | transformers-cli upload ./path/to/pretrained_model/ 14 | 15 | # ^^ Upload folder containing weights/tokenizer/config 16 | # saved via `.save_pretrained()` 17 | 18 | transformers-cli upload ./config.json [--filename folder/foobar.json] 19 | 20 | # ^^ Upload a single file 21 | # (you can optionally override its filename, which can be nested inside a folder) 22 | ``` 23 | 24 | If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command: 25 | ```shell 26 | --organization organization_name 27 | ``` 28 | 29 | Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above: 30 | ```python 31 | "username/pretrained_model" 32 | # or if an org: 33 | "organization_name/pretrained_model" 34 | ``` 35 | 36 | **Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc. 37 | 38 | Your model now has a page on huggingface.co/models 🔥 39 | 40 | Anyone can load it from code: 41 | ```python 42 | tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model") 43 | model = AutoModel.from_pretrained("namespace/pretrained_model") 44 | ``` 45 | 46 | List all your files on S3: 47 | ```shell 48 | transformers-cli s3 ls 49 | ``` 50 | 51 | You can also delete unneeded files: 52 | 53 | ```shell 54 | transformers-cli s3 rm … 55 | ``` 56 | -------------------------------------------------------------------------------- /docs/source/notebooks.rst: -------------------------------------------------------------------------------- 1 | Notebooks 2 | ================================================ 3 | 4 | We include `three Jupyter Notebooks `_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model. 5 | 6 | 7 | * 8 | The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb `_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models. 9 | 10 | * 11 | The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb `_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models. 12 | 13 | * 14 | The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb `_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model. 15 | 16 | Please follow the instructions given in the notebooks to run and modify them. 17 | -------------------------------------------------------------------------------- /examples/bert_stable_fine_tuning/Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image must at least have pytorch and CUDA installed. 2 | # We are using NVIDIA NGC's PyTorch image here, see: https://ngc.nvidia.com/catalog/containers/nvidia:pytorch for latest version 3 | ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:19.09-py3 4 | FROM $BASE_IMAGE 5 | ARG BASE_IMAGE 6 | LABEL repository="bert-stable-fine-tuning" 7 | 8 | # Set path to CUDA 9 | ENV CUDA_HOME=/usr/local/cuda 10 | 11 | # Install additional programs 12 | RUN apt update && \ 13 | apt install -y build-essential \ 14 | htop \ 15 | git \ 16 | curl \ 17 | ca-certificates \ 18 | vim \ 19 | tmux && \ 20 | rm -rf /var/lib/apt/lists 21 | 22 | # Update pip 23 | RUN SHA=ToUcHMe which python3 24 | RUN SHA=ToUcHMe python3 -m pip install --upgrade pip 25 | 26 | # See http://bugs.python.org/issue19846 27 | ENV LANG C.UTF-8 28 | 29 | # Install additional dependencies 30 | RUN python3 -m pip install wandb 31 | RUN python3 -m pip install autopep8 32 | RUN python3 -m pip install attrdict 33 | RUN conda install pylint 34 | 35 | # Specify a new user (USER_NAME and USER_UID are specified via --build-arg) 36 | ARG USER_UID 37 | ARG USER_NAME 38 | ENV USER_GID=$USER_UID 39 | ENV USER_GROUP="users" 40 | 41 | # Create the user 42 | RUN mkdir /home/$USER_NAME 43 | RUN useradd -l -d /home/$USER_NAME -u $USER_UID -g $USER_GROUP $USER_NAME 44 | 45 | # Setup VSCode stuff (comment when not using vscode) 46 | RUN mkdir /home/$USER_NAME/.vscode-server 47 | RUN mkdir /home/$USER_NAME/.vscode-server-insiders 48 | 49 | # Change owner of home dir 50 | RUN chown -R ${USER_UID}:${USER_GID} /home/$USER_NAME/ 51 | 52 | # Set workdir when starting container 53 | WORKDIR /transformers 54 | 55 | # Add workdir to PYTHONPATH 56 | ENV PYTHONPATH="$PYTHONPATH:/transformers" 57 | 58 | CMD ["/bin/bash"] 59 | -------------------------------------------------------------------------------- /examples/bert_stable_fine_tuning/README.md: -------------------------------------------------------------------------------- 1 | ## Installing & Getting Started 2 | 3 | 1. Clone the repository. 4 | 5 | ```` 6 | git clone git@github.com:uds-lsv/bert-stable-fine-tuning.git 7 | cd bert-stable-fine-tuning/examples/bert_stable_fine_tuning 8 | ```` 9 | 10 | 2. [Download datasets](https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py) from the GLUE benchmark. 11 | 12 | 3. Setup a Docker image and start a container. 13 | 14 | ```` 15 | docker build -f ./Dockerfile --build-arg USER_UID=$UID --build-arg USER_NAME=$(id -un) -t bert-stable-fine-tuning:latest . 16 | 17 | docker run -it --rm --runtime=nvidia --pid=host --ipc=host \ 18 | -v /path/to/bert-stable-fine-tuning:/transformers \ 19 | -v /path/to/pre-trained-transformers:/pre-trained-transformers \ 20 | -v /path/to/datasets:/datasets \ 21 | -v /path/to/bert-stable-fine-tuning/logfiles:/logfiles \ 22 | -v /path/to/bert-stable-fine-tuning/checkpoints:/checkpoints \ 23 | -v /path/to/bert-stable-fine-tuning/tb-logs:/tb-logs \ 24 | -v /path/to/bert-stable-fine-tuning/wandb-logs:/wandb-logs \ 25 | bert-stable-fine-tuning:latest 26 | ```` 27 | 28 | Add `--user=` to the `docker run` command above in order to run the container as your user. Use `--gpus=all instead` instead of `--runtime=nvidia` for more recent Docker versions (starting from 19.03). More information on Docker can be found here: `/bert_stable_fine_tuning/run_docker.txt` 29 | 30 | 4. Install huggingface transformers in editable mode **inside** the container. 31 | 32 | ```` 33 | python3 -m pip install -e . --user 34 | ```` 35 | 36 | 5. Fine-tune BERT-large-uncased on RTE. (You might want to check `./bert_stable_fine_tuning/scripts/seeds.sh` first.) 37 | 38 | ```` 39 | bash /transformers/examples/bert_stable_fine_tuning/scripts/seeds.sh /transformers/examples/bert_stable_fine_tuning/configs/rte/pooler-bert-large-uncased_bsz_16_lr_2e-05_adamW_bias-correct_early-stopping_20.yaml 1 1 0 40 | ```` 41 | 42 | 6. Additional config files for RTE, MRPC, and CoLA can be found here: `./bert_stable_fine_tuning/configs`. Bash commands for every config file can be found here: `./bert_stable_fine_tuning/scripts/run_scripts.sh` 43 | 44 | **Happy stable fine-tuning!** :rocket: :metal: 45 | 46 | -------------------------------------------------------------------------------- /examples/bert_stable_fine_tuning/run_docker.txt: -------------------------------------------------------------------------------- 1 | # Build docker image from docker file (using latest base PyTorch image from NVIDIA GPU Cloud) 2 | docker build -f ./Dockerfile --build-arg USER_UID=$UID --build-arg USER_NAME=$(id -un) -t bert-stable-fine-tuning:latest . 3 | 4 | 5 | # Launch docker container from docker image as root 6 | # add --user= to run as user 7 | # /path/to/pre-trained-transformers is a dir that holds the downloaded weights for the pre-trained models 8 | 9 | docker run -it --rm --runtime=nvidia --pid=host --ipc=host \ 10 | -v /path/to/bert-stable-fine-tuning:/transformers \ 11 | -v /path/to/pre-trained-transformers:/pre-trained-transformers \ 12 | -v /path/to/datasets:/datasets \ 13 | -v /path/to/bert-stable-fine-tuning/logfiles:/logfiles \ 14 | -v /path/to/bert-stable-fine-tuning/checkpoints:/checkpoints \ 15 | -v /path/to/bert-stable-fine-tuning/tb-logs:/tb-logs \ 16 | -v /path/to/bert-stable-fine-tuning/wandb-logs:/wandb-logs \ 17 | bert-stable-fine-tuning:latest 18 | 19 | 20 | # For latest docker version (starting from 19.03) use: 21 | # --gpus=all instead of --runtime=nvidia 22 | 23 | ###################################### 24 | # Make huggingface transformers editable inside container 25 | ###################################### 26 | 27 | export PYTHONPATH="$PYTHONPATH:/transformers" 28 | python3 -m pip install -e . --user 29 | -------------------------------------------------------------------------------- /examples/bert_stable_fine_tuning/scripts/seeds.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$PYTHONPATH:/transformers" 3 | 4 | # Passed as arguements 5 | CONFIG_FILE=$1 6 | FIRST_SEED=$2 7 | LAST_SEED=$3 8 | export CUDA_VISIBLE_DEVICES=$4 9 | 10 | # Setup weights & biases environment variables 11 | # Comment lines below if you don't want to use wandb 12 | export WANDB_API_KEY=your-key 13 | export WANDB_USERNAME="your-username" 14 | export WANDB_ENTITY="your-entity" 15 | 16 | # Train the same model on the same dataset with different random seeds 17 | for SEED in $(seq $FIRST_SEED $LAST_SEED); 18 | do 19 | python /transformers/examples/bert_stable_fine_tuning/run_finetuning.py \ 20 | --config ${CONFIG_FILE} \ 21 | --do_train \ 22 | --do_eval \ 23 | --seed ${SEED} 24 | done -------------------------------------------------------------------------------- /examples/bert_stable_fine_tuning/scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$PYTHONPATH:/transformers" 3 | 4 | # Passed as arguements 5 | CONFIG_FILE=$1 6 | SEED=$2 7 | export CUDA_VISIBLE_DEVICES=$3 8 | 9 | # Setup weights & biases environment variables 10 | # Comment lines below if you don't want to use wandb 11 | export WANDB_API_KEY=your-key 12 | export WANDB_USERNAME="your-username" 13 | export WANDB_ENTITY="your-entity" 14 | 15 | python /transformers/examples/bert_stable_fine_tuning/run_finetuning.py \ 16 | --config ${CONFIG_FILE} \ 17 | --do_train \ 18 | --do_eval \ 19 | --seed ${SEED} -------------------------------------------------------------------------------- /examples/contrib/README.md: -------------------------------------------------------------------------------- 1 | # Community contributed examples 2 | 3 | This folder contains examples which are not actively maintained (mostly contributed by the community). 4 | 5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 6 | -------------------------------------------------------------------------------- /examples/contrib/run_camembert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from transformers.modeling_camembert import CamembertForMaskedLM 4 | from transformers.tokenization_camembert import CamembertTokenizer 5 | 6 | 7 | def fill_mask(masked_input, model, tokenizer, topk=5): 8 | # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py 9 | assert masked_input.count("") == 1 10 | input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1 11 | logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple 12 | masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item() 13 | logits = logits[0, masked_index, :] 14 | prob = logits.softmax(dim=0) 15 | values, indices = prob.topk(k=topk, dim=0) 16 | topk_predicted_token_bpe = " ".join( 17 | [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))] 18 | ) 19 | masked_token = tokenizer.mask_token 20 | topk_filled_outputs = [] 21 | for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")): 22 | predicted_token = predicted_token_bpe.replace("\u2581", " ") 23 | if " {0}".format(masked_token) in masked_input: 24 | topk_filled_outputs.append( 25 | ( 26 | masked_input.replace(" {0}".format(masked_token), predicted_token), 27 | values[index].item(), 28 | predicted_token, 29 | ) 30 | ) 31 | else: 32 | topk_filled_outputs.append( 33 | (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,) 34 | ) 35 | return topk_filled_outputs 36 | 37 | 38 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base") 39 | model = CamembertForMaskedLM.from_pretrained("camembert-base") 40 | model.eval() 41 | 42 | masked_input = "Le camembert est :)" 43 | print(fill_mask(masked_input, model, tokenizer, topk=3)) 44 | -------------------------------------------------------------------------------- /examples/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | gitpython==3.0.2 4 | tensorboard>=1.14.0 5 | tensorboardX==1.8 6 | psutil==5.6.6 7 | scipy==1.3.1 8 | -------------------------------------------------------------------------------- /examples/distillation/scripts/token_counts.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ 16 | Preprocessing script before training the distilled model. 17 | """ 18 | import argparse 19 | import logging 20 | import pickle 21 | from collections import Counter 22 | 23 | 24 | logging.basicConfig( 25 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO 26 | ) 27 | logger = logging.getLogger(__name__) 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser( 31 | description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)" 32 | ) 33 | parser.add_argument( 34 | "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset." 35 | ) 36 | parser.add_argument( 37 | "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file." 38 | ) 39 | parser.add_argument("--vocab_size", default=30522, type=int) 40 | args = parser.parse_args() 41 | 42 | logger.info(f"Loading data from {args.data_file}") 43 | with open(args.data_file, "rb") as fp: 44 | data = pickle.load(fp) 45 | 46 | logger.info("Counting occurences for MLM.") 47 | counter = Counter() 48 | for tk_ids in data: 49 | counter.update(tk_ids) 50 | counts = [0] * args.vocab_size 51 | for k, v in counter.items(): 52 | counts[k] = v 53 | 54 | logger.info(f"Dump to {args.token_counts_dump}") 55 | with open(args.token_counts_dump, "wb") as handle: 56 | pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL) 57 | -------------------------------------------------------------------------------- /examples/distillation/training_configs/distilbert-base-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 28996 14 | } 15 | -------------------------------------------------------------------------------- /examples/distillation/training_configs/distilbert-base-multilingual-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 119547 14 | } 15 | -------------------------------------------------------------------------------- /examples/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /examples/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /examples/distillation/training_configs/distilroberta-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab_size": 50265, 3 | "hidden_size": 768, 4 | "num_hidden_layers": 6, 5 | "num_attention_heads": 12, 6 | "intermediate_size": 3072, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "attention_probs_dropout_prob": 0.1, 10 | "max_position_embeddings": 514, 11 | "type_vocab_size": 1, 12 | "initializer_range": 0.02, 13 | "layer_norm_eps": 0.00001 14 | } -------------------------------------------------------------------------------- /examples/ner/run.sh: -------------------------------------------------------------------------------- 1 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \ 2 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp 3 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \ 4 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp 5 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \ 6 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp 7 | wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" 8 | export MAX_LENGTH=128 9 | export BERT_MODEL=bert-base-multilingual-cased 10 | python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt 11 | python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt 12 | python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt 13 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt 14 | export OUTPUT_DIR=germeval-model 15 | export BATCH_SIZE=32 16 | export NUM_EPOCHS=3 17 | export SAVE_STEPS=750 18 | export SEED=1 19 | 20 | python3 run_ner.py --data_dir ./ \ 21 | --model_type bert \ 22 | --labels ./labels.txt \ 23 | --model_name_or_path $BERT_MODEL \ 24 | --output_dir $OUTPUT_DIR \ 25 | --max_seq_length $MAX_LENGTH \ 26 | --num_train_epochs $NUM_EPOCHS \ 27 | --per_gpu_train_batch_size $BATCH_SIZE \ 28 | --save_steps $SAVE_STEPS \ 29 | --seed $SEED \ 30 | --do_train \ 31 | --do_eval \ 32 | --do_predict 33 | -------------------------------------------------------------------------------- /examples/ner/run_pl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Install newest ptl. 4 | pip install -U git+http://github.com/PyTorchLightning/pytorch-lightning/ 5 | # for seqeval metrics import 6 | pip install -r ../requirements.txt 7 | 8 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \ 9 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp 10 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \ 11 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp 12 | curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \ 13 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp 14 | wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py" 15 | export MAX_LENGTH=128 16 | export BERT_MODEL=bert-base-multilingual-cased 17 | python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt 18 | python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt 19 | python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt 20 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt 21 | export BATCH_SIZE=32 22 | export NUM_EPOCHS=3 23 | export SEED=1 24 | 25 | export OUTPUT_DIR_NAME=germeval-model 26 | export CURRENT_DIR=${PWD} 27 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 28 | mkdir -p $OUTPUT_DIR 29 | 30 | python3 run_pl_ner.py --data_dir ./ \ 31 | --model_type bert \ 32 | --labels ./labels.txt \ 33 | --model_name_or_path $BERT_MODEL \ 34 | --output_dir $OUTPUT_DIR \ 35 | --max_seq_length $MAX_LENGTH \ 36 | --num_train_epochs $NUM_EPOCHS \ 37 | --train_batch_size 32 \ 38 | --seed $SEED \ 39 | --do_train \ 40 | --do_predict -------------------------------------------------------------------------------- /examples/pplm/README.md: -------------------------------------------------------------------------------- 1 | # Plug and Play Language Models: a Simple Approach to Controlled Text Generation 2 | 3 | Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/) 4 | 5 | This folder contains the original code used to run the Plug and Play Language Model (PPLM). 6 | 7 | Paper link: https://arxiv.org/abs/1912.02164 8 | 9 | Blog link: https://eng.uber.com/pplm 10 | 11 | Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM 12 | 13 | 14 | ## Setup 15 | 16 | ```bash 17 | git clone https://github.com/huggingface/transformers && cd transformers 18 | pip install . 19 | pip install nltk torchtext # additional requirements. 20 | cd examples/pplm 21 | ``` 22 | 23 | ## PPLM-BoW 24 | 25 | ### Example command for bag-of-words control 26 | 27 | ```bash 28 | python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample 29 | ``` 30 | 31 | ### Tuning hyperparameters for bag-of-words control 32 | 33 | 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 34 | 35 | 2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider:
36 | a) Reduce the `--stepsize`
37 | b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term)
38 | c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).
39 | 40 | 41 | ## PPLM-Discrim 42 | 43 | ### Example command for discriminator based sentiment control 44 | 45 | ```bash 46 | python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample 47 | ``` 48 | 49 | ### Tuning hyperparameters for discriminator control 50 | 51 | 1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 52 | 53 | 2. Use `--class_label 3` for negative, and `--class_label 2` for positive 54 | 55 | -------------------------------------------------------------------------------- /examples/pplm/imgs/headfigure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/pplm/imgs/headfigure.png -------------------------------------------------------------------------------- /examples/pplm/imgs/wooly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/pplm/imgs/wooly.png -------------------------------------------------------------------------------- /examples/pplm/pplm_classification_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ClassificationHead(torch.nn.Module): 5 | """Classification Head for transformer encoders""" 6 | 7 | def __init__(self, class_size, embed_size): 8 | super().__init__() 9 | self.class_size = class_size 10 | self.embed_size = embed_size 11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size) 12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size)) 13 | self.mlp = torch.nn.Linear(embed_size, class_size) 14 | 15 | def forward(self, hidden_state): 16 | # hidden_state = F.relu(self.mlp1(hidden_state)) 17 | # hidden_state = self.mlp2(hidden_state) 18 | logits = self.mlp(hidden_state) 19 | return logits 20 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | tensorboard 3 | scikit-learn 4 | seqeval 5 | -------------------------------------------------------------------------------- /examples/squad/configs/albert-base-v1.yaml: -------------------------------------------------------------------------------- 1 | input: 2 | data_dir: /datasets/squad 3 | train_file: train-v1.1.json 4 | predict_file: dev-v1.1.json 5 | version_2_with_negative: false 6 | overwrite_cache: false # Overwrite the cached training and evaluation sets 7 | lang_id: 0 # language id of input for language-specific xlm models 8 | doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks. 9 | threads: 1 # multiple threads for converting example to features 10 | 11 | output: 12 | log_dir: /logfiles/squad11 13 | checkpoint_dir: /checkpoints/squad11 14 | verbose_logging: false # If true, all of the warnings related to data processing will be printed. 15 | 16 | tensorboard: 17 | enable: false 18 | log_dir: /tb-logs/squad11 19 | log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it 20 | 21 | wandb: 22 | enable: true 23 | project_name: squad-fine-tuning 24 | log_dir: /wandb-logs/squad11 25 | 26 | model: 27 | model_type: albert 28 | model_name_or_path: albert-base-v1 # Path to pre-trained model or shortcut name of huggingface transformer models 29 | config_name: albert-base-v1 # Pretrained config name or path if not the same as model_name 30 | tokenizer_name: albert-base-v1 # Pretrained tokenizer name or path if not the same as model_name 31 | cache_dir: /pre-trained-transformers 32 | do_lower_case: true # ALBERT vocab is uncased 33 | max_seq_length: 384 34 | max_query_length: 64 # The maximum number of tokens for the question 35 | max_answer_length: 30 # The maximum length of an answer that can be generated. 36 | null_score_diff_threshold: 0.0 37 | n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file. 38 | 39 | training: 40 | num_train_epochs: 3 41 | max_steps: -1 # overrides num_train_epochs 42 | evaluate_during_training: true 43 | per_gpu_train_batch_size: 8 44 | gradient_accumulation_steps: 1 45 | train_logging_steps: 300 # adjust based on batch_size and number of GPUs used 46 | eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used 47 | save_steps: -1 48 | 49 | optimizer: 50 | learning_rate: 0.00003 # learning-rate should not be too large 51 | learning_rate_schedule: warmup-linear 52 | warmup_steps: 0 53 | weight_decay: 0.0 54 | adam_epsilon: 0.00000001 55 | max_grad_norm: 1.0 56 | fp16: false 57 | fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html 58 | 59 | eval: 60 | eval_all_checkpoints: false 61 | per_gpu_eval_batch_size: 100 # eval will always run on a single GPU 62 | -------------------------------------------------------------------------------- /examples/squad/configs/bert-base-cased.yaml: -------------------------------------------------------------------------------- 1 | input: 2 | data_dir: /datasets/squad 3 | train_file: train-v1.1.json 4 | predict_file: dev-v1.1.json 5 | version_2_with_negative: false 6 | overwrite_cache: false # Overwrite the cached training and evaluation sets 7 | lang_id: 0 # language id of input for language-specific xlm models 8 | doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks. 9 | threads: 1 # multiple threads for converting example to features 10 | 11 | output: 12 | log_dir: /logfiles/squad11 13 | checkpoint_dir: /checkpoints/squad11 14 | verbose_logging: false # If true, all of the warnings related to data processing will be printed. 15 | 16 | tensorboard: 17 | enable: false 18 | log_dir: /tb-logs/squad11 19 | log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it 20 | 21 | wandb: 22 | enable: true 23 | project_name: squad-fine-tuning 24 | log_dir: /wandb-logs/squad11 25 | 26 | model: 27 | model_type: bert 28 | model_name_or_path: bert-base-cased # Path to pre-trained model or shortcut name of huggingface transformer models 29 | config_name: bert-base-cased # Pretrained config name or path if not the same as model_name 30 | tokenizer_name: bert-base-cased # Pretrained tokenizer name or path if not the same as model_name 31 | cache_dir: /pre-trained-transformers 32 | do_lower_case: false 33 | max_seq_length: 384 34 | max_query_length: 64 # The maximum number of tokens for the question 35 | max_answer_length: 30 # The maximum length of an answer that can be generated. 36 | null_score_diff_threshold: 0.0 37 | n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file. 38 | 39 | training: 40 | num_train_epochs: 3 41 | max_steps: -1 # overrides num_train_epochs 42 | evaluate_during_training: true 43 | per_gpu_train_batch_size: 8 44 | gradient_accumulation_steps: 1 45 | train_logging_steps: 300 # adjust based on batch_size and number of GPUs used 46 | eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used 47 | save_steps: -1 48 | 49 | optimizer: 50 | learning_rate: 0.00003 # learning-rate should not be too large 51 | learning_rate_schedule: warmup-linear 52 | warmup_steps: 0 53 | weight_decay: 0.0 54 | adam_epsilon: 0.00000001 55 | max_grad_norm: 1.0 56 | fp16: false 57 | fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html 58 | 59 | eval: 60 | eval_all_checkpoints: false 61 | per_gpu_eval_batch_size: 100 # eval will always run on a single GPU 62 | -------------------------------------------------------------------------------- /examples/squad/configs/roberta-base.yaml: -------------------------------------------------------------------------------- 1 | input: 2 | data_dir: /datasets/squad 3 | train_file: train-v1.1.json 4 | predict_file: dev-v1.1.json 5 | version_2_with_negative: false 6 | overwrite_cache: false # Overwrite the cached training and evaluation sets 7 | lang_id: 0 # language id of input for language-specific xlm models 8 | doc_stride: 128 # When splitting up a long document into chunks, how much stride to take between chunks. 9 | threads: 1 # multiple threads for converting example to features 10 | 11 | output: 12 | log_dir: /logfiles/squad11 13 | checkpoint_dir: /checkpoints/squad11 14 | verbose_logging: false # If true, all of the warnings related to data processing will be printed. 15 | 16 | tensorboard: 17 | enable: false 18 | log_dir: /tb-logs/squad11 19 | log_histograms: false # Log histograms in tensorboard. They can be quite large (file size) and slow down training. So one might want to disable it 20 | 21 | wandb: 22 | enable: true 23 | project_name: squad-fine-tuning 24 | log_dir: /wandb-logs/squad11 25 | 26 | model: 27 | model_type: roberta 28 | model_name_or_path: roberta-base # Path to pre-trained model or shortcut name of huggingface transformer models 29 | config_name: roberta-base # Pretrained config name or path if not the same as model_name 30 | tokenizer_name: roberta-base # Pretrained tokenizer name or path if not the same as model_name 31 | cache_dir: /pre-trained-transformers 32 | do_lower_case: false 33 | max_seq_length: 384 34 | max_query_length: 64 # The maximum number of tokens for the question 35 | max_answer_length: 30 # The maximum length of an answer that can be generated. 36 | null_score_diff_threshold: 0.0 37 | n_best_size: 5 # The total number of n-best predictions to generate in the nbest_predictions.json output file. 38 | 39 | training: 40 | num_train_epochs: 3 41 | max_steps: -1 # overrides num_train_epochs 42 | evaluate_during_training: true 43 | per_gpu_train_batch_size: 8 44 | gradient_accumulation_steps: 1 45 | train_logging_steps: 300 # adjust based on batch_size and number of GPUs used 46 | eval_logging_steps: 2777 # adjust based on batch_size and number of GPUs used 47 | save_steps: -1 48 | 49 | optimizer: 50 | learning_rate: 0.00003 # learning-rate should not be too large 51 | learning_rate_schedule: warmup-linear 52 | warmup_steps: 0 53 | weight_decay: 0.0 54 | adam_epsilon: 0.00000001 55 | max_grad_norm: 1.0 56 | fp16: false 57 | fp16_opt_level: "01" # If fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html 58 | 59 | eval: 60 | eval_all_checkpoints: false 61 | per_gpu_eval_batch_size: 100 # eval will always run on a single GPU 62 | -------------------------------------------------------------------------------- /examples/squad/scripts/iterations_info.txt: -------------------------------------------------------------------------------- 1 | # SQUAD v1.1 2 | 3 | ## Single GPU training 4 | 5 | batch_size iterations 6 | 2 44432 7 | 4 22216 8 | 6 14811 9 | 8 11108 10 | 16 5554 # this is the largest batch size that fits on a P100 with base models (of course it also depends on max_seq_len) 11 | 24 3703 12 | 32 2777 13 | 14 | ## Multi GPU training 15 | 16 | gpus batch_size_per_gpu total_batch_size iterations_per_gpu 17 | 2 8 16 5554 18 | 4 8 32 2777 19 | 20 | # SQUAD v2.0 21 | 22 | ## Single GPU training 23 | 24 | ## Multi GPU training 25 | -------------------------------------------------------------------------------- /examples/squad/scripts/run_scripts.txt: -------------------------------------------------------------------------------- 1 | # Single GPU taining 2 | # - run as follows (args: config file, seed, gpu) 3 | 4 | # For SQUAD v2.0 make sure to set 'version_2_with_negative' to true in config file and change dirs accordingly 5 | 6 | # SQUAD v1.1/v2.0 7 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/bert-base-cased.yaml 123 7 8 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/albert-base-v1.yaml 123 7 9 | bash /transformers/examples/squad/scripts/train.sh /transformers/examples/squad/configs/roberta-base.yaml 123 7 10 | 11 | 12 | ------------------------------------------------------------------------------------------------------------------------------------- 13 | 14 | # Multi GPU taining 15 | # - run as follows (args: config file, seed, n_gpus, list_of_gpus) 16 | # - make sure to adjust config files accordingly 17 | 18 | # SQUAD v1.1/v2-0 19 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/bert-base-cased.yaml 123 4 4,5,6,7 20 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/albert-base-v1.yaml 123 4 4,5,6,7 21 | bash /transformers/examples/squad/scripts/train_multi_gpu.sh /transformers/examples/squad/configs/roberta-base.yaml 123 4 4,5,6,7 -------------------------------------------------------------------------------- /examples/squad/scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$PYTHONPATH:/transformers" 3 | 4 | # Passed as arguements 5 | CONFIG_FILE=$1 6 | SEED=$2 7 | export CUDA_VISIBLE_DEVICES=$3 8 | 9 | # Setup weights & biases environment variables 10 | # Comment lines below if you don't want to use wandb 11 | export WANDB_API_KEY=your-key 12 | export WANDB_USERNAME="your-username" 13 | export WANDB_ENTITY="your-entity" 14 | 15 | python /transformers/examples/squad/run_finetuning.py \ 16 | --config ${CONFIG_FILE} \ 17 | --do_train \ 18 | --do_eval \ 19 | --seed ${SEED} -------------------------------------------------------------------------------- /examples/squad/scripts/train_multi_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="$PYTHONPATH:/transformers" 3 | 4 | # Passed as arguements 5 | CONFIG_FILE=$1 6 | SEED=$2 7 | N_GPUS=$3 8 | export CUDA_VISIBLE_DEVICES=$4 9 | 10 | # Setup weights & biases environment variables 11 | # Comment lines below if you don't want to use wandb 12 | export WANDB_API_KEY=your-key 13 | export WANDB_USERNAME="your-username" 14 | export WANDB_ENTITY="your-entity" 15 | 16 | python -m torch.distributed.launch --nproc_per_node ${N_GPUS} /transformers/examples/squad/run_finetuning.py \ 17 | --config ${CONFIG_FILE} \ 18 | --do_train \ 19 | --do_eval \ 20 | --seed ${SEED} -------------------------------------------------------------------------------- /examples/squad/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import yaml 4 | 5 | import torch 6 | import numpy as np 7 | from attrdict import AttrDict 8 | 9 | 10 | def read_config(config_file): 11 | # Source: https://stackoverflow.com/questions/1773805/how-can-i-parse-a-yaml-file-in-python 12 | with open(config_file, 'r') as stream: 13 | try: 14 | config = yaml.safe_load(stream) 15 | except yaml.YAMLError as exc: 16 | print(f"Catched the following YAMLError:\n{exc}") 17 | 18 | # Convert to AttrDict to allow acessing by dot e.g. config.seed 19 | config = AttrDict(config) 20 | 21 | return config 22 | 23 | 24 | def save_config(config_file, output_file): 25 | config_file = dict(config_file) 26 | with open(output_file, 'w') as yaml_file: 27 | yaml.dump(config_file, yaml_file, default_flow_style=True) 28 | 29 | 30 | def create_unique_dir(path, config, timestamp): 31 | new_dir = os.path.join(path, timestamp) 32 | 33 | for name in [config.model.model_name_or_path, config.optimizer.learning_rate_schedule]: 34 | new_dir += f'_{name}' 35 | 36 | if config.optimizer.fp16: 37 | new_dir += f'_fp16_{config.optimizer.fp16_opt_level}' 38 | 39 | if not os.path.exists(new_dir): 40 | os.makedirs(new_dir) 41 | 42 | return new_dir 43 | 44 | 45 | def set_seed(args): 46 | random.seed(args.seed) 47 | np.random.seed(args.seed) 48 | torch.manual_seed(args.seed) 49 | if args.n_gpu > 0: 50 | torch.cuda.manual_seed_all(args.seed) 51 | 52 | 53 | def to_list(tensor): 54 | return tensor.detach().cpu().tolist() -------------------------------------------------------------------------------- /examples/summarization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/__init__.py -------------------------------------------------------------------------------- /examples/summarization/bart/README.md: -------------------------------------------------------------------------------- 1 | ### Get the CNN Data 2 | To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running: 3 | 4 | ```bash 5 | tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz 6 | ``` 7 | this should make a directory called cnn_dm/ with files like `test.source`. 8 | To use your own data, copy that files format. Each article to be summarized is on its own line. 9 | 10 | ### Usage 11 | To create summaries for each article in dataset, run: 12 | ```bash 13 | python evaluate_cnn.py cnn_test_summaries.txt 14 | ``` 15 | the default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system. 16 | 17 | ### Where is the code? 18 | The core model is in `src/transformers/modeling_bart.py`. This directory only contains examples. 19 | 20 | ### (WIP) Rouge Scores 21 | 22 | ### Stanford CoreNLP Setup 23 | ``` 24 | ptb_tokenize () { 25 | cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2 26 | } 27 | 28 | sudo apt install openjdk-8-jre-headless 29 | sudo apt-get install ant 30 | wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip 31 | unzip stanford-corenlp-full-2018-10-05.zip 32 | cd stanford-corenlp-full-2018-10-05 33 | export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar 34 | ``` 35 | Then run `ptb_tokenize` on `test.target` and your generated hypotheses. 36 | ### Rouge Setup 37 | Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge). 38 | I also needed to run `sudo apt-get install libxml-parser-perl` 39 | 40 | ```python 41 | from files2rouge import files2rouge 42 | from files2rouge import settings 43 | files2rouge.run(, 44 | , 45 | saveto='rouge_output.txt') 46 | ``` 47 | -------------------------------------------------------------------------------- /examples/summarization/bart/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/bart/__init__.py -------------------------------------------------------------------------------- /examples/summarization/bart/evaluate_cnn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import torch 5 | from tqdm import tqdm 6 | 7 | from transformers import BartForConditionalGeneration, BartTokenizer 8 | 9 | 10 | DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" 11 | 12 | 13 | def chunks(lst, n): 14 | """Yield successive n-sized chunks from lst.""" 15 | for i in range(0, len(lst), n): 16 | yield lst[i : i + n] 17 | 18 | 19 | def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE): 20 | fout = Path(out_file).open("w") 21 | model = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(device) 22 | tokenizer = BartTokenizer.from_pretrained("bart-large") 23 | 24 | max_length = 140 25 | min_length = 55 26 | 27 | for batch in tqdm(list(chunks(lns, batch_size))): 28 | dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True) 29 | summaries = model.generate( 30 | input_ids=dct["input_ids"].to(device), 31 | attention_mask=dct["attention_mask"].to(device), 32 | num_beams=4, 33 | length_penalty=2.0, 34 | max_length=max_length + 2, # +2 from original because we start at step=1 and stop before max_length 35 | min_length=min_length + 1, # +1 from original because we start at step=1 36 | no_repeat_ngram_size=3, 37 | early_stopping=True, 38 | do_sample=False, 39 | decoder_start_token_id=model.config.eos_token_ids[0], 40 | ) 41 | dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries] 42 | for hypothesis in dec: 43 | fout.write(hypothesis + "\n") 44 | fout.flush() 45 | 46 | 47 | def _run_generate(): 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | "source_path", type=str, help="like cnn_dm/test.source", 51 | ) 52 | parser.add_argument( 53 | "output_path", type=str, help="where to save summaries", 54 | ) 55 | parser.add_argument( 56 | "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.", 57 | ) 58 | parser.add_argument( 59 | "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time", 60 | ) 61 | args = parser.parse_args() 62 | lns = [" " + x.rstrip() for x in open(args.source_path).readlines()] 63 | generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device) 64 | 65 | 66 | if __name__ == "__main__": 67 | _run_generate() 68 | -------------------------------------------------------------------------------- /examples/summarization/bart/test_bart_examples.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import tempfile 4 | import unittest 5 | from pathlib import Path 6 | from unittest.mock import patch 7 | 8 | from .evaluate_cnn import _run_generate 9 | 10 | 11 | articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."] 12 | 13 | logging.basicConfig(level=logging.DEBUG) 14 | 15 | logger = logging.getLogger() 16 | 17 | 18 | class TestBartExamples(unittest.TestCase): 19 | def test_bart_cnn_cli(self): 20 | stream_handler = logging.StreamHandler(sys.stdout) 21 | logger.addHandler(stream_handler) 22 | tmp = Path(tempfile.gettempdir()) / "utest_generations.hypo" 23 | with tmp.open("w") as f: 24 | f.write("\n".join(articles)) 25 | testargs = ["evaluate_cnn.py", str(tmp), "output.txt"] 26 | with patch.object(sys, "argv", testargs): 27 | _run_generate() 28 | self.assertTrue(Path("output.txt").exists()) 29 | -------------------------------------------------------------------------------- /examples/summarization/bertabs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/examples/summarization/bertabs/__init__.py -------------------------------------------------------------------------------- /examples/summarization/bertabs/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | # For ROUGE 4 | nltk 5 | py-rouge 6 | -------------------------------------------------------------------------------- /examples/tests_samples/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | cache* 3 | temp* 4 | !*.tsv 5 | !*.json 6 | !.gitignore -------------------------------------------------------------------------------- /examples/tests_samples/MRPC/dev.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /examples/tests_samples/MRPC/train.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /images/fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/images/fig1.png -------------------------------------------------------------------------------- /images/tab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/images/tab1.png -------------------------------------------------------------------------------- /model_cards/DeepPavlov/bert-base-bg-cs-pl-ru-cased/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - bulgarian 4 | - czech 5 | - polish 6 | - russian 7 | --- 8 | 9 | # bert-base-bg-cs-pl-ru-cased 10 | 11 | SlavicBERT\[1\] \(Slavic \(bg, cs, pl, ru\), cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian. Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT. 12 | 13 | 14 | \[1\]: Arkhipov M., Trofimova M., Kuratov Y., Sorokin A. \(2019\). [Tuning Multilingual Transformers for Language-Specific Named Entity Recognition](https://www.aclweb.org/anthology/W19-3712/). ACL anthology W19-3712. 15 | -------------------------------------------------------------------------------- /model_cards/DeepPavlov/bert-base-cased-conversational/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - english 4 | --- 5 | 6 | # bert-base-cased-conversational 7 | 8 | Conversational BERT \(English, cased, 12‑layer, 768‑hidden, 12‑heads, 110M parameters\) was trained on the English part of Twitter, Reddit, DailyDialogues\[1\], OpenSubtitles\[2\], Debates\[3\], Blogs\[4\], Facebook News Comments. We used this training data to build the vocabulary of English subtokens and took English cased version of BERT‑base as an initialization for English Conversational BERT. 9 | 10 | 11 | \[1\]: Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. 12 | 13 | \[2\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\) 14 | 15 | \[3\]: Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. 16 | 17 | \[4\]: J. Schler, M. Koppel, S. Argamon and J. Pennebaker \(2006\). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. 18 | -------------------------------------------------------------------------------- /model_cards/DeepPavlov/bert-base-multilingual-cased-sentence/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - multilingual 4 | --- 5 | 6 | # bert-base-multilingual-cased-sentence 7 | 8 | Sentence Multilingual BERT \(101 languages, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) is a representation‑based sentence encoder for 101 languages of Multilingual BERT. It is initialized with Multilingual BERT and then fine‑tuned on english MultiNLI\[1\] and on dev set of multilingual XNLI\[2\]. Sentence representations are mean pooled token embeddings in the same manner as in Sentence‑BERT\[3\]. 9 | 10 | 11 | \[1\]: Williams A., Nangia N. & Bowman S. \(2017\) A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference. arXiv preprint [arXiv:1704.05426](https://arxiv.org/abs/1704.05426) 12 | 13 | \[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053) 14 | 15 | \[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084) 16 | -------------------------------------------------------------------------------- /model_cards/DeepPavlov/rubert-base-cased-conversational/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - russian 4 | --- 5 | 6 | # rubert-base-cased-conversational 7 | 8 | Conversational RuBERT \(Russian, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on OpenSubtitles\[1\], [Dirty](https://d3.ru/), [Pikabu](https://pikabu.ru/), and a Social Media segment of Taiga corpus\[2\]. We assembled a new vocabulary for Conversational RuBERT model on this data and initialized the model with [RuBERT](../rubert-base-cased). 9 | 10 | 11 | \[1\]: P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation \(LREC 2016\) 12 | 13 | \[2\]: Shavrina T., Shapovalova O. \(2017\) TO THE METHODOLOGY OF CORPUS CONSTRUCTION FOR MACHINE LEARNING: «TAIGA» SYNTAX TREE CORPUS AND PARSER. in proc. of “CORPORA2017”, international conference , Saint-Petersbourg, 2017. 14 | -------------------------------------------------------------------------------- /model_cards/DeepPavlov/rubert-base-cased-sentence/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - russian 4 | --- 5 | 6 | # rubert-base-cased-sentence 7 | 8 | Sentence RuBERT \(Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters\) is a representation‑based sentence encoder for Russian. It is initialized with RuBERT and fine‑tuned on SNLI\[1\] google-translated to russian and on russian part of XNLI dev set\[2\]. Sentence representations are mean pooled token embeddings in the same manner as in Sentence‑BERT\[3\]. 9 | 10 | 11 | \[1\]: S. R. Bowman, G. Angeli, C. Potts, and C. D. Manning. \(2015\) A large annotated corpus for learning natural language inference. arXiv preprint [arXiv:1508.05326](https://arxiv.org/abs/1508.05326) 12 | 13 | \[2\]: Williams A., Bowman S. \(2018\) XNLI: Evaluating Cross-lingual Sentence Representations. arXiv preprint [arXiv:1809.05053](https://arxiv.org/abs/1809.05053) 14 | 15 | \[3\]: N. Reimers, I. Gurevych \(2019\) Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint [arXiv:1908.10084](https://arxiv.org/abs/1908.10084) 16 | -------------------------------------------------------------------------------- /model_cards/DeepPavlov/rubert-base-cased/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - russian 4 | --- 5 | 6 | # rubert-base-cased 7 | 8 | RuBERT \(Russian, cased, 12‑layer, 768‑hidden, 12‑heads, 180M parameters\) was trained on the Russian part of Wikipedia and news data. We used this training data to build a vocabulary of Russian subtokens and took a multilingual version of BERT‑base as an initialization for RuBERT\[1\]. 9 | 10 | 11 | \[1\]: Kuratov, Y., Arkhipov, M. \(2019\). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint [arXiv:1905.07213](https://arxiv.org/abs/1905.07213). 12 | -------------------------------------------------------------------------------- /model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md: -------------------------------------------------------------------------------- 1 | ## Albert xxlarge version 1 language model fine-tuned on SQuAD2.0 2 | 3 | ### with the following results: 4 | 5 | ``` 6 | exact: 85.65653162637918 7 | f1: 89.260458954177 8 | total': 11873 9 | HasAns_exact': 82.6417004048583 10 | HasAns_f1': 89.8598902096736 11 | HasAns_total': 5928 12 | NoAns_exact': 88.66274179983179 13 | NoAns_f1': 88.66274179983179 14 | NoAns_total': 5945 15 | best_exact': 85.65653162637918 16 | best_exact_thresh': 0.0 17 | best_f1': 89.2604589541768 18 | best_f1_thresh': 0.0 19 | ``` 20 | 21 | ### from script: 22 | 23 | ``` 24 | python -m torch.distributed.launch --nproc_per_node=2 ${RUN_SQUAD_DIR}/run_squad.py \ 25 | --model_type albert \ 26 | --model_name_or_path albert-xxlarge-v1 \ 27 | --do_train \ 28 | --train_file ${SQUAD_DIR}/train-v2.0.json \ 29 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \ 30 | --version_2_with_negative \ 31 | --num_train_epochs 3 \ 32 | --max_steps 8144 \ 33 | --warmup_steps 814 \ 34 | --do_lower_case \ 35 | --learning_rate 3e-5 \ 36 | --max_seq_length 512 \ 37 | --doc_stride 128 \ 38 | --save_steps 2000 \ 39 | --per_gpu_train_batch_size 1 \ 40 | --gradient_accumulation_steps 24 \ 41 | --output_dir ${MODEL_PATH} 42 | 43 | CUDA_VISIBLE_DEVICES=0 python ${RUN_SQUAD_DIR}/run_squad.py \ 44 | --model_type albert \ 45 | --model_name_or_path ${MODEL_PATH} \ 46 | --do_eval \ 47 | --train_file ${SQUAD_DIR}/train-v2.0.json \ 48 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \ 49 | --version_2_with_negative \ 50 | --do_lower_case \ 51 | --max_seq_length 512 \ 52 | --per_gpu_eval_batch_size 48 \ 53 | --output_dir ${MODEL_PATH} 54 | ``` 55 | 56 | ### using the following system & software: 57 | 58 | ``` 59 | OS/Platform: Linux-4.15.0-76-generic-x86_64-with-debian-buster-sid 60 | GPU/CPU: 2 x NVIDIA 1080Ti / Intel i7-8700 61 | Transformers: 2.3.0 62 | PyTorch: 1.4.0 63 | TensorFlow: 2.1.0 64 | Python: 3.7.6 65 | ``` 66 | 67 | ### Inferencing / prediction works with the current Transformers v2.4.1 68 | 69 | ### Access this albert_xxlargev1_sqd2_512 fine-tuned model with "tried & true" code: 70 | 71 | ```python 72 | config_class, model_class, tokenizer_class = \ 73 | AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer 74 | 75 | model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512" 76 | config = config_class.from_pretrained(model_name_or_path) 77 | tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True) 78 | model = model_class.from_pretrained(model_name_or_path, config=config) 79 | ``` 80 | 81 | ### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my app & confirm: 82 | 83 | ```python 84 | from transformers import AutoConfig, AutoTokenizer, AutoModel 85 | 86 | model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512" 87 | config = AutoConfig.from_pretrained(model_name_or_path) 88 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True) 89 | model = AutoModel.from_pretrained(model_name_or_path, config=config) 90 | ``` -------------------------------------------------------------------------------- /model_cards/ahotrod/xlnet_large_squad2_512/README.md: -------------------------------------------------------------------------------- 1 | ## XLNet large language model fine-tuned on SQuAD2.0 2 | 3 | ### with the following results: 4 | 5 | ``` 6 | "exact": 82.07698138633876, 7 | "f1": 85.898874470488, 8 | "total": 11873, 9 | "HasAns_exact": 79.60526315789474, 10 | "HasAns_f1": 87.26000954590184, 11 | "HasAns_total": 5928, 12 | "NoAns_exact": 84.54163162321278, 13 | "NoAns_f1": 84.54163162321278, 14 | "NoAns_total": 5945, 15 | "best_exact": 83.22243746315169, 16 | "best_exact_thresh": -11.112004280090332, 17 | "best_f1": 86.88541353813282, 18 | "best_f1_thresh": -11.112004280090332 19 | ``` 20 | ### from script: 21 | ``` 22 | python -m torch.distributed.launch --nproc_per_node=2 ${RUN_SQUAD_DIR}/run_squad.py \ 23 | --model_type xlnet \ 24 | --model_name_or_path xlnet-large-cased \ 25 | --do_train \ 26 | --train_file ${SQUAD_DIR}/train-v2.0.json \ 27 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \ 28 | --version_2_with_negative \ 29 | --num_train_epochs 3 \ 30 | --learning_rate 3e-5 \ 31 | --adam_epsilon 1e-6 \ 32 | --max_seq_length 512 \ 33 | --doc_stride 128 \ 34 | --save_steps 2000 \ 35 | --per_gpu_train_batch_size 1 \ 36 | --gradient_accumulation_steps 24 \ 37 | --output_dir ${MODEL_PATH} 38 | 39 | CUDA_VISIBLE_DEVICES=0 python ${RUN_SQUAD_DIR}/run_squad_II.py \ 40 | --model_type xlnet \ 41 | --model_name_or_path ${MODEL_PATH} \ 42 | --do_eval \ 43 | --train_file ${SQUAD_DIR}/train-v2.0.json \ 44 | --predict_file ${SQUAD_DIR}/dev-v2.0.json \ 45 | --version_2_with_negative \ 46 | --max_seq_length 512 \ 47 | --per_gpu_eval_batch_size 48 \ 48 | --output_dir ${MODEL_PATH} 49 | ``` 50 | ### using the following system & software: 51 | ``` 52 | OS/Platform: Linux-4.15.0-76-generic-x86_64-with-debian-buster-sid 53 | GPU/CPU: 2 x NVIDIA 1080Ti / Intel i7-8700 54 | Transformers: 2.1.1 55 | PyTorch: 1.4.0 56 | TensorFlow: 2.1.0 57 | Python: 3.7.6 58 | ``` 59 | ### Inferencing / prediction works with Transformers v2.4.1, the latest version tested 60 | 61 | ### Utilize this xlnet_large_squad2_512 fine-tuned model with: 62 | ```python 63 | config_class, model_class, tokenizer_class = \ 64 | XLNetConfig, XLNetforQuestionAnswering, XLNetTokenizer 65 | model_name_or_path = "ahotrod/xlnet_large_squad2_512" 66 | config = config_class.from_pretrained(model_name_or_path) 67 | tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True) 68 | model = model_class.from_pretrained(model_name_or_path, config=config) 69 | ``` 70 | ### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my apps & confirm: 71 | ```python 72 | from transformers import AutoConfig, AutoTokenizer, AutoModel 73 | model_name_or_path = "ahotrod/xlnet_large_squad2_512" 74 | config = AutoConfig.from_pretrained(model_name_or_path) 75 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True) 76 | model = AutoModel.from_pretrained(model_name_or_path, config=config) 77 | ``` 78 | -------------------------------------------------------------------------------- /model_cards/asafaya/bert-base-arabic/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: arabic 3 | --- 4 | 5 | # Arabic BERT Model 6 | 7 | Pretrained BERT base language model for Arabic 8 | 9 | ## Pretraining Corpus 10 | 11 | `arabic-bert-base` model was pretrained on ~8.2 Billion words: 12 | 13 | - Arabic version of [OSCAR](https://traces1.inria.fr/oscar/) - filtered from [Common Crawl](http://commoncrawl.org/) 14 | - Recent dump of Arabic [Wikipedia](https://dumps.wikimedia.org/backup-index.html) 15 | 16 | and other Arabic resources which sum up to ~95GB of text. 17 | 18 | __Notes on training data:__ 19 | 20 | - Our final version of corpus contains some non-Arabic words inlines, which we did not remove from sentences since that would affect some tasks like NER. 21 | - Although non-Arabic characters were lowered as a preprocessing step, since Arabic characters does not have upper or lower case, there is no cased and uncased version of the model. 22 | - The corpus and vocabulary set are not restricted to Modern Standard Arabic, they contain some dialectical Arabic too. 23 | 24 | ## Pretraining details 25 | 26 | - This model was trained using Google BERT's github [repository](https://github.com/google-research/bert) on a single TPU v3-8 provided for free from [TFRC](https://www.tensorflow.org/tfrc). 27 | - Our pretraining procedure follows training settings of bert with some changes: trained for 3M training steps with batchsize of 128, instead of 1M with batchsize of 256. 28 | 29 | ## Load Pretrained Model 30 | 31 | You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this: 32 | 33 | ```python 34 | from transformers import AutoTokenizer, AutoModel 35 | 36 | tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic") 37 | model = AutoModel.from_pretrained("asafaya/bert-base-arabic") 38 | ``` 39 | 40 | ## Results 41 | 42 | For further details on the models performance or any other queries, please refer to [Arabic-BERT](https://github.com/alisafaya/Arabic-BERT) 43 | 44 | ## Acknowledgement 45 | 46 | Thanks to Google for providing free TPU for the training process and for Huggingface for hosting this model on their servers 😊 47 | 48 | 49 | -------------------------------------------------------------------------------- /model_cards/binwang/xlnet-base-cased/README.md: -------------------------------------------------------------------------------- 1 | This model is pre-trained **XLNET** with 12 layers. 2 | 3 | It comes with paper: SBERT-WK: A Sentence Embedding Method By Dissecting BERT-based Word Models 4 | 5 | Project Page: [SBERT-WK](https://github.com/BinWang28/SBERT-WK-Sentence-Embedding) 6 | -------------------------------------------------------------------------------- /model_cards/camembert-base-README.md: -------------------------------------------------------------------------------- 1 | # CamemBERT 2 | 3 | CamemBERT is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR. 4 | 5 | CamemBERT was originally evaluated on four different downstream tasks for French: part-of-speech (POS) tagging, dependency parsing, named entity recognition (NER) and natural language inference (NLI); improving the state of the art for most tasks over previous monolingual and multilingual approaches, which confirms the effectiveness of large pretrained language models for French. 6 | 7 | CamemBERT was trained and evaluated by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 8 | 9 | Preprint can be found [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 10 | -------------------------------------------------------------------------------- /model_cards/canwenxu/BERT-of-Theseus-MNLI/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | thumbnail: https://raw.githubusercontent.com/JetRunner/BERT-of-Theseus/master/bert-of-theseus.png 3 | --- 4 | 5 | # BERT-of-Theseus 6 | See our paper ["BERT-of-Theseus: Compressing BERT by Progressive Module Replacing"](http://arxiv.org/abs/2002.02925). 7 | 8 | BERT-of-Theseus is a new compressed BERT by progressively replacing the components of the original BERT. 9 | 10 | ![BERT of Theseus](https://github.com/JetRunner/BERT-of-Theseus/blob/master/bert-of-theseus.png?raw=true) 11 | 12 | ## Load Pretrained Model on MNLI 13 | 14 | We provide a 6-layer pretrained model on MNLI as a general-purpose model, which can transfer to other sentence classification tasks, outperforming DistillBERT (with the same 6-layer structure) on six tasks of GLUE (dev set). 15 | 16 | | Method | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | 17 | |-----------------|------|------|------|------|------|-------|-------| 18 | | BERT-base | 83.5 | 89.5 | 91.2 | 89.8 | 71.1 | 91.5 | 88.9 | 19 | | DistillBERT | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 20 | | BERT-of-Theseus | 82.1 | 87.5 | 88.8 | 88.8 | 70.1 | 91.8 | 87.8 | 21 | -------------------------------------------------------------------------------- /model_cards/dbmdz/bert-base-german-europeana-cased/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: german 3 | tags: 4 | - "historic german" 5 | --- 6 | 7 | # 🤗 + 📚 dbmdz BERT models 8 | 9 | In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State 10 | Library open sources German Europeana BERT models 🎉 11 | 12 | # German Europeana BERT 13 | 14 | We use the open source [Europeana newspapers](http://www.europeana-newspapers.eu/) 15 | that were provided by *The European Library*. The final 16 | training corpus has a size of 51GB and consists of 8,035,986,369 tokens. 17 | 18 | Detailed information about the data and pretraining steps can be found in 19 | [this repository](https://github.com/stefan-it/europeana-bert). 20 | 21 | ## Model weights 22 | 23 | Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers) 24 | compatible weights are available. If you need access to TensorFlow checkpoints, 25 | please raise an issue! 26 | 27 | | Model | Downloads 28 | | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------- 29 | | `dbmdz/bert-base-german-europeana-cased` | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-cased/vocab.txt) 30 | 31 | ## Results 32 | 33 | For results on Historic NER, please refer to [this repository](https://github.com/stefan-it/europeana-bert). 34 | 35 | ## Usage 36 | 37 | With Transformers >= 2.3 our German Europeana BERT models can be loaded like: 38 | 39 | ```python 40 | from transformers import AutoModel, AutoTokenizer 41 | 42 | tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-europeana-cased") 43 | model = AutoModel.from_pretrained("dbmdz/bert-base-german-europeana-cased") 44 | ``` 45 | 46 | # Huggingface model hub 47 | 48 | All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz). 49 | 50 | # Contact (Bugs, Feedback, Contribution and more) 51 | 52 | For questions about our BERT models just open an issue 53 | [here](https://github.com/dbmdz/berts/issues/new) 🤗 54 | 55 | # Acknowledgments 56 | 57 | Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC). 58 | Thanks for providing access to the TFRC ❤️ 59 | 60 | Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team, 61 | it is possible to download both cased and uncased models from their S3 storage 🤗 62 | -------------------------------------------------------------------------------- /model_cards/dbmdz/bert-base-german-europeana-uncased/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: german 3 | tags: 4 | - "historic german" 5 | --- 6 | 7 | # 🤗 + 📚 dbmdz BERT models 8 | 9 | In this repository the MDZ Digital Library team (dbmdz) at the Bavarian State 10 | Library open sources German Europeana BERT models 🎉 11 | 12 | # German Europeana BERT 13 | 14 | We use the open source [Europeana newspapers](http://www.europeana-newspapers.eu/) 15 | that were provided by *The European Library*. The final 16 | training corpus has a size of 51GB and consists of 8,035,986,369 tokens. 17 | 18 | Detailed information about the data and pretraining steps can be found in 19 | [this repository](https://github.com/stefan-it/europeana-bert). 20 | 21 | ## Model weights 22 | 23 | Currently only PyTorch-[Transformers](https://github.com/huggingface/transformers) 24 | compatible weights are available. If you need access to TensorFlow checkpoints, 25 | please raise an issue! 26 | 27 | | Model | Downloads 28 | | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------- 29 | | `dbmdz/bert-base-german-europeana-uncased` | [`config.json`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/config.json) • [`pytorch_model.bin`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/pytorch_model.bin) • [`vocab.txt`](https://cdn.huggingface.co/dbmdz/bert-base-german-europeana-uncased/vocab.txt) 30 | 31 | ## Results 32 | 33 | For results on Historic NER, please refer to [this repository](https://github.com/stefan-it/europeana-bert). 34 | 35 | ## Usage 36 | 37 | With Transformers >= 2.3 our German Europeana BERT models can be loaded like: 38 | 39 | ```python 40 | from transformers import AutoModel, AutoTokenizer 41 | 42 | tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-europeana-uncased") 43 | model = AutoModel.from_pretrained("dbmdz/bert-base-german-europeana-uncased") 44 | ``` 45 | 46 | # Huggingface model hub 47 | 48 | All models are available on the [Huggingface model hub](https://huggingface.co/dbmdz). 49 | 50 | # Contact (Bugs, Feedback, Contribution and more) 51 | 52 | For questions about our BERT models just open an issue 53 | [here](https://github.com/dbmdz/berts/issues/new) 🤗 54 | 55 | # Acknowledgments 56 | 57 | Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC). 58 | Thanks for providing access to the TFRC ❤️ 59 | 60 | Thanks to the generous support from the [Hugging Face](https://huggingface.co/) team, 61 | it is possible to download both cased and uncased models from their S3 storage 🤗 62 | -------------------------------------------------------------------------------- /model_cards/djstrong/bg_cs_pl_ru_cased_L-12_H-768_A-12/README.md: -------------------------------------------------------------------------------- 1 | Slavic BERT from https://github.com/deepmipt/Slavic-BERT-NER http://files.deeppavlov.ai/deeppavlov_data/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz 2 | -------------------------------------------------------------------------------- /model_cards/emilyalsentzer/Bio_ClinicalBERT/README.md: -------------------------------------------------------------------------------- 1 | 2 | # ClinicalBERT - Bio + Clinical BERT Model 3 | 4 | The [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) paper contains four unique clinicalBERT models: initialized with BERT-Base (`cased_L-12_H-768_A-12`) or BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`) & trained on either all MIMIC notes or only discharge summaries. 5 | 6 | This model card describes the Bio+Clinical BERT model, which was initialized from [BioBERT](https://arxiv.org/abs/1901.08746) & trained on all MIMIC notes. 7 | 8 | ## Pretraining Data 9 | The `Bio_ClinicalBERT` model was trained on all notes from [MIMIC III](https://www.nature.com/articles/sdata201635), a database containing electronic health records from ICU patients at the Beth Israel Hospital in Boston, MA. For more details on MIMIC, see [here](https://mimic.physionet.org/). All notes from the `NOTEEVENTS` table were included (~880M words). 10 | 11 | ## Model Pretraining 12 | 13 | ### Note Preprocessing 14 | Each note in MIMIC was first split into sections using a rules-based section splitter (e.g. discharge summary notes were split into "History of Present Illness", "Family History", "Brief Hospital Course", etc. sections). Then each section was split into sentences using SciSpacy (`en core sci md` tokenizer). 15 | 16 | ### Pretraining Procedures 17 | The model was trained using code from [Google's BERT repository](https://github.com/google-research/bert) on a GeForce GTX TITAN X 12 GB GPU. Model parameters were initialized with BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`). 18 | 19 | ### Pretraining Hyperparameters 20 | We used a batch size of 32, a maximum sequence length of 128, and a learning rate of 5 · 10−5 for pre-training our models. The models trained on all MIMIC notes were trained for 150,000 steps. The dup factor for duplicating input data with different masks was set to 5. All other default parameters were used (specifically, masked language model probability = 0.15 21 | and max predictions per sequence = 20). 22 | 23 | ## How to use the model 24 | 25 | Load the model via the transformers library: 26 | ``` 27 | from transformers import AutoTokenizer, AutoModel 28 | tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 29 | model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 30 | ``` 31 | 32 | ## More Information 33 | 34 | Refer to the original paper, [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) (NAACL Clinical NLP Workshop 2019) for additional details and performance on NLI and NER tasks. 35 | 36 | ## Questions? 37 | 38 | Post a Github issue on the [clinicalBERT repo](https://github.com/EmilyAlsentzer/clinicalBERT) or email emilya@mit.edu with any questions. 39 | 40 | -------------------------------------------------------------------------------- /model_cards/emilyalsentzer/Bio_Discharge_Summary_BERT/README.md: -------------------------------------------------------------------------------- 1 | 2 | # ClinicalBERT - Bio + Discharge Summary BERT Model 3 | 4 | The [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) paper contains four unique clinicalBERT models: initialized with BERT-Base (`cased_L-12_H-768_A-12`) or BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`) & trained on either all MIMIC notes or only discharge summaries. 5 | 6 | This model card describes the Bio+Discharge Summary BERT model, which was initialized from [BioBERT](https://arxiv.org/abs/1901.08746) & trained on only discharge summaries from MIMIC. 7 | 8 | ## Pretraining Data 9 | The `Bio_Discharge_Summary_BERT` model was trained on all discharge summaries from [MIMIC III](https://www.nature.com/articles/sdata201635), a database containing electronic health records from ICU patients at the Beth Israel Hospital in Boston, MA. For more details on MIMIC, see [here](https://mimic.physionet.org/). All notes from the `NOTEEVENTS` table were included (~880M words). 10 | 11 | ## Model Pretraining 12 | 13 | ### Note Preprocessing 14 | Each note in MIMIC was first split into sections using a rules-based section splitter (e.g. discharge summary notes were split into "History of Present Illness", "Family History", "Brief Hospital Course", etc. sections). Then each section was split into sentences using SciSpacy (`en core sci md` tokenizer). 15 | 16 | ### Pretraining Procedures 17 | The model was trained using code from [Google's BERT repository](https://github.com/google-research/bert) on a GeForce GTX TITAN X 12 GB GPU. Model parameters were initialized with BioBERT (`BioBERT-Base v1.0 + PubMed 200K + PMC 270K`). 18 | 19 | ### Pretraining Hyperparameters 20 | We used a batch size of 32, a maximum sequence length of 128, and a learning rate of 5 · 10−5 for pre-training our models. The models trained on all MIMIC notes were trained for 150,000 steps. The dup factor for duplicating input data with different masks was set to 5. All other default parameters were used (specifically, masked language model probability = 0.15 21 | and max predictions per sequence = 20). 22 | 23 | ## How to use the model 24 | 25 | Load the model via the transformers library: 26 | ``` 27 | from transformers import AutoTokenizer, AutoModel 28 | tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 29 | model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 30 | ``` 31 | 32 | ## More Information 33 | 34 | Refer to the original paper, [Publicly Available Clinical BERT Embeddings](https://arxiv.org/abs/1904.03323) (NAACL Clinical NLP Workshop 2019) for additional details and performance on NLI and NER tasks. 35 | 36 | ## Questions? 37 | 38 | Post a Github issue on the [clinicalBERT repo](https://github.com/EmilyAlsentzer/clinicalBERT) or email emilya@mit.edu with any questions. 39 | 40 | -------------------------------------------------------------------------------- /model_cards/fmikaelian/camembert-base-fquad/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: french 3 | --- 4 | 5 | # camembert-base-fquad 6 | 7 | ## Description 8 | 9 | A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [FQuAD](https://fquad.illuin.tech/)) 10 | 11 | ## Training hyperparameters 12 | 13 | ```shell 14 | python3 ./examples/run_squad.py \ 15 | --model_type camembert \ 16 | --model_name_or_path camembert-base \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_lower_case \ 20 | --train_file train.json \ 21 | --predict_file valid.json \ 22 | --learning_rate 3e-5 \ 23 | --num_train_epochs 2 \ 24 | --max_seq_length 384 \ 25 | --doc_stride 128 \ 26 | --output_dir output \ 27 | --per_gpu_eval_batch_size=3 \ 28 | --per_gpu_train_batch_size=3 \ 29 | --save_steps 10000 30 | ``` 31 | 32 | ## Evaluation results 33 | 34 | ```shell 35 | {"f1": 77.24515316052342, "exact_match": 52.82308657465496} 36 | ``` 37 | 38 | ## Usage 39 | 40 | ```python 41 | from transformers import pipeline 42 | 43 | nlp = pipeline('question-answering', model='fmikaelian/camembert-base-fquad', tokenizer='fmikaelian/camembert-base-fquad') 44 | 45 | nlp({ 46 | 'question': "Qui est Claude Monet?", 47 | 'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme." 48 | }) 49 | ``` -------------------------------------------------------------------------------- /model_cards/fmikaelian/camembert-base-squad/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: french 3 | --- 4 | 5 | # camembert-base-squad 6 | 7 | ## Description 8 | 9 | A baseline model for question-answering in french ([CamemBERT](https://camembert-model.fr/) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD)) 10 | 11 | ## Training hyperparameters 12 | 13 | ```shell 14 | python3 ./examples/run_squad.py \ 15 | --model_type camembert \ 16 | --model_name_or_path camembert-base \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_lower_case \ 20 | --train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \ 21 | --predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \ 22 | --learning_rate 3e-5 \ 23 | --num_train_epochs 2 \ 24 | --max_seq_length 384 \ 25 | --doc_stride 128 \ 26 | --output_dir output3 \ 27 | --per_gpu_eval_batch_size=3 \ 28 | --per_gpu_train_batch_size=3 \ 29 | --save_steps 10000 30 | ``` 31 | 32 | ## Evaluation results 33 | 34 | ```shell 35 | {"f1": 79.8570684959745, "exact_match": 59.21327108373895} 36 | ``` 37 | 38 | ## Usage 39 | 40 | ```python 41 | from transformers import pipeline 42 | 43 | nlp = pipeline('question-answering', model='fmikaelian/camembert-base-squad', tokenizer='fmikaelian/camembert-base-squad') 44 | 45 | nlp({ 46 | 'question': "Qui est Claude Monet?", 47 | 'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme." 48 | }) 49 | ``` -------------------------------------------------------------------------------- /model_cards/fmikaelian/flaubert-base-uncased-squad/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: french 3 | --- 4 | 5 | # flaubert-base-uncased-squad 6 | 7 | ## Description 8 | 9 | A baseline model for question-answering in french ([flaubert](https://github.com/getalp/Flaubert) model fine-tuned on [french-translated SQuAD 1.1 dataset](https://github.com/Alikabbadj/French-SQuAD)) 10 | 11 | ## Training hyperparameters 12 | 13 | ```shell 14 | python3 ./examples/run_squad.py \ 15 | --model_type flaubert \ 16 | --model_name_or_path flaubert-base-uncased \ 17 | --do_train \ 18 | --do_eval \ 19 | --do_lower_case \ 20 | --train_file SQuAD-v1.1-train_fr_ss999_awstart2_net.json \ 21 | --predict_file SQuAD-v1.1-dev_fr_ss999_awstart2_net.json \ 22 | --learning_rate 3e-5 \ 23 | --num_train_epochs 2 \ 24 | --max_seq_length 384 \ 25 | --doc_stride 128 \ 26 | --output_dir output \ 27 | --per_gpu_eval_batch_size=3 \ 28 | --per_gpu_train_batch_size=3 29 | ``` 30 | 31 | ## Evaluation results 32 | 33 | ```shell 34 | {"f1": 68.66174806561969, "exact_match": 49.299692063176714} 35 | ``` 36 | 37 | ## Usage 38 | 39 | ```python 40 | from transformers import pipeline 41 | 42 | nlp = pipeline('question-answering', model='fmikaelian/flaubert-base-uncased-squad', tokenizer='fmikaelian/flaubert-base-uncased-squad') 43 | 44 | nlp({ 45 | 'question': "Qui est Claude Monet?", 46 | 'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme." 47 | }) 48 | ``` -------------------------------------------------------------------------------- /model_cards/henryk/bert-base-multilingual-cased-finetuned-dutch-squad2/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: dutch 3 | --- 4 | 5 | # Multilingual + Dutch SQuAD2.0 6 | 7 | This model is the multilingual model provided by the Google research team with a fine-tuned dutch Q&A downstream task. 8 | 9 | ## Details of the language model(bert-base-multilingual-cased) 10 | 11 | Language model ([**bert-base-multilingual-cased**](https://github.com/google-research/bert/blob/master/multilingual.md)): 12 | 12-layer, 768-hidden, 12-heads, 110M parameters. 13 | Trained on cased text in the top 104 languages with the largest Wikipedias. 14 | 15 | ## Details of the downstream task - Dataset 16 | Using the `mtranslate` Python module, [**SQuAD2.0**](https://rajpurkar.github.io/SQuAD-explorer/) was machine-translated. In order to find the start tokens the direct translations of the answers were searched in the corresponding paragraphs. Since the answer could not always be found in the text, due to the different translations depending on the context (missing context in the pure answer), a loss of question-answer examples occurred. This is a potential problem where errors can occur in the data set (but in the end it was a quick and dirty solution that worked well enough for my task). 17 | 18 | | Dataset | # Q&A | 19 | | ---------------------- | ----- | 20 | | SQuAD2.0 Train | 130 K | 21 | | Dutch SQuAD2.0 Train | 99 K | 22 | | SQuAD2.0 Dev | 12 K | 23 | | Dutch SQuAD2.0 Dev | 10 K | 24 | 25 | ## Model training 26 | 27 | The model was trained on a Tesla V100 GPU with the following command: 28 | 29 | ```python 30 | export SQUAD_DIR=path/to/nl_squad 31 | 32 | python run_squad.py \ 33 | --model_type bert \ 34 | --model_name_or_path bert-base-multilingual-cased \ 35 | --version_2_with_negative \ 36 | --do_train \ 37 | --do_eval \ 38 | --train_file $SQUAD_DIR/train_nl-v2.0.json \ 39 | --predict_file $SQUAD_DIR/dev_nl-v2.0.json \ 40 | --per_gpu_train_batch_size 12 \ 41 | --learning_rate 3e-5 \ 42 | --num_train_epochs 2.0 \ 43 | --max_seq_length 384 \ 44 | --doc_stride 128 \ 45 | --output_dir /tmp/output_dir/ 46 | ``` 47 | 48 | **Results**: 49 | 50 | {'exact': **67.38**, 'f1': **71.36**} -------------------------------------------------------------------------------- /model_cards/jplu/tf-camembert-base/README.md: -------------------------------------------------------------------------------- 1 | # Tensorflow CamemBERT 2 | 3 | In this repository you will find different versions of the CamemBERT model for Tensorflow. 4 | 5 | ## CamemBERT 6 | 7 | [CamemBERT](https://camembert-model.fr/) is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR. 8 | 9 | ## Model Weights 10 | 11 | | Model | Downloads 12 | | -------------------------------- | --------------------------------------------------------------------------------------------------------------- 13 | | `jplu/tf-camembert-base` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-camembert-base/tf_model.h5) 14 | 15 | ## Usage 16 | 17 | With Transformers >= 2.4 the Tensorflow models of CamemBERT can be loaded like: 18 | 19 | ```python 20 | from transformers import TFCamembertModel 21 | 22 | model = TFCamembertModel.from_pretrained("jplu/tf-camembert-base") 23 | ``` 24 | 25 | ## Huggingface model hub 26 | 27 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu). 28 | 29 | ## Acknowledgments 30 | 31 | Thanks to all the Huggingface team for the support and their amazing library! 32 | -------------------------------------------------------------------------------- /model_cards/jplu/tf-xlm-roberta-base/README.md: -------------------------------------------------------------------------------- 1 | # Tensorflow XLM-RoBERTa 2 | 3 | In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow. 4 | 5 | ## XLM-RoBERTa 6 | 7 | [XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks. 8 | 9 | ## Model Weights 10 | 11 | | Model | Downloads 12 | | -------------------------------- | --------------------------------------------------------------------------------------------------------------- 13 | | `jplu/tf-xlm-roberta-base` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5) 14 | | `jplu/tf-xlm-roberta-large` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5) 15 | 16 | ## Usage 17 | 18 | With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like: 19 | 20 | ```python 21 | from transformers import TFXLMRobertaModel 22 | 23 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base") 24 | ``` 25 | Or 26 | ``` 27 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large") 28 | ``` 29 | 30 | ## Huggingface model hub 31 | 32 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu). 33 | 34 | ## Acknowledgments 35 | 36 | Thanks to all the Huggingface team for the support and their amazing library! 37 | -------------------------------------------------------------------------------- /model_cards/jplu/tf-xlm-roberta-large/README.md: -------------------------------------------------------------------------------- 1 | # Tensorflow XLM-RoBERTa 2 | 3 | In this repository you will find different versions of the XLM-RoBERTa model for Tensorflow. 4 | 5 | ## XLM-RoBERTa 6 | 7 | [XLM-RoBERTa](https://ai.facebook.com/blog/-xlm-r-state-of-the-art-cross-lingual-understanding-through-self-supervision/) is a scaled cross lingual sentence encoder. It is trained on 2.5T of data across 100 languages data filtered from Common Crawl. XLM-R achieves state-of-the-arts results on multiple cross lingual benchmarks. 8 | 9 | ## Model Weights 10 | 11 | | Model | Downloads 12 | | -------------------------------- | --------------------------------------------------------------------------------------------------------------- 13 | | `jplu/tf-xlm-roberta-base` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-base/tf_model.h5) 14 | | `jplu/tf-xlm-roberta-large` | [`config.json`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/config.json) • [`tf_model.h5`](https://s3.amazonaws.com/models.huggingface.co/bert/jplu/tf-xlm-roberta-large/tf_model.h5) 15 | 16 | ## Usage 17 | 18 | With Transformers >= 2.4 the Tensorflow models of XLM-RoBERTa can be loaded like: 19 | 20 | ```python 21 | from transformers import TFXLMRobertaModel 22 | 23 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base") 24 | ``` 25 | Or 26 | ``` 27 | model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-large") 28 | ``` 29 | 30 | ## Huggingface model hub 31 | 32 | All models are available on the [Huggingface model hub](https://huggingface.co/jplu). 33 | 34 | ## Acknowledgments 35 | 36 | Thanks to all the Huggingface team for the support and their amazing library! 37 | -------------------------------------------------------------------------------- /model_cards/julien-c/EsperBERTo-small-pos/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: esperanto 3 | thumbnail: https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png 4 | --- 5 | 6 | # EsperBERTo: RoBERTa-like Language model trained on Esperanto 7 | 8 | **Companion model to blog post https://huggingface.co/blog/how-to-train** 🔥 9 | 10 | ## Training Details 11 | 12 | - current checkpoint: 566000 13 | - machine name: `galinette` 14 | 15 | 16 | ![](https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png) 17 | 18 | ## Example pipeline 19 | 20 | ```python 21 | from transformers import TokenClassificationPipeline, pipeline 22 | 23 | 24 | MODEL_PATH = "./models/EsperBERTo-small-pos/" 25 | 26 | nlp = pipeline( 27 | "ner", 28 | model=MODEL_PATH, 29 | tokenizer=MODEL_PATH, 30 | ) 31 | # or instantiate a TokenClassificationPipeline directly. 32 | 33 | nlp("Mi estas viro kej estas tago varma.") 34 | 35 | # {'entity': 'PRON', 'score': 0.9979867339134216, 'word': ' Mi'} 36 | # {'entity': 'VERB', 'score': 0.9683094620704651, 'word': ' estas'} 37 | # {'entity': 'VERB', 'score': 0.9797462821006775, 'word': ' estas'} 38 | # {'entity': 'NOUN', 'score': 0.8509314060211182, 'word': ' tago'} 39 | # {'entity': 'ADJ', 'score': 0.9996201395988464, 'word': ' varma'} 40 | ``` -------------------------------------------------------------------------------- /model_cards/julien-c/EsperBERTo-small/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: esperanto 3 | thumbnail: https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png 4 | --- 5 | 6 | # EsperBERTo: RoBERTa-like Language model trained on Esperanto 7 | 8 | **Companion model to blog post https://huggingface.co/blog/how-to-train** 🔥 9 | 10 | ## Training Details 11 | 12 | - current checkpoint: 566000 13 | - machine name: `galinette` 14 | 15 | 16 | ![](https://huggingface.co/blog/assets/EsperBERTo-thumbnail-v2.png) 17 | 18 | ## Example pipeline 19 | 20 | ```python 21 | from transformers import pipeline 22 | 23 | fill_mask = pipeline( 24 | "fill-mask", 25 | model="julien-c/EsperBERTo-small", 26 | tokenizer="julien-c/EsperBERTo-small" 27 | ) 28 | 29 | fill_mask("Jen la komenco de bela .") 30 | 31 | # This is the beginning of a beautiful . 32 | # => 33 | 34 | # { 35 | # 'score':0.06502299010753632 36 | # 'sequence':' Jen la komenco de bela vivo.' 37 | # 'token':1099 38 | # } 39 | # { 40 | # 'score':0.0421181358397007 41 | # 'sequence':' Jen la komenco de bela vespero.' 42 | # 'token':5100 43 | # } 44 | # { 45 | # 'score':0.024884626269340515 46 | # 'sequence':' Jen la komenco de bela laboro.' 47 | # 'token':1570 48 | # } 49 | # { 50 | # 'score':0.02324388362467289 51 | # 'sequence':' Jen la komenco de bela tago.' 52 | # 'token':1688 53 | # } 54 | # { 55 | # 'score':0.020378097891807556 56 | # 'sequence':' Jen la komenco de bela festo.' 57 | # 'token':4580 58 | # } 59 | ``` 60 | -------------------------------------------------------------------------------- /model_cards/julien-c/bert-xsmall-dummy/README.md: -------------------------------------------------------------------------------- 1 | ## How to build a dummy model 2 | 3 | 4 | ```python 5 | from transformers.configuration_bert import BertConfig 6 | from transformers.modeling_bert import BertForMaskedLM 7 | from transformers.modeling_tf_bert import TFBertForMaskedLM 8 | from transformers.tokenization_bert import BertTokenizer 9 | 10 | 11 | SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" 12 | DIRNAME = "./bert-xsmall-dummy" 13 | 14 | config = BertConfig(10, 20, 1, 1, 40) 15 | 16 | model = BertForMaskedLM(config) 17 | model.save_pretrained(DIRNAME) 18 | 19 | tf_model = TFBertForMaskedLM.from_pretrained(DIRNAME, from_pt=True) 20 | tf_model.save_pretrained(DIRNAME) 21 | 22 | # Slightly different for tokenizer. 23 | # tokenizer = BertTokenizer.from_pretrained(DIRNAME) 24 | # tokenizer.save_pretrained() 25 | ``` 26 | -------------------------------------------------------------------------------- /model_cards/julien-c/dummy-unknown/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | tags: 3 | - ci 4 | --- 5 | 6 | ## Dummy model used for unit testing and CI 7 | 8 | 9 | ```python 10 | import json 11 | import os 12 | from transformers.configuration_roberta import RobertaConfig 13 | from transformers import RobertaForMaskedLM, TFRobertaForMaskedLM 14 | 15 | DIRNAME = "./dummy-unknown" 16 | 17 | 18 | config = RobertaConfig(10, 20, 1, 1, 40) 19 | 20 | model = RobertaForMaskedLM(config) 21 | model.save_pretrained(DIRNAME) 22 | 23 | tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True) 24 | tf_model.save_pretrained(DIRNAME) 25 | 26 | # Tokenizer: 27 | 28 | vocab = [ 29 | "l", 30 | "o", 31 | "w", 32 | "e", 33 | "r", 34 | "s", 35 | "t", 36 | "i", 37 | "d", 38 | "n", 39 | "\u0120", 40 | "\u0120l", 41 | "\u0120n", 42 | "\u0120lo", 43 | "\u0120low", 44 | "er", 45 | "\u0120lowest", 46 | "\u0120newer", 47 | "\u0120wider", 48 | "", 49 | ] 50 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 51 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 52 | 53 | vocab_file = os.path.join(DIRNAME, "vocab.json") 54 | merges_file = os.path.join(DIRNAME, "merges.txt") 55 | with open(vocab_file, "w", encoding="utf-8") as fp: 56 | fp.write(json.dumps(vocab_tokens) + "\n") 57 | with open(merges_file, "w", encoding="utf-8") as fp: 58 | fp.write("\n".join(merges)) 59 | ``` 60 | -------------------------------------------------------------------------------- /model_cards/lvwerra/gpt2-medium-taboo/README.md: -------------------------------------------------------------------------------- 1 | # GPT-2 (medium) Taboo 2 | 3 | ## What is it? 4 | A fine-tuned GPT-2 version for Taboo cards generation. 5 | 6 | ## Training setting 7 | 8 | The model was trained on ~900 Taboo cards in the following format for 100 epochs: 9 | ``` 10 | Describe the word Glitch without using the words Problem, Unexpected, Technology, Minor, Outage. 11 | ```` 12 | 13 | -------------------------------------------------------------------------------- /model_cards/lysandre/arxiv-nlp/README.md: -------------------------------------------------------------------------------- 1 | # ArXiv-NLP GPT-2 checkpoint 2 | 3 | This is a GPT-2 small checkpoint for PyTorch. It is the official `gpt2-small` fine-tuned to ArXiv paper on the computational linguistics field. 4 | 5 | ## Training data 6 | 7 | This model was trained on a subset of ArXiv papers that were parsed from PDF to txt. The resulting data is made of 80MB of text from the computational linguistics (cs.CL) field. -------------------------------------------------------------------------------- /model_cards/lysandre/arxiv/README.md: -------------------------------------------------------------------------------- 1 | # ArXiv GPT-2 checkpoint 2 | 3 | This is a GPT-2 small checkpoint for PyTorch. It is the official `gpt2-small` finetuned to ArXiv paper on physics fields. 4 | 5 | ## Training data 6 | 7 | This model was trained on a subset of ArXiv papers that were parsed from PDF to txt. The resulting data is made of 130MB of text, mostly from quantum physics (quant-ph) and other physics sub-fields. 8 | -------------------------------------------------------------------------------- /model_cards/microsoft/DialoGPT-large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png 3 | tags: 4 | - conversational 5 | --- 6 | 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT) 8 | 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test. 11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 12 | 13 | * Multi-turn generation examples from an interactive environment: 14 | 15 | |Role | Response | 16 | |---------|--------| 17 | |User | Does money buy happiness? | 18 | | Bot | Depends how much money you spend on it .| 19 | |User | What is the best way to buy happiness ? | 20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . | 21 | |User |This is so difficult ! | 22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money | 23 | 24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT) 25 | 26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536) 27 | 28 | ### How to use 29 | 30 | Now we are ready to try out how the model works as a chatting partner! 31 | 32 | ```python 33 | from transformers import AutoModelWithLMHead, AutoTokenizer 34 | import torch 35 | 36 | 37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large") 38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-large") 39 | 40 | # Let's chat for 5 lines 41 | for step in range(5): 42 | # encode the new user input, add the eos_token and return a tensor in Pytorch 43 | new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt') 44 | 45 | # append the new user input tokens to the chat history 46 | bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids 47 | 48 | # generated a response while limiting the total chat history to 1000 tokens, 49 | chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id) 50 | 51 | # pretty print last ouput tokens from bot 52 | print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))) 53 | ``` 54 | -------------------------------------------------------------------------------- /model_cards/microsoft/DialoGPT-medium/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png 3 | tags: 4 | - conversational 5 | --- 6 | 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT) 8 | 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test. 11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 12 | 13 | * Multi-turn generation examples from an interactive environment: 14 | 15 | |Role | Response | 16 | |---------|--------| 17 | |User | Does money buy happiness? | 18 | | Bot | Depends how much money you spend on it .| 19 | |User | What is the best way to buy happiness ? | 20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . | 21 | |User |This is so difficult ! | 22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money | 23 | 24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT) 25 | 26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536) 27 | 28 | ### How to use 29 | 30 | Now we are ready to try out how the model works as a chatting partner! 31 | 32 | ```python 33 | from transformers import AutoModelWithLMHead, AutoTokenizer 34 | import torch 35 | 36 | 37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") 38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium") 39 | 40 | # Let's chat for 5 lines 41 | for step in range(5): 42 | # encode the new user input, add the eos_token and return a tensor in Pytorch 43 | new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt') 44 | 45 | # append the new user input tokens to the chat history 46 | bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids 47 | 48 | # generated a response while limiting the total chat history to 1000 tokens, 49 | chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id) 50 | 51 | # pretty print last ouput tokens from bot 52 | print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))) 53 | ``` 54 | -------------------------------------------------------------------------------- /model_cards/microsoft/DialoGPT-small/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png 3 | tags: 4 | - conversational 5 | --- 6 | 7 | ## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT) 8 | 9 | DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations. 10 | The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test. 11 | The model is trained on 147M multi-turn dialogue from Reddit discussion thread. 12 | 13 | * Multi-turn generation examples from an interactive environment: 14 | 15 | |Role | Response | 16 | |---------|--------| 17 | |User | Does money buy happiness? | 18 | | Bot | Depends how much money you spend on it .| 19 | |User | What is the best way to buy happiness ? | 20 | | Bot | You just have to be a millionaire by your early 20s, then you can be happy . | 21 | |User |This is so difficult ! | 22 | | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money | 23 | 24 | Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT) 25 | 26 | ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536) 27 | 28 | ### How to use 29 | 30 | Now we are ready to try out how the model works as a chatting partner! 31 | 32 | ```python 33 | from transformers import AutoModelWithLMHead, AutoTokenizer 34 | import torch 35 | 36 | 37 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") 38 | model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small") 39 | 40 | # Let's chat for 5 lines 41 | for step in range(5): 42 | # encode the new user input, add the eos_token and return a tensor in Pytorch 43 | new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt') 44 | 45 | # append the new user input tokens to the chat history 46 | bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids 47 | 48 | # generated a response while limiting the total chat history to 1000 tokens, 49 | chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id) 50 | 51 | # pretty print last ouput tokens from bot 52 | print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))) 53 | ``` 54 | -------------------------------------------------------------------------------- /model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: spanish 3 | thumbnail: https://i.imgur.com/jgBdimh.png 4 | --- 5 | 6 | # Spanish BERT (BETO) + NER 7 | 8 | This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corpora) of the Spanish BERT cased [(BETO)](https://github.com/dccuchile/beto) for **NER** downstream task. 9 | 10 | ## Details of the downstream task (NER) - Dataset 11 | 12 | - [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) 13 | 14 | I preprocessed the dataset and splitted it as train / dev (80/20) 15 | 16 | | Dataset | # Examples | 17 | | ---------------------- | ----- | 18 | | Train | 8.7 K | 19 | | Dev | 2.2 K | 20 | 21 | 22 | - [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) 23 | 24 | - Labels covered: 25 | 26 | ``` 27 | B-LOC 28 | B-MISC 29 | B-ORG 30 | B-PER 31 | I-LOC 32 | I-MISC 33 | I-ORG 34 | I-PER 35 | O 36 | ``` 37 | 38 | ## Metrics on evaluation set: 39 | 40 | | Metric | # score | 41 | | :------------------------------------------------------------------------------------: | :-------: | 42 | | F1 | **90.17** 43 | | Precision | **89.86** | 44 | | Recall | **90.47** | 45 | 46 | ## Comparison: 47 | 48 | | Model | # score | 49 | | :--------------------------------------------------------------------------------------------------------------: | :-------: | 50 | | bert-base-spanish-wwm-cased (BETO) | 88.43 | 51 | | [bert-spanish-cased-finetuned-ner (this one)](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-ner) | **89.65** | 52 | | Best Multilingual BERT | 87.38 | 53 | 54 | ## Model in action 55 | 56 | Fast usage with **pipelines**: 57 | 58 | ```python 59 | from transformers import pipeline 60 | 61 | nlp_ner = pipeline( 62 | "ner", 63 | model="mrm8488/bert-spanish-cased-finetuned-ner", 64 | tokenizer=( 65 | 'mrm8488/bert-spanish-cased-finetuned-ner', 66 | {"use_fast": False} 67 | )) 68 | 69 | nlp_ner(text) 70 | 71 | #Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}] 72 | ``` 73 | 74 | > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) 75 | 76 | > Made with in Spain 77 | -------------------------------------------------------------------------------- /model_cards/mrm8488/bert-uncased-finetuned-qnli/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: english 3 | thumbnail: 4 | --- 5 | 6 | # [BERT](https://huggingface.co/deepset/bert-base-cased-squad2) fine tuned on [QNLI](https://github.com/rhythmcao/QNLI)+ compression ([BERT-of-Theseus](https://github.com/JetRunner/BERT-of-Theseus)) 7 | 8 | I used a [Bert model fine tuned on **SQUAD v2**](https://huggingface.co/deepset/bert-base-cased-squad2) and then I fine tuned it on **QNLI** using **compression** (with a constant replacing rate) as proposed in **BERT-of-Theseus** 9 | 10 | ## Details of the downstream task (QNLI): 11 | 12 | ### Getting the dataset 13 | ```bash 14 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/train.tsv 15 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/test.tsv 16 | wget https://raw.githubusercontent.com/rhythmcao/QNLI/master/data/QNLI/dev.tsv 17 | 18 | mkdir QNLI_dataset 19 | mv *.tsv QNLI_dataset 20 | ``` 21 | 22 | ### Model training 23 | 24 | The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command: 25 | 26 | ```bash 27 | !python /content/BERT-of-Theseus/run_glue.py \ 28 | --model_name_or_path deepset/bert-base-cased-squad2 \ 29 | --task_name qnli \ 30 | --do_train \ 31 | --do_eval \ 32 | --do_lower_case \ 33 | --data_dir /content/QNLI_dataset \ 34 | --max_seq_length 128 \ 35 | --per_gpu_train_batch_size 32 \ 36 | --per_gpu_eval_batch_size 32 \ 37 | --learning_rate 2e-5 \ 38 | --save_steps 2000 \ 39 | --num_train_epochs 50 \ 40 | --output_dir /content/ouput_dir \ 41 | --evaluate_during_training \ 42 | --replacing_rate 0.7 \ 43 | --steps_for_replacing 2500 44 | ``` 45 | 46 | ## Metrics: 47 | 48 | | Model | Accuracy | 49 | |-----------------|------| 50 | | BERT-base | 91.2 | 51 | | BERT-of-Theseus | 88.8 | 52 | | [bert-uncased-finetuned-qnli](https://huggingface.co/mrm8488/bert-uncased-finetuned-qnli) | 87.2 53 | | DistillBERT | 85.3 | 54 | 55 | 56 | 57 | 58 | > [See all my models](https://huggingface.co/models?search=mrm8488) 59 | 60 | > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) 61 | 62 | > Made with in Spain 63 | -------------------------------------------------------------------------------- /model_cards/nlptown/bert-base-multilingual-uncased-sentiment/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - english 4 | - dutch 5 | - german 6 | - french 7 | - italian 8 | - spanish 9 | --- 10 | 11 | # bert-base-multilingual-uncased-sentiment 12 | 13 | This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish and Italian. It predicts the sentiment of the review as a number of stars (between 1 and 5). 14 | 15 | This model is intended for direct use as a sentiment analysis model for product reviews in any of the six languages above, or for further finetuning on related sentiment analysis tasks. 16 | 17 | ## Training data 18 | 19 | Here is the number of product reviews we used for finetuning the model: 20 | 21 | | Language | Number of reviews | 22 | | -------- | ----------------- | 23 | | English | 150k | 24 | | Dutch | 80k | 25 | | German | 137k | 26 | | French | 140k | 27 | | Italian | 72k | 28 | | Spanish | 50k | 29 | 30 | ## Accuracy 31 | 32 | The finetuned model obtained the following accuracy on 5,000 held-out product reviews in each of the languages: 33 | 34 | - Accuracy (exact) is the exact match on the number of stars. 35 | - Accuracy (off-by-1) is the percentage of reviews where the number of stars the model predicts differs by a maximum of 1 from the number given by the human reviewer. 36 | 37 | 38 | | Language | Accuracy (exact) | Accuracy (off-by-1) | 39 | | -------- | ---------------------- | ------------------- | 40 | | English | 67% | 95% 41 | | Dutch | 57% | 93% 42 | | German | 61% | 94% 43 | | French | 59% | 94% 44 | | Italian | 59% | 95% 45 | | Spanish | 58% | 95% 46 | 47 | ## Contact 48 | 49 | Contact [NLP Town](https://www.nlp.town) for questions, feedback and/or requests for similar models. 50 | -------------------------------------------------------------------------------- /model_cards/severinsimmler/literary-german-bert/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: german 3 | thumbnail: kfold.png 4 | --- 5 | 6 | # German BERT for literary texts 7 | 8 | This German BERT is based on `bert-base-german-dbmdz-cased`, and has been adapted to the domain of literary texts by fine-tuning the language modeling task on the [Corpus of German-Language Fiction](https://figshare.com/articles/Corpus_of_German-Language_Fiction_txt_/4524680/1). Afterwards the model was fine-tuned for named entity recognition on the [DROC](https://gitlab2.informatik.uni-wuerzburg.de/kallimachos/DROC-Release) corpus, so you can use it to recognize protagonists in German novels. 9 | 10 | 11 | # Stats 12 | 13 | ## Language modeling 14 | 15 | The [Corpus of German-Language Fiction](https://figshare.com/articles/Corpus_of_German-Language_Fiction_txt_/4524680/1) consists of 3,194 documents with 203,516,988 tokens or 1,520,855 types. The publication year of the texts ranges from the 18th to the 20th century: 16 | 17 | ![years](prosa-jahre.png) 18 | 19 | 20 | ### Results 21 | 22 | After one epoch: 23 | 24 | | Model | Perplexity | 25 | | ---------------- | ---------- | 26 | | Vanilla BERT | 6.82 | 27 | | Fine-tuned BERT | 4.98 | 28 | 29 | 30 | ## Named entity recognition 31 | 32 | The provided model was also fine-tuned for two epochs on 10,799 sentences for training, validated on 547 and tested on 1,845 with three labels: `B-PER`, `I-PER` and `O`. 33 | 34 | 35 | ## Results 36 | 37 | | Dataset | Precision | Recall | F1 | 38 | | ------- | --------- | ------ | ---- | 39 | | Dev | 96.4 | 87.3 | 91.6 | 40 | | Test | 92.8 | 94.9 | 93.8 | 41 | 42 | The model has also been evaluated using 10-fold cross validation and compared with a classic Conditional Random Field baseline described in [Jannidis et al.](https://opus.bibliothek.uni-wuerzburg.de/opus4-wuerzburg/frontdoor/deliver/index/docId/14333/file/Jannidis_Figurenerkennung_Roman.pdf) (2015): 43 | 44 | ![kfold](kfold.png) 45 | 46 | 47 | # References 48 | 49 | Markus Krug, Lukas Weimer, Isabella Reger, Luisa Macharowsky, Stephan Feldhaus, Frank Puppe, Fotis Jannidis, [Description of a Corpus of Character References in German Novels](http://webdoc.sub.gwdg.de/pub/mon/dariah-de/dwp-2018-27.pdf), 2018. 50 | 51 | Fotis Jannidis, Isabella Reger, Lukas Weimer, Markus Krug, Martin Toepfer, Frank Puppe, [Automatische Erkennung von Figuren in deutschsprachigen Romanen](https://opus.bibliothek.uni-wuerzburg.de/opus4-wuerzburg/frontdoor/deliver/index/docId/14333/file/Jannidis_Figurenerkennung_Roman.pdf), 2015. 52 | -------------------------------------------------------------------------------- /model_cards/severinsimmler/literary-german-bert/kfold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/model_cards/severinsimmler/literary-german-bert/kfold.png -------------------------------------------------------------------------------- /model_cards/severinsimmler/literary-german-bert/prosa-jahre.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/model_cards/severinsimmler/literary-german-bert/prosa-jahre.png -------------------------------------------------------------------------------- /model_cards/twmkn9/albert-base-v2-squad2/README.md: -------------------------------------------------------------------------------- 1 | This model is ALBERT base v2 trained on SQuAD v2 as: 2 | 3 | ``` 4 | python run_squad.py 5 | --model_type albert 6 | --model_name_or_path albert-base-v2 7 | --do_train --do_eval 8 | --do_lower_case 9 | --version_2_with_negative 10 | --train_file $SQUAD_DIR/train-v2.0.json 11 | --predict_file $SQUAD_DIR/dev-v2.0.json 12 | --per_gpu_train_batch_size 8 13 | --num_train_epochs 3 14 | --learning_rate 3e-5 15 | --max_seq_length 384 16 | --doc_stride 128 17 | --output_dir ./tmp/albert_base_fine/ 18 | ``` 19 | 20 | Performance on a dev subset is close to the original paper: 21 | 22 | ``` 23 | Results: 24 | { 25 | 'exact': 78.71010200723923, 26 | 'f1': 81.89228117126069, 27 | 'total': 6078, 28 | 'HasAns_exact': 75.39518900343643, 29 | 'HasAns_f1': 82.04167868004215, 30 | 'HasAns_total': 2910, 31 | 'NoAns_exact': 81.7550505050505, 32 | 'NoAns_f1': 81.7550505050505, 33 | 'NoAns_total': 3168, 34 | 'best_exact': 78.72655478775913, 35 | 'best_exact_thresh': 0.0, 36 | 'best_f1': 81.90873395178066, 37 | 'best_f1_thresh': 0.0 38 | } 39 | ``` 40 | 41 | We are hopeful this might save you time, energy, and compute. Cheers! -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_base/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_base 7 | 8 | This a albert_chinese_base model from [Google's github](https://github.com/google-research/ALBERT) 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_base model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_base 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_base' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `感 0.36333346366882324` 45 | -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_large/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_large 7 | 8 | This a albert_chinese_large model from [Google's github](https://github.com/google-research/ALBERT) 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_large model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_large 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_large' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `心 0.9422469735145569` 45 | -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_small/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_small 7 | 8 | This a albert_chinese_small model from [brightmart/albert_zh project](https://github.com/brightmart/albert_zh), albert_small_google_zh model 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_small model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_small 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_small' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `感 0.6390823125839233` 45 | -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_tiny/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_tiny 7 | 8 | This a albert_chinese_tiny model from [brightmart/albert_zh project](https://github.com/brightmart/albert_zh), albert_tiny_google_zh model 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_tiny model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_tiny 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_tiny' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `感 0.40312355756759644` 45 | -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_xlarge/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_xlarge 7 | 8 | This a albert_chinese_xlarge model from [Google's github](https://github.com/google-research/ALBERT) 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_xlarge model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_xlarge 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_xlarge' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `心 0.9942440390586853` 45 | -------------------------------------------------------------------------------- /model_cards/voidful/albert_chinese_xxlarge/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: 3 | - chinese 4 | --- 5 | 6 | # albert_chinese_xxlarge 7 | 8 | This a albert_chinese_xxlarge model from [Google's github](https://github.com/google-research/ALBERT) 9 | converted by huggingface's [script](https://github.com/huggingface/transformers/blob/master/src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py) 10 | 11 | ## Attention (注意) 12 | 13 | Since sentencepiece is not used in albert_chinese_xxlarge model 14 | you have to call BertTokenizer instead of AlbertTokenizer !!! 15 | we can eval it using an example on MaskedLM 16 | 17 | 由於 albert_chinese_xxlarge 模型沒有用 sentencepiece 18 | 用AlbertTokenizer會載不進詞表,因此需要改用BertTokenizer !!! 19 | 我們可以跑MaskedLM預測來驗證這個做法是否正確 20 | 21 | ## Justify (驗證有效性) 22 | [colab trial](https://colab.research.google.com/drive/1Wjz48Uws6-VuSHv_-DcWLilv77-AaYgj) 23 | ```python 24 | from transformers import * 25 | import torch 26 | from torch.nn.functional import softmax 27 | 28 | pretrained = 'voidful/albert_chinese_xxlarge' 29 | tokenizer = BertTokenizer.from_pretrained(pretrained) 30 | model = AlbertForMaskedLM.from_pretrained(pretrained) 31 | 32 | inputtext = "今天[MASK]情很好" 33 | 34 | maskpos = tokenizer.encode(inputtext, add_special_tokens=True).index(103) 35 | 36 | input_ids = torch.tensor(tokenizer.encode(inputtext, add_special_tokens=True)).unsqueeze(0) # Batch size 1 37 | outputs = model(input_ids, masked_lm_labels=input_ids) 38 | loss, prediction_scores = outputs[:2] 39 | logit_prob = softmax(prediction_scores[0, maskpos]).data.tolist() 40 | predicted_index = torch.argmax(prediction_scores[0, maskpos]).item() 41 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 42 | print(predicted_token,logit_prob[predicted_index]) 43 | ``` 44 | Result: `心 0.995713472366333` 45 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Transformers Notebooks 2 | 3 | You can find here a list of the official notebooks provided by Hugging Face. 4 | 5 | Also, we would like to list here interesting content created by the community. 6 | If you wrote some notebook(s) leveraging transformers and would like be listed here, please open a 7 | Pull Request and we'll review it so it can be included here. 8 | 9 | 10 | ## Hugging Face's notebooks :hugs: 11 | 12 | | Notebook | Description | | 13 | |:----------|:-------------:|------:| 14 | | [Getting Started Tokenizers](01-training-tokenizers.ipynb) | How to train and use your very own tokenizer |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) | 15 | | [Getting Started Transformers](02-transformers.ipynb) | How to easily start using transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb) | 16 | | [How to use Pipelines](03-pipelines.ipynb) | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb) | 17 | | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| 18 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | ensure_newline_before_comments = True 3 | force_grid_wrap = 0 4 | include_trailing_comma = True 5 | known_first_party = transformers 6 | known_third_party = 7 | absl 8 | fairseq 9 | fastprogress 10 | git 11 | h5py 12 | MeCab 13 | nltk 14 | numpy 15 | packaging 16 | PIL 17 | psutil 18 | pytorch_lightning 19 | seqeval 20 | sklearn 21 | tensorboardX 22 | tensorflow 23 | tensorflow_datasets 24 | torch 25 | torchtext 26 | torchvision 27 | torch_xla 28 | 29 | line_length = 119 30 | lines_after_imports = 2 31 | multi_line_output = 3 32 | use_parentheses = True 33 | 34 | [flake8] 35 | ignore = E203, E501, W503 36 | max-line-length = 119 37 | -------------------------------------------------------------------------------- /src/transformers/activations.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | def swish(x): 8 | return x * torch.sigmoid(x) 9 | 10 | 11 | def _gelu_python(x): 12 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created. 13 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 14 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 15 | This is now written in C in torch.nn.functional 16 | Also see https://arxiv.org/abs/1606.08415 17 | """ 18 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 19 | 20 | 21 | if torch.__version__ < "1.4.0": 22 | gelu = _gelu_python 23 | else: 24 | gelu = F.gelu 25 | 26 | 27 | def gelu_new(x): 28 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). 29 | Also see https://arxiv.org/abs/1606.08415 30 | """ 31 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 32 | 33 | 34 | ACT2FN = { 35 | "relu": F.relu, 36 | "swish": swish, 37 | "gelu": gelu, 38 | "tanh": F.tanh, 39 | "gelu_new": gelu_new, 40 | } 41 | 42 | 43 | def get_activation(activation_string): 44 | if activation_string in ACT2FN: 45 | return ACT2FN[activation_string] 46 | else: 47 | raise KeyError( 48 | "function {} not found in ACT2FN mapping {} or torch.nn.functional".format( 49 | activation_string, list(ACT2FN.keys()) 50 | ) 51 | ) 52 | -------------------------------------------------------------------------------- /src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /src/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /src/transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /src/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 40 | model_type = "camembert" 41 | -------------------------------------------------------------------------------- /src/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /src/transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | """ 38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 39 | superclass for the appropriate documentation alongside usage examples. 40 | """ 41 | 42 | pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 43 | model_type = "xlm-roberta" 44 | -------------------------------------------------------------------------------- /src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForMaskedLM(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from transformers.file_utils import WEIGHTS_NAME 7 | 8 | 9 | DIALOGPT_MODELS = ["small", "medium", "large"] 10 | 11 | OLD_KEY = "lm_head.decoder.weight" 12 | NEW_KEY = "lm_head.weight" 13 | 14 | 15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 16 | d = torch.load(checkpoint_path) 17 | d[NEW_KEY] = d.pop(OLD_KEY) 18 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 19 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--dialogpt_path", default=".", type=str) 25 | args = parser.parse_args() 26 | for MODEL in DIALOGPT_MODELS: 27 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 28 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 29 | convert_dialogpt_checkpoint( 30 | checkpoint_path, pytorch_dump_folder_path, 31 | ) 32 | -------------------------------------------------------------------------------- /src/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /src/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | ) 24 | 25 | 26 | if is_sklearn_available(): 27 | from .metrics import glue_compute_metrics, xnli_compute_metrics 28 | -------------------------------------------------------------------------------- /src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | -------------------------------------------------------------------------------- /src/transformers/tokenization_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_roberta import RobertaTokenizer 17 | 18 | 19 | # vocab and merges same as roberta 20 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" 21 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" 22 | _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"] 23 | 24 | 25 | class BartTokenizer(RobertaTokenizer): 26 | # merges and vocab same as Roberta 27 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 28 | pretrained_vocab_files_map = { 29 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 30 | "merges_file": {m: merges_url for m in _all_bart_models}, 31 | } 32 | -------------------------------------------------------------------------------- /src/transformers/utils_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Classes to support Encoder-Decoder architectures """ 16 | 17 | 18 | def prepare_encoder_decoder_model_kwargs(**kwargs): 19 | """ Prepare the encoder and decoder's keyword arguments. 20 | 21 | Keyword arguments come in 3 flavors: 22 | - encoder-specific (prefixed by `encoder_`) 23 | - decoder-specific (prefixed by `decoder_`) 24 | - those that apply to the model as whole. 25 | 26 | We let the specific kwargs override the common ones in case of 27 | conflict. 28 | """ 29 | 30 | kwargs_common = { 31 | argument: value 32 | for argument, value in kwargs.items() 33 | if not argument.startswith("encoder_") and not argument.startswith("decoder_") 34 | } 35 | if "input_ids" in kwargs_common: 36 | kwargs["encoder_input_ids"] = kwargs_common.pop("input_ids") 37 | 38 | decoder_kwargs = kwargs_common.copy() 39 | encoder_kwargs = kwargs_common.copy() 40 | encoder_kwargs.update( 41 | {argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")} 42 | ) 43 | decoder_kwargs.update( 44 | {argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")} 45 | ) 46 | decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) 47 | return encoder_kwargs, decoder_kwargs 48 | -------------------------------------------------------------------------------- /templates/adding_a_new_example_script/README.md: -------------------------------------------------------------------------------- 1 | # How to add a new example script in 🤗Transformers 2 | 3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the 🤗Transformers library. 4 | 5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases. 6 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert XXX checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = XxxConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = XxxForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_xxx(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /templates/adding_a_new_model/tests/test_tokenization_xxx.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 XXX Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | 24 | 25 | class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 26 | 27 | tokenizer_class = XxxTokenizer 28 | 29 | def setUp(self): 30 | super().setUp() 31 | 32 | vocab_tokens = [ 33 | "[UNK]", 34 | "[CLS]", 35 | "[SEP]", 36 | "want", 37 | "##want", 38 | "##ed", 39 | "wa", 40 | "un", 41 | "runn", 42 | "##ing", 43 | ",", 44 | "low", 45 | "lowest", 46 | ] 47 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 48 | with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: 49 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 50 | 51 | def get_tokenizer(self, **kwargs): 52 | return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs) 53 | 54 | def get_input_output_texts(self): 55 | input_text = "UNwant\u00E9d,running" 56 | output_text = "unwanted, running" 57 | return input_text, output_text 58 | 59 | def test_full_tokenizer(self): 60 | tokenizer = self.tokenizer_class(self.vocab_file) 61 | 62 | tokens = tokenizer.tokenize("UNwant\u00E9d,running") 63 | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 64 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 65 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/dummy-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "roberta" 3 | } -------------------------------------------------------------------------------- /tests/fixtures/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/empty.txt -------------------------------------------------------------------------------- /tests/fixtures/input.txt: -------------------------------------------------------------------------------- 1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer 2 | -------------------------------------------------------------------------------- /tests/fixtures/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/spiece.model -------------------------------------------------------------------------------- /tests/fixtures/test_sentencepiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uds-lsv/bert-stable-fine-tuning/3cf27e8667aee9d1c822d747c63ce331b231f283/tests/fixtures/test_sentencepiece.model -------------------------------------------------------------------------------- /tests/test_activations.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from transformers import is_torch_available 4 | 5 | from .utils import require_torch 6 | 7 | 8 | if is_torch_available(): 9 | from transformers.activations import _gelu_python, get_activation, gelu_new 10 | import torch 11 | 12 | 13 | @require_torch 14 | class TestActivations(unittest.TestCase): 15 | def test_gelu_versions(self): 16 | x = torch.Tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100]) 17 | torch_builtin = get_activation("gelu") 18 | self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item()) 19 | self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item()) 20 | 21 | def test_get_activation(self): 22 | get_activation("swish") 23 | get_activation("relu") 24 | get_activation("tanh") 25 | with self.assertRaises(KeyError): 26 | get_activation("bogus") 27 | with self.assertRaises(KeyError): 28 | get_activation(None) 29 | -------------------------------------------------------------------------------- /tests/test_configuration_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import unittest 18 | 19 | from transformers.configuration_auto import CONFIG_MAPPING, AutoConfig 20 | from transformers.configuration_bert import BertConfig 21 | from transformers.configuration_roberta import RobertaConfig 22 | 23 | from .utils import DUMMY_UNKWOWN_IDENTIFIER 24 | 25 | 26 | SAMPLE_ROBERTA_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/dummy-config.json") 27 | 28 | 29 | class AutoConfigTest(unittest.TestCase): 30 | def test_config_from_model_shortcut(self): 31 | config = AutoConfig.from_pretrained("bert-base-uncased") 32 | self.assertIsInstance(config, BertConfig) 33 | 34 | def test_config_model_type_from_local_file(self): 35 | config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG) 36 | self.assertIsInstance(config, RobertaConfig) 37 | 38 | def test_config_model_type_from_model_identifier(self): 39 | config = AutoConfig.from_pretrained(DUMMY_UNKWOWN_IDENTIFIER) 40 | self.assertIsInstance(config, RobertaConfig) 41 | 42 | def test_config_for_model_str(self): 43 | config = AutoConfig.for_model("roberta") 44 | self.assertIsInstance(config, RobertaConfig) 45 | 46 | def test_pattern_matching_fallback(self): 47 | """ 48 | In cases where config.json doesn't include a model_type, 49 | perform a few safety checks on the config mapping's order. 50 | """ 51 | # no key string should be included in a later key string (typical failure case) 52 | keys = list(CONFIG_MAPPING.keys()) 53 | for i, key in enumerate(keys): 54 | self.assertFalse(any(key in later_key for later_key in keys[i + 1 :])) 55 | -------------------------------------------------------------------------------- /tests/test_tokenization_ctrl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team. 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import json 17 | import os 18 | import unittest 19 | 20 | from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer 21 | 22 | from .test_tokenization_common import TokenizerTesterMixin 23 | 24 | 25 | class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): 26 | 27 | tokenizer_class = CTRLTokenizer 28 | 29 | def setUp(self): 30 | super().setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] 34 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 35 | merges = ["#version: 0.2", "a p", "ap t", "r e", "a d", "ad apt", ""] 36 | self.special_tokens_map = {"unk_token": ""} 37 | 38 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) 39 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) 40 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 41 | fp.write(json.dumps(vocab_tokens) + "\n") 42 | with open(self.merges_file, "w", encoding="utf-8") as fp: 43 | fp.write("\n".join(merges)) 44 | 45 | def get_tokenizer(self, **kwargs): 46 | kwargs.update(self.special_tokens_map) 47 | return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 48 | 49 | def get_input_output_texts(self): 50 | input_text = "adapt react readapt apt" 51 | output_text = "adapt react readapt apt" 52 | return input_text, output_text 53 | 54 | def test_full_tokenizer(self): 55 | tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 56 | text = "adapt react readapt apt" 57 | bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split() 58 | tokens = tokenizer.tokenize(text) 59 | self.assertListEqual(tokens, bpe_tokens) 60 | 61 | input_tokens = tokens + [tokenizer.unk_token] 62 | 63 | input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6] 64 | self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 65 | -------------------------------------------------------------------------------- /tests/test_tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from transformers.tokenization_distilbert import DistilBertTokenizer 18 | 19 | from .test_tokenization_bert import BertTokenizationTest 20 | from .utils import slow 21 | 22 | 23 | class DistilBertTokenizationTest(BertTokenizationTest): 24 | 25 | tokenizer_class = DistilBertTokenizer 26 | 27 | def get_tokenizer(self, **kwargs): 28 | return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 29 | 30 | @slow 31 | def test_sequence_builders(self): 32 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 33 | 34 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 35 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 36 | 37 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 38 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 39 | 40 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 41 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ 42 | tokenizer.sep_token_id 43 | ] 44 | -------------------------------------------------------------------------------- /tests/test_tokenization_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import unittest 18 | 19 | from transformers import PreTrainedTokenizer 20 | from transformers.tokenization_gpt2 import GPT2Tokenizer 21 | 22 | from .utils import slow 23 | 24 | 25 | class TokenizerUtilsTest(unittest.TestCase): 26 | def check_tokenizer_from_pretrained(self, tokenizer_class): 27 | s3_models = list(tokenizer_class.max_model_input_sizes.keys()) 28 | for model_name in s3_models[:1]: 29 | tokenizer = tokenizer_class.from_pretrained(model_name) 30 | self.assertIsNotNone(tokenizer) 31 | self.assertIsInstance(tokenizer, tokenizer_class) 32 | self.assertIsInstance(tokenizer, PreTrainedTokenizer) 33 | 34 | for special_tok in tokenizer.all_special_tokens: 35 | self.assertIsInstance(special_tok, str) 36 | special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) 37 | self.assertIsInstance(special_tok_id, int) 38 | 39 | @slow 40 | def test_pretrained_tokenizers(self): 41 | self.check_tokenizer_from_pretrained(GPT2Tokenizer) 42 | -------------------------------------------------------------------------------- /transformers-cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.convert import ConvertCommand 5 | from transformers.commands.download import DownloadCommand 6 | from transformers.commands.env import EnvironmentCommand 7 | from transformers.commands.run import RunCommand 8 | from transformers.commands.serving import ServeCommand 9 | from transformers.commands.user import UserCommands 10 | 11 | if __name__ == '__main__': 12 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') 13 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 14 | 15 | # Register commands 16 | ConvertCommand.register_subcommand(commands_parser) 17 | DownloadCommand.register_subcommand(commands_parser) 18 | EnvironmentCommand.register_subcommand(commands_parser) 19 | RunCommand.register_subcommand(commands_parser) 20 | ServeCommand.register_subcommand(commands_parser) 21 | UserCommands.register_subcommand(commands_parser) 22 | 23 | # Let's go 24 | args = parser.parse_args() 25 | 26 | if not hasattr(args, 'func'): 27 | parser.print_help() 28 | exit(1) 29 | 30 | # Run 31 | service = args.func(args) 32 | service.run() 33 | -------------------------------------------------------------------------------- /utils/link_tester.py: -------------------------------------------------------------------------------- 1 | """ Link tester. 2 | 3 | This little utility reads all the python files in the repository, 4 | scans for links pointing to S3 and tests the links one by one. Raises an error 5 | at the end of the scan if at least one link was reported broken. 6 | """ 7 | import os 8 | import re 9 | import sys 10 | 11 | import requests 12 | 13 | 14 | REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1""" 15 | 16 | 17 | S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" 18 | 19 | 20 | def list_python_files_in_repository(): 21 | """ List all python files in the repository. 22 | 23 | This function assumes that the script is executed in the root folder. 24 | """ 25 | source_code_files = [] 26 | for path, subdirs, files in os.walk("."): 27 | if "templates" in path: 28 | continue 29 | for name in files: 30 | if ".py" in name and ".pyc" not in name: 31 | path_to_files = os.path.join(path, name) 32 | source_code_files.append(path_to_files) 33 | 34 | return source_code_files 35 | 36 | 37 | def find_all_links(file_paths): 38 | links = [] 39 | for path in file_paths: 40 | links += scan_code_for_links(path) 41 | 42 | return [link for link in links if link != S3_BUCKET_PREFIX] 43 | 44 | 45 | def scan_code_for_links(source): 46 | """ Scans the file to find links using a regular expression. 47 | Returns a list of links. 48 | """ 49 | with open(source, "r") as content: 50 | content = content.read() 51 | raw_links = re.findall(REGEXP_FIND_S3_LINKS, content) 52 | links = [prefix + suffix for _, prefix, suffix in raw_links] 53 | 54 | return links 55 | 56 | 57 | def check_all_links(links): 58 | """ Check that the provided links are valid. 59 | 60 | Links are considered valid if a HEAD request to the server 61 | returns a 200 status code. 62 | """ 63 | broken_links = [] 64 | for link in links: 65 | head = requests.head(link) 66 | if head.status_code != 200: 67 | broken_links.append(link) 68 | 69 | return broken_links 70 | 71 | 72 | if __name__ == "__main__": 73 | file_paths = list_python_files_in_repository() 74 | links = find_all_links(file_paths) 75 | broken_links = check_all_links(links) 76 | print("Looking for broken links to pre-trained models/configs/tokenizers...") 77 | if broken_links: 78 | print("The following links did not respond:") 79 | for link in broken_links: 80 | print("- {}".format(link)) 81 | sys.exit(1) 82 | print("All links are ok.") 83 | --------------------------------------------------------------------------------