├── finetune_gpt2 ├── examples │ ├── benchmarking │ │ └── requirements.txt │ ├── research_projects │ │ ├── bertabs │ │ │ ├── __init__.py │ │ │ └── requirements.txt │ │ ├── deebert │ │ │ ├── src │ │ │ │ └── __init__.py │ │ │ ├── requirements.txt │ │ │ ├── eval_deebert.sh │ │ │ ├── entropy_eval.sh │ │ │ └── train_deebert.sh │ │ ├── bert-loses-patience │ │ │ ├── pabee │ │ │ │ └── __init__.py │ │ │ └── requirements.txt │ │ ├── bertology │ │ │ └── requirements.txt │ │ ├── adversarial │ │ │ ├── requirements.txt │ │ │ └── README.md │ │ ├── longform-qa │ │ │ ├── requirements.txt │ │ │ └── README.md │ │ ├── mlm_wwm │ │ │ └── requirements.txt │ │ ├── rag │ │ │ ├── __init__.py │ │ │ ├── requirements.txt │ │ │ ├── finetune_rag.sh │ │ │ ├── finetune_rag_ray.sh │ │ │ └── parse_dpr_relevance_data.py │ │ ├── pplm │ │ │ ├── imgs │ │ │ │ ├── wooly.png │ │ │ │ └── headfigure.png │ │ │ ├── requirements.txt │ │ │ └── pplm_classification_head.py │ │ ├── wav2vec2 │ │ │ ├── requirements.txt │ │ │ ├── finetune_base_100.sh │ │ │ ├── finetune_large_lv60_100.sh │ │ │ ├── finetune_base_timit_asr.sh │ │ │ ├── finetune_large_lv60_timit_asr.sh │ │ │ ├── finetune_wav2vec2_xlsr_turkish.sh │ │ │ ├── finetune_large_xlsr_53_arabic_speech_corpus.sh │ │ │ └── vocab │ │ │ │ └── buckwalter.json │ │ ├── distillation │ │ │ ├── requirements.txt │ │ │ └── training_configs │ │ │ │ ├── distilgpt2.json │ │ │ │ ├── distilbert-base-cased.json │ │ │ │ ├── distilbert-base-uncased.json │ │ │ │ ├── distilbert-base-multilingual-cased.json │ │ │ │ └── distilroberta-base.json │ │ ├── movement-pruning │ │ │ ├── emmental │ │ │ │ ├── modules │ │ │ │ │ └── __init__.py │ │ │ │ └── __init__.py │ │ │ └── requirements.txt │ │ ├── lxmert │ │ │ └── README.md │ │ ├── performer │ │ │ ├── full_script.sh │ │ │ ├── sanity_script.sh │ │ │ └── README.md │ │ ├── seq2seq-distillation │ │ │ ├── requirements.txt │ │ │ ├── finetune.sh │ │ │ ├── finetune_t5.sh │ │ │ ├── finetune_pegasus_xsum.sh │ │ │ ├── train_mbart_cc25_enro.sh │ │ │ ├── dynamic_bs_example.sh │ │ │ ├── sentence_splitter.py │ │ │ ├── distil_marian_no_teacher.sh │ │ │ ├── train_distilbart_cnn.sh │ │ │ ├── distil_marian_enro_teacher.sh │ │ │ ├── train_distilbart_xsum.sh │ │ │ └── finetune_bart_tiny.sh │ │ ├── mm-imdb │ │ │ └── README.md │ │ └── README.md │ ├── legacy │ │ ├── seq2seq │ │ │ ├── test_data │ │ │ │ ├── test_data │ │ │ │ ├── wmt_en_ro │ │ │ │ │ ├── val.len │ │ │ │ │ └── train.len │ │ │ │ └── fsmt │ │ │ │ │ └── build-eval-data.py │ │ │ ├── __init__.py │ │ │ ├── requirements.txt │ │ │ ├── finetune.sh │ │ │ ├── finetune_tpu.sh │ │ │ ├── minify_dataset.py │ │ │ ├── rouge_cli.py │ │ │ ├── sentence_splitter.py │ │ │ ├── convert_model_to_fp16.py │ │ │ ├── old_test_tatoeba_conversion.py │ │ │ └── train_mbart_cc25_enro.sh │ │ ├── pytorch-lightning │ │ │ ├── requirements.txt │ │ │ ├── run_glue.sh │ │ │ └── run_pos.sh │ │ ├── README.md │ │ └── token-classification │ │ │ ├── run_chunk.sh │ │ │ ├── run_pos.sh │ │ │ └── scripts │ │ │ └── preprocess.py │ ├── question-answering │ │ └── requirements.txt │ ├── multiple-choice │ │ └── requirements.txt │ ├── text-generation │ │ ├── requirements.txt │ │ └── README.md │ ├── token-classification │ │ ├── requirements.txt │ │ └── run.sh │ ├── language-modeling │ │ └── requirements.txt │ ├── text-classification │ │ └── requirements.txt │ ├── seq2seq │ │ └── requirements.txt │ ├── _tests_requirements.txt │ └── tests │ │ └── deepspeed │ │ └── ds_config.json ├── src │ ├── transformers │ │ ├── benchmark │ │ │ └── __init__.py │ │ ├── models │ │ │ ├── dialogpt │ │ │ │ └── __init__.py │ │ │ └── xlm_prophetnet │ │ │ │ ├── configuration_xlm_prophetnet.py │ │ │ │ └── __init__.py │ │ ├── sagemaker │ │ │ └── __init__.py │ │ ├── commands │ │ │ └── __init__.py │ │ ├── data │ │ │ ├── datasets │ │ │ │ └── __init__.py │ │ │ ├── processors │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ └── utils │ │ │ └── dummy_flax_objects.py │ └── transformers.egg-info │ │ ├── dependency_links.txt │ │ ├── top_level.txt │ │ └── entry_points.txt ├── infinite_memory_transformer_sticky_mem │ └── config.json ├── infinite_memory_transformer │ └── config.json └── utils │ └── get_modified_files.py ├── document_grounded_generation ├── transformers │ ├── tests │ │ ├── __init__.py │ │ ├── test_pipelines_text2text_generation.py │ │ ├── test_pipelines_feature_extraction.py │ │ ├── test_pipelines_sentiment_analysis.py │ │ ├── test_cli.py │ │ └── test_activations_tf.py │ ├── MANIFEST.in │ ├── examples │ │ ├── benchmarking │ │ │ └── requirements.txt │ │ ├── research_projects │ │ │ ├── bertabs │ │ │ │ ├── __init__.py │ │ │ │ └── requirements.txt │ │ │ ├── deebert │ │ │ │ ├── src │ │ │ │ │ └── __init__.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── eval_deebert.sh │ │ │ │ ├── entropy_eval.sh │ │ │ │ └── train_deebert.sh │ │ │ ├── bert-loses-patience │ │ │ │ ├── pabee │ │ │ │ │ └── __init__.py │ │ │ │ └── requirements.txt │ │ │ ├── adversarial │ │ │ │ └── requirements.txt │ │ │ ├── bertology │ │ │ │ └── requirements.txt │ │ │ ├── longform-qa │ │ │ │ ├── requirements.txt │ │ │ │ └── README.md │ │ │ ├── mlm_wwm │ │ │ │ └── requirements.txt │ │ │ ├── rag │ │ │ │ ├── __init__.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── finetune_rag.sh │ │ │ │ ├── finetune_rag_ray.sh │ │ │ │ └── parse_dpr_relevance_data.py │ │ │ ├── wav2vec2 │ │ │ │ ├── requirements.txt │ │ │ │ ├── finetune_base_100.sh │ │ │ │ ├── finetune_large_lv60_100.sh │ │ │ │ ├── finetune_base_timit_asr.sh │ │ │ │ ├── finetune_large_lv60_timit_asr.sh │ │ │ │ ├── finetune_wav2vec2_xlsr_turkish.sh │ │ │ │ ├── finetune_large_xlsr_53_arabic_speech_corpus.sh │ │ │ │ └── vocab │ │ │ │ │ └── buckwalter.json │ │ │ ├── distillation │ │ │ │ ├── requirements.txt │ │ │ │ └── training_configs │ │ │ │ │ ├── distilgpt2.json │ │ │ │ │ ├── distilbert-base-cased.json │ │ │ │ │ ├── distilbert-base-uncased.json │ │ │ │ │ ├── distilbert-base-multilingual-cased.json │ │ │ │ │ └── distilroberta-base.json │ │ │ ├── pplm │ │ │ │ ├── imgs │ │ │ │ │ ├── wooly.png │ │ │ │ │ └── headfigure.png │ │ │ │ ├── requirements.txt │ │ │ │ └── pplm_classification_head.py │ │ │ ├── movement-pruning │ │ │ │ ├── emmental │ │ │ │ │ ├── modules │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── __init__.py │ │ │ │ └── requirements.txt │ │ │ ├── lxmert │ │ │ │ └── README.md │ │ │ ├── performer │ │ │ │ ├── full_script.sh │ │ │ │ ├── sanity_script.sh │ │ │ │ └── README.md │ │ │ ├── seq2seq-distillation │ │ │ │ ├── requirements.txt │ │ │ │ ├── finetune.sh │ │ │ │ ├── finetune_t5.sh │ │ │ │ ├── finetune_pegasus_xsum.sh │ │ │ │ ├── train_mbart_cc25_enro.sh │ │ │ │ ├── dynamic_bs_example.sh │ │ │ │ ├── sentence_splitter.py │ │ │ │ ├── distil_marian_no_teacher.sh │ │ │ │ ├── train_distilbart_cnn.sh │ │ │ │ ├── distil_marian_enro_teacher.sh │ │ │ │ ├── train_distilbart_xsum.sh │ │ │ │ └── finetune_bart_tiny.sh │ │ │ ├── mm-imdb │ │ │ │ └── README.md │ │ │ └── README.md │ │ ├── legacy │ │ │ ├── seq2seq │ │ │ │ ├── test_data │ │ │ │ │ ├── test_data │ │ │ │ │ ├── wmt_en_ro │ │ │ │ │ │ ├── val.len │ │ │ │ │ │ └── train.len │ │ │ │ │ └── fsmt │ │ │ │ │ │ └── build-eval-data.py │ │ │ │ ├── __init__.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── finetune.sh │ │ │ │ ├── finetune_tpu.sh │ │ │ │ ├── minify_dataset.py │ │ │ │ ├── rouge_cli.py │ │ │ │ ├── sentence_splitter.py │ │ │ │ ├── convert_model_to_fp16.py │ │ │ │ ├── train_mbart_cc25_enro.sh │ │ │ │ └── old_test_tatoeba_conversion.py │ │ │ ├── pytorch-lightning │ │ │ │ ├── requirements.txt │ │ │ │ ├── run_glue.sh │ │ │ │ └── run_pos.sh │ │ │ ├── README.md │ │ │ └── token-classification │ │ │ │ ├── run_pos.sh │ │ │ │ ├── run_chunk.sh │ │ │ │ └── scripts │ │ │ │ └── preprocess.py │ │ ├── question-answering │ │ │ └── requirements.txt │ │ ├── token-classification │ │ │ ├── requirements.txt │ │ │ └── run.sh │ │ ├── multiple-choice │ │ │ └── requirements.txt │ │ ├── text-generation │ │ │ ├── requirements.txt │ │ │ └── README.md │ │ ├── language-modeling │ │ │ └── requirements.txt │ │ ├── text-classification │ │ │ └── requirements.txt │ │ ├── seq2seq │ │ │ └── requirements.txt │ │ ├── _tests_requirements.txt │ │ └── tests │ │ │ └── deepspeed │ │ │ └── ds_config.json │ ├── src │ │ └── transformers │ │ │ ├── benchmark │ │ │ └── __init__.py │ │ │ ├── models │ │ │ ├── dialogpt │ │ │ │ └── __init__.py │ │ │ ├── gpt2 │ │ │ │ └── pre_process_wmt19.py │ │ │ └── xlm_prophetnet │ │ │ │ ├── configuration_xlm_prophetnet.py │ │ │ │ └── __init__.py │ │ │ ├── sagemaker │ │ │ └── __init__.py │ │ │ ├── commands │ │ │ └── __init__.py │ │ │ ├── data │ │ │ ├── datasets │ │ │ │ └── __init__.py │ │ │ ├── processors │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ │ └── utils │ │ │ └── dummy_flax_objects.py │ ├── docs │ │ ├── source │ │ │ ├── contributing.md │ │ │ ├── examples.md │ │ │ ├── notebooks.md │ │ │ ├── favicon.ico │ │ │ ├── imgs │ │ │ │ ├── ppl_full.gif │ │ │ │ ├── ppl_chunked.gif │ │ │ │ ├── ppl_sliding.gif │ │ │ │ ├── local_attention_mask.png │ │ │ │ ├── transformers_logo_name.png │ │ │ │ ├── transformers_overview.png │ │ │ │ ├── warmup_cosine_schedule.png │ │ │ │ ├── warmup_linear_schedule.png │ │ │ │ ├── warmup_constant_schedule.png │ │ │ │ ├── warmup_cosine_hard_restarts_schedule.png │ │ │ │ └── warmup_cosine_warm_restarts_schedule.png │ │ │ ├── _static │ │ │ │ └── css │ │ │ │ │ ├── Calibre-Light.ttf │ │ │ │ │ ├── Calibre-Thin.otf │ │ │ │ │ ├── Calibre-Medium.otf │ │ │ │ │ ├── Calibre-Regular.otf │ │ │ │ │ └── code-snippets.css │ │ │ └── main_classes │ │ │ │ └── configuration.rst │ │ └── Makefile │ ├── .gitattributes │ ├── pyproject.toml │ ├── .github │ │ ├── conda │ │ │ ├── build.sh │ │ │ └── meta.yaml │ │ ├── ISSUE_TEMPLATE │ │ │ ├── ---new-benchmark.md │ │ │ ├── --new-model-addition.md │ │ │ ├── question-help.md │ │ │ └── feature-request.md │ │ └── workflows │ │ │ ├── stale.yml │ │ │ ├── release-conda.yml │ │ │ └── github-torch-hub.yml │ ├── templates │ │ ├── adding_a_new_model │ │ │ ├── open_model_proposals │ │ │ │ └── README.md │ │ │ ├── tests │ │ │ │ ├── pt-seq-2-seq-bart-tokenizer.json │ │ │ │ ├── encoder-bert-tokenizer.json │ │ │ │ ├── pt-encoder-bert-tokenizer.json │ │ │ │ ├── standalone.json │ │ │ │ ├── tf-encoder-bert-tokenizer.json │ │ │ │ └── tf-seq-2-seq-bart-tokenizer.json │ │ │ ├── cookiecutter.json │ │ │ └── cookiecutter-template-{{cookiecutter.modelname}} │ │ │ │ └── configuration.json │ │ └── adding_a_new_example_script │ │ │ └── cookiecutter.json │ ├── .coveragerc │ ├── docker │ │ ├── transformers-pytorch-tpu │ │ │ ├── docker-entrypoint.sh │ │ │ ├── dataset.yaml │ │ │ └── bert-base-cased.jsonnet │ │ ├── transformers-pytorch-cpu │ │ │ └── Dockerfile │ │ ├── transformers-tensorflow-cpu │ │ │ └── Dockerfile │ │ ├── transformers-cpu │ │ │ └── Dockerfile │ │ ├── transformers-tensorflow-gpu │ │ │ └── Dockerfile │ │ ├── transformers-pytorch-gpu │ │ │ └── Dockerfile │ │ └── transformers-gpu │ │ │ └── Dockerfile │ ├── scripts │ │ ├── tatoeba │ │ │ └── upload_models.sh │ │ ├── fsmt │ │ │ └── tests-to-run.sh │ │ └── pegasus │ │ │ └── build_test_sample_spm_no_bos.py │ ├── setup.cfg │ └── model_cards │ │ └── README.md ├── requirements.txt └── test_special_tokens.py ├── requirements.txt └── sorting ├── run_sort_inftyformer.sh └── utils └── exp_utils.py /finetune_gpt2/examples/benchmarking/requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/models/dialogpt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/bertabs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/deebert/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/test_data/test_data: -------------------------------------------------------------------------------- 1 | seq2seq/test_data -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/bert-loses-patience/pabee/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/benchmarking/requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/question-answering/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.2.1 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/bertabs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/models/dialogpt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/contributing.md: -------------------------------------------------------------------------------- 1 | ../../CONTRIBUTING.md -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/examples.md: -------------------------------------------------------------------------------- 1 | ../../examples/README.md -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/notebooks.md: -------------------------------------------------------------------------------- 1 | ../../notebooks/README.md -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/deebert/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/models/gpt2/pre_process_wmt19.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/bertology/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/deebert/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/multiple-choice/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/adversarial/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/bert-loses-patience/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 -------------------------------------------------------------------------------- /finetune_gpt2/examples/text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/token-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | seqeval 2 | datasets >= 1.1.3 3 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.gitattributes: -------------------------------------------------------------------------------- 1 | *.py eol=lf 2 | *.rst eol=lf 3 | *.md eol=lf -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/test_data: -------------------------------------------------------------------------------- 1 | seq2seq/test_data -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/question-answering/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.2.1 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/pabee/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/adversarial/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/bertology/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/deebert/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/token-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | seqeval 2 | datasets >= 1.1.3 3 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/language-modeling/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/multiple-choice/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | target-version = ['py35'] 4 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/conda/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install # Python command to install the script. 2 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/bertabs/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | 3 | # For ROUGE 4 | nltk 5 | py-rouge 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/longform-qa/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | faiss-cpu 3 | streamlit 4 | elasticsearch 5 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/mlm_wwm/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | ltp 5 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/text-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets >= 1.1.3 3 | sentencepiece != 0.1.92 4 | protobuf 5 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/language-modeling/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | transformers-cli = transformers.commands.transformers_cli:main 3 | 4 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | sacrebleu >= 1.4.12 5 | rouge-score 6 | nltk -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/bertabs/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | 3 | # For ROUGE 4 | nltk 5 | py-rouge 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/rag/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /document_grounded_generation/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | pytorch-ignite 3 | transformers==2.5.1 4 | tensorboardX==1.8 5 | tensorflow # for tensorboardX 6 | spacy 7 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/longform-qa/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | faiss-cpu 3 | streamlit 4 | elasticsearch 5 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/mlm_wwm/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | ltp 5 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/text-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets >= 1.1.3 3 | sentencepiece != 0.1.92 4 | protobuf 5 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | sacrebleu >= 1.4.12 5 | rouge-score 6 | nltk -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | datasets 3 | torch>=1.5.0 4 | torchaudio 5 | jiwer==2.2.0 6 | lang-trans==0.6.0 7 | librosa==0.8.0 8 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/favicon.ico -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/rag/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/rag/requirements.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu >= 1.6.3 2 | datasets >= 1.0.1 3 | psutil >= 5.7.0 4 | torch >= 1.4.0 5 | transformers 6 | pytorch-lightning==1.0.4 7 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | gitpython==3.0.2 4 | tensorboard>=1.14.0 5 | tensorboardX==1.8 6 | psutil==5.6.6 7 | scipy>=1.4.1 8 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | datasets 3 | torch>=1.5.0 4 | torchaudio 5 | jiwer==2.2.0 6 | lang-trans==0.6.0 7 | librosa==0.8.0 8 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/open_model_proposals/README.md: -------------------------------------------------------------------------------- 1 | Currently the following model proposals are available: 2 | 3 | - [BigBird (Google)](./ADD_BIG_BIRD.md) 4 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | gitpython==3.0.2 4 | tensorboard>=1.14.0 5 | tensorboardX==1.8 6 | psutil==5.6.6 7 | scipy>=1.4.1 8 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/rag/requirements.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu >= 1.6.3 2 | datasets >= 1.0.1 3 | psutil >= 5.7.0 4 | torch >= 1.4.0 5 | transformers 6 | pytorch-lightning==1.0.4 7 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/movement-pruning/emmental/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer 3 | from .masked_nn import MaskedLinear 4 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer 3 | from .masked_nn import MaskedLinear 4 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/lxmert/README.md: -------------------------------------------------------------------------------- 1 | # LXMERT DEMO 2 | 3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate`` 4 | 2. install reqs: ``pip install -r ./requirements.txt`` 5 | 3. usage is as shown in demo.ipynb 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/movement-pruning/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.4.0 2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers 3 | knockknock>=0.1.8.1 4 | h5py>=2.10.0 5 | numpy>=1.18.2 6 | scipy>=1.4.1 7 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/lxmert/README.md: -------------------------------------------------------------------------------- 1 | # LXMERT DEMO 2 | 3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate`` 4 | 2. install reqs: ``pip install -r ./requirements.txt`` 5 | 3. usage is as shown in demo.ipynb 6 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/movement-pruning/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.4.0 2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers 3 | knockknock>=0.1.8.1 4 | h5py>=2.10.0 5 | numpy>=1.18.2 6 | scipy>=1.4.1 7 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=transformers 3 | omit = 4 | # skip convertion scripts from testing for now 5 | */convert_* 6 | */__main__.py 7 | [report] 8 | exclude_lines = 9 | pragma: no cover 10 | raise 11 | except 12 | register_parameter -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.0.3 2 | gensim==3.8.3 3 | ignite==1.1.0 4 | matplotlib==3.4.3 5 | numpy==1.21.3 6 | pytorch-ignite==0.4.7 7 | pytorch-lightning==1.6.0 8 | rouge-score==0.0.4 9 | sacrebleu==2.0.0 10 | scikit-learn==1.0.1 11 | scipy==1.7.1 12 | tensorboard==2.9.0 13 | tensorboardX==1.8 14 | torch==1.9.0 15 | tqdm==4.62.3 16 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ~/.bashrc 3 | echo "running docker-entrypoint.sh" 4 | conda activate container 5 | echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS 6 | echo "printed TPU info" 7 | export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" 8 | exec "$@"#!/bin/bash 9 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/_tests_requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu >= 1.4.12 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/_tests_requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu >= 1.4.12 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/performer/full_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/performer/sanity_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | psutil 4 | sacrebleu 5 | rouge-score 6 | tensorflow_datasets 7 | pytorch-lightning==1.0.4 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/scripts/tatoeba/upload_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for FILE in converted/*; do 4 | model_name=`basename $FILE` 5 | transformers-cli repo create $model_name -y 6 | git clone https://huggingface.co/Helsinki-NLP/$model_name 7 | mv $FILE/* $model_name/ 8 | cd $model_name 9 | git add . && git commit -m "initial commit" 10 | git push 11 | cd .. 12 | done 13 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/pytorch-lightning/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | ray 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/movement-pruning/emmental/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .configuration_bert_masked import MaskedBertConfig 3 | from .modeling_bert_masked import ( 4 | MaskedBertForMultipleChoice, 5 | MaskedBertForQuestionAnswering, 6 | MaskedBertForSequenceClassification, 7 | MaskedBertForTokenClassification, 8 | MaskedBertModel, 9 | ) 10 | from .modules import * 11 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/performer/full_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/performer/sanity_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 28996 14 | } 15 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/pplm/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | transformers==3.5.1 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | psutil 4 | sacrebleu 5 | rouge-score 6 | tensorflow_datasets 7 | pytorch-lightning==1.0.4 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } 13 | 14 | .highlight .gp { 15 | color: #FB8D68; 16 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .configuration_bert_masked import MaskedBertConfig 3 | from .modeling_bert_masked import ( 4 | MaskedBertForMultipleChoice, 5 | MaskedBertForQuestionAnswering, 6 | MaskedBertForSequenceClassification, 7 | MaskedBertForTokenClassification, 8 | MaskedBertModel, 9 | ) 10 | from .modules import * 11 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_example_script/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "example_name": "text classification", 3 | "directory_name": "{{cookiecutter.example_name|lower|replace(' ', '-')}}", 4 | "example_shortcut": "{{cookiecutter.directory_name}}", 5 | "model_class": "AutoModel", 6 | "authors": "The HuggingFace Team", 7 | "can_train_from_scratch": ["True", "False"], 8 | "with_trainer": ["True", "False"] 9 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/pytorch-lightning/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | ray 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 119547 14 | } 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 28996 14 | } 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/pplm/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | transformers==3.5.1 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune.sh: -------------------------------------------------------------------------------- 1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 2 | # run ./finetune.sh --help to see all the possible options 3 | python finetune.py \ 4 | --learning_rate=3e-5 \ 5 | --fp16 \ 6 | --gpus 1 \ 7 | --do_train \ 8 | --do_predict \ 9 | --n_val 1000 \ 10 | --val_check_interval 0.1 \ 11 | "$@" 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 119547 14 | } 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune.sh: -------------------------------------------------------------------------------- 1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 2 | # run ./finetune.sh --help to see all the possible options 3 | python finetune.py \ 4 | --learning_rate=3e-5 \ 5 | --fp16 \ 6 | --gpus 1 \ 7 | --do_train \ 8 | --do_predict \ 9 | --n_val 1000 \ 10 | --val_check_interval 0.1 \ 11 | "$@" 12 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_t5.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --data_dir=$CNN_DIR \ 6 | --learning_rate=3e-5 \ 7 | --train_batch_size=$BS \ 8 | --eval_batch_size=$BS \ 9 | --output_dir=$OUTPUT_DIR \ 10 | --max_source_length=512 \ 11 | --max_target_length=56 \ 12 | --val_check_interval=0.1 --n_val=200 \ 13 | --do_train --do_predict \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "NewENCDEC", 3 | "uppercase_modelname": "NEW_ENC_DEC", 4 | "lowercase_modelname": "new_enc_dec", 5 | "camelcase_modelname": "NewEncDec", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "new-enc-dec-base", 8 | "tokenizer_type": "Based on BART", 9 | "generate_tensorflow_and_pytorch": "PyTorch", 10 | "is_encoder_decoder_model": "True" 11 | } 12 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/distillation/training_configs/distilroberta-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab_size": 50265, 3 | "hidden_size": 768, 4 | "num_hidden_layers": 6, 5 | "num_attention_heads": 12, 6 | "intermediate_size": 3072, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "attention_probs_dropout_prob": 0.1, 10 | "max_position_embeddings": 514, 11 | "type_vocab_size": 1, 12 | "initializer_range": 0.02, 13 | "layer_norm_eps": 0.00001 14 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "Template", 3 | "uppercase_modelname": "TEMPLATE", 4 | "lowercase_modelname": "template", 5 | "camelcase_modelname": "Template", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplatePT", 3 | "uppercase_modelname": "TEMPLATE_PT", 4 | "lowercase_modelname": "template_pt", 5 | "camelcase_modelname": "TemplatePt", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "PyTorch", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/standalone.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplateBI", 3 | "uppercase_modelname": "TEMPLATE_BI", 4 | "lowercase_modelname": "template_bi", 5 | "camelcase_modelname": "TemplateBi", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "bi-brand-new-bert-base-cased", 8 | "tokenizer_type": "Standalone", 9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplateTF", 3 | "uppercase_modelname": "TEMPLATE_TF", 4 | "lowercase_modelname": "template_tf", 5 | "camelcase_modelname": "TemplateTf", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "NewTFENCDEC", 3 | "uppercase_modelname": "NEW_TF_ENC_DEC", 4 | "lowercase_modelname": "new_tf_enc_dec", 5 | "camelcase_modelname": "NewTFEncDec", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "new-tf-enc-dec-base", 8 | "tokenizer_type": "Based on BART", 9 | "generate_tensorflow_and_pytorch": "TensorFlow", 10 | "is_encoder_decoder_model": "True" 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_t5.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --data_dir=$CNN_DIR \ 6 | --learning_rate=3e-5 \ 7 | --train_batch_size=$BS \ 8 | --eval_batch_size=$BS \ 9 | --output_dir=$OUTPUT_DIR \ 10 | --max_source_length=512 \ 11 | --max_target_length=56 \ 12 | --val_check_interval=0.1 --n_val=200 \ 13 | --do_train --do_predict \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilroberta-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab_size": 50265, 3 | "hidden_size": 768, 4 | "num_hidden_layers": 6, 5 | "num_attention_heads": 12, 6 | "intermediate_size": 3072, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "attention_probs_dropout_prob": 0.1, 10 | "max_position_embeddings": 514, 11 | "type_vocab_size": 1, 12 | "initializer_range": 0.02, 13 | "layer_norm_eps": 0.00001 14 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "BrandNewBERT", 3 | "uppercase_modelname": "BRAND_NEW_BERT", 4 | "lowercase_modelname": "brand_new_bert", 5 | "camelcase_modelname": "BrandNewBert", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"], 9 | "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"], 10 | "is_encoder_decoder_model": ["True", "False"] 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F5A5 New benchmark" 3 | about: Benchmark a part of this library and share your results 4 | title: "[Benchmark]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🖥 Benchmarking `transformers` 11 | 12 | ## Benchmark 13 | 14 | Which part of `transformers` did you benchmark? 15 | 16 | ## Set-up 17 | 18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use? 19 | 20 | ## Results 21 | 22 | Put your results here! 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F31F New model addition" 3 | about: Submit a proposal/request to implement a new Transformer-based model 4 | title: '' 5 | labels: New model 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🌟 New model addition 11 | 12 | ## Model description 13 | 14 | 15 | 16 | ## Open source status 17 | 18 | * [ ] the model implementation is available: (give details) 19 | * [ ] the model weights are available: (give details) 20 | * [ ] who are the authors: (mention them, if possible by @gh-username) 21 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # From appendix C of paper https://arxiv.org/abs/1912.08777 5 | # Set --gradient_accumulation_steps so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16) 6 | python finetune.py \ 7 | --learning_rate=1e-4 \ 8 | --do_train \ 9 | --do_predict \ 10 | --n_val 1000 \ 11 | --val_check_interval 0.25 \ 12 | --max_source_length 512 --max_target_length 56 \ 13 | --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # From appendix C of paper https://arxiv.org/abs/1912.08777 5 | # Set --gradient_accumulation_steps so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16) 6 | python finetune.py \ 7 | --learning_rate=1e-4 \ 8 | --do_train \ 9 | --do_predict \ 10 | --n_val 1000 \ 11 | --val_check_interval 0.25 \ 12 | --max_source_length 512 --max_target_length 56 \ 13 | --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "{{cookiecutter.modelname}}", 3 | "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}", 4 | "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}", 5 | "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}", 6 | "authors": "{{cookiecutter.authors}}", 7 | "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}", 8 | "tokenizer_type": "{{cookiecutter.tokenizer_type}}", 9 | "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}", 10 | "is_encoder_decoder_model": ["True", "False"] 11 | } 12 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Stale Bot 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | 7 | jobs: 8 | close_stale_issues: 9 | name: Close Stale Issues 10 | if: github.repository == 'huggingface/transformers' 11 | runs-on: ubuntu-latest 12 | env: 13 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Setup Python 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: 3.7 21 | 22 | - name: Install requirements 23 | run: | 24 | pip install PyGithub 25 | - name: Close stale issues 26 | run: | 27 | python scripts/stale.py -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/longform-qa/README.md: -------------------------------------------------------------------------------- 1 | # Long Form Question Answering 2 | 3 | Author: @yjernite 4 | 5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries. 6 | 7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html). 8 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_100.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-base-100h" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="32" \ 6 | --per_device_eval_batch_size="32" \ 7 | --evaluation_strategy="steps" \ 8 | --save_total_limit="3" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-base" \ 15 | --fp16 \ 16 | --dataset_name="librispeech_asr" \ 17 | --dataset_config_name="clean" \ 18 | --train_split_name="train.100" \ 19 | --preprocessing_num_workers="32" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/longform-qa/README.md: -------------------------------------------------------------------------------- 1 | # Long Form Question Answering 2 | 3 | Author: @yjernite 4 | 5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries. 6 | 7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html). 8 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --learning_rate=3e-5 \ 6 | --fp16 \ 7 | --do_train \ 8 | --val_check_interval=0.25 \ 9 | --adam_eps 1e-06 \ 10 | --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ 11 | --data_dir $ENRO_DIR \ 12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 13 | --train_batch_size=$BS --eval_batch_size=$BS \ 14 | --task translation \ 15 | --warmup_steps 500 \ 16 | --freeze_embeds \ 17 | --model_name_or_path=facebook/mbart-large-cc25 \ 18 | "$@" 19 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-lv60-100h" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="16" \ 6 | --per_device_eval_batch_size="16" \ 7 | --evaluation_strategy="steps" \ 8 | --save_total_limit="3" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \ 15 | --fp16 \ 16 | --dataset_name="librispeech_asr" \ 17 | --dataset_config_name="clean" \ 18 | --train_split_name="train.100" \ 19 | --preprocessing_num_workers="32" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_100.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-base-100h" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="32" \ 6 | --per_device_eval_batch_size="32" \ 7 | --evaluation_strategy="steps" \ 8 | --save_total_limit="3" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-base" \ 15 | --fp16 \ 16 | --dataset_name="librispeech_asr" \ 17 | --dataset_config_name="clean" \ 18 | --train_split_name="train.100" \ 19 | --preprocessing_num_workers="32" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor 22 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-base-timit-asr" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="20" \ 6 | --per_device_eval_batch_size="20" \ 7 | --evaluation_strategy="steps" \ 8 | --save_steps="500" \ 9 | --eval_steps="100" \ 10 | --logging_steps="50" \ 11 | --learning_rate="5e-4" \ 12 | --warmup_steps="3000" \ 13 | --model_name_or_path="facebook/wav2vec2-base" \ 14 | --fp16 \ 15 | --dataset_name="timit_asr" \ 16 | --train_split_name="train" \ 17 | --validation_split_name="test" \ 18 | --orthography="timit" \ 19 | --preprocessing_num_workers="$(nproc)" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor \ 22 | --verbose_logging \ 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --learning_rate=3e-5 \ 6 | --fp16 \ 7 | --do_train \ 8 | --val_check_interval=0.25 \ 9 | --adam_eps 1e-06 \ 10 | --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ 11 | --data_dir $ENRO_DIR \ 12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 13 | --train_batch_size=$BS --eval_batch_size=$BS \ 14 | --task translation \ 15 | --warmup_steps 500 \ 16 | --freeze_embeds \ 17 | --model_name_or_path=facebook/mbart-large-cc25 \ 18 | "$@" 19 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | export MAX_LEN=128 5 | export m=sshleifer/student_marian_en_ro_6_1 6 | python finetune.py \ 7 | --learning_rate=3e-4 \ 8 | --do_train \ 9 | --fp16 \ 10 | --data_dir wmt_en_ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --freeze_encoder --freeze_embeds \ 13 | --train_batch_size=48 --eval_batch_size=64 \ 14 | --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \ 15 | --warmup_steps 500 --logger_name wandb --gpus 1 \ 16 | --fp16_opt_level=O1 --task translation \ 17 | "$@" 18 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-pytorch-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | torch 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-lv60-100h" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="16" \ 6 | --per_device_eval_batch_size="16" \ 7 | --evaluation_strategy="steps" \ 8 | --save_total_limit="3" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \ 15 | --fp16 \ 16 | --dataset_name="librispeech_asr" \ 17 | --dataset_config_name="clean" \ 18 | --train_split_name="train.100" \ 19 | --preprocessing_num_workers="32" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-tensorflow-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow-cpu 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | export MAX_LEN=128 5 | export m=sshleifer/student_marian_en_ro_6_1 6 | python finetune.py \ 7 | --learning_rate=3e-4 \ 8 | --do_train \ 9 | --fp16 \ 10 | --data_dir wmt_en_ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --freeze_encoder --freeze_embeds \ 13 | --train_batch_size=48 --eval_batch_size=64 \ 14 | --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \ 15 | --warmup_steps 500 --logger_name wandb --gpus 1 \ 16 | --fp16_opt_level=O1 --task translation \ 17 | "$@" 18 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-base-timit-asr" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="20" \ 6 | --per_device_eval_batch_size="20" \ 7 | --evaluation_strategy="steps" \ 8 | --save_steps="500" \ 9 | --eval_steps="100" \ 10 | --logging_steps="50" \ 11 | --learning_rate="5e-4" \ 12 | --warmup_steps="3000" \ 13 | --model_name_or_path="facebook/wav2vec2-base" \ 14 | --fp16 \ 15 | --dataset_name="timit_asr" \ 16 | --train_split_name="train" \ 17 | --validation_split_name="test" \ 18 | --orthography="timit" \ 19 | --preprocessing_num_workers="$(nproc)" \ 20 | --group_by_length \ 21 | --freeze_feature_extractor \ 22 | --verbose_logging \ 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/pplm/pplm_classification_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ClassificationHead(torch.nn.Module): 5 | """Classification Head for transformer encoders""" 6 | 7 | def __init__(self, class_size, embed_size): 8 | super().__init__() 9 | self.class_size = class_size 10 | self.embed_size = embed_size 11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size) 12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size)) 13 | self.mlp = torch.nn.Linear(embed_size, class_size) 14 | 15 | def forward(self, hidden_state): 16 | # hidden_state = F.relu(self.mlp1(hidden_state)) 17 | # hidden_state = self.mlp2(hidden_state) 18 | logits = self.mlp(hidden_state) 19 | return logits 20 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from filelock import FileLock 4 | 5 | 6 | try: 7 | import nltk 8 | 9 | NLTK_AVAILABLE = True 10 | except (ImportError, ModuleNotFoundError): 11 | NLTK_AVAILABLE = False 12 | 13 | if NLTK_AVAILABLE: 14 | with FileLock(".lock") as lock: 15 | nltk.download("punkt", quiet=True) 16 | 17 | 18 | def add_newline_to_end_of_each_sentence(x: str) -> str: 19 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 20 | re.sub("", "", x) # remove pegasus newline char 21 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 22 | return "\n".join(nltk.sent_tokenize(x)) 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow-cpu \ 19 | torch 20 | 21 | WORKDIR /workspace 22 | COPY . transformers/ 23 | RUN cd transformers/ && \ 24 | python3 -m pip install --no-cache-dir . 25 | 26 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/mm-imdb/README.md: -------------------------------------------------------------------------------- 1 | ## MM-IMDb 2 | 3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py). 4 | 5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata. 6 | 7 | ### Training on MM-IMDb 8 | 9 | ``` 10 | python run_mmimdb.py \ 11 | --data_dir /path/to/mmimdb/dataset/ \ 12 | --model_type bert \ 13 | --model_name_or_path bert-base-uncased \ 14 | --output_dir /path/to/save/dir/ \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_len 512 \ 18 | --gradient_accumulation_steps 20 \ 19 | --num_image_embeds 3 \ 20 | --num_train_epochs 100 \ 21 | --patience 5 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | export MAX_LEN=128 5 | python finetune.py \ 6 | --learning_rate=3e-4 \ 7 | --do_train \ 8 | --do_predict \ 9 | --fp16 \ 10 | --val_check_interval 0.25 \ 11 | --data_dir $ENRO_DIR \ 12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 13 | --freeze_encoder --freeze_embeds \ 14 | --train_batch_size=$BS --eval_batch_size=$BS \ 15 | --tokenizer_name $m --model_name_or_path $m \ 16 | --warmup_steps 500 --sortish_sampler --logger_name wandb \ 17 | --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \ 18 | "$@" 19 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-tensorflow-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/pplm/pplm_classification_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ClassificationHead(torch.nn.Module): 5 | """Classification Head for transformer encoders""" 6 | 7 | def __init__(self, class_size, embed_size): 8 | super().__init__() 9 | self.class_size = class_size 10 | self.embed_size = embed_size 11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size) 12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size)) 13 | self.mlp = torch.nn.Linear(embed_size, class_size) 14 | 15 | def forward(self, hidden_state): 16 | # hidden_state = F.relu(self.mlp1(hidden_state)) 17 | # hidden_state = self.mlp2(hidden_state) 18 | logits = self.mlp(hidden_state) 19 | return logits 20 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from filelock import FileLock 4 | 5 | 6 | try: 7 | import nltk 8 | 9 | NLTK_AVAILABLE = True 10 | except (ImportError, ModuleNotFoundError): 11 | NLTK_AVAILABLE = False 12 | 13 | if NLTK_AVAILABLE: 14 | with FileLock(".lock") as lock: 15 | nltk.download("punkt", quiet=True) 16 | 17 | 18 | def add_newline_to_end_of_each_sentence(x: str) -> str: 19 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 20 | re.sub("", "", x) # remove pegasus newline char 21 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 22 | return "\n".join(nltk.sent_tokenize(x)) 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | export BS=32 5 | export GAS=1 6 | 7 | python finetune.py \ 8 | --learning_rate=3e-5 \ 9 | --fp16 \ 10 | --gpus 1 \ 11 | --do_train \ 12 | --do_predict \ 13 | --val_check_interval 0.25 \ 14 | --n_val 500 \ 15 | --num_train_epochs 2 \ 16 | --freeze_encoder --freeze_embeds --data_dir cnn_dm \ 17 | --max_target_length 142 --val_max_target_length=142 \ 18 | --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \ 19 | --model_name_or_path sshleifer/student_cnn_12_6 \ 20 | --tokenizer_name facebook/bart-large \ 21 | --warmup_steps 500 \ 22 | --output_dir distilbart-cnn-12-6 \ 23 | "$@" 24 | 25 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="2" \ 6 | --per_device_eval_batch_size="2" \ 7 | --gradient_accumulation_steps="4" \ 8 | --evaluation_strategy="steps" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \ 15 | --fp16 \ 16 | --dataset_name="timit_asr" \ 17 | --train_split_name="train" \ 18 | --validation_split_name="test" \ 19 | --orthography="timit" \ 20 | --preprocessing_num_workers="$(nproc)" \ 21 | --group_by_length \ 22 | --freeze_feature_extractor \ 23 | --verbose_logging \ 24 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/mm-imdb/README.md: -------------------------------------------------------------------------------- 1 | ## MM-IMDb 2 | 3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py). 4 | 5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata. 6 | 7 | ### Training on MM-IMDb 8 | 9 | ``` 10 | python run_mmimdb.py \ 11 | --data_dir /path/to/mmimdb/dataset/ \ 12 | --model_type bert \ 13 | --model_name_or_path bert-base-uncased \ 14 | --output_dir /path/to/save/dir/ \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_len 512 \ 18 | --gradient_accumulation_steps 20 \ 19 | --num_image_embeds 3 \ 20 | --num_train_epochs 100 \ 21 | --patience 5 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | export MAX_LEN=128 5 | python finetune.py \ 6 | --learning_rate=3e-4 \ 7 | --do_train \ 8 | --do_predict \ 9 | --fp16 \ 10 | --val_check_interval 0.25 \ 11 | --data_dir $ENRO_DIR \ 12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 13 | --freeze_encoder --freeze_embeds \ 14 | --train_batch_size=$BS --eval_batch_size=$BS \ 15 | --tokenizer_name $m --model_name_or_path $m \ 16 | --warmup_steps 500 --sortish_sampler --logger_name wandb \ 17 | --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \ 18 | "$@" 19 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_common_voice.py \ 3 | --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ 4 | --dataset_config_name="tr" \ 5 | --output_dir=./wav2vec2-large-xlsr-turkish-demo \ 6 | --overwrite_output_dir \ 7 | --num_train_epochs="5" \ 8 | --per_device_train_batch_size="16" \ 9 | --evaluation_strategy="steps" \ 10 | --learning_rate="3e-4" \ 11 | --warmup_steps="500" \ 12 | --fp16 \ 13 | --freeze_feature_extractor \ 14 | --save_steps="400" \ 15 | --eval_steps="400" \ 16 | --save_total_limit="3" \ 17 | --logging_steps="400" \ 18 | --group_by_length \ 19 | --feat_proj_dropout="0.0" \ 20 | --layerdrop="0.1" \ 21 | --gradient_checkpointing \ 22 | --do_train --do_eval 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | export BS=32 5 | export GAS=1 6 | 7 | python finetune.py \ 8 | --learning_rate=3e-5 \ 9 | --fp16 \ 10 | --gpus 1 \ 11 | --do_train \ 12 | --do_predict \ 13 | --val_check_interval 0.25 \ 14 | --n_val 500 \ 15 | --num_train_epochs 2 \ 16 | --freeze_encoder --freeze_embeds --data_dir cnn_dm \ 17 | --max_target_length 142 --val_max_target_length=142 \ 18 | --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \ 19 | --model_name_or_path sshleifer/student_cnn_12_6 \ 20 | --tokenizer_name facebook/bart-large \ 21 | --warmup_steps 500 \ 22 | --output_dir distilbart-cnn-12-6 \ 23 | "$@" 24 | 25 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \ 4 | --num_train_epochs="30" \ 5 | --per_device_train_batch_size="2" \ 6 | --per_device_eval_batch_size="2" \ 7 | --gradient_accumulation_steps="4" \ 8 | --evaluation_strategy="steps" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \ 15 | --fp16 \ 16 | --dataset_name="timit_asr" \ 17 | --train_split_name="train" \ 18 | --validation_split_name="test" \ 19 | --orthography="timit" \ 20 | --preprocessing_num_workers="$(nproc)" \ 21 | --group_by_length \ 22 | --freeze_feature_extractor \ 23 | --verbose_logging \ 24 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_common_voice.py \ 3 | --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \ 4 | --dataset_config_name="tr" \ 5 | --output_dir=./wav2vec2-large-xlsr-turkish-demo \ 6 | --overwrite_output_dir \ 7 | --num_train_epochs="5" \ 8 | --per_device_train_batch_size="16" \ 9 | --evaluation_strategy="steps" \ 10 | --learning_rate="3e-4" \ 11 | --warmup_steps="500" \ 12 | --fp16 \ 13 | --freeze_feature_extractor \ 14 | --save_steps="400" \ 15 | --eval_steps="400" \ 16 | --save_total_limit="3" \ 17 | --logging_steps="400" \ 18 | --group_by_length \ 19 | --feat_proj_dropout="0.0" \ 20 | --layerdrop="0.1" \ 21 | --gradient_checkpointing \ 22 | --do_train --do_eval 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/token-classification/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python3 run_ner.py \ 16 | --model_name_or_path bert-base-uncased \ 17 | --dataset_name conll2003 \ 18 | --output_dir /tmp/test-ner \ 19 | --do_train \ 20 | --do_eval 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/token-classification/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python3 run_ner.py \ 16 | --model_name_or_path bert-base-uncased \ 17 | --dataset_name conll2003 \ 18 | --output_dir /tmp/test-ner \ 19 | --do_train \ 20 | --do_eval 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/question-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓ Questions & Help" 3 | about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/ 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # ❓ Questions & Help 11 | 12 | 16 | 17 | ## Details 18 | 19 | 20 | 21 | 23 | 24 | **A link to original question on the forum**: 25 | 26 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | # export MAX_LEN=128 5 | python distillation.py \ 6 | --learning_rate=3e-4 \ 7 | --do_train \ 8 | --fp16 \ 9 | --val_check_interval 0.25 \ 10 | --teacher Helsinki-NLP/opus-mt-en-ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --student_decoder_layers 3 --student_encoder_layers 6 \ 13 | --freeze_encoder --freeze_embeds \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=$BS --eval_batch_size=$BS \ 17 | --tokenizer_name Helsinki-NLP/opus-mt-en-ro \ 18 | --warmup_steps 500 --logger_name wandb \ 19 | --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \ 20 | "$@" 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature request" 3 | about: Submit a proposal/request for a new transformers feature 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🚀 Feature request 11 | 12 | 14 | 15 | ## Motivation 16 | 17 | 20 | 21 | ## Your contribution 22 | 23 | 26 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | python distillation.py \ 4 | --teacher facebook/bart-large-xsum --data_dir xsum \ 5 | --tokenizer_name facebook/bart-large-xsum \ 6 | --student_decoder_layers 6 --student_encoder_layers 12 \ 7 | --freeze_encoder --freeze_embeds \ 8 | --learning_rate=3e-4 \ 9 | --do_train \ 10 | --do_predict \ 11 | --fp16 --fp16_opt_level=O1 \ 12 | --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \ 13 | --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \ 17 | --sortish_sampler \ 18 | --num_train_epochs=6 \ 19 | --warmup_steps 500 \ 20 | --output_dir distilbart_xsum_12_6 \ 21 | "$@" 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | # export MAX_LEN=128 5 | python distillation.py \ 6 | --learning_rate=3e-4 \ 7 | --do_train \ 8 | --fp16 \ 9 | --val_check_interval 0.25 \ 10 | --teacher Helsinki-NLP/opus-mt-en-ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --student_decoder_layers 3 --student_encoder_layers 6 \ 13 | --freeze_encoder --freeze_embeds \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=$BS --eval_batch_size=$BS \ 17 | --tokenizer_name Helsinki-NLP/opus-mt-en-ro \ 18 | --warmup_steps 500 --logger_name wandb \ 19 | --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \ 20 | "$@" 21 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \ 4 | --num_train_epochs="50" \ 5 | --per_device_train_batch_size="1" \ 6 | --per_device_eval_batch_size="1" \ 7 | --gradient_accumulation_steps="8" \ 8 | --evaluation_strategy="steps" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \ 15 | --fp16 \ 16 | --dataset_name="arabic_speech_corpus" \ 17 | --train_split_name="train" \ 18 | --validation_split_name="test" \ 19 | --max_duration_in_seconds="15" \ 20 | --orthography="buckwalter" \ 21 | --preprocessing_num_workers="$(nproc)" \ 22 | --group_by_length \ 23 | --freeze_feature_extractor \ 24 | --target_feature_extractor_sampling_rate \ 25 | --verbose_logging \ 26 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | python distillation.py \ 4 | --teacher facebook/bart-large-xsum --data_dir xsum \ 5 | --tokenizer_name facebook/bart-large-xsum \ 6 | --student_decoder_layers 6 --student_encoder_layers 12 \ 7 | --freeze_encoder --freeze_embeds \ 8 | --learning_rate=3e-4 \ 9 | --do_train \ 10 | --do_predict \ 11 | --fp16 --fp16_opt_level=O1 \ 12 | --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \ 13 | --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \ 17 | --sortish_sampler \ 18 | --num_train_epochs=6 \ 19 | --warmup_steps 500 \ 20 | --output_dir distilbart_xsum_12_6 \ 21 | "$@" 22 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/deebert/eval_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | 17 | python -u run_glue_deebert.py \ 18 | --model_type $MODEL_TYPE \ 19 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 20 | --task_name $DATASET \ 21 | --do_eval \ 22 | --do_lower_case \ 23 | --data_dir $PATH_TO_DATA/$DATASET \ 24 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 25 | --plot_data_dir ./results/ \ 26 | --max_seq_length 128 \ 27 | --eval_each_highway \ 28 | --eval_highway \ 29 | --overwrite_cache \ 30 | --per_gpu_eval_batch_size=1 31 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Legacy examples 18 | 19 | This folder contains examples which are not actively maintained (mostly contributed by the community). 20 | 21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python run_asr.py \ 3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \ 4 | --num_train_epochs="50" \ 5 | --per_device_train_batch_size="1" \ 6 | --per_device_eval_batch_size="1" \ 7 | --gradient_accumulation_steps="8" \ 8 | --evaluation_strategy="steps" \ 9 | --save_steps="500" \ 10 | --eval_steps="100" \ 11 | --logging_steps="50" \ 12 | --learning_rate="5e-4" \ 13 | --warmup_steps="3000" \ 14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \ 15 | --fp16 \ 16 | --dataset_name="arabic_speech_corpus" \ 17 | --train_split_name="train" \ 18 | --validation_split_name="test" \ 19 | --max_duration_in_seconds="15" \ 20 | --orthography="buckwalter" \ 21 | --preprocessing_num_workers="$(nproc)" \ 22 | --group_by_length \ 23 | --freeze_feature_extractor \ 24 | --target_feature_extractor_sampling_rate \ 25 | --verbose_logging \ 26 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/deebert/eval_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | 17 | python -u run_glue_deebert.py \ 18 | --model_type $MODEL_TYPE \ 19 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 20 | --task_name $DATASET \ 21 | --do_eval \ 22 | --do_lower_case \ 23 | --data_dir $PATH_TO_DATA/$DATASET \ 24 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 25 | --plot_data_dir ./results/ \ 26 | --max_seq_length 128 \ 27 | --eval_each_highway \ 28 | --eval_highway \ 29 | --overwrite_cache \ 30 | --per_gpu_eval_batch_size=1 31 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-pytorch-tpu/dataset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: huggingface-cluster-disk 5 | spec: 6 | storageClassName: "" 7 | capacity: 8 | storage: 500Gi 9 | accessModes: 10 | - ReadOnlyMany 11 | claimRef: 12 | namespace: default 13 | name: huggingface-cluster-disk-claim 14 | gcePersistentDisk: 15 | pdName: huggingface-cluster-disk 16 | fsType: ext4 17 | readOnly: true 18 | --- 19 | apiVersion: v1 20 | kind: PersistentVolumeClaim 21 | metadata: 22 | name: huggingface-cluster-disk-claim 23 | spec: 24 | # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass. 25 | # A nil storageClassName value uses the default StorageClass. For details, see 26 | # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1 27 | storageClassName: "" 28 | accessModes: 29 | - ReadOnlyMany 30 | resources: 31 | requests: 32 | storage: 1Ki 33 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Legacy examples 18 | 19 | This folder contains examples which are not actively maintained (mostly contributed by the community). 20 | 21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 22 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-pytorch-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | torch 19 | 20 | RUN git clone https://github.com/NVIDIA/apex 21 | RUN cd apex && \ 22 | python3 setup.py install && \ 23 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 24 | 25 | WORKDIR /workspace 26 | COPY . transformers/ 27 | RUN cd transformers/ && \ 28 | python3 -m pip install --no-cache-dir . 29 | 30 | CMD ["/bin/bash"] 31 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/sagemaker/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2021 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .trainer_sm import SageMakerTrainer 20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available 21 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow \ 19 | torch 20 | 21 | RUN git clone https://github.com/NVIDIA/apex 22 | RUN cd apex && \ 23 | python3 setup.py install && \ 24 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 25 | 26 | WORKDIR /workspace 27 | COPY . transformers/ 28 | RUN cd transformers/ && \ 29 | python3 -m pip install --no-cache-dir . 30 | 31 | CMD ["/bin/bash"] 32 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/sagemaker/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2021 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .trainer_sm import SageMakerTrainer 20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available 21 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh: -------------------------------------------------------------------------------- 1 | # Script for verifying that run_bart_sum can be invoked from its directory 2 | 3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test) 4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz 5 | tar -xzvf cnn_tiny.tgz 6 | rm cnn_tiny.tgz 7 | 8 | export OUTPUT_DIR_NAME=bart_utest_output 9 | export CURRENT_DIR=${PWD} 10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 11 | 12 | # Make output directory if it doesn't exist 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py 16 | export PYTHONPATH="../":"${PYTHONPATH}" 17 | python finetune.py \ 18 | --data_dir=cnn_tiny/ \ 19 | --model_name_or_path=sshleifer/bart-tiny-random \ 20 | --learning_rate=3e-5 \ 21 | --train_batch_size=2 \ 22 | --eval_batch_size=2 \ 23 | --output_dir=$OUTPUT_DIR \ 24 | --num_train_epochs=1 \ 25 | --gpus=0 \ 26 | --do_train "$@" 27 | 28 | rm -rf cnn_tiny 29 | rm -rf $OUTPUT_DIR 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/wav2vec2/vocab/buckwalter.json: -------------------------------------------------------------------------------- 1 | { 2 | "": 0, 3 | "": 1, 4 | "": 2, 5 | "": 3, 6 | "/": 4, 7 | "'": 5, 8 | "|": 6, 9 | ">": 7, 10 | "&": 8, 11 | "<": 9, 12 | "}": 10, 13 | "A": 11, 14 | "b": 12, 15 | "p": 13, 16 | "t": 14, 17 | "v": 15, 18 | "j": 16, 19 | "H": 17, 20 | "x": 18, 21 | "d": 19, 22 | "*": 20, 23 | "r": 21, 24 | "z": 22, 25 | "s": 23, 26 | "$": 24, 27 | "S": 25, 28 | "D": 26, 29 | "T": 27, 30 | "Z": 28, 31 | "E": 29, 32 | "g": 30, 33 | "_": 31, 34 | "f": 32, 35 | "q": 33, 36 | "k": 34, 37 | "l": 35, 38 | "m": 36, 39 | "n": 37, 40 | "h": 38, 41 | "w": 39, 42 | "Y": 40, 43 | "y": 41, 44 | "F": 42, 45 | "N": 43, 46 | "K": 44, 47 | "a": 45, 48 | "u": 46, 49 | "i": 47, 50 | "~": 48, 51 | "o": 49, 52 | "`": 50, 53 | "{": 51, 54 | "P": 52, 55 | "J": 53, 56 | "V": 54, 57 | "G": 55 58 | } -------------------------------------------------------------------------------- /document_grounded_generation/transformers/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = transformers 7 | known_third_party = 8 | absl 9 | conllu 10 | datasets 11 | elasticsearch 12 | fairseq 13 | faiss-cpu 14 | fastprogress 15 | fire 16 | fugashi 17 | git 18 | h5py 19 | matplotlib 20 | nltk 21 | numpy 22 | packaging 23 | pandas 24 | PIL 25 | psutil 26 | pytest 27 | pytorch_lightning 28 | rouge_score 29 | sacrebleu 30 | seqeval 31 | sklearn 32 | streamlit 33 | tensorboardX 34 | tensorflow 35 | tensorflow_datasets 36 | timeout_decorator 37 | torch 38 | torchaudio 39 | torchtext 40 | torchvision 41 | torch_xla 42 | tqdm 43 | 44 | line_length = 119 45 | lines_after_imports = 2 46 | multi_line_output = 3 47 | use_parentheses = True 48 | 49 | [flake8] 50 | ignore = E203, E501, E741, W503, W605 51 | max-line-length = 119 52 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseTransformersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/wav2vec2/vocab/buckwalter.json: -------------------------------------------------------------------------------- 1 | { 2 | "": 0, 3 | "": 1, 4 | "": 2, 5 | "": 3, 6 | "/": 4, 7 | "'": 5, 8 | "|": 6, 9 | ">": 7, 10 | "&": 8, 11 | "<": 9, 12 | "}": 10, 13 | "A": 11, 14 | "b": 12, 15 | "p": 13, 16 | "t": 14, 17 | "v": 15, 18 | "j": 16, 19 | "H": 17, 20 | "x": 18, 21 | "d": 19, 22 | "*": 20, 23 | "r": 21, 24 | "z": 22, 25 | "s": 23, 26 | "$": 24, 27 | "S": 25, 28 | "D": 26, 29 | "T": 27, 30 | "Z": 28, 31 | "E": 29, 32 | "g": 30, 33 | "_": 31, 34 | "f": 32, 35 | "q": 33, 36 | "k": 34, 37 | "l": 35, 38 | "m": 36, 39 | "n": 37, 40 | "h": 38, 41 | "w": 39, 42 | "Y": 40, 43 | "y": 41, 44 | "F": 42, 45 | "N": 43, 46 | "K": 44, 47 | "a": 45, 48 | "u": 46, 49 | "i": 47, 50 | "~": 48, 51 | "o": 49, 52 | "`": 50, 53 | "{": 51, 54 | "P": 52, 55 | "J": 53, 56 | "V": 54, 57 | "G": 55 58 | } -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import io 4 | import json 5 | import subprocess 6 | 7 | 8 | pairs = [ 9 | ["en", "ru"], 10 | ["ru", "en"], 11 | ["en", "de"], 12 | ["de", "en"], 13 | ] 14 | 15 | n_objs = 8 16 | 17 | 18 | def get_all_data(pairs, n_objs): 19 | text = {} 20 | for src, tgt in pairs: 21 | pair = f"{src}-{tgt}" 22 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split() 23 | src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 24 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split() 25 | tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 26 | text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]} 27 | return text 28 | 29 | 30 | text = get_all_data(pairs, n_objs) 31 | filename = "./fsmt_val_data.json" 32 | with io.open(filename, "w", encoding="utf-8") as f: 33 | bleu_data = json.dump(text, f, indent=2, ensure_ascii=False) 34 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh: -------------------------------------------------------------------------------- 1 | # Script for verifying that run_bart_sum can be invoked from its directory 2 | 3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test) 4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz 5 | tar -xzvf cnn_tiny.tgz 6 | rm cnn_tiny.tgz 7 | 8 | export OUTPUT_DIR_NAME=bart_utest_output 9 | export CURRENT_DIR=${PWD} 10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 11 | 12 | # Make output directory if it doesn't exist 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py 16 | export PYTHONPATH="../":"${PYTHONPATH}" 17 | python finetune.py \ 18 | --data_dir=cnn_tiny/ \ 19 | --model_name_or_path=sshleifer/bart-tiny-random \ 20 | --learning_rate=3e-5 \ 21 | --train_batch_size=2 \ 22 | --eval_batch_size=2 \ 23 | --output_dir=$OUTPUT_DIR \ 24 | --num_train_epochs=1 \ 25 | --gpus=0 \ 26 | --do_train "$@" 27 | 28 | rm -rf cnn_tiny 29 | rm -rf $OUTPUT_DIR 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseTransformersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/pytorch-lightning/run_glue.sh: -------------------------------------------------------------------------------- 1 | # Install example requirements 2 | pip install -r ../requirements.txt 3 | 4 | # Download glue data 5 | python3 ../../utils/download_glue_data.py 6 | 7 | export TASK=mrpc 8 | export DATA_DIR=./glue_data/MRPC/ 9 | export MAX_LENGTH=128 10 | export LEARNING_RATE=2e-5 11 | export BERT_MODEL=bert-base-cased 12 | export BATCH_SIZE=32 13 | export NUM_EPOCHS=3 14 | export SEED=2 15 | export OUTPUT_DIR_NAME=mrpc-pl-bert 16 | export CURRENT_DIR=${PWD} 17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 18 | 19 | # Make output directory if it doesn't exist 20 | mkdir -p $OUTPUT_DIR 21 | # Add parent directory to python path to access lightning_base.py 22 | export PYTHONPATH="../":"${PYTHONPATH}" 23 | 24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \ 25 | --task $TASK \ 26 | --model_name_or_path $BERT_MODEL \ 27 | --output_dir $OUTPUT_DIR \ 28 | --max_seq_length $MAX_LENGTH \ 29 | --learning_rate $LEARNING_RATE \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --train_batch_size $BATCH_SIZE \ 32 | --seed $SEED \ 33 | --do_train \ 34 | --do_predict 35 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/deebert/entropy_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7" 17 | 18 | for ENTROPY in $ENTROPIES; do 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 22 | --task_name $DATASET \ 23 | --do_eval \ 24 | --do_lower_case \ 25 | --data_dir $PATH_TO_DATA/$DATASET \ 26 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 27 | --plot_data_dir ./results/ \ 28 | --max_seq_length 128 \ 29 | --early_exit_entropy $ENTROPY \ 30 | --eval_highway \ 31 | --overwrite_cache \ 32 | --per_gpu_eval_batch_size=1 33 | done 34 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import io 4 | import json 5 | import subprocess 6 | 7 | 8 | pairs = [ 9 | ["en", "ru"], 10 | ["ru", "en"], 11 | ["en", "de"], 12 | ["de", "en"], 13 | ] 14 | 15 | n_objs = 8 16 | 17 | 18 | def get_all_data(pairs, n_objs): 19 | text = {} 20 | for src, tgt in pairs: 21 | pair = f"{src}-{tgt}" 22 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split() 23 | src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 24 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split() 25 | tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 26 | text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]} 27 | return text 28 | 29 | 30 | text = get_all_data(pairs, n_objs) 31 | filename = "./fsmt_val_data.json" 32 | with io.open(filename, "w", encoding="utf-8") as f: 33 | bleu_data = json.dump(text, f, indent=2, ensure_ascii=False) 34 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "transformers" %} 2 | 3 | package: 4 | name: "{{ name|lower }}" 5 | version: "{{ TRANSFORMERS_VERSION }}" 6 | 7 | source: 8 | path: ../../ 9 | 10 | build: 11 | noarch: python 12 | 13 | requirements: 14 | host: 15 | - python 16 | - pip 17 | - numpy >=1.17 18 | - dataclasses 19 | - packaging 20 | - filelock 21 | - requests 22 | - tqdm >=4.27 23 | - sacremoses 24 | - regex !=2019.12.17 25 | - protobuf 26 | - tokenizers >=0.10.1,<0.11.0 27 | run: 28 | - python 29 | - numpy >=1.17 30 | - dataclasses 31 | - packaging 32 | - filelock 33 | - requests 34 | - tqdm >=4.27 35 | - sacremoses 36 | - regex !=2019.12.17 37 | - protobuf 38 | - tokenizers >=0.10.1,<0.11.0 39 | 40 | test: 41 | imports: 42 | - transformers 43 | 44 | about: 45 | home: https://huggingface.co 46 | license: Apache License 2.0 47 | license_file: LICENSE 48 | summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0." 49 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/finetune.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 16 | # run ./finetune.sh --help to see all the possible options 17 | python finetune_trainer.py \ 18 | --learning_rate=3e-5 \ 19 | --fp16 \ 20 | --do_train --do_eval --do_predict \ 21 | --evaluation_strategy steps \ 22 | --predict_with_generate \ 23 | --n_val 1000 \ 24 | "$@" 25 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/deebert/entropy_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7" 17 | 18 | for ENTROPY in $ENTROPIES; do 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 22 | --task_name $DATASET \ 23 | --do_eval \ 24 | --do_lower_case \ 25 | --data_dir $PATH_TO_DATA/$DATASET \ 26 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 27 | --plot_data_dir ./results/ \ 28 | --max_seq_length 128 \ 29 | --early_exit_entropy $ENTROPY \ 30 | --eval_highway \ 31 | --overwrite_cache \ 32 | --per_gpu_eval_batch_size=1 33 | done 34 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_glue.sh: -------------------------------------------------------------------------------- 1 | # Install example requirements 2 | pip install -r ../requirements.txt 3 | 4 | # Download glue data 5 | python3 ../../utils/download_glue_data.py 6 | 7 | export TASK=mrpc 8 | export DATA_DIR=./glue_data/MRPC/ 9 | export MAX_LENGTH=128 10 | export LEARNING_RATE=2e-5 11 | export BERT_MODEL=bert-base-cased 12 | export BATCH_SIZE=32 13 | export NUM_EPOCHS=3 14 | export SEED=2 15 | export OUTPUT_DIR_NAME=mrpc-pl-bert 16 | export CURRENT_DIR=${PWD} 17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 18 | 19 | # Make output directory if it doesn't exist 20 | mkdir -p $OUTPUT_DIR 21 | # Add parent directory to python path to access lightning_base.py 22 | export PYTHONPATH="../":"${PYTHONPATH}" 23 | 24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \ 25 | --task $TASK \ 26 | --model_name_or_path $BERT_MODEL \ 27 | --output_dir $OUTPUT_DIR \ 28 | --max_seq_length $MAX_LENGTH \ 29 | --learning_rate $LEARNING_RATE \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --train_batch_size $BATCH_SIZE \ 32 | --seed $SEED \ 33 | --do_train \ 34 | --do_predict 35 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/deebert/train_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | EPOCHS=10 12 | if [ $MODEL_TYPE = 'bert' ] 13 | then 14 | EPOCHS=3 15 | MODEL_NAME=${MODEL_NAME}-uncased 16 | fi 17 | 18 | 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path $MODEL_NAME \ 22 | --task_name $DATASET \ 23 | --do_train \ 24 | --do_eval \ 25 | --do_lower_case \ 26 | --data_dir $PATH_TO_DATA/$DATASET \ 27 | --max_seq_length 128 \ 28 | --per_gpu_eval_batch_size=1 \ 29 | --per_gpu_train_batch_size=8 \ 30 | --learning_rate 2e-5 \ 31 | --num_train_epochs $EPOCHS \ 32 | --overwrite_output_dir \ 33 | --seed 42 \ 34 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 35 | --plot_data_dir ./results/ \ 36 | --save_steps 0 \ 37 | --overwrite_cache \ 38 | --eval_after_first_stage 39 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/finetune.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 16 | # run ./finetune.sh --help to see all the possible options 17 | python finetune_trainer.py \ 18 | --learning_rate=3e-5 \ 19 | --fp16 \ 20 | --do_train --do_eval --do_predict \ 21 | --evaluation_strategy steps \ 22 | --predict_with_generate \ 23 | --n_val 1000 \ 24 | "$@" 25 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/deebert/train_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | EPOCHS=10 12 | if [ $MODEL_TYPE = 'bert' ] 13 | then 14 | EPOCHS=3 15 | MODEL_NAME=${MODEL_NAME}-uncased 16 | fi 17 | 18 | 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path $MODEL_NAME \ 22 | --task_name $DATASET \ 23 | --do_train \ 24 | --do_eval \ 25 | --do_lower_case \ 26 | --data_dir $PATH_TO_DATA/$DATASET \ 27 | --max_seq_length 128 \ 28 | --per_gpu_eval_batch_size=1 \ 29 | --per_gpu_train_batch_size=8 \ 30 | --learning_rate 2e-5 \ 31 | --num_train_epochs $EPOCHS \ 32 | --overwrite_output_dir \ 33 | --seed 42 \ 34 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 35 | --plot_data_dir ./results/ \ 36 | --save_steps 0 \ 37 | --overwrite_cache \ 38 | --eval_after_first_stage 39 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/finetune_tpu.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export TPU_NUM_CORES=8 16 | 17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 18 | # run ./finetune_tpu.sh --help to see all the possible options 19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \ 20 | finetune_trainer.py \ 21 | --learning_rate=3e-5 \ 22 | --do_train --do_eval \ 23 | --evaluation_strategy steps \ 24 | --prediction_loss_only \ 25 | --n_val 1000 \ 26 | "$@" 27 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/tests/deepspeed/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 32, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "zero_optimization": { 12 | "stage": 2, 13 | "allgather_partitions": true, 14 | "allgather_bucket_size": 2e8, 15 | "overlap_comm": true, 16 | "reduce_scatter": true, 17 | "reduce_bucket_size": 2e8, 18 | "contiguous_gradients": true, 19 | "cpu_offload": true 20 | }, 21 | 22 | "optimizer": { 23 | "type": "AdamW", 24 | "params": { 25 | "lr": 3e-5, 26 | "betas": [0.8, 0.999], 27 | "eps": 1e-8, 28 | "weight_decay": 3e-7 29 | } 30 | }, 31 | 32 | "scheduler": { 33 | "type": "WarmupLR", 34 | "params": { 35 | "warmup_min_lr": 0, 36 | "warmup_max_lr": 3e-5, 37 | "warmup_num_steps": 500 38 | } 39 | }, 40 | 41 | "steps_per_print": 2000, 42 | "wall_clock_breakdown": false 43 | } 44 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet: -------------------------------------------------------------------------------- 1 | local base = import 'templates/base.libsonnet'; 2 | local tpus = import 'templates/tpus.libsonnet'; 3 | local utils = import "templates/utils.libsonnet"; 4 | local volumes = import "templates/volumes.libsonnet"; 5 | 6 | local bertBaseCased = base.BaseTest { 7 | frameworkPrefix: "hf", 8 | modelName: "bert-base-cased", 9 | mode: "example", 10 | configMaps: [], 11 | 12 | timeout: 3600, # 1 hour, in seconds 13 | 14 | image: std.extVar('image'), 15 | imageTag: std.extVar('image-tag'), 16 | 17 | tpuSettings+: { 18 | softwareVersion: "pytorch-nightly", 19 | }, 20 | accelerator: tpus.v3_8, 21 | 22 | volumeMap+: { 23 | datasets: volumes.PersistentVolumeSpec { 24 | name: "huggingface-cluster-disk", 25 | mountPath: "/datasets", 26 | }, 27 | }, 28 | command: utils.scriptCommand( 29 | ||| 30 | python -m pytest -s transformers/examples/test_xla_examples.py -v 31 | test_exit_code=$? 32 | echo "\nFinished running commands.\n" 33 | test $test_exit_code -eq 0 34 | ||| 35 | ), 36 | }; 37 | 38 | bertBaseCased.oneshotJob 39 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/finetune_tpu.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export TPU_NUM_CORES=8 16 | 17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 18 | # run ./finetune_tpu.sh --help to see all the possible options 19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \ 20 | finetune_trainer.py \ 21 | --learning_rate=3e-5 \ 22 | --do_train --do_eval \ 23 | --evaluation_strategy steps \ 24 | --prediction_loss_only \ 25 | --n_val 1000 \ 26 | "$@" 27 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/tests/deepspeed/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": true, 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 32, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "zero_optimization": { 12 | "stage": 2, 13 | "allgather_partitions": true, 14 | "allgather_bucket_size": 2e8, 15 | "overlap_comm": true, 16 | "reduce_scatter": true, 17 | "reduce_bucket_size": 2e8, 18 | "contiguous_gradients": true, 19 | "cpu_offload": true 20 | }, 21 | 22 | "optimizer": { 23 | "type": "AdamW", 24 | "params": { 25 | "lr": 3e-5, 26 | "betas": [0.8, 0.999], 27 | "eps": 1e-8, 28 | "weight_decay": 3e-7 29 | } 30 | }, 31 | 32 | "scheduler": { 33 | "type": "WarmupLR", 34 | "params": { 35 | "warmup_min_lr": 0, 36 | "warmup_max_lr": 3e-5, 37 | "warmup_num_steps": 500 38 | } 39 | }, 40 | 41 | "steps_per_print": 2000, 42 | "wall_clock_breakdown": false 43 | } 44 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/scripts/fsmt/tests-to-run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # these scripts need to be run before any changes to FSMT-related code - it should cover all bases 17 | 18 | CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py 19 | RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py 20 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/rag/finetune_rag.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options 6 | 7 | python examples/rag/finetune_rag.py \ 8 | --data_dir $DATA_DIR \ 9 | --output_dir $OUTPUT_DIR \ 10 | --model_name_or_path $MODEL_NAME_OR_PATH \ 11 | --model_type rag_sequence \ 12 | --fp16 \ 13 | --gpus 8 \ 14 | --profile \ 15 | --do_train \ 16 | --do_predict \ 17 | --n_val -1 \ 18 | --train_batch_size 8 \ 19 | --eval_batch_size 1 \ 20 | --max_source_length 128 \ 21 | --max_target_length 25 \ 22 | --val_max_target_length 25 \ 23 | --test_max_target_length 25 \ 24 | --label_smoothing 0.1 \ 25 | --dropout 0.1 \ 26 | --attention_dropout 0.1 \ 27 | --weight_decay 0.001 \ 28 | --adam_epsilon 1e-08 \ 29 | --max_grad_norm 0.1 \ 30 | --lr_scheduler polynomial \ 31 | --learning_rate 3e-05 \ 32 | --num_train_epochs 100 \ 33 | --warmup_steps 500 \ 34 | --gradient_accumulation_steps 1 \ 35 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/token-classification/run_chunk.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Downloading CONLL2003 dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Downloading CONLL2003 test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Downloading CONLL2003 train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=chunker-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type Chunk \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/token-classification/run_pos.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Download dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Download test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Download train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=postagger-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type POS \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/token-classification/scripts/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from transformers import AutoTokenizer 4 | 5 | 6 | dataset = sys.argv[1] 7 | model_name_or_path = sys.argv[2] 8 | max_len = int(sys.argv[3]) 9 | 10 | subword_len_counter = 0 11 | 12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 13 | max_len -= tokenizer.num_special_tokens_to_add() 14 | 15 | with open(dataset, "rt") as f_p: 16 | for line in f_p: 17 | line = line.rstrip() 18 | 19 | if not line: 20 | print(line) 21 | subword_len_counter = 0 22 | continue 23 | 24 | token = line.split()[0] 25 | 26 | current_subwords_len = len(tokenizer.tokenize(token)) 27 | 28 | # Token contains strange control characters like \x96 or \x95 29 | # Just filter out the complete line 30 | if current_subwords_len == 0: 31 | continue 32 | 33 | if (subword_len_counter + current_subwords_len) > max_len: 34 | print("") 35 | print(line) 36 | subword_len_counter = current_subwords_len 37 | continue 38 | 39 | subword_len_counter += current_subwords_len 40 | 41 | print(line) 42 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import GlueDataset, GlueDataTrainingArguments 20 | from .language_modeling import ( 21 | LineByLineTextDataset, 22 | LineByLineWithRefDataset, 23 | LineByLineWithSOPTextDataset, 24 | TextDataset, 25 | TextDatasetForNextSentencePrediction, 26 | ) 27 | from .squad import SquadDataset, SquadDataTrainingArguments 28 | -------------------------------------------------------------------------------- /sorting/run_sort_inftyformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $1 == 'train' ]]; then 4 | echo 'Run training...' 5 | python3 train.py \ 6 | --cuda \ 7 | --data ../data/ \ 8 | --dataset ../data_sort_8000 \ 9 | --n_layer 3 \ 10 | --d_model 300 \ 11 | --n_head 6 \ 12 | --d_head 50 \ 13 | --d_inner 300 \ 14 | --dropout 0.1 \ 15 | --dropatt 0.0 \ 16 | --optim adam \ 17 | --lr 0.0002 \ 18 | --warmup_step 0 \ 19 | --max_step 20000 \ 20 | --tgt_len 1024 \ 21 | --mem_len 1024 \ 22 | --eval_tgt_len 1024 \ 23 | --batch_size 8 \ 24 | --gpu0_bsz 8 \ 25 | --continuous \ 26 | --long_term_attention \ 27 | --long_term_attention_norm='softmax' \ 28 | --long_term_attention_basis 512 \ 29 | --affines \ 30 | --augment \ 31 | --augment_len 1024 \ 32 | --infinite_memory \ 33 | --mask \ 34 | --mask_type 'cnn' \ 35 | --kl_regularizer \ 36 | --kl_m .000001 \ 37 | --sigma_0 .05 \ 38 | --name infty_former \ 39 | --work_dir ./sort_8000 \ 40 | ${@:2} 41 | echo 'unknown argment 1' 42 | fi 43 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/test_pipelines_text2text_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class Text2TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "text2text-generation" 22 | small_models = ["patrickvonplaten/t5-tiny-random"] # Default model - Models tested without the @slow decorator 23 | large_models = [] # Models tested with the @slow decorator 24 | invalid_inputs = [4, ""] 25 | mandatory_keys = ["generated_text"] 26 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/token-classification/run_pos.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Download dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Download test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Download train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=postagger-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type POS \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options 6 | 7 | python examples/rag/finetune_rag.py \ 8 | --data_dir $DATA_DIR \ 9 | --output_dir $OUTPUT_DIR \ 10 | --model_name_or_path $MODEL_NAME_OR_PATH \ 11 | --model_type rag_sequence \ 12 | --fp16 \ 13 | --gpus 8 \ 14 | --profile \ 15 | --do_train \ 16 | --do_predict \ 17 | --n_val -1 \ 18 | --train_batch_size 8 \ 19 | --eval_batch_size 1 \ 20 | --max_source_length 128 \ 21 | --max_target_length 25 \ 22 | --val_max_target_length 25 \ 23 | --test_max_target_length 25 \ 24 | --label_smoothing 0.1 \ 25 | --dropout 0.1 \ 26 | --attention_dropout 0.1 \ 27 | --weight_decay 0.001 \ 28 | --adam_epsilon 1e-08 \ 29 | --max_grad_norm 0.1 \ 30 | --lr_scheduler polynomial \ 31 | --learning_rate 3e-05 \ 32 | --num_train_epochs 100 \ 33 | --warmup_steps 500 \ 34 | --gradient_accumulation_steps 1 \ 35 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/test_pipelines_feature_extraction.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class FeatureExtractionPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "feature-extraction" 22 | small_models = [ 23 | "sshleifer/tiny-distilbert-base-cased" 24 | ] # Default model - Models tested without the @slow decorator 25 | large_models = [None] # Models tested with the @slow decorator 26 | mandatory_keys = {} # Keys which should be in the output 27 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/token-classification/run_chunk.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Downloading CONLL2003 dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Downloading CONLL2003 test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Downloading CONLL2003 train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=chunker-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type Chunk \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/token-classification/scripts/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from transformers import AutoTokenizer 4 | 5 | 6 | dataset = sys.argv[1] 7 | model_name_or_path = sys.argv[2] 8 | max_len = int(sys.argv[3]) 9 | 10 | subword_len_counter = 0 11 | 12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 13 | max_len -= tokenizer.num_special_tokens_to_add() 14 | 15 | with open(dataset, "rt") as f_p: 16 | for line in f_p: 17 | line = line.rstrip() 18 | 19 | if not line: 20 | print(line) 21 | subword_len_counter = 0 22 | continue 23 | 24 | token = line.split()[0] 25 | 26 | current_subwords_len = len(tokenizer.tokenize(token)) 27 | 28 | # Token contains strange control characters like \x96 or \x95 29 | # Just filter out the complete line 30 | if current_subwords_len == 0: 31 | continue 32 | 33 | if (subword_len_counter + current_subwords_len) > max_len: 34 | print("") 35 | print(line) 36 | subword_len_counter = current_subwords_len 37 | continue 38 | 39 | subword_len_counter += current_subwords_len 40 | 41 | print(line) 42 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import GlueDataset, GlueDataTrainingArguments 20 | from .language_modeling import ( 21 | LineByLineTextDataset, 22 | LineByLineWithRefDataset, 23 | LineByLineWithSOPTextDataset, 24 | TextDataset, 25 | TextDatasetForNextSentencePrediction, 26 | ) 27 | from .squad import SquadDataset, SquadDataTrainingArguments 28 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/workflows/release-conda.yml: -------------------------------------------------------------------------------- 1 | name: Release - Conda 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | env: 9 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }} 10 | 11 | jobs: 12 | build_and_package: 13 | runs-on: ubuntu-latest 14 | defaults: 15 | run: 16 | shell: bash -l {0} 17 | 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v1 21 | 22 | - name: Install miniconda 23 | uses: conda-incubator/setup-miniconda@v2 24 | with: 25 | auto-update-conda: true 26 | auto-activate-base: false 27 | activate-environment: "build-transformers" 28 | channels: huggingface 29 | 30 | - name: Setup conda env 31 | run: | 32 | conda install -c defaults anaconda-client conda-build 33 | 34 | - name: Extract version 35 | run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV 36 | 37 | - name: Build conda packages 38 | run: | 39 | conda info 40 | conda list 41 | conda-build .github/conda 42 | 43 | - name: Upload to Anaconda 44 | run: anaconda upload `conda-build .github/conda --output` --force 45 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Research projects 18 | 19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific 20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work. 21 | 22 | To use any of them, just run the command 23 | ``` 24 | pip install -r requirements.txt 25 | ``` 26 | inside the folder of your choice. 27 | 28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder. 29 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/test_pipelines_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class SentimentAnalysisPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "sentiment-analysis" 22 | small_models = [ 23 | "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english" 24 | ] # Default model - Models tested without the @slow decorator 25 | large_models = [None] # Models tested with the @slow decorator 26 | mandatory_keys = {"label", "score"} # Keys which should be in the output 27 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Research projects 18 | 19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific 20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work. 21 | 22 | To use any of them, just run the command 23 | ``` 24 | pip install -r requirements.txt 25 | ``` 26 | inside the folder of your choice. 27 | 28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder. 29 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 23 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/pytorch-lightning/run_pos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if ! [ -f ./dev.txt ]; then 3 | echo "Download dev dataset...." 4 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 5 | fi 6 | 7 | if ! [ -f ./test.txt ]; then 8 | echo "Download test dataset...." 9 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 10 | fi 11 | 12 | if ! [ -f ./train.txt ]; then 13 | echo "Download train dataset...." 14 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 15 | fi 16 | 17 | export MAX_LENGTH=200 18 | export BERT_MODEL=bert-base-uncased 19 | export OUTPUT_DIR=postagger-model 20 | export BATCH_SIZE=32 21 | export NUM_EPOCHS=3 22 | export SAVE_STEPS=750 23 | export SEED=1 24 | 25 | 26 | # Add parent directory to python path to access lightning_base.py 27 | export PYTHONPATH="../":"${PYTHONPATH}" 28 | 29 | python3 run_ner.py --data_dir ./ \ 30 | --task_type POS \ 31 | --model_name_or_path $BERT_MODEL \ 32 | --output_dir $OUTPUT_DIR \ 33 | --max_seq_length $MAX_LENGTH \ 34 | --num_train_epochs $NUM_EPOCHS \ 35 | --train_batch_size $BATCH_SIZE \ 36 | --seed $SEED \ 37 | --gpus 1 \ 38 | --do_train \ 39 | --do_predict 40 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/text-generation/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | ## Language generation 18 | 19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py). 20 | 21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL. 22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you 23 | can try out the different models available in the library. 24 | 25 | Example usage: 26 | 27 | ```bash 28 | python run_generation.py \ 29 | --model_type=gpt2 \ 30 | --model_name_or_path=gpt2 31 | ``` 32 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 23 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import unittest 17 | from unittest.mock import patch 18 | 19 | from transformers.testing_utils import CaptureStd 20 | 21 | 22 | class CLITest(unittest.TestCase): 23 | @patch("sys.argv", ["fakeprogrampath", "env"]) 24 | def test_cli_env(self): 25 | # test transformers-cli env 26 | import transformers.commands.transformers_cli 27 | 28 | with CaptureStd() as cs: 29 | transformers.commands.transformers_cli.main() 30 | assert "Python version" in cs.out 31 | assert "Platform" in cs.out 32 | assert "Using distributed or parallel set-up in script?" in cs.out 33 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/minify_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from pathlib import Path 17 | 18 | import fire 19 | 20 | 21 | def minify(src_dir: str, dest_dir: str, n: int): 22 | """Write first n lines of each file f in src_dir to dest_dir/f """ 23 | src_dir = Path(src_dir) 24 | dest_dir = Path(dest_dir) 25 | dest_dir.mkdir(exist_ok=True) 26 | for path in src_dir.iterdir(): 27 | new = [x.rstrip() for x in list(path.open().readlines())][:n] 28 | dest_path = dest_dir.joinpath(path.name) 29 | print(dest_path) 30 | dest_path.open("w").write("\n".join(new)) 31 | 32 | 33 | if __name__ == "__main__": 34 | fire.Fire(minify) 35 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/rouge_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import fire 16 | 17 | from utils import calculate_rouge, save_json 18 | 19 | 20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs): 21 | """Kwargs will be passed to calculate_rouge""" 22 | pred_lns = [x.strip() for x in open(pred_path).readlines()] 23 | tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)] 24 | metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs) 25 | if save_path is not None: 26 | save_json(metrics, save_path, indent=None) 27 | return metrics # these print nicely 28 | 29 | 30 | if __name__ == "__main__": 31 | fire.Fire(calculate_rouge_path) 32 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_pos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if ! [ -f ./dev.txt ]; then 3 | echo "Download dev dataset...." 4 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 5 | fi 6 | 7 | if ! [ -f ./test.txt ]; then 8 | echo "Download test dataset...." 9 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 10 | fi 11 | 12 | if ! [ -f ./train.txt ]; then 13 | echo "Download train dataset...." 14 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 15 | fi 16 | 17 | export MAX_LENGTH=200 18 | export BERT_MODEL=bert-base-uncased 19 | export OUTPUT_DIR=postagger-model 20 | export BATCH_SIZE=32 21 | export NUM_EPOCHS=3 22 | export SAVE_STEPS=750 23 | export SEED=1 24 | 25 | 26 | # Add parent directory to python path to access lightning_base.py 27 | export PYTHONPATH="../":"${PYTHONPATH}" 28 | 29 | python3 run_ner.py --data_dir ./ \ 30 | --task_type POS \ 31 | --model_name_or_path $BERT_MODEL \ 32 | --output_dir $OUTPUT_DIR \ 33 | --max_seq_length $MAX_LENGTH \ 34 | --num_train_epochs $NUM_EPOCHS \ 35 | --train_batch_size $BATCH_SIZE \ 36 | --seed $SEED \ 37 | --gpus 1 \ 38 | --do_train \ 39 | --do_predict 40 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/utils/dummy_flax_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_flax 3 | 4 | 5 | class FlaxPreTrainedModel: 6 | def __init__(self, *args, **kwargs): 7 | requires_flax(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_flax(self) 12 | 13 | 14 | FLAX_MODEL_MAPPING = None 15 | 16 | 17 | class FlaxAutoModel: 18 | def __init__(self, *args, **kwargs): 19 | requires_flax(self) 20 | 21 | @classmethod 22 | def from_pretrained(self, *args, **kwargs): 23 | requires_flax(self) 24 | 25 | 26 | class FlaxBertForMaskedLM: 27 | def __init__(self, *args, **kwargs): 28 | requires_flax(self) 29 | 30 | @classmethod 31 | def from_pretrained(self, *args, **kwargs): 32 | requires_flax(self) 33 | 34 | 35 | class FlaxBertModel: 36 | def __init__(self, *args, **kwargs): 37 | requires_flax(self) 38 | 39 | @classmethod 40 | def from_pretrained(self, *args, **kwargs): 41 | requires_flax(self) 42 | 43 | 44 | class FlaxRobertaModel: 45 | def __init__(self, *args, **kwargs): 46 | requires_flax(self) 47 | 48 | @classmethod 49 | def from_pretrained(self, *args, **kwargs): 50 | requires_flax(self) 51 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/text-generation/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | ## Language generation 18 | 19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py). 20 | 21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL. 22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you 23 | can try out the different models available in the library. 24 | 25 | Example usage: 26 | 27 | ```bash 28 | python run_generation.py \ 29 | --model_type=gpt2 \ 30 | --model_name_or_path=gpt2 31 | ``` 32 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/minify_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from pathlib import Path 17 | 18 | import fire 19 | 20 | 21 | def minify(src_dir: str, dest_dir: str, n: int): 22 | """Write first n lines of each file f in src_dir to dest_dir/f """ 23 | src_dir = Path(src_dir) 24 | dest_dir = Path(dest_dir) 25 | dest_dir.mkdir(exist_ok=True) 26 | for path in src_dir.iterdir(): 27 | new = [x.rstrip() for x in list(path.open().readlines())][:n] 28 | dest_path = dest_dir.joinpath(path.name) 29 | print(dest_path) 30 | dest_path.open("w").write("\n".join(new)) 31 | 32 | 33 | if __name__ == "__main__": 34 | fire.Fire(minify) 35 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/rouge_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import fire 16 | 17 | from utils import calculate_rouge, save_json 18 | 19 | 20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs): 21 | """Kwargs will be passed to calculate_rouge""" 22 | pred_lns = [x.strip() for x in open(pred_path).readlines()] 23 | tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)] 24 | metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs) 25 | if save_path is not None: 26 | save_json(metrics, save_path, indent=None) 27 | return metrics # these print nicely 28 | 29 | 30 | if __name__ == "__main__": 31 | fire.Fire(calculate_rouge_path) 32 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/utils/dummy_flax_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_flax 3 | 4 | 5 | class FlaxPreTrainedModel: 6 | def __init__(self, *args, **kwargs): 7 | requires_flax(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_flax(self) 12 | 13 | 14 | FLAX_MODEL_MAPPING = None 15 | 16 | 17 | class FlaxAutoModel: 18 | def __init__(self, *args, **kwargs): 19 | requires_flax(self) 20 | 21 | @classmethod 22 | def from_pretrained(self, *args, **kwargs): 23 | requires_flax(self) 24 | 25 | 26 | class FlaxBertForMaskedLM: 27 | def __init__(self, *args, **kwargs): 28 | requires_flax(self) 29 | 30 | @classmethod 31 | def from_pretrained(self, *args, **kwargs): 32 | requires_flax(self) 33 | 34 | 35 | class FlaxBertModel: 36 | def __init__(self, *args, **kwargs): 37 | requires_flax(self) 38 | 39 | @classmethod 40 | def from_pretrained(self, *args, **kwargs): 41 | requires_flax(self) 42 | 43 | 44 | class FlaxRobertaModel: 45 | def __init__(self, *args, **kwargs): 46 | requires_flax(self) 47 | 48 | @classmethod 49 | def from_pretrained(self, *args, **kwargs): 50 | requires_flax(self) 51 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/model_cards/README.md: -------------------------------------------------------------------------------- 1 | ## 🔥 Model cards now live inside each huggingface.co model repo 🔥 2 | 3 | 4 | For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub. 5 | 6 | ### How to update a model card 7 | 8 | You can directly update a model card inside any model repo you have **write access** to, i.e.: 9 | - a model under your username namespace 10 | - a model under any organization you are a part of. 11 | 12 | You can either: 13 | - update it, commit and push using your usual git workflow (command line, GUI, etc.) 14 | - or edit it directly from the website's UI. 15 | 16 | **What if you want to create or update a model card for a model you don't have write access to?** 17 | 18 | In that case, given that we don't have a Pull request system yet on huggingface.co (🤯), 19 | you can open an issue here, post the card's content, and tag the model author(s) and/or the Hugging Face team. 20 | 21 | We might implement a more seamless process at some point, so your early feedback is precious! 22 | Please let us know of any suggestion. 23 | 24 | ### What happened to the model cards here? 25 | 26 | We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub. 27 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | .. 2 | Copyright 2020 The HuggingFace Team. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 5 | the License. You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 10 | an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 11 | specific language governing permissions and limitations under the License. 12 | 13 | Configuration 14 | ----------------------------------------------------------------------------------------------------------------------- 15 | 16 | The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration 17 | either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded 18 | from HuggingFace's AWS S3 repository). 19 | 20 | 21 | PretrainedConfig 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | .. autoclass:: transformers.PretrainedConfig 25 | :members: 26 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from filelock import FileLock 17 | 18 | 19 | try: 20 | import nltk 21 | 22 | NLTK_AVAILABLE = True 23 | except (ImportError, ModuleNotFoundError): 24 | NLTK_AVAILABLE = False 25 | 26 | if NLTK_AVAILABLE: 27 | with FileLock(".lock") as lock: 28 | nltk.download("punkt", quiet=True) 29 | 30 | 31 | def add_newline_to_end_of_each_sentence(x: str) -> str: 32 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 33 | re.sub("", "", x) # remove pegasus newline char 34 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 35 | return "\n".join(nltk.sent_tokenize(x)) 36 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from filelock import FileLock 17 | 18 | 19 | try: 20 | import nltk 21 | 22 | NLTK_AVAILABLE = True 23 | except (ImportError, ModuleNotFoundError): 24 | NLTK_AVAILABLE = False 25 | 26 | if NLTK_AVAILABLE: 27 | with FileLock(".lock") as lock: 28 | nltk.download("punkt", quiet=True) 29 | 30 | 31 | def add_newline_to_end_of_each_sentence(x: str) -> str: 32 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 33 | re.sub("", "", x) # remove pegasus newline char 34 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 35 | return "\n".join(nltk.sent_tokenize(x)) 36 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/scripts/pegasus/build_test_sample_spm_no_bos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus 17 | 18 | # 1. pip install sentencepiece 19 | # 20 | # 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt 21 | 22 | # 3. build 23 | import sentencepiece as spm 24 | 25 | # pegasus: 26 | # 1. no bos 27 | # 2. eos_id is 1 28 | # 3. unk_id is 2 29 | # build a sample spm file accordingly 30 | spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000') 31 | 32 | # 4. now update the fixture 33 | # mv test_sentencepiece_no_bos.model ../../tests/fixtures/ 34 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .metrics import glue_compute_metrics, xnli_compute_metrics 20 | from .processors import ( 21 | DataProcessor, 22 | InputExample, 23 | InputFeatures, 24 | SingleSentenceClassificationProcessor, 25 | SquadExample, 26 | SquadFeatures, 27 | SquadV1Processor, 28 | SquadV2Processor, 29 | glue_convert_examples_to_features, 30 | glue_output_modes, 31 | glue_processors, 32 | glue_tasks_num_labels, 33 | squad_convert_examples_to_features, 34 | xnli_output_modes, 35 | xnli_processors, 36 | xnli_tasks_num_labels, 37 | ) 38 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM-ProphetNet model configuration """ 16 | 17 | 18 | from ...utils import logging 19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json", 26 | } 27 | 28 | 29 | class XLMProphetNetConfig(ProphetNetConfig): 30 | """ 31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate 32 | documentation alongside usage examples. 33 | """ 34 | 35 | model_type = "xlm-prophetnet" 36 | -------------------------------------------------------------------------------- /sorting/utils/exp_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os, shutil 3 | 4 | import numpy as np 5 | 6 | import torch 7 | 8 | 9 | def logging(s, log_path, print_=True, log_=True): 10 | if print_: 11 | print(s) 12 | if log_: 13 | with open(log_path, 'a+') as f_log: 14 | f_log.write(s + '\n') 15 | 16 | def get_logger(log_path, **kwargs): 17 | return functools.partial(logging, log_path=log_path, **kwargs) 18 | 19 | def create_exp_dir(dir_path, scripts_to_save=None, debug=False): 20 | if debug: 21 | print('Debug Mode : no experiment dir created') 22 | return functools.partial(logging, log_path=None, log_=False) 23 | 24 | if not os.path.exists(dir_path): 25 | os.makedirs(dir_path) 26 | 27 | print('Experiment dir : {}'.format(dir_path)) 28 | if scripts_to_save is not None: 29 | script_path = os.path.join(dir_path, 'scripts') 30 | if not os.path.exists(script_path): 31 | os.makedirs(script_path) 32 | for script in scripts_to_save: 33 | dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script)) 34 | shutil.copyfile(script, dst_file) 35 | 36 | return get_logger(log_path=os.path.join(dir_path, 'log.txt')) 37 | 38 | def save_checkpoint(model, optimizer, path, epoch): 39 | torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch))) 40 | torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch))) 41 | -------------------------------------------------------------------------------- /finetune_gpt2/infinite_memory_transformer_sticky_mem/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "gpt2", 3 | "activation_function": "gelu_new", 4 | "affines": true, 5 | "architectures": [ 6 | "GPT2LMHeadModel" 7 | ], 8 | "attn_drop": 0.1, 9 | "attn_pdrop": 0.1, 10 | "bos_token_id": 50256, 11 | "continuous": true, 12 | "embd_pdrop": 0.1, 13 | "eos_token_id": 50256, 14 | "gradient_checkpointing": false, 15 | "infinite_memory": true, 16 | "initializer_range": 0.02, 17 | "kl_regularizer": true, 18 | "layer_norm_epsilon": 1e-05, 19 | "long_term_attention": true, 20 | "long_term_attention_basis": 512, 21 | "long_term_attention_norm": "softmax", 22 | "mask": true, 23 | "mask_dropout": 0, 24 | "mask_type": "cnn", 25 | "model_type": "gpt2", 26 | "mu_0": -1, 27 | "n_ctx": 1024, 28 | "n_embd": 768, 29 | "n_head": 12, 30 | "n_inner": null, 31 | "n_layer": 12, 32 | "n_positions": 1024, 33 | "n_special": 0, 34 | "predict_special_tokens": true, 35 | "resid_pdrop": 0.1, 36 | "sigma_0": 0.05, 37 | "sticky_memories": true, 38 | "summary_activation": null, 39 | "summary_first_dropout": 0.1, 40 | "summary_proj_to_labels": true, 41 | "summary_type": "cls_index", 42 | "summary_use_proj": true, 43 | "task_specific_params": { 44 | "text-generation": { 45 | "do_sample": true, 46 | "max_length": 50 47 | } 48 | }, 49 | "transformers_version": "4.5.0.dev0", 50 | "use_cache": true, 51 | "vocab_size": 50257 52 | } 53 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .metrics import glue_compute_metrics, xnli_compute_metrics 20 | from .processors import ( 21 | DataProcessor, 22 | InputExample, 23 | InputFeatures, 24 | SingleSentenceClassificationProcessor, 25 | SquadExample, 26 | SquadFeatures, 27 | SquadV1Processor, 28 | SquadV2Processor, 29 | glue_convert_examples_to_features, 30 | glue_output_modes, 31 | glue_processors, 32 | glue_tasks_num_labels, 33 | squad_convert_examples_to_features, 34 | xnli_output_modes, 35 | xnli_processors, 36 | xnli_tasks_num_labels, 37 | ) 38 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/performer/README.md: -------------------------------------------------------------------------------- 1 | # Performer fine-tuning 2 | 3 | Example authors: @TevenLeScao, @Patrickvonplaten 4 | 5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller 6 | 7 | ## Requirements 8 | 9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it. 10 | 11 | ## Examples 12 | 13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`. 14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`. 15 | 16 | Here are a few key arguments: 17 | - Remove the `--performer` argument to use a standard Bert model. 18 | 19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 20 | 21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument. 22 | 23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging. 24 | 25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need. -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/rag/finetune_rag_ray.sh: -------------------------------------------------------------------------------- 1 | # Sample script to finetune RAG using Ray for distributed retrieval. 2 | 3 | # Add parent directory to python path to access lightning_base.py 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | # Start a single-node Ray cluster. 7 | ray start --head 8 | 9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options 11 | 12 | python examples/rag/finetune_rag.py \ 13 | --data_dir $DATA_DIR \ 14 | --output_dir $OUTPUT_DIR \ 15 | --model_name_or_path $MODEL_NAME_OR_PATH \ 16 | --model_type rag_sequence \ 17 | --fp16 \ 18 | --gpus 8 \ 19 | --profile \ 20 | --do_train \ 21 | --do_predict \ 22 | --n_val -1 \ 23 | --train_batch_size 8 \ 24 | --eval_batch_size 1 \ 25 | --max_source_length 128 \ 26 | --max_target_length 25 \ 27 | --val_max_target_length 25 \ 28 | --test_max_target_length 25 \ 29 | --label_smoothing 0.1 \ 30 | --dropout 0.1 \ 31 | --attention_dropout 0.1 \ 32 | --weight_decay 0.001 \ 33 | --adam_epsilon 1e-08 \ 34 | --max_grad_norm 0.1 \ 35 | --lr_scheduler polynomial \ 36 | --learning_rate 3e-05 \ 37 | --num_train_epochs 100 \ 38 | --warmup_steps 500 \ 39 | --gradient_accumulation_steps 1 \ 40 | --distributed_retriever ray \ 41 | --num_retrieval_workers 4 42 | 43 | # Stop the Ray cluster. 44 | ray stop 45 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM-ProphetNet model configuration """ 16 | 17 | 18 | from ...utils import logging 19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json", 26 | } 27 | 28 | 29 | class XLMProphetNetConfig(ProphetNetConfig): 30 | """ 31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate 32 | documentation alongside usage examples. 33 | """ 34 | 35 | model_type = "xlm-prophetnet" 36 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/performer/README.md: -------------------------------------------------------------------------------- 1 | # Performer fine-tuning 2 | 3 | Example authors: @TevenLeScao, @Patrickvonplaten 4 | 5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller 6 | 7 | ## Requirements 8 | 9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it. 10 | 11 | ## Examples 12 | 13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`. 14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`. 15 | 16 | Here are a few key arguments: 17 | - Remove the `--performer` argument to use a standard Bert model. 18 | 19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 20 | 21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument. 22 | 23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging. 24 | 25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need. -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag_ray.sh: -------------------------------------------------------------------------------- 1 | # Sample script to finetune RAG using Ray for distributed retrieval. 2 | 3 | # Add parent directory to python path to access lightning_base.py 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | # Start a single-node Ray cluster. 7 | ray start --head 8 | 9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options 11 | 12 | python examples/rag/finetune_rag.py \ 13 | --data_dir $DATA_DIR \ 14 | --output_dir $OUTPUT_DIR \ 15 | --model_name_or_path $MODEL_NAME_OR_PATH \ 16 | --model_type rag_sequence \ 17 | --fp16 \ 18 | --gpus 8 \ 19 | --profile \ 20 | --do_train \ 21 | --do_predict \ 22 | --n_val -1 \ 23 | --train_batch_size 8 \ 24 | --eval_batch_size 1 \ 25 | --max_source_length 128 \ 26 | --max_target_length 25 \ 27 | --val_max_target_length 25 \ 28 | --test_max_target_length 25 \ 29 | --label_smoothing 0.1 \ 30 | --dropout 0.1 \ 31 | --attention_dropout 0.1 \ 32 | --weight_decay 0.001 \ 33 | --adam_epsilon 1e-08 \ 34 | --max_grad_norm 0.1 \ 35 | --lr_scheduler polynomial \ 36 | --learning_rate 3e-05 \ 37 | --num_train_epochs 100 \ 38 | --warmup_steps 500 \ 39 | --gradient_accumulation_steps 1 \ 40 | --distributed_retriever ray \ 41 | --num_retrieval_workers 4 42 | 43 | # Stop the Ray cluster. 44 | ray stop 45 | -------------------------------------------------------------------------------- /finetune_gpt2/infinite_memory_transformer/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "gpt2", 3 | "activation_function": "gelu_new", 4 | "affines": true, 5 | "architectures": [ 6 | "GPT2LMHeadModel" 7 | ], 8 | "attn_drop": 0.1, 9 | "attn_pdrop": 0.1, 10 | "bos_token_id": 50256, 11 | "compression_rate": 2, 12 | "compressive": false, 13 | "continuous": true, 14 | "embd_pdrop": 0.1, 15 | "eos_token_id": 50256, 16 | "gradient_checkpointing": false, 17 | "infinite_memory": true, 18 | "initializer_range": 0.02, 19 | "kl_regularizer": true, 20 | "layer_norm_epsilon": 1e-05, 21 | "long_term_attention": true, 22 | "long_term_attention_basis": 512, 23 | "long_term_attention_norm": "softmax", 24 | "mask": true, 25 | "mask_dropout": 0, 26 | "mask_type": "cnn", 27 | "model_type": "gpt2", 28 | "mu_0": -1, 29 | "n_ctx": 1024, 30 | "n_embd": 768, 31 | "n_head": 12, 32 | "n_inner": null, 33 | "n_layer": 12, 34 | "n_positions": 1024, 35 | "n_special": 0, 36 | "predict_special_tokens": true, 37 | "resid_pdrop": 0.1, 38 | "sigma_0": 0.05, 39 | "sticky_memories": false, 40 | "summary_activation": null, 41 | "summary_first_dropout": 0.1, 42 | "summary_proj_to_labels": true, 43 | "summary_type": "cls_index", 44 | "summary_use_proj": true, 45 | "task_specific_params": { 46 | "text-generation": { 47 | "do_sample": true, 48 | "max_length": 50 49 | } 50 | }, 51 | "transformers_version": "4.5.0.dev0", 52 | "use_cache": true, 53 | "vocab_size": 50257 54 | } 55 | 56 | -------------------------------------------------------------------------------- /finetune_gpt2/src/transformers/models/xlm_prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_torch_available 20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_xlm_prophetnet import ( 28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 29 | XLMProphetNetDecoder, 30 | XLMProphetNetEncoder, 31 | XLMProphetNetForCausalLM, 32 | XLMProphetNetForConditionalGeneration, 33 | XLMProphetNetModel, 34 | ) 35 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/tests/test_activations_tf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from transformers import is_tf_available 18 | from transformers.testing_utils import require_tf 19 | 20 | 21 | if is_tf_available(): 22 | from transformers.activations_tf import get_tf_activation 23 | 24 | 25 | @require_tf 26 | class TestTFActivations(unittest.TestCase): 27 | def test_get_activation(self): 28 | get_tf_activation("swish") 29 | get_tf_activation("silu") 30 | get_tf_activation("gelu") 31 | get_tf_activation("relu") 32 | get_tf_activation("tanh") 33 | get_tf_activation("gelu_new") 34 | get_tf_activation("gelu_fast") 35 | get_tf_activation("mish") 36 | with self.assertRaises(KeyError): 37 | get_tf_activation("bogus") 38 | with self.assertRaises(KeyError): 39 | get_tf_activation(None) 40 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/.github/workflows/github-torch-hub.yml: -------------------------------------------------------------------------------- 1 | name: Torch hub integration 2 | 3 | on: 4 | push: 5 | branches: 6 | - "*" 7 | 8 | jobs: 9 | torch_hub_integration: 10 | runs-on: ubuntu-latest 11 | env: 12 | # TODO quickfix but may need more investigation 13 | ACTIONS_ALLOW_UNSECURE_COMMANDS: True 14 | steps: 15 | # no checkout necessary here. 16 | - name: Extract branch name 17 | run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}" 18 | - name: Check branch name 19 | run: echo $BRANCH 20 | - name: Set up Python 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: 3.7 24 | 25 | - name: Loading cache 26 | uses: actions/cache@v2 27 | id: cache 28 | with: 29 | path: ~/.cache/pip 30 | key: v0-torch_hub-${{ hashFiles('setup.py') }} 31 | 32 | - name: Install dependencies 33 | run: | 34 | pip install --upgrade pip 35 | # install torch-hub specific dependencies 36 | pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub] 37 | # no longer needed 38 | pip uninstall -y transformers 39 | 40 | - name: Torch hub list 41 | run: | 42 | python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))" 43 | 44 | - name: Torch hub help 45 | run: | 46 | python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))" 47 | -------------------------------------------------------------------------------- /document_grounded_generation/test_special_tokens.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | import unittest 4 | 5 | from transformers import OpenAIGPTTokenizer, GPT2Tokenizer 6 | from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS 7 | 8 | class TestSpecialTokenTreatment(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.save_dir = Path('utest_save_dir') 12 | self.save_dir.mkdir(exist_ok=True) 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.save_dir) 16 | 17 | def test_special_tokens_checkpoint_behavior(self): 18 | toks = [OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2')] 19 | for tok in toks: 20 | self.assertEqual(len(tok.added_tokens_encoder), 0) 21 | tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) 22 | self.assertEqual(len(tok.added_tokens_encoder), 5) 23 | # Make sure we never split 24 | self.assertEqual(len(tok.tokenize(" ")), 2) 25 | ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS) 26 | self.assertTrue(all([x > 0 for x in ids]), 27 | f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}') 28 | # Need to mantain indices through save. (this is also tested in pytorch-transformers) 29 | tok.save_pretrained(self.save_dir) 30 | tok_loaded = tok.from_pretrained(str(self.save_dir)) 31 | ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) 32 | self.assertListEqual(ids, ids2) 33 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_torch_available 20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_xlm_prophetnet import ( 28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 29 | XLMProphetNetDecoder, 30 | XLMProphetNetEncoder, 31 | XLMProphetNetForCausalLM, 32 | XLMProphetNetForConditionalGeneration, 33 | XLMProphetNetModel, 34 | ) 35 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/convert_model_to_fp16.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from typing import Union 17 | 18 | import fire 19 | import torch 20 | from tqdm import tqdm 21 | 22 | 23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None: 24 | """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space.""" 25 | state_dict = torch.load(src_path, map_location=map_location) 26 | for k, v in tqdm(state_dict.items()): 27 | if not isinstance(v, torch.Tensor): 28 | raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin") 29 | state_dict[k] = v.half() 30 | if save_path is None: # overwrite src_path 31 | save_path = src_path 32 | torch.save(state_dict, save_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | fire.Fire(convert) 37 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/old_test_tatoeba_conversion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import tempfile 17 | import unittest 18 | 19 | from transformers.file_utils import cached_property 20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter 21 | from transformers.testing_utils import slow 22 | 23 | 24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.") 25 | class TatoebaConversionTester(unittest.TestCase): 26 | @cached_property 27 | def resolver(self): 28 | tmp_dir = tempfile.mkdtemp() 29 | return TatoebaConverter(save_dir=tmp_dir) 30 | 31 | @slow 32 | def test_resolver(self): 33 | self.resolver.convert_models(["heb-eng"]) 34 | 35 | @slow 36 | def test_model_card(self): 37 | content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True) 38 | assert mmeta["long_pair"] == "heb-eng" 39 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/legacy/seq2seq/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python finetune_trainer.py \ 16 | --model_name_or_path=facebook/mbart-large-cc25 \ 17 | --data_dir $ENRO_DIR \ 18 | --output_dir mbart_cc25_enro --overwrite_output_dir \ 19 | --learning_rate=3e-5 \ 20 | --warmup_steps 500 \ 21 | --fp16 \ 22 | --label_smoothing 0.1 \ 23 | --adam_eps 1e-06 \ 24 | --src_lang en_XX --tgt_lang ro_RO \ 25 | --freeze_embeds \ 26 | --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \ 27 | --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\ 28 | --sortish_sampler \ 29 | --num_train_epochs 6 \ 30 | --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \ 31 | --do_train --do_eval --do_predict \ 32 | --evaluation_strategy steps \ 33 | --predict_with_generate --logging_first_step \ 34 | --task translation \ 35 | "$@" 36 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/convert_model_to_fp16.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from typing import Union 17 | 18 | import fire 19 | import torch 20 | from tqdm import tqdm 21 | 22 | 23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None: 24 | """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space.""" 25 | state_dict = torch.load(src_path, map_location=map_location) 26 | for k, v in tqdm(state_dict.items()): 27 | if not isinstance(v, torch.Tensor): 28 | raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin") 29 | state_dict[k] = v.half() 30 | if save_path is None: # overwrite src_path 31 | save_path = src_path 32 | torch.save(state_dict, save_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | fire.Fire(convert) 37 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/rag/parse_dpr_relevance_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint. 3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting 4 | positive contexts for a given query. 5 | """ 6 | 7 | import argparse 8 | import json 9 | 10 | from tqdm import tqdm 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | 16 | # Required parameters 17 | parser.add_argument( 18 | "--src_path", 19 | type=str, 20 | default="biencoder-nq-dev.json", 21 | help="Path to raw DPR training data", 22 | ) 23 | parser.add_argument( 24 | "--evaluation_set", 25 | type=str, 26 | help="where to store parsed evaluation_set file", 27 | ) 28 | parser.add_argument( 29 | "--gold_data_path", 30 | type=str, 31 | help="where to store parsed gold_data_path file", 32 | ) 33 | args = parser.parse_args() 34 | 35 | with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open( 36 | args.gold_data_path, "w" 37 | ) as gold_file: 38 | dpr_records = json.load(src_file) 39 | for dpr_record in tqdm(dpr_records): 40 | question = dpr_record["question"] 41 | contexts = [context["title"] for context in dpr_record["positive_ctxs"]] 42 | eval_file.write(question + "\n") 43 | gold_file.write("\t".join(contexts) + "\n") 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python finetune_trainer.py \ 16 | --model_name_or_path=facebook/mbart-large-cc25 \ 17 | --data_dir $ENRO_DIR \ 18 | --output_dir mbart_cc25_enro --overwrite_output_dir \ 19 | --learning_rate=3e-5 \ 20 | --warmup_steps 500 \ 21 | --fp16 \ 22 | --label_smoothing 0.1 \ 23 | --adam_eps 1e-06 \ 24 | --src_lang en_XX --tgt_lang ro_RO \ 25 | --freeze_embeds \ 26 | --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \ 27 | --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\ 28 | --sortish_sampler \ 29 | --num_train_epochs 6 \ 30 | --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \ 31 | --do_train --do_eval --do_predict \ 32 | --evaluation_strategy steps \ 33 | --predict_with_generate --logging_first_step \ 34 | --task translation \ 35 | "$@" 36 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/legacy/seq2seq/old_test_tatoeba_conversion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import tempfile 17 | import unittest 18 | 19 | from transformers.file_utils import cached_property 20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter 21 | from transformers.testing_utils import slow 22 | 23 | 24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.") 25 | class TatoebaConversionTester(unittest.TestCase): 26 | @cached_property 27 | def resolver(self): 28 | tmp_dir = tempfile.mkdtemp() 29 | return TatoebaConverter(save_dir=tmp_dir) 30 | 31 | @slow 32 | def test_resolver(self): 33 | self.resolver.convert_models(["heb-eng"]) 34 | 35 | @slow 36 | def test_model_card(self): 37 | content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True) 38 | assert mmeta["long_pair"] == "heb-eng" 39 | -------------------------------------------------------------------------------- /finetune_gpt2/examples/research_projects/adversarial/README.md: -------------------------------------------------------------------------------- 1 | ## Adversarial evaluation of model performances 2 | 3 | Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi). 4 | 5 | The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans). 6 | 7 | This is an example of using test_hans.py: 8 | 9 | ```bash 10 | export HANS_DIR=path-to-hans 11 | export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc 12 | export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py 13 | 14 | python run_hans.py \ 15 | --task_name hans \ 16 | --model_type $MODEL_TYPE \ 17 | --do_eval \ 18 | --data_dir $HANS_DIR \ 19 | --model_name_or_path $MODEL_PATH \ 20 | --max_seq_length 128 \ 21 | --output_dir $MODEL_PATH \ 22 | ``` 23 | 24 | This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset. 25 | 26 | The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows: 27 | 28 | ```bash 29 | Heuristic entailed results: 30 | lexical_overlap: 0.9702 31 | subsequence: 0.9942 32 | constituent: 0.9962 33 | 34 | Heuristic non-entailed results: 35 | lexical_overlap: 0.199 36 | subsequence: 0.0396 37 | constituent: 0.118 38 | ``` 39 | -------------------------------------------------------------------------------- /document_grounded_generation/transformers/examples/research_projects/rag/parse_dpr_relevance_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint. 3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting 4 | positive contexts for a given query. 5 | """ 6 | 7 | import argparse 8 | import json 9 | 10 | from tqdm import tqdm 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | 16 | # Required parameters 17 | parser.add_argument( 18 | "--src_path", 19 | type=str, 20 | default="biencoder-nq-dev.json", 21 | help="Path to raw DPR training data", 22 | ) 23 | parser.add_argument( 24 | "--evaluation_set", 25 | type=str, 26 | help="where to store parsed evaluation_set file", 27 | ) 28 | parser.add_argument( 29 | "--gold_data_path", 30 | type=str, 31 | help="where to store parsed gold_data_path file", 32 | ) 33 | args = parser.parse_args() 34 | 35 | with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open( 36 | args.gold_data_path, "w" 37 | ) as gold_file: 38 | dpr_records = json.load(src_file) 39 | for dpr_record in tqdm(dpr_records): 40 | question = dpr_record["question"] 41 | contexts = [context["title"] for context in dpr_record["positive_ctxs"]] 42 | eval_file.write(question + "\n") 43 | gold_file.write("\t".join(contexts) + "\n") 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /finetune_gpt2/utils/get_modified_files.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.: 17 | # python ./utils/get_modified_files.py utils src tests examples 18 | # 19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered 20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results 21 | 22 | import re 23 | import subprocess 24 | import sys 25 | 26 | 27 | fork_point_sha = subprocess.check_output("git merge-base master HEAD".split()).decode("utf-8") 28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split() 29 | 30 | joined_dirs = "|".join(sys.argv[1:]) 31 | regex = re.compile(fr"^({joined_dirs}).*?\.py$") 32 | 33 | relevant_modified_files = [x for x in modified_files if regex.match(x)] 34 | print(" ".join(relevant_modified_files), end="") 35 | --------------------------------------------------------------------------------