├── finetune_gpt2
    ├── examples
    │   ├── benchmarking
    │   │   └── requirements.txt
    │   ├── research_projects
    │   │   ├── bertabs
    │   │   │   ├── __init__.py
    │   │   │   └── requirements.txt
    │   │   ├── deebert
    │   │   │   ├── src
    │   │   │   │   └── __init__.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── eval_deebert.sh
    │   │   │   ├── entropy_eval.sh
    │   │   │   └── train_deebert.sh
    │   │   ├── bert-loses-patience
    │   │   │   ├── pabee
    │   │   │   │   └── __init__.py
    │   │   │   └── requirements.txt
    │   │   ├── bertology
    │   │   │   └── requirements.txt
    │   │   ├── adversarial
    │   │   │   ├── requirements.txt
    │   │   │   └── README.md
    │   │   ├── longform-qa
    │   │   │   ├── requirements.txt
    │   │   │   └── README.md
    │   │   ├── mlm_wwm
    │   │   │   └── requirements.txt
    │   │   ├── rag
    │   │   │   ├── __init__.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── finetune_rag.sh
    │   │   │   ├── finetune_rag_ray.sh
    │   │   │   └── parse_dpr_relevance_data.py
    │   │   ├── pplm
    │   │   │   ├── imgs
    │   │   │   │   ├── wooly.png
    │   │   │   │   └── headfigure.png
    │   │   │   ├── requirements.txt
    │   │   │   └── pplm_classification_head.py
    │   │   ├── wav2vec2
    │   │   │   ├── requirements.txt
    │   │   │   ├── finetune_base_100.sh
    │   │   │   ├── finetune_large_lv60_100.sh
    │   │   │   ├── finetune_base_timit_asr.sh
    │   │   │   ├── finetune_large_lv60_timit_asr.sh
    │   │   │   ├── finetune_wav2vec2_xlsr_turkish.sh
    │   │   │   ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
    │   │   │   └── vocab
    │   │   │   │   └── buckwalter.json
    │   │   ├── distillation
    │   │   │   ├── requirements.txt
    │   │   │   └── training_configs
    │   │   │   │   ├── distilgpt2.json
    │   │   │   │   ├── distilbert-base-cased.json
    │   │   │   │   ├── distilbert-base-uncased.json
    │   │   │   │   ├── distilbert-base-multilingual-cased.json
    │   │   │   │   └── distilroberta-base.json
    │   │   ├── movement-pruning
    │   │   │   ├── emmental
    │   │   │   │   ├── modules
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── __init__.py
    │   │   │   └── requirements.txt
    │   │   ├── lxmert
    │   │   │   └── README.md
    │   │   ├── performer
    │   │   │   ├── full_script.sh
    │   │   │   ├── sanity_script.sh
    │   │   │   └── README.md
    │   │   ├── seq2seq-distillation
    │   │   │   ├── requirements.txt
    │   │   │   ├── finetune.sh
    │   │   │   ├── finetune_t5.sh
    │   │   │   ├── finetune_pegasus_xsum.sh
    │   │   │   ├── train_mbart_cc25_enro.sh
    │   │   │   ├── dynamic_bs_example.sh
    │   │   │   ├── sentence_splitter.py
    │   │   │   ├── distil_marian_no_teacher.sh
    │   │   │   ├── train_distilbart_cnn.sh
    │   │   │   ├── distil_marian_enro_teacher.sh
    │   │   │   ├── train_distilbart_xsum.sh
    │   │   │   └── finetune_bart_tiny.sh
    │   │   ├── mm-imdb
    │   │   │   └── README.md
    │   │   └── README.md
    │   ├── legacy
    │   │   ├── seq2seq
    │   │   │   ├── test_data
    │   │   │   │   ├── test_data
    │   │   │   │   ├── wmt_en_ro
    │   │   │   │   │   ├── val.len
    │   │   │   │   │   └── train.len
    │   │   │   │   └── fsmt
    │   │   │   │   │   └── build-eval-data.py
    │   │   │   ├── __init__.py
    │   │   │   ├── requirements.txt
    │   │   │   ├── finetune.sh
    │   │   │   ├── finetune_tpu.sh
    │   │   │   ├── minify_dataset.py
    │   │   │   ├── rouge_cli.py
    │   │   │   ├── sentence_splitter.py
    │   │   │   ├── convert_model_to_fp16.py
    │   │   │   ├── old_test_tatoeba_conversion.py
    │   │   │   └── train_mbart_cc25_enro.sh
    │   │   ├── pytorch-lightning
    │   │   │   ├── requirements.txt
    │   │   │   ├── run_glue.sh
    │   │   │   └── run_pos.sh
    │   │   ├── README.md
    │   │   └── token-classification
    │   │   │   ├── run_chunk.sh
    │   │   │   ├── run_pos.sh
    │   │   │   └── scripts
    │   │   │       └── preprocess.py
    │   ├── question-answering
    │   │   └── requirements.txt
    │   ├── multiple-choice
    │   │   └── requirements.txt
    │   ├── text-generation
    │   │   ├── requirements.txt
    │   │   └── README.md
    │   ├── token-classification
    │   │   ├── requirements.txt
    │   │   └── run.sh
    │   ├── language-modeling
    │   │   └── requirements.txt
    │   ├── text-classification
    │   │   └── requirements.txt
    │   ├── seq2seq
    │   │   └── requirements.txt
    │   ├── _tests_requirements.txt
    │   └── tests
    │   │   └── deepspeed
    │   │       └── ds_config.json
    ├── src
    │   ├── transformers
    │   │   ├── benchmark
    │   │   │   └── __init__.py
    │   │   ├── models
    │   │   │   ├── dialogpt
    │   │   │   │   └── __init__.py
    │   │   │   └── xlm_prophetnet
    │   │   │   │   ├── configuration_xlm_prophetnet.py
    │   │   │   │   └── __init__.py
    │   │   ├── sagemaker
    │   │   │   └── __init__.py
    │   │   ├── commands
    │   │   │   └── __init__.py
    │   │   ├── data
    │   │   │   ├── datasets
    │   │   │   │   └── __init__.py
    │   │   │   ├── processors
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   └── utils
    │   │   │   └── dummy_flax_objects.py
    │   └── transformers.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   └── entry_points.txt
    ├── infinite_memory_transformer_sticky_mem
    │   └── config.json
    ├── infinite_memory_transformer
    │   └── config.json
    └── utils
    │   └── get_modified_files.py
├── document_grounded_generation
    ├── transformers
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_pipelines_text2text_generation.py
    │   │   ├── test_pipelines_feature_extraction.py
    │   │   ├── test_pipelines_sentiment_analysis.py
    │   │   ├── test_cli.py
    │   │   └── test_activations_tf.py
    │   ├── MANIFEST.in
    │   ├── examples
    │   │   ├── benchmarking
    │   │   │   └── requirements.txt
    │   │   ├── research_projects
    │   │   │   ├── bertabs
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── requirements.txt
    │   │   │   ├── deebert
    │   │   │   │   ├── src
    │   │   │   │   │   └── __init__.py
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── eval_deebert.sh
    │   │   │   │   ├── entropy_eval.sh
    │   │   │   │   └── train_deebert.sh
    │   │   │   ├── bert-loses-patience
    │   │   │   │   ├── pabee
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── requirements.txt
    │   │   │   ├── adversarial
    │   │   │   │   └── requirements.txt
    │   │   │   ├── bertology
    │   │   │   │   └── requirements.txt
    │   │   │   ├── longform-qa
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── README.md
    │   │   │   ├── mlm_wwm
    │   │   │   │   └── requirements.txt
    │   │   │   ├── rag
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── finetune_rag.sh
    │   │   │   │   ├── finetune_rag_ray.sh
    │   │   │   │   └── parse_dpr_relevance_data.py
    │   │   │   ├── wav2vec2
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── finetune_base_100.sh
    │   │   │   │   ├── finetune_large_lv60_100.sh
    │   │   │   │   ├── finetune_base_timit_asr.sh
    │   │   │   │   ├── finetune_large_lv60_timit_asr.sh
    │   │   │   │   ├── finetune_wav2vec2_xlsr_turkish.sh
    │   │   │   │   ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
    │   │   │   │   └── vocab
    │   │   │   │   │   └── buckwalter.json
    │   │   │   ├── distillation
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── training_configs
    │   │   │   │   │   ├── distilgpt2.json
    │   │   │   │   │   ├── distilbert-base-cased.json
    │   │   │   │   │   ├── distilbert-base-uncased.json
    │   │   │   │   │   ├── distilbert-base-multilingual-cased.json
    │   │   │   │   │   └── distilroberta-base.json
    │   │   │   ├── pplm
    │   │   │   │   ├── imgs
    │   │   │   │   │   ├── wooly.png
    │   │   │   │   │   └── headfigure.png
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── pplm_classification_head.py
    │   │   │   ├── movement-pruning
    │   │   │   │   ├── emmental
    │   │   │   │   │   ├── modules
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── __init__.py
    │   │   │   │   └── requirements.txt
    │   │   │   ├── lxmert
    │   │   │   │   └── README.md
    │   │   │   ├── performer
    │   │   │   │   ├── full_script.sh
    │   │   │   │   ├── sanity_script.sh
    │   │   │   │   └── README.md
    │   │   │   ├── seq2seq-distillation
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── finetune.sh
    │   │   │   │   ├── finetune_t5.sh
    │   │   │   │   ├── finetune_pegasus_xsum.sh
    │   │   │   │   ├── train_mbart_cc25_enro.sh
    │   │   │   │   ├── dynamic_bs_example.sh
    │   │   │   │   ├── sentence_splitter.py
    │   │   │   │   ├── distil_marian_no_teacher.sh
    │   │   │   │   ├── train_distilbart_cnn.sh
    │   │   │   │   ├── distil_marian_enro_teacher.sh
    │   │   │   │   ├── train_distilbart_xsum.sh
    │   │   │   │   └── finetune_bart_tiny.sh
    │   │   │   ├── mm-imdb
    │   │   │   │   └── README.md
    │   │   │   └── README.md
    │   │   ├── legacy
    │   │   │   ├── seq2seq
    │   │   │   │   ├── test_data
    │   │   │   │   │   ├── test_data
    │   │   │   │   │   ├── wmt_en_ro
    │   │   │   │   │   │   ├── val.len
    │   │   │   │   │   │   └── train.len
    │   │   │   │   │   └── fsmt
    │   │   │   │   │   │   └── build-eval-data.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── finetune.sh
    │   │   │   │   ├── finetune_tpu.sh
    │   │   │   │   ├── minify_dataset.py
    │   │   │   │   ├── rouge_cli.py
    │   │   │   │   ├── sentence_splitter.py
    │   │   │   │   ├── convert_model_to_fp16.py
    │   │   │   │   ├── train_mbart_cc25_enro.sh
    │   │   │   │   └── old_test_tatoeba_conversion.py
    │   │   │   ├── pytorch-lightning
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── run_glue.sh
    │   │   │   │   └── run_pos.sh
    │   │   │   ├── README.md
    │   │   │   └── token-classification
    │   │   │   │   ├── run_pos.sh
    │   │   │   │   ├── run_chunk.sh
    │   │   │   │   └── scripts
    │   │   │   │       └── preprocess.py
    │   │   ├── question-answering
    │   │   │   └── requirements.txt
    │   │   ├── token-classification
    │   │   │   ├── requirements.txt
    │   │   │   └── run.sh
    │   │   ├── multiple-choice
    │   │   │   └── requirements.txt
    │   │   ├── text-generation
    │   │   │   ├── requirements.txt
    │   │   │   └── README.md
    │   │   ├── language-modeling
    │   │   │   └── requirements.txt
    │   │   ├── text-classification
    │   │   │   └── requirements.txt
    │   │   ├── seq2seq
    │   │   │   └── requirements.txt
    │   │   ├── _tests_requirements.txt
    │   │   └── tests
    │   │   │   └── deepspeed
    │   │   │       └── ds_config.json
    │   ├── src
    │   │   └── transformers
    │   │   │   ├── benchmark
    │   │   │       └── __init__.py
    │   │   │   ├── models
    │   │   │       ├── dialogpt
    │   │   │       │   └── __init__.py
    │   │   │       ├── gpt2
    │   │   │       │   └── pre_process_wmt19.py
    │   │   │       └── xlm_prophetnet
    │   │   │       │   ├── configuration_xlm_prophetnet.py
    │   │   │       │   └── __init__.py
    │   │   │   ├── sagemaker
    │   │   │       └── __init__.py
    │   │   │   ├── commands
    │   │   │       └── __init__.py
    │   │   │   ├── data
    │   │   │       ├── datasets
    │   │   │       │   └── __init__.py
    │   │   │       ├── processors
    │   │   │       │   └── __init__.py
    │   │   │       └── __init__.py
    │   │   │   └── utils
    │   │   │       └── dummy_flax_objects.py
    │   ├── docs
    │   │   ├── source
    │   │   │   ├── contributing.md
    │   │   │   ├── examples.md
    │   │   │   ├── notebooks.md
    │   │   │   ├── favicon.ico
    │   │   │   ├── imgs
    │   │   │   │   ├── ppl_full.gif
    │   │   │   │   ├── ppl_chunked.gif
    │   │   │   │   ├── ppl_sliding.gif
    │   │   │   │   ├── local_attention_mask.png
    │   │   │   │   ├── transformers_logo_name.png
    │   │   │   │   ├── transformers_overview.png
    │   │   │   │   ├── warmup_cosine_schedule.png
    │   │   │   │   ├── warmup_linear_schedule.png
    │   │   │   │   ├── warmup_constant_schedule.png
    │   │   │   │   ├── warmup_cosine_hard_restarts_schedule.png
    │   │   │   │   └── warmup_cosine_warm_restarts_schedule.png
    │   │   │   ├── _static
    │   │   │   │   └── css
    │   │   │   │   │   ├── Calibre-Light.ttf
    │   │   │   │   │   ├── Calibre-Thin.otf
    │   │   │   │   │   ├── Calibre-Medium.otf
    │   │   │   │   │   ├── Calibre-Regular.otf
    │   │   │   │   │   └── code-snippets.css
    │   │   │   └── main_classes
    │   │   │   │   └── configuration.rst
    │   │   └── Makefile
    │   ├── .gitattributes
    │   ├── pyproject.toml
    │   ├── .github
    │   │   ├── conda
    │   │   │   ├── build.sh
    │   │   │   └── meta.yaml
    │   │   ├── ISSUE_TEMPLATE
    │   │   │   ├── ---new-benchmark.md
    │   │   │   ├── --new-model-addition.md
    │   │   │   ├── question-help.md
    │   │   │   └── feature-request.md
    │   │   └── workflows
    │   │   │   ├── stale.yml
    │   │   │   ├── release-conda.yml
    │   │   │   └── github-torch-hub.yml
    │   ├── templates
    │   │   ├── adding_a_new_model
    │   │   │   ├── open_model_proposals
    │   │   │   │   └── README.md
    │   │   │   ├── tests
    │   │   │   │   ├── pt-seq-2-seq-bart-tokenizer.json
    │   │   │   │   ├── encoder-bert-tokenizer.json
    │   │   │   │   ├── pt-encoder-bert-tokenizer.json
    │   │   │   │   ├── standalone.json
    │   │   │   │   ├── tf-encoder-bert-tokenizer.json
    │   │   │   │   └── tf-seq-2-seq-bart-tokenizer.json
    │   │   │   ├── cookiecutter.json
    │   │   │   └── cookiecutter-template-{{cookiecutter.modelname}}
    │   │   │   │   └── configuration.json
    │   │   └── adding_a_new_example_script
    │   │   │   └── cookiecutter.json
    │   ├── .coveragerc
    │   ├── docker
    │   │   ├── transformers-pytorch-tpu
    │   │   │   ├── docker-entrypoint.sh
    │   │   │   ├── dataset.yaml
    │   │   │   └── bert-base-cased.jsonnet
    │   │   ├── transformers-pytorch-cpu
    │   │   │   └── Dockerfile
    │   │   ├── transformers-tensorflow-cpu
    │   │   │   └── Dockerfile
    │   │   ├── transformers-cpu
    │   │   │   └── Dockerfile
    │   │   ├── transformers-tensorflow-gpu
    │   │   │   └── Dockerfile
    │   │   ├── transformers-pytorch-gpu
    │   │   │   └── Dockerfile
    │   │   └── transformers-gpu
    │   │   │   └── Dockerfile
    │   ├── scripts
    │   │   ├── tatoeba
    │   │   │   └── upload_models.sh
    │   │   ├── fsmt
    │   │   │   └── tests-to-run.sh
    │   │   └── pegasus
    │   │   │   └── build_test_sample_spm_no_bos.py
    │   ├── setup.cfg
    │   └── model_cards
    │   │   └── README.md
    ├── requirements.txt
    └── test_special_tokens.py
├── requirements.txt
└── sorting
    ├── run_sort_inftyformer.sh
    └── utils
        └── exp_utils.py


/finetune_gpt2/examples/benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/dialogpt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertabs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/test_data:
--------------------------------------------------------------------------------
1 | seq2seq/test_data


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bert-loses-patience/pabee/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/question-answering/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.2.1
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertabs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/dialogpt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/contributing.md:
--------------------------------------------------------------------------------
1 | ../../CONTRIBUTING.md


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/examples.md:
--------------------------------------------------------------------------------
1 | ../../examples/README.md


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/notebooks.md:
--------------------------------------------------------------------------------
1 | ../../notebooks/README.md


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/gpt2/pre_process_wmt19.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertology/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/multiple-choice/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/adversarial/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bert-loses-patience/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1


--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/token-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | seqeval
2 | datasets >= 1.1.3
3 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py	eol=lf
2 | *.rst	eol=lf
3 | *.md	eol=lf


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/test_data:
--------------------------------------------------------------------------------
1 | seq2seq/test_data


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/question-answering/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.2.1
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/pabee/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/adversarial/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertology/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/token-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | seqeval
2 | datasets >= 1.1.3
3 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/language-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/multiple-choice/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | target-version = ['py35']
4 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install     # Python command to install the script.
2 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertabs/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/longform-qa/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | faiss-cpu
3 | streamlit
4 | elasticsearch
5 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/mlm_wwm/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | ltp
5 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.1.3
3 | sentencepiece != 0.1.92
4 | protobuf
5 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/language-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | transformers-cli = transformers.commands.transformers_cli:main
3 | 
4 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | sacrebleu >= 1.4.12
5 | rouge-score
6 | nltk


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertabs/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 | 
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 | 


--------------------------------------------------------------------------------
/document_grounded_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | pytorch-ignite
3 | transformers==2.5.1
4 | tensorboardX==1.8
5 | tensorflow  # for tensorboardX
6 | spacy
7 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/longform-qa/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | faiss-cpu
3 | streamlit
4 | elasticsearch
5 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/mlm_wwm/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | ltp
5 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.1.3
3 | sentencepiece != 0.1.92
4 | protobuf
5 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | sacrebleu >= 1.4.12
5 | rouge-score
6 | nltk


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | torch>=1.5.0
4 | torchaudio
5 | jiwer==2.2.0
6 | lang-trans==0.6.0
7 | librosa==0.8.0
8 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/favicon.ico


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | 
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/requirements.txt:
--------------------------------------------------------------------------------
1 | faiss-cpu >= 1.6.3
2 | datasets >= 1.0.1
3 | psutil >= 5.7.0
4 | torch >= 1.4.0
5 | transformers
6 | pytorch-lightning==1.0.4
7 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.6
7 | scipy>=1.4.1
8 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | torch>=1.5.0
4 | torchaudio
5 | jiwer==2.2.0
6 | lang-trans==0.6.0
7 | librosa==0.8.0
8 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/open_model_proposals/README.md:
--------------------------------------------------------------------------------
1 | Currently the following model proposals are available:
2 | 
3 | - <s>[BigBird (Google)](./ADD_BIG_BIRD.md)</s>
4 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.6
7 | scipy>=1.4.1
8 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/requirements.txt:
--------------------------------------------------------------------------------
1 | faiss-cpu >= 1.6.3
2 | datasets >= 1.0.1
3 | psutil >= 5.7.0
4 | torch >= 1.4.0
5 | transformers
6 | pytorch-lightning==1.0.4
7 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/emmental/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
3 | from .masked_nn import MaskedLinear
4 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
3 | from .masked_nn import MaskedLinear
4 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/lxmert/README.md:
--------------------------------------------------------------------------------
1 | # LXMERT DEMO
2 | 
3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
4 | 2. install reqs: ``pip install -r ./requirements.txt``
5 | 3. usage is as shown in demo.ipynb
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.4.0
2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
3 | knockknock>=0.1.8.1
4 | h5py>=2.10.0
5 | numpy>=1.18.2
6 | scipy>=1.4.1
7 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/lxmert/README.md:
--------------------------------------------------------------------------------
1 | # LXMERT DEMO
2 | 
3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
4 | 2. install reqs: ``pip install -r ./requirements.txt``
5 | 3. usage is as shown in demo.ipynb
6 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"initializer_range": 0.02,
 3 | 	"layer_norm_epsilon": 0.00001,
 4 | 	"n_ctx": 1024,
 5 | 	"n_embd": 768,
 6 | 	"n_head": 12,
 7 | 	"n_layer": 6,
 8 | 	"n_positions": 1024,
 9 | 	"vocab_size": 50257
10 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.4.0
2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
3 | knockknock>=0.1.8.1
4 | h5py>=2.10.0
5 | numpy>=1.18.2
6 | scipy>=1.4.1
7 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source=transformers
 3 | omit =
 4 |     # skip convertion scripts from testing for now
 5 |     */convert_*
 6 |     */__main__.py
 7 | [report]
 8 | exclude_lines =
 9 |     pragma: no cover
10 |     raise
11 |     except
12 |     register_parameter


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"initializer_range": 0.02,
 3 | 	"layer_norm_epsilon": 0.00001,
 4 | 	"n_ctx": 1024,
 5 | 	"n_embd": 768,
 6 | 	"n_head": 12,
 7 | 	"n_layer": 6,
 8 | 	"n_positions": 1024,
 9 | 	"vocab_size": 50257
10 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==8.0.3
 2 | gensim==3.8.3
 3 | ignite==1.1.0
 4 | matplotlib==3.4.3
 5 | numpy==1.21.3
 6 | pytorch-ignite==0.4.7
 7 | pytorch-lightning==1.6.0
 8 | rouge-score==0.0.4
 9 | sacrebleu==2.0.0
10 | scikit-learn==1.0.1
11 | scipy==1.7.1
12 | tensorboard==2.9.0
13 | tensorboardX==1.8
14 | torch==1.9.0
15 | tqdm==4.62.3
16 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ~/.bashrc
3 | echo "running docker-entrypoint.sh"
4 | conda activate container
5 | echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
6 | echo "printed TPU info"
7 | export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
8 | exec "$@"#!/bin/bash
9 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/_tests_requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu >= 1.4.12
 6 | rouge-score
 7 | tensorflow_datasets
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/_tests_requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu >= 1.4.12
 6 | rouge-score
 7 | tensorflow_datasets
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/full_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/sanity_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | psutil
 4 | sacrebleu
 5 | rouge-score
 6 | tensorflow_datasets
 7 | pytorch-lightning==1.0.4
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/tatoeba/upload_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for FILE in converted/*; do 
 4 |   model_name=`basename $FILE`
 5 |   transformers-cli repo create $model_name -y
 6 |   git clone https://huggingface.co/Helsinki-NLP/$model_name
 7 |   mv $FILE/* $model_name/
 8 |   cd $model_name
 9 |   git add . && git commit -m "initial commit" 
10 |   git push
11 |   cd ..
12 | done
13 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | pytorch-lightning==1.0.4
 9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | ray
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/emmental/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from .configuration_bert_masked import MaskedBertConfig
 3 | from .modeling_bert_masked import (
 4 |     MaskedBertForMultipleChoice,
 5 |     MaskedBertForQuestionAnswering,
 6 |     MaskedBertForSequenceClassification,
 7 |     MaskedBertForTokenClassification,
 8 |     MaskedBertModel,
 9 | )
10 | from .modules import *
11 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/full_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/sanity_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 28996
14 |   }
15 |   


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 30522
14 |   }
15 |   


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | pytorch-lightning==1.0.4
 9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | transformers==3.5.1
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | psutil
 4 | sacrebleu
 5 | rouge-score
 6 | tensorflow_datasets
 7 | pytorch-lightning==1.0.4
 8 | matplotlib
 9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .highlight .c1, .highlight .sd{
 3 |     color: #999
 4 | }
 5 | 
 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
 7 |     color: #FB8D68;
 8 | }
 9 | 
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
11 |     color: #6670FF;
12 | }
13 | 
14 | .highlight .gp {
15 |     color: #FB8D68;
16 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from .configuration_bert_masked import MaskedBertConfig
 3 | from .modeling_bert_masked import (
 4 |     MaskedBertForMultipleChoice,
 5 |     MaskedBertForQuestionAnswering,
 6 |     MaskedBertForSequenceClassification,
 7 |     MaskedBertForTokenClassification,
 8 |     MaskedBertModel,
 9 | )
10 | from .modules import *
11 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_example_script/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 |   "example_name": "text classification",
3 |   "directory_name": "{{cookiecutter.example_name|lower|replace(' ', '-')}}",
4 |   "example_shortcut": "{{cookiecutter.directory_name}}",
5 |   "model_class": "AutoModel",
6 |   "authors": "The HuggingFace Team",
7 |   "can_train_from_scratch": ["True", "False"],
8 |   "with_trainer": ["True", "False"]
9 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | pytorch-lightning==1.0.4
 9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | ray
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 119547
14 |   }
15 |   


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 28996
14 |   }
15 |   


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorboard
 2 | scikit-learn
 3 | seqeval
 4 | psutil
 5 | sacrebleu
 6 | rouge-score
 7 | tensorflow_datasets
 8 | pytorch-lightning==1.0.4
 9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | transformers==3.5.1
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune.sh:
--------------------------------------------------------------------------------
 1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
 2 | # run ./finetune.sh --help to see all the possible options
 3 | python finetune.py \
 4 |     --learning_rate=3e-5 \
 5 |     --fp16 \
 6 |     --gpus 1 \
 7 |     --do_train \
 8 |     --do_predict \
 9 |     --n_val 1000 \
10 |     --val_check_interval 0.1 \
11 |     "$@"
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 30522
14 |   }
15 |   


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 119547
14 |   }
15 |   


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune.sh:
--------------------------------------------------------------------------------
 1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
 2 | # run ./finetune.sh --help to see all the possible options
 3 | python finetune.py \
 4 |     --learning_rate=3e-5 \
 5 |     --fp16 \
 6 |     --gpus 1 \
 7 |     --do_train \
 8 |     --do_predict \
 9 |     --n_val 1000 \
10 |     --val_check_interval 0.1 \
11 |     "$@"
12 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_t5.sh:
--------------------------------------------------------------------------------
 1 | # Add parent directory to python path to access lightning_base.py
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | python finetune.py \
 5 | --data_dir=$CNN_DIR \
 6 | --learning_rate=3e-5 \
 7 | --train_batch_size=$BS \
 8 | --eval_batch_size=$BS \
 9 | --output_dir=$OUTPUT_DIR \
10 | --max_source_length=512 \
11 | --max_target_length=56 \
12 | --val_check_interval=0.1 --n_val=200 \
13 | --do_train --do_predict \
14 |  "$@"
15 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "NewENCDEC",
 3 |   "uppercase_modelname": "NEW_ENC_DEC",
 4 |   "lowercase_modelname": "new_enc_dec",
 5 |   "camelcase_modelname": "NewEncDec",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "new-enc-dec-base",
 8 |   "tokenizer_type": "Based on BART",
 9 |   "generate_tensorflow_and_pytorch": "PyTorch",
10 |   "is_encoder_decoder_model": "True"
11 | }
12 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vocab_size": 50265,
 3 |     "hidden_size": 768,
 4 |     "num_hidden_layers": 6,
 5 |     "num_attention_heads": 12,
 6 |     "intermediate_size": 3072,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "attention_probs_dropout_prob": 0.1,
10 |     "max_position_embeddings": 514,
11 |     "type_vocab_size": 1,
12 |     "initializer_range": 0.02,
13 |     "layer_norm_eps": 0.00001
14 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "Template",
 3 |   "uppercase_modelname": "TEMPLATE",
 4 |   "lowercase_modelname": "template",
 5 |   "camelcase_modelname": "Template",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "brand-new-bert-base-cased",
 8 |   "tokenizer_type": "Based on BERT",
 9 |   "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
10 |   "is_encoder_decoder_model": "False"
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "TemplatePT",
 3 |   "uppercase_modelname": "TEMPLATE_PT",
 4 |   "lowercase_modelname": "template_pt",
 5 |   "camelcase_modelname": "TemplatePt",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "brand-new-bert-base-cased",
 8 |   "tokenizer_type": "Based on BERT",
 9 |   "generate_tensorflow_and_pytorch": "PyTorch",
10 |   "is_encoder_decoder_model": "False"
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/standalone.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "TemplateBI",
 3 |   "uppercase_modelname": "TEMPLATE_BI",
 4 |   "lowercase_modelname": "template_bi",
 5 |   "camelcase_modelname": "TemplateBi",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "bi-brand-new-bert-base-cased",
 8 |   "tokenizer_type": "Standalone",
 9 |   "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
10 |   "is_encoder_decoder_model": "False"
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "TemplateTF",
 3 |   "uppercase_modelname": "TEMPLATE_TF",
 4 |   "lowercase_modelname": "template_tf",
 5 |   "camelcase_modelname": "TemplateTf",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "brand-new-bert-base-cased",
 8 |   "tokenizer_type": "Based on BERT",
 9 |   "generate_tensorflow_and_pytorch": "TensorFlow",
10 |   "is_encoder_decoder_model": "False"
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "NewTFENCDEC",
 3 |   "uppercase_modelname": "NEW_TF_ENC_DEC",
 4 |   "lowercase_modelname": "new_tf_enc_dec",
 5 |   "camelcase_modelname": "NewTFEncDec",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "new-tf-enc-dec-base",
 8 |   "tokenizer_type": "Based on BART",
 9 |   "generate_tensorflow_and_pytorch": "TensorFlow",
10 |   "is_encoder_decoder_model": "True"
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_t5.sh:
--------------------------------------------------------------------------------
 1 | # Add parent directory to python path to access lightning_base.py
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | python finetune.py \
 5 | --data_dir=$CNN_DIR \
 6 | --learning_rate=3e-5 \
 7 | --train_batch_size=$BS \
 8 | --eval_batch_size=$BS \
 9 | --output_dir=$OUTPUT_DIR \
10 | --max_source_length=512 \
11 | --max_target_length=56 \
12 | --val_check_interval=0.1 --n_val=200 \
13 | --do_train --do_predict \
14 |  "$@"
15 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vocab_size": 50265,
 3 |     "hidden_size": 768,
 4 |     "num_hidden_layers": 6,
 5 |     "num_attention_heads": 12,
 6 |     "intermediate_size": 3072,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "attention_probs_dropout_prob": 0.1,
10 |     "max_position_embeddings": 514,
11 |     "type_vocab_size": 1,
12 |     "initializer_range": 0.02,
13 |     "layer_norm_eps": 0.00001
14 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "BrandNewBERT",
 3 |   "uppercase_modelname": "BRAND_NEW_BERT",
 4 |   "lowercase_modelname": "brand_new_bert",
 5 |   "camelcase_modelname": "BrandNewBert",
 6 |   "authors": "The HuggingFace Team",
 7 |   "checkpoint_identifier": "brand-new-bert-base-cased",
 8 |   "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"],
 9 |   "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"],
10 |   "is_encoder_decoder_model": ["True", "False"]
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F5A5 New benchmark"
 3 | about: Benchmark a part of this library and share your results
 4 | title: "[Benchmark]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🖥 Benchmarking `transformers`
11 | 
12 | ## Benchmark
13 | 
14 | Which part of `transformers` did you benchmark?
15 | 
16 | ## Set-up
17 | 
18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
19 | 
20 | ## Results
21 | 
22 | Put your results here!
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F31F New model addition"
 3 | about: Submit a proposal/request to implement a new Transformer-based model
 4 | title: ''
 5 | labels: New model
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🌟 New model addition
11 | 
12 | ## Model description
13 | 
14 | <!-- Important information -->
15 | 
16 | ## Open source status
17 | 
18 | * [ ] the model implementation is available: (give details)
19 | * [ ] the model weights are available: (give details)
20 | * [ ] who are the authors: (mention them, if possible by @gh-username)
21 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | # From appendix C of paper https://arxiv.org/abs/1912.08777
 5 | # Set --gradient_accumulation_steps  so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
 6 | python finetune.py \
 7 |     --learning_rate=1e-4 \
 8 |     --do_train \
 9 |     --do_predict \
10 |     --n_val 1000 \
11 |     --val_check_interval 0.25 \
12 |     --max_source_length 512 --max_target_length 56 \
13 |     --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
14 |     "$@"
15 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | # From appendix C of paper https://arxiv.org/abs/1912.08777
 5 | # Set --gradient_accumulation_steps  so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
 6 | python finetune.py \
 7 |     --learning_rate=1e-4 \
 8 |     --do_train \
 9 |     --do_predict \
10 |     --n_val 1000 \
11 |     --val_check_interval 0.25 \
12 |     --max_source_length 512 --max_target_length 56 \
13 |     --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
14 |     "$@"
15 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "modelname": "{{cookiecutter.modelname}}",
 3 |   "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}",
 4 |   "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}",
 5 |   "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}",
 6 |   "authors": "{{cookiecutter.authors}}",
 7 |   "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}",
 8 |   "tokenizer_type": "{{cookiecutter.tokenizer_type}}",
 9 |   "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}",
10 |   "is_encoder_decoder_model": ["True", "False"]
11 | }
12 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: Stale Bot
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 | 
 7 | jobs:
 8 |   close_stale_issues:
 9 |     name: Close Stale Issues
10 |     if: github.repository == 'huggingface/transformers'
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
14 |     steps:
15 |     - uses: actions/checkout@v2
16 | 
17 |     - name: Setup Python
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: 3.7
21 | 
22 |     - name: Install requirements
23 |       run: |
24 |         pip install PyGithub
25 |     - name: Close stale issues
26 |       run: |
27 |         python scripts/stale.py


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/longform-qa/README.md:
--------------------------------------------------------------------------------
1 | # Long Form Question Answering
2 | 
3 | Author: @yjernite
4 | 
5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
6 | 
7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
8 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_100.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-base-100h" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="32" \
 6 | --per_device_eval_batch_size="32" \
 7 | --evaluation_strategy="steps" \
 8 | --save_total_limit="3" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-base" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/longform-qa/README.md:
--------------------------------------------------------------------------------
1 | # Long Form Question Answering
2 | 
3 | Author: @yjernite
4 | 
5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
6 | 
7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
8 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | python finetune.py \
 5 |     --learning_rate=3e-5 \
 6 |     --fp16 \
 7 |     --do_train \
 8 |     --val_check_interval=0.25 \
 9 |     --adam_eps 1e-06 \
10 |     --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
11 |     --data_dir $ENRO_DIR \
12 |     --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 |     --train_batch_size=$BS --eval_batch_size=$BS \
14 |     --task translation \
15 |     --warmup_steps 500 \
16 |     --freeze_embeds \
17 |     --model_name_or_path=facebook/mbart-large-cc25 \
18 |     "$@"
19 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-lv60-100h" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="16" \
 6 | --per_device_eval_batch_size="16" \
 7 | --evaluation_strategy="steps" \
 8 | --save_total_limit="3" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_100.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-base-100h" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="32" \
 6 | --per_device_eval_batch_size="32" \
 7 | --evaluation_strategy="steps" \
 8 | --save_total_limit="3" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-base" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-base-timit-asr" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="20" \
 6 | --per_device_eval_batch_size="20" \
 7 | --evaluation_strategy="steps" \
 8 | --save_steps="500" \
 9 | --eval_steps="100" \
10 | --logging_steps="50" \
11 | --learning_rate="5e-4" \
12 | --warmup_steps="3000" \
13 | --model_name_or_path="facebook/wav2vec2-base" \
14 | --fp16 \
15 | --dataset_name="timit_asr" \
16 | --train_split_name="train" \
17 | --validation_split_name="test" \
18 | --orthography="timit" \
19 | --preprocessing_num_workers="$(nproc)" \
20 | --group_by_length \
21 | --freeze_feature_extractor \
22 | --verbose_logging \
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | python finetune.py \
 5 |     --learning_rate=3e-5 \
 6 |     --fp16 \
 7 |     --do_train \
 8 |     --val_check_interval=0.25 \
 9 |     --adam_eps 1e-06 \
10 |     --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
11 |     --data_dir $ENRO_DIR \
12 |     --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 |     --train_batch_size=$BS --eval_batch_size=$BS \
14 |     --task translation \
15 |     --warmup_steps 500 \
16 |     --freeze_embeds \
17 |     --model_name_or_path=facebook/mbart-large-cc25 \
18 |     "$@"
19 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | export MAX_LEN=128
 5 | export m=sshleifer/student_marian_en_ro_6_1
 6 | python finetune.py \
 7 |   --learning_rate=3e-4 \
 8 |   --do_train \
 9 |   --fp16 \
10 |   --data_dir wmt_en_ro \
11 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 |   --freeze_encoder --freeze_embeds \
13 |   --train_batch_size=48 --eval_batch_size=64 \
14 |   --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \
15 |   --warmup_steps 500 --logger_name wandb --gpus 1 \
16 |   --fp16_opt_level=O1 --task translation \
17 |   "$@"
18 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     torch
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-lv60-100h" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="16" \
 6 | --per_device_eval_batch_size="16" \
 7 | --evaluation_strategy="steps" \
 8 | --save_total_limit="3" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-tensorflow-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     tensorflow-cpu
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | export MAX_LEN=128
 5 | export m=sshleifer/student_marian_en_ro_6_1
 6 | python finetune.py \
 7 |   --learning_rate=3e-4 \
 8 |   --do_train \
 9 |   --fp16 \
10 |   --data_dir wmt_en_ro \
11 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 |   --freeze_encoder --freeze_embeds \
13 |   --train_batch_size=48 --eval_batch_size=64 \
14 |   --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \
15 |   --warmup_steps 500 --logger_name wandb --gpus 1 \
16 |   --fp16_opt_level=O1 --task translation \
17 |   "$@"
18 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-base-timit-asr" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="20" \
 6 | --per_device_eval_batch_size="20" \
 7 | --evaluation_strategy="steps" \
 8 | --save_steps="500" \
 9 | --eval_steps="100" \
10 | --logging_steps="50" \
11 | --learning_rate="5e-4" \
12 | --warmup_steps="3000" \
13 | --model_name_or_path="facebook/wav2vec2-base" \
14 | --fp16 \
15 | --dataset_name="timit_asr" \
16 | --train_split_name="train" \
17 | --validation_split_name="test" \
18 | --orthography="timit" \
19 | --preprocessing_num_workers="$(nproc)" \
20 | --group_by_length \
21 | --freeze_feature_extractor \
22 | --verbose_logging \
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class ClassificationHead(torch.nn.Module):
 5 |     """Classification Head for  transformer encoders"""
 6 | 
 7 |     def __init__(self, class_size, embed_size):
 8 |         super().__init__()
 9 |         self.class_size = class_size
10 |         self.embed_size = embed_size
11 |         # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 |         # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 |         self.mlp = torch.nn.Linear(embed_size, class_size)
14 | 
15 |     def forward(self, hidden_state):
16 |         # hidden_state = F.relu(self.mlp1(hidden_state))
17 |         # hidden_state = self.mlp2(hidden_state)
18 |         logits = self.mlp(hidden_state)
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/sentence_splitter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from filelock import FileLock
 4 | 
 5 | 
 6 | try:
 7 |     import nltk
 8 | 
 9 |     NLTK_AVAILABLE = True
10 | except (ImportError, ModuleNotFoundError):
11 |     NLTK_AVAILABLE = False
12 | 
13 | if NLTK_AVAILABLE:
14 |     with FileLock(".lock") as lock:
15 |         nltk.download("punkt", quiet=True)
16 | 
17 | 
18 | def add_newline_to_end_of_each_sentence(x: str) -> str:
19 |     """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
20 |     re.sub("<n>", "", x)  # remove pegasus newline char
21 |     assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
22 |     return "\n".join(nltk.sent_tokenize(x))
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     tensorflow-cpu \
19 |     torch
20 | 
21 | WORKDIR /workspace
22 | COPY . transformers/
23 | RUN cd transformers/ && \
24 |     python3 -m pip install --no-cache-dir .
25 | 
26 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/mm-imdb/README.md:
--------------------------------------------------------------------------------
 1 | ## MM-IMDb
 2 | 
 3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
 4 | 
 5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
 6 | 
 7 | ### Training on MM-IMDb
 8 | 
 9 | ```
10 | python run_mmimdb.py \
11 |     --data_dir /path/to/mmimdb/dataset/ \
12 |     --model_type bert \
13 |     --model_name_or_path bert-base-uncased \
14 |     --output_dir /path/to/save/dir/ \
15 |     --do_train \
16 |     --do_eval \
17 |     --max_seq_len 512 \
18 |     --gradient_accumulation_steps 20 \
19 |     --num_image_embeds 3 \
20 |     --num_train_epochs 100 \
21 |     --patience 5
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | export MAX_LEN=128
 5 | python finetune.py \
 6 |   --learning_rate=3e-4 \
 7 |   --do_train \
 8 |   --do_predict \
 9 |   --fp16 \
10 |   --val_check_interval 0.25 \
11 |   --data_dir $ENRO_DIR \
12 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 |   --freeze_encoder --freeze_embeds \
14 |   --train_batch_size=$BS --eval_batch_size=$BS \
15 |   --tokenizer_name $m --model_name_or_path $m \
16 |   --warmup_steps 500 --sortish_sampler --logger_name wandb \
17 |   --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \
18 |   "$@"
19 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-tensorflow-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     tensorflow
19 | 
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 |     python3 -m pip install --no-cache-dir .
24 | 
25 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class ClassificationHead(torch.nn.Module):
 5 |     """Classification Head for  transformer encoders"""
 6 | 
 7 |     def __init__(self, class_size, embed_size):
 8 |         super().__init__()
 9 |         self.class_size = class_size
10 |         self.embed_size = embed_size
11 |         # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 |         # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 |         self.mlp = torch.nn.Linear(embed_size, class_size)
14 | 
15 |     def forward(self, hidden_state):
16 |         # hidden_state = F.relu(self.mlp1(hidden_state))
17 |         # hidden_state = self.mlp2(hidden_state)
18 |         logits = self.mlp(hidden_state)
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/sentence_splitter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from filelock import FileLock
 4 | 
 5 | 
 6 | try:
 7 |     import nltk
 8 | 
 9 |     NLTK_AVAILABLE = True
10 | except (ImportError, ModuleNotFoundError):
11 |     NLTK_AVAILABLE = False
12 | 
13 | if NLTK_AVAILABLE:
14 |     with FileLock(".lock") as lock:
15 |         nltk.download("punkt", quiet=True)
16 | 
17 | 
18 | def add_newline_to_end_of_each_sentence(x: str) -> str:
19 |     """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
20 |     re.sub("<n>", "", x)  # remove pegasus newline char
21 |     assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
22 |     return "\n".join(nltk.sent_tokenize(x))
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | export BS=32
 5 | export GAS=1
 6 | 
 7 | python finetune.py \
 8 |     --learning_rate=3e-5 \
 9 |     --fp16 \
10 |     --gpus 1 \
11 |     --do_train \
12 |     --do_predict \
13 |     --val_check_interval 0.25 \
14 |     --n_val 500 \
15 |     --num_train_epochs 2 \
16 |     --freeze_encoder --freeze_embeds --data_dir cnn_dm \
17 |     --max_target_length 142 --val_max_target_length=142 \
18 |     --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
19 |     --model_name_or_path sshleifer/student_cnn_12_6 \
20 |     --tokenizer_name facebook/bart-large \
21 |     --warmup_steps 500 \
22 |     --output_dir distilbart-cnn-12-6 \
23 |     "$@"
24 | 
25 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="2" \
 6 | --per_device_eval_batch_size="2" \
 7 | --gradient_accumulation_steps="4" \
 8 | --evaluation_strategy="steps" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="timit_asr" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --orthography="timit" \
20 | --preprocessing_num_workers="$(nproc)" \
21 | --group_by_length \
22 | --freeze_feature_extractor \
23 | --verbose_logging \
24 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/mm-imdb/README.md:
--------------------------------------------------------------------------------
 1 | ## MM-IMDb
 2 | 
 3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
 4 | 
 5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
 6 | 
 7 | ### Training on MM-IMDb
 8 | 
 9 | ```
10 | python run_mmimdb.py \
11 |     --data_dir /path/to/mmimdb/dataset/ \
12 |     --model_type bert \
13 |     --model_name_or_path bert-base-uncased \
14 |     --output_dir /path/to/save/dir/ \
15 |     --do_train \
16 |     --do_eval \
17 |     --max_seq_len 512 \
18 |     --gradient_accumulation_steps 20 \
19 |     --num_image_embeds 3 \
20 |     --num_train_epochs 100 \
21 |     --patience 5
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | export MAX_LEN=128
 5 | python finetune.py \
 6 |   --learning_rate=3e-4 \
 7 |   --do_train \
 8 |   --do_predict \
 9 |   --fp16 \
10 |   --val_check_interval 0.25 \
11 |   --data_dir $ENRO_DIR \
12 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 |   --freeze_encoder --freeze_embeds \
14 |   --train_batch_size=$BS --eval_batch_size=$BS \
15 |   --tokenizer_name $m --model_name_or_path $m \
16 |   --warmup_steps 500 --sortish_sampler --logger_name wandb \
17 |   --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \
18 |   "$@"
19 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_common_voice.py \
 3 |     --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
 4 |     --dataset_config_name="tr" \
 5 |     --output_dir=./wav2vec2-large-xlsr-turkish-demo \
 6 |     --overwrite_output_dir \
 7 |     --num_train_epochs="5" \
 8 |     --per_device_train_batch_size="16" \
 9 |     --evaluation_strategy="steps" \
10 |     --learning_rate="3e-4" \
11 |     --warmup_steps="500" \
12 |     --fp16 \
13 |     --freeze_feature_extractor \
14 |     --save_steps="400" \
15 |     --eval_steps="400" \
16 |     --save_total_limit="3" \
17 |     --logging_steps="400" \
18 |     --group_by_length \
19 |     --feat_proj_dropout="0.0" \
20 |     --layerdrop="0.1" \
21 |     --gradient_checkpointing \
22 |     --do_train --do_eval
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | export BS=32
 5 | export GAS=1
 6 | 
 7 | python finetune.py \
 8 |     --learning_rate=3e-5 \
 9 |     --fp16 \
10 |     --gpus 1 \
11 |     --do_train \
12 |     --do_predict \
13 |     --val_check_interval 0.25 \
14 |     --n_val 500 \
15 |     --num_train_epochs 2 \
16 |     --freeze_encoder --freeze_embeds --data_dir cnn_dm \
17 |     --max_target_length 142 --val_max_target_length=142 \
18 |     --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
19 |     --model_name_or_path sshleifer/student_cnn_12_6 \
20 |     --tokenizer_name facebook/bart-large \
21 |     --warmup_steps 500 \
22 |     --output_dir distilbart-cnn-12-6 \
23 |     "$@"
24 | 
25 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \
 4 | --num_train_epochs="30" \
 5 | --per_device_train_batch_size="2" \
 6 | --per_device_eval_batch_size="2" \
 7 | --gradient_accumulation_steps="4" \
 8 | --evaluation_strategy="steps" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="timit_asr" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --orthography="timit" \
20 | --preprocessing_num_workers="$(nproc)" \
21 | --group_by_length \
22 | --freeze_feature_extractor \
23 | --verbose_logging \
24 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_common_voice.py \
 3 |     --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
 4 |     --dataset_config_name="tr" \
 5 |     --output_dir=./wav2vec2-large-xlsr-turkish-demo \
 6 |     --overwrite_output_dir \
 7 |     --num_train_epochs="5" \
 8 |     --per_device_train_batch_size="16" \
 9 |     --evaluation_strategy="steps" \
10 |     --learning_rate="3e-4" \
11 |     --warmup_steps="500" \
12 |     --fp16 \
13 |     --freeze_feature_extractor \
14 |     --save_steps="400" \
15 |     --eval_steps="400" \
16 |     --save_total_limit="3" \
17 |     --logging_steps="400" \
18 |     --group_by_length \
19 |     --feat_proj_dropout="0.0" \
20 |     --layerdrop="0.1" \
21 |     --gradient_checkpointing \
22 |     --do_train --do_eval
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/token-classification/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | python3 run_ner.py \
16 |   --model_name_or_path bert-base-uncased \
17 |   --dataset_name conll2003 \
18 |   --output_dir /tmp/test-ner \
19 |   --do_train \
20 |   --do_eval
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/token-classification/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | python3 run_ner.py \
16 |   --model_name_or_path bert-base-uncased \
17 |   --dataset_name conll2003 \
18 |   --output_dir /tmp/test-ner \
19 |   --do_train \
20 |   --do_eval
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/question-help.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "❓ Questions & Help"
 3 | about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # ❓ Questions & Help
11 | 
12 | <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
13 |      new models, benchmarks, and migration questions. For all other questions,
14 |      we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
15 |      -->
16 | 
17 | ## Details
18 | 
19 | <!-- Description of your issue -->
20 | 
21 | <!-- You should first ask your question on the forum, and only if
22 |      you didn't get an answer after a few days ask it here on GitHub. -->
23 | 
24 | **A link to original question on the forum**:
25 | 
26 | <!-- Your issue will be closed if you don't fill this part. -->


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | # export MAX_LEN=128
 5 | python distillation.py \
 6 |   --learning_rate=3e-4 \
 7 |   --do_train \
 8 |   --fp16 \
 9 |   --val_check_interval 0.25 \
10 |   --teacher Helsinki-NLP/opus-mt-en-ro \
11 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 |   --student_decoder_layers 3 --student_encoder_layers 6 \
13 |   --freeze_encoder --freeze_embeds \
14 |   --model_name_or_path IGNORED \
15 |   --alpha_hid=3. \
16 |   --train_batch_size=$BS --eval_batch_size=$BS \
17 |   --tokenizer_name Helsinki-NLP/opus-mt-en-ro \
18 |   --warmup_steps 500 --logger_name wandb \
19 |   --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \
20 |   "$@"
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680 Feature request"
 3 | about: Submit a proposal/request for a new transformers feature
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🚀 Feature request
11 | 
12 | <!-- A clear and concise description of the feature proposal.
13 |      Please provide a link to the paper and code in case they exist. -->
14 | 
15 | ## Motivation
16 | 
17 | <!-- Please outline the motivation for the proposal. Is your feature request
18 |      related to a problem? e.g., I'm always frustrated when [...]. If this is related
19 |      to another GitHub issue, please link here too. -->
20 | 
21 | ## Your contribution
22 | 
23 | <!-- Is there any way that you could help, e.g. by submitting a PR?
24 |      Make sure to read the CONTRIBUTING.MD readme:
25 |      https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
26 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | python distillation.py \
 4 |   --teacher facebook/bart-large-xsum --data_dir xsum \
 5 |   --tokenizer_name facebook/bart-large-xsum \
 6 |   --student_decoder_layers 6 --student_encoder_layers 12 \
 7 |   --freeze_encoder --freeze_embeds \
 8 |   --learning_rate=3e-4 \
 9 |   --do_train \
10 |   --do_predict \
11 |   --fp16 --fp16_opt_level=O1 \
12 |   --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
13 |   --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
14 |   --model_name_or_path IGNORED \
15 |   --alpha_hid=3. \
16 |   --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
17 |   --sortish_sampler \
18 |   --num_train_epochs=6 \
19 |   --warmup_steps 500 \
20 |   --output_dir distilbart_xsum_12_6 \
21 |   "$@"
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | export WANDB_PROJECT=dmar
 4 | # export MAX_LEN=128
 5 | python distillation.py \
 6 |   --learning_rate=3e-4 \
 7 |   --do_train \
 8 |   --fp16 \
 9 |   --val_check_interval 0.25 \
10 |   --teacher Helsinki-NLP/opus-mt-en-ro \
11 |   --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 |   --student_decoder_layers 3 --student_encoder_layers 6 \
13 |   --freeze_encoder --freeze_embeds \
14 |   --model_name_or_path IGNORED \
15 |   --alpha_hid=3. \
16 |   --train_batch_size=$BS --eval_batch_size=$BS \
17 |   --tokenizer_name Helsinki-NLP/opus-mt-en-ro \
18 |   --warmup_steps 500 --logger_name wandb \
19 |   --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \
20 |   "$@"
21 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
 4 | --num_train_epochs="50" \
 5 | --per_device_train_batch_size="1" \
 6 | --per_device_eval_batch_size="1" \
 7 | --gradient_accumulation_steps="8" \
 8 | --evaluation_strategy="steps" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
15 | --fp16 \
16 | --dataset_name="arabic_speech_corpus" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --max_duration_in_seconds="15" \
20 | --orthography="buckwalter" \
21 | --preprocessing_num_workers="$(nproc)" \
22 | --group_by_length \
23 | --freeze_feature_extractor \
24 | --target_feature_extractor_sampling_rate \
25 | --verbose_logging \
26 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | python distillation.py \
 4 |   --teacher facebook/bart-large-xsum --data_dir xsum \
 5 |   --tokenizer_name facebook/bart-large-xsum \
 6 |   --student_decoder_layers 6 --student_encoder_layers 12 \
 7 |   --freeze_encoder --freeze_embeds \
 8 |   --learning_rate=3e-4 \
 9 |   --do_train \
10 |   --do_predict \
11 |   --fp16 --fp16_opt_level=O1 \
12 |   --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
13 |   --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
14 |   --model_name_or_path IGNORED \
15 |   --alpha_hid=3. \
16 |   --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
17 |   --sortish_sampler \
18 |   --num_train_epochs=6 \
19 |   --warmup_steps 500 \
20 |   --output_dir distilbart_xsum_12_6 \
21 |   "$@"
22 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/eval_deebert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 |   MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 | 
16 | 
17 | python -u run_glue_deebert.py  \
18 |   --model_type $MODEL_TYPE \
19 |   --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
20 |   --task_name $DATASET \
21 |   --do_eval \
22 |   --do_lower_case \
23 |   --data_dir $PATH_TO_DATA/$DATASET \
24 |   --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
25 |   --plot_data_dir ./results/ \
26 |   --max_seq_length 128 \
27 |   --eval_each_highway \
28 |   --eval_highway \
29 |   --overwrite_cache \
30 |   --per_gpu_eval_batch_size=1
31 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | # Legacy examples
18 | 
19 | This folder contains examples which are not actively maintained (mostly contributed by the community).
20 | 
21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_asr.py \
 3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
 4 | --num_train_epochs="50" \
 5 | --per_device_train_batch_size="1" \
 6 | --per_device_eval_batch_size="1" \
 7 | --gradient_accumulation_steps="8" \
 8 | --evaluation_strategy="steps" \
 9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
15 | --fp16 \
16 | --dataset_name="arabic_speech_corpus" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --max_duration_in_seconds="15" \
20 | --orthography="buckwalter" \
21 | --preprocessing_num_workers="$(nproc)" \
22 | --group_by_length \
23 | --freeze_feature_extractor \
24 | --target_feature_extractor_sampling_rate \
25 | --verbose_logging \
26 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/eval_deebert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 |   MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 | 
16 | 
17 | python -u run_glue_deebert.py  \
18 |   --model_type $MODEL_TYPE \
19 |   --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
20 |   --task_name $DATASET \
21 |   --do_eval \
22 |   --do_lower_case \
23 |   --data_dir $PATH_TO_DATA/$DATASET \
24 |   --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
25 |   --plot_data_dir ./results/ \
26 |   --max_seq_length 128 \
27 |   --eval_each_highway \
28 |   --eval_highway \
29 |   --overwrite_cache \
30 |   --per_gpu_eval_batch_size=1
31 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/dataset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: huggingface-cluster-disk
 5 | spec:
 6 |   storageClassName: ""
 7 |   capacity:
 8 |     storage: 500Gi
 9 |   accessModes:
10 |     - ReadOnlyMany
11 |   claimRef:
12 |     namespace: default
13 |     name: huggingface-cluster-disk-claim
14 |   gcePersistentDisk:
15 |     pdName: huggingface-cluster-disk
16 |     fsType: ext4
17 |     readOnly: true
18 | ---
19 | apiVersion: v1
20 | kind: PersistentVolumeClaim
21 | metadata:
22 |   name: huggingface-cluster-disk-claim
23 | spec:
24 |   # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
25 |   # A nil storageClassName value uses the default StorageClass. For details, see
26 |   # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
27 |   storageClassName: ""
28 |   accessModes:
29 |     - ReadOnlyMany
30 |   resources:
31 |     requests:
32 |       storage: 1Ki
33 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | # Legacy examples
18 | 
19 | This folder contains examples which are not actively maintained (mostly contributed by the community).
20 | 
21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
22 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     mkl \
18 |     torch
19 | 
20 | RUN git clone https://github.com/NVIDIA/apex
21 | RUN cd apex && \
22 |     python3 setup.py install && \
23 |     pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
24 | 
25 | WORKDIR /workspace
26 | COPY . transformers/
27 | RUN cd transformers/ && \
28 |     python3 -m pip install --no-cache-dir .
29 | 
30 | CMD ["/bin/bash"]
31 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/sagemaker/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2021 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .trainer_sm import SageMakerTrainer
20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available
21 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 2 | LABEL maintainer="Hugging Face"
 3 | LABEL repository="transformers"
 4 | 
 5 | RUN apt update && \
 6 |     apt install -y bash \
 7 |                    build-essential \
 8 |                    git \
 9 |                    curl \
10 |                    ca-certificates \
11 |                    python3 \
12 |                    python3-pip && \
13 |     rm -rf /var/lib/apt/lists
14 | 
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 |     python3 -m pip install --no-cache-dir \
17 |     jupyter \
18 |     tensorflow \
19 |     torch
20 | 
21 | RUN git clone https://github.com/NVIDIA/apex
22 | RUN cd apex && \
23 |     python3 setup.py install && \
24 |     pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
25 | 
26 | WORKDIR /workspace
27 | COPY . transformers/
28 | RUN cd transformers/ && \
29 |     python3 -m pip install --no-cache-dir .
30 | 
31 | CMD ["/bin/bash"]
32 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/sagemaker/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2021 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .trainer_sm import SageMakerTrainer
20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available
21 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh:
--------------------------------------------------------------------------------
 1 | # Script for verifying that run_bart_sum can be invoked from its directory
 2 | 
 3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test)
 4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz
 5 | tar -xzvf cnn_tiny.tgz
 6 | rm cnn_tiny.tgz
 7 | 
 8 | export OUTPUT_DIR_NAME=bart_utest_output
 9 | export CURRENT_DIR=${PWD}
10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
11 | 
12 | # Make output directory if it doesn't exist
13 | mkdir -p $OUTPUT_DIR
14 | 
15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py
16 | export PYTHONPATH="../":"${PYTHONPATH}"
17 | python finetune.py \
18 | --data_dir=cnn_tiny/ \
19 | --model_name_or_path=sshleifer/bart-tiny-random \
20 | --learning_rate=3e-5 \
21 | --train_batch_size=2 \
22 | --eval_batch_size=2 \
23 | --output_dir=$OUTPUT_DIR \
24 | --num_train_epochs=1  \
25 | --gpus=0 \
26 | --do_train "$@"
27 | 
28 | rm -rf cnn_tiny
29 | rm -rf $OUTPUT_DIR
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/vocab/buckwalter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "<pad>": 0,
 3 |     "<s>": 1,
 4 |     "</s>": 2,
 5 |     "<unk>": 3,
 6 |     "/": 4,
 7 |     "'": 5,
 8 |     "|": 6,
 9 |     ">": 7,
10 |     "&": 8,
11 |     "<": 9,
12 |     "}": 10,
13 |     "A": 11,
14 |     "b": 12,
15 |     "p": 13,
16 |     "t": 14,
17 |     "v": 15,
18 |     "j": 16,
19 |     "H": 17,
20 |     "x": 18,
21 |     "d": 19,
22 |     "*": 20,
23 |     "r": 21,
24 |     "z": 22,
25 |     "s": 23,
26 |     "$": 24,
27 |     "S": 25,
28 |     "D": 26,
29 |     "T": 27,
30 |     "Z": 28,
31 |     "E": 29,
32 |     "g": 30,
33 |     "_": 31,
34 |     "f": 32,
35 |     "q": 33,
36 |     "k": 34,
37 |     "l": 35,
38 |     "m": 36,
39 |     "n": 37,
40 |     "h": 38,
41 |     "w": 39,
42 |     "Y": 40,
43 |     "y": 41,
44 |     "F": 42,
45 |     "N": 43,
46 |     "K": 44,
47 |     "a": 45,
48 |     "u": 46,
49 |     "i": 47,
50 |     "~": 48,
51 |     "o": 49,
52 |     "`": 50,
53 |     "{": 51,
54 |     "P": 52,
55 |     "J": 53,
56 |     "V": 54,
57 |     "G": 55
58 | }


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = transformers
 7 | known_third_party =
 8 |     absl
 9 |     conllu
10 |     datasets
11 |     elasticsearch
12 |     fairseq
13 |     faiss-cpu
14 |     fastprogress
15 |     fire
16 |     fugashi
17 |     git
18 |     h5py
19 |     matplotlib
20 |     nltk
21 |     numpy
22 |     packaging
23 |     pandas
24 |     PIL
25 |     psutil
26 |     pytest
27 |     pytorch_lightning
28 |     rouge_score
29 |     sacrebleu
30 |     seqeval
31 |     sklearn
32 |     streamlit
33 |     tensorboardX
34 |     tensorflow
35 |     tensorflow_datasets
36 |     timeout_decorator
37 |     torch
38 |     torchaudio
39 |     torchtext
40 |     torchvision
41 |     torch_xla
42 |     tqdm
43 | 
44 | line_length = 119
45 | lines_after_imports = 2
46 | multi_line_output = 3
47 | use_parentheses = True
48 | 
49 | [flake8]
50 | ignore = E203, E501, E741, W503, W605
51 | max-line-length = 119
52 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 | 
18 | 
19 | class BaseTransformersCLICommand(ABC):
20 |     @staticmethod
21 |     @abstractmethod
22 |     def register_subcommand(parser: ArgumentParser):
23 |         raise NotImplementedError()
24 | 
25 |     @abstractmethod
26 |     def run(self):
27 |         raise NotImplementedError()
28 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/vocab/buckwalter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "<pad>": 0,
 3 |     "<s>": 1,
 4 |     "</s>": 2,
 5 |     "<unk>": 3,
 6 |     "/": 4,
 7 |     "'": 5,
 8 |     "|": 6,
 9 |     ">": 7,
10 |     "&": 8,
11 |     "<": 9,
12 |     "}": 10,
13 |     "A": 11,
14 |     "b": 12,
15 |     "p": 13,
16 |     "t": 14,
17 |     "v": 15,
18 |     "j": 16,
19 |     "H": 17,
20 |     "x": 18,
21 |     "d": 19,
22 |     "*": 20,
23 |     "r": 21,
24 |     "z": 22,
25 |     "s": 23,
26 |     "$": 24,
27 |     "S": 25,
28 |     "D": 26,
29 |     "T": 27,
30 |     "Z": 28,
31 |     "E": 29,
32 |     "g": 30,
33 |     "_": 31,
34 |     "f": 32,
35 |     "q": 33,
36 |     "k": 34,
37 |     "l": 35,
38 |     "m": 36,
39 |     "n": 37,
40 |     "h": 38,
41 |     "w": 39,
42 |     "Y": 40,
43 |     "y": 41,
44 |     "F": 42,
45 |     "N": 43,
46 |     "K": 44,
47 |     "a": 45,
48 |     "u": 46,
49 |     "i": 47,
50 |     "~": 48,
51 |     "o": 49,
52 |     "`": 50,
53 |     "{": 51,
54 |     "P": 52,
55 |     "J": 53,
56 |     "V": 54,
57 |     "G": 55
58 | }


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import io
 4 | import json
 5 | import subprocess
 6 | 
 7 | 
 8 | pairs = [
 9 |     ["en", "ru"],
10 |     ["ru", "en"],
11 |     ["en", "de"],
12 |     ["de", "en"],
13 | ]
14 | 
15 | n_objs = 8
16 | 
17 | 
18 | def get_all_data(pairs, n_objs):
19 |     text = {}
20 |     for src, tgt in pairs:
21 |         pair = f"{src}-{tgt}"
22 |         cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
23 |         src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
24 |         cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
25 |         tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
26 |         text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
27 |     return text
28 | 
29 | 
30 | text = get_all_data(pairs, n_objs)
31 | filename = "./fsmt_val_data.json"
32 | with io.open(filename, "w", encoding="utf-8") as f:
33 |     bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
34 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh:
--------------------------------------------------------------------------------
 1 | # Script for verifying that run_bart_sum can be invoked from its directory
 2 | 
 3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test)
 4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz
 5 | tar -xzvf cnn_tiny.tgz
 6 | rm cnn_tiny.tgz
 7 | 
 8 | export OUTPUT_DIR_NAME=bart_utest_output
 9 | export CURRENT_DIR=${PWD}
10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
11 | 
12 | # Make output directory if it doesn't exist
13 | mkdir -p $OUTPUT_DIR
14 | 
15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py
16 | export PYTHONPATH="../":"${PYTHONPATH}"
17 | python finetune.py \
18 | --data_dir=cnn_tiny/ \
19 | --model_name_or_path=sshleifer/bart-tiny-random \
20 | --learning_rate=3e-5 \
21 | --train_batch_size=2 \
22 | --eval_batch_size=2 \
23 | --output_dir=$OUTPUT_DIR \
24 | --num_train_epochs=1  \
25 | --gpus=0 \
26 | --do_train "$@"
27 | 
28 | rm -rf cnn_tiny
29 | rm -rf $OUTPUT_DIR
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 | 
18 | 
19 | class BaseTransformersCLICommand(ABC):
20 |     @staticmethod
21 |     @abstractmethod
22 |     def register_subcommand(parser: ArgumentParser):
23 |         raise NotImplementedError()
24 | 
25 |     @abstractmethod
26 |     def run(self):
27 |         raise NotImplementedError()
28 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/run_glue.sh:
--------------------------------------------------------------------------------
 1 | # Install example requirements
 2 | pip install -r ../requirements.txt
 3 | 
 4 | # Download glue data
 5 | python3 ../../utils/download_glue_data.py
 6 | 
 7 | export TASK=mrpc
 8 | export DATA_DIR=./glue_data/MRPC/
 9 | export MAX_LENGTH=128
10 | export LEARNING_RATE=2e-5
11 | export BERT_MODEL=bert-base-cased
12 | export BATCH_SIZE=32
13 | export NUM_EPOCHS=3
14 | export SEED=2
15 | export OUTPUT_DIR_NAME=mrpc-pl-bert
16 | export CURRENT_DIR=${PWD}
17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
18 | 
19 | # Make output directory if it doesn't exist
20 | mkdir -p $OUTPUT_DIR
21 | # Add parent directory to python path to access lightning_base.py
22 | export PYTHONPATH="../":"${PYTHONPATH}"
23 | 
24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
25 | --task $TASK \
26 | --model_name_or_path $BERT_MODEL \
27 | --output_dir $OUTPUT_DIR \
28 | --max_seq_length  $MAX_LENGTH \
29 | --learning_rate $LEARNING_RATE \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --train_batch_size $BATCH_SIZE \
32 | --seed $SEED \
33 | --do_train \
34 | --do_predict
35 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/entropy_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 |   MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 | 
16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7"
17 | 
18 | for ENTROPY in $ENTROPIES; do
19 |   python -u run_glue_deebert.py \
20 |     --model_type $MODEL_TYPE \
21 |     --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
22 |     --task_name $DATASET \
23 |     --do_eval \
24 |     --do_lower_case \
25 |     --data_dir $PATH_TO_DATA/$DATASET \
26 |     --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
27 |     --plot_data_dir ./results/ \
28 |     --max_seq_length 128 \
29 |     --early_exit_entropy $ENTROPY \
30 |     --eval_highway \
31 |     --overwrite_cache \
32 |     --per_gpu_eval_batch_size=1
33 | done
34 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import io
 4 | import json
 5 | import subprocess
 6 | 
 7 | 
 8 | pairs = [
 9 |     ["en", "ru"],
10 |     ["ru", "en"],
11 |     ["en", "de"],
12 |     ["de", "en"],
13 | ]
14 | 
15 | n_objs = 8
16 | 
17 | 
18 | def get_all_data(pairs, n_objs):
19 |     text = {}
20 |     for src, tgt in pairs:
21 |         pair = f"{src}-{tgt}"
22 |         cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
23 |         src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
24 |         cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
25 |         tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
26 |         text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
27 |     return text
28 | 
29 | 
30 | text = get_all_data(pairs, n_objs)
31 | filename = "./fsmt_val_data.json"
32 | with io.open(filename, "w", encoding="utf-8") as f:
33 |     bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
34 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "transformers" %}
 2 | 
 3 | package:
 4 |   name: "{{ name|lower }}"
 5 |   version: "{{ TRANSFORMERS_VERSION }}"
 6 | 
 7 | source:
 8 |   path: ../../
 9 | 
10 | build:
11 |   noarch: python
12 | 
13 | requirements:
14 |   host:
15 |     - python
16 |     - pip
17 |     - numpy >=1.17
18 |     - dataclasses
19 |     - packaging
20 |     - filelock
21 |     - requests
22 |     - tqdm >=4.27
23 |     - sacremoses
24 |     - regex !=2019.12.17
25 |     - protobuf
26 |     - tokenizers >=0.10.1,<0.11.0
27 |   run:
28 |     - python
29 |     - numpy >=1.17
30 |     - dataclasses
31 |     - packaging
32 |     - filelock
33 |     - requests
34 |     - tqdm >=4.27
35 |     - sacremoses
36 |     - regex !=2019.12.17
37 |     - protobuf
38 |     - tokenizers >=0.10.1,<0.11.0
39 | 
40 | test:
41 |   imports:
42 |     - transformers
43 | 
44 | about:
45 |   home: https://huggingface.co
46 |   license: Apache License 2.0
47 |   license_file: LICENSE
48 |   summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
49 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/finetune.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
16 | # run ./finetune.sh --help to see all the possible options
17 | python finetune_trainer.py \
18 |     --learning_rate=3e-5 \
19 |     --fp16 \
20 |     --do_train --do_eval --do_predict \
21 |     --evaluation_strategy steps \
22 |     --predict_with_generate \
23 |     --n_val 1000 \
24 |     "$@"
25 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/entropy_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 |   MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 | 
16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7"
17 | 
18 | for ENTROPY in $ENTROPIES; do
19 |   python -u run_glue_deebert.py \
20 |     --model_type $MODEL_TYPE \
21 |     --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
22 |     --task_name $DATASET \
23 |     --do_eval \
24 |     --do_lower_case \
25 |     --data_dir $PATH_TO_DATA/$DATASET \
26 |     --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
27 |     --plot_data_dir ./results/ \
28 |     --max_seq_length 128 \
29 |     --early_exit_entropy $ENTROPY \
30 |     --eval_highway \
31 |     --overwrite_cache \
32 |     --per_gpu_eval_batch_size=1
33 | done
34 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_glue.sh:
--------------------------------------------------------------------------------
 1 | # Install example requirements
 2 | pip install -r ../requirements.txt
 3 | 
 4 | # Download glue data
 5 | python3 ../../utils/download_glue_data.py
 6 | 
 7 | export TASK=mrpc
 8 | export DATA_DIR=./glue_data/MRPC/
 9 | export MAX_LENGTH=128
10 | export LEARNING_RATE=2e-5
11 | export BERT_MODEL=bert-base-cased
12 | export BATCH_SIZE=32
13 | export NUM_EPOCHS=3
14 | export SEED=2
15 | export OUTPUT_DIR_NAME=mrpc-pl-bert
16 | export CURRENT_DIR=${PWD}
17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
18 | 
19 | # Make output directory if it doesn't exist
20 | mkdir -p $OUTPUT_DIR
21 | # Add parent directory to python path to access lightning_base.py
22 | export PYTHONPATH="../":"${PYTHONPATH}"
23 | 
24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
25 | --task $TASK \
26 | --model_name_or_path $BERT_MODEL \
27 | --output_dir $OUTPUT_DIR \
28 | --max_seq_length  $MAX_LENGTH \
29 | --learning_rate $LEARNING_RATE \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --train_batch_size $BATCH_SIZE \
32 | --seed $SEED \
33 | --do_train \
34 | --do_predict
35 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/train_deebert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | EPOCHS=10
12 | if [ $MODEL_TYPE = 'bert' ]
13 | then
14 |   EPOCHS=3
15 |   MODEL_NAME=${MODEL_NAME}-uncased
16 | fi
17 | 
18 | 
19 | python -u run_glue_deebert.py \
20 |   --model_type $MODEL_TYPE \
21 |   --model_name_or_path $MODEL_NAME \
22 |   --task_name $DATASET \
23 |   --do_train \
24 |   --do_eval \
25 |   --do_lower_case \
26 |   --data_dir $PATH_TO_DATA/$DATASET \
27 |   --max_seq_length 128 \
28 |   --per_gpu_eval_batch_size=1 \
29 |   --per_gpu_train_batch_size=8 \
30 |   --learning_rate 2e-5 \
31 |   --num_train_epochs $EPOCHS \
32 |   --overwrite_output_dir \
33 |   --seed 42 \
34 |   --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
35 |   --plot_data_dir ./results/ \
36 |   --save_steps 0 \
37 |   --overwrite_cache \
38 |   --eval_after_first_stage
39 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/finetune.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
16 | # run ./finetune.sh --help to see all the possible options
17 | python finetune_trainer.py \
18 |     --learning_rate=3e-5 \
19 |     --fp16 \
20 |     --do_train --do_eval --do_predict \
21 |     --evaluation_strategy steps \
22 |     --predict_with_generate \
23 |     --n_val 1000 \
24 |     "$@"
25 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/train_deebert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_VISIBLE_DEVICES=0
 3 | 
 4 | PATH_TO_DATA=/h/xinji/projects/GLUE
 5 | 
 6 | MODEL_TYPE=bert  # bert or roberta
 7 | MODEL_SIZE=base  # base or large
 8 | DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
 9 | 
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | EPOCHS=10
12 | if [ $MODEL_TYPE = 'bert' ]
13 | then
14 |   EPOCHS=3
15 |   MODEL_NAME=${MODEL_NAME}-uncased
16 | fi
17 | 
18 | 
19 | python -u run_glue_deebert.py \
20 |   --model_type $MODEL_TYPE \
21 |   --model_name_or_path $MODEL_NAME \
22 |   --task_name $DATASET \
23 |   --do_train \
24 |   --do_eval \
25 |   --do_lower_case \
26 |   --data_dir $PATH_TO_DATA/$DATASET \
27 |   --max_seq_length 128 \
28 |   --per_gpu_eval_batch_size=1 \
29 |   --per_gpu_train_batch_size=8 \
30 |   --learning_rate 2e-5 \
31 |   --num_train_epochs $EPOCHS \
32 |   --overwrite_output_dir \
33 |   --seed 42 \
34 |   --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
35 |   --plot_data_dir ./results/ \
36 |   --save_steps 0 \
37 |   --overwrite_cache \
38 |   --eval_after_first_stage
39 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/finetune_tpu.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | export TPU_NUM_CORES=8
16 | 
17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
18 | # run ./finetune_tpu.sh --help to see all the possible options
19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \
20 |     finetune_trainer.py \
21 |     --learning_rate=3e-5 \
22 |     --do_train --do_eval \
23 |     --evaluation_strategy steps \
24 |     --prediction_loss_only \
25 |     --n_val 1000 \
26 |     "$@"
27 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/tests/deepspeed/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": true,
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 32,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "zero_optimization": {
12 |         "stage": 2,
13 |         "allgather_partitions": true,
14 |         "allgather_bucket_size": 2e8,
15 |         "overlap_comm": true,
16 |         "reduce_scatter": true,
17 |         "reduce_bucket_size": 2e8,
18 |         "contiguous_gradients": true,
19 |         "cpu_offload": true
20 |     },
21 | 
22 |     "optimizer": {
23 |         "type": "AdamW",
24 |         "params": {
25 |             "lr": 3e-5,
26 |             "betas": [0.8, 0.999],
27 |             "eps": 1e-8,
28 |             "weight_decay": 3e-7
29 |         }
30 |     },
31 | 
32 |     "scheduler": {
33 |         "type": "WarmupLR",
34 |         "params": {
35 |             "warmup_min_lr": 0,
36 |             "warmup_max_lr": 3e-5,
37 |             "warmup_num_steps": 500
38 |         }
39 |     },
40 | 
41 |     "steps_per_print": 2000,
42 |     "wall_clock_breakdown": false
43 | }
44 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet:
--------------------------------------------------------------------------------
 1 | local base = import 'templates/base.libsonnet';
 2 | local tpus = import 'templates/tpus.libsonnet';
 3 | local utils = import "templates/utils.libsonnet";
 4 | local volumes = import "templates/volumes.libsonnet";
 5 | 
 6 | local bertBaseCased = base.BaseTest {
 7 |   frameworkPrefix: "hf",
 8 |   modelName: "bert-base-cased",
 9 |   mode: "example",
10 |   configMaps: [],
11 | 
12 |   timeout: 3600, # 1 hour, in seconds
13 | 
14 |   image: std.extVar('image'),
15 |   imageTag: std.extVar('image-tag'),
16 | 
17 |   tpuSettings+: {
18 |     softwareVersion: "pytorch-nightly",
19 |   },
20 |   accelerator: tpus.v3_8,
21 | 
22 |   volumeMap+: {
23 |     datasets: volumes.PersistentVolumeSpec {
24 |       name: "huggingface-cluster-disk",
25 |       mountPath: "/datasets",
26 |     },
27 |   },
28 |   command: utils.scriptCommand(
29 |     |||
30 |       python -m pytest -s transformers/examples/test_xla_examples.py -v
31 |       test_exit_code=$?
32 |       echo "\nFinished running commands.\n"
33 |       test $test_exit_code -eq 0
34 |     |||
35 |   ),
36 | };
37 | 
38 | bertBaseCased.oneshotJob
39 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/finetune_tpu.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | export TPU_NUM_CORES=8
16 | 
17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
18 | # run ./finetune_tpu.sh --help to see all the possible options
19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \
20 |     finetune_trainer.py \
21 |     --learning_rate=3e-5 \
22 |     --do_train --do_eval \
23 |     --evaluation_strategy steps \
24 |     --prediction_loss_only \
25 |     --n_val 1000 \
26 |     "$@"
27 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/tests/deepspeed/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": true,
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 32,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "zero_optimization": {
12 |         "stage": 2,
13 |         "allgather_partitions": true,
14 |         "allgather_bucket_size": 2e8,
15 |         "overlap_comm": true,
16 |         "reduce_scatter": true,
17 |         "reduce_bucket_size": 2e8,
18 |         "contiguous_gradients": true,
19 |         "cpu_offload": true
20 |     },
21 | 
22 |     "optimizer": {
23 |         "type": "AdamW",
24 |         "params": {
25 |             "lr": 3e-5,
26 |             "betas": [0.8, 0.999],
27 |             "eps": 1e-8,
28 |             "weight_decay": 3e-7
29 |         }
30 |     },
31 | 
32 |     "scheduler": {
33 |         "type": "WarmupLR",
34 |         "params": {
35 |             "warmup_min_lr": 0,
36 |             "warmup_max_lr": 3e-5,
37 |             "warmup_num_steps": 500
38 |         }
39 |     },
40 | 
41 |     "steps_per_print": 2000,
42 |     "wall_clock_breakdown": false
43 | }
44 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/fsmt/tests-to-run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # these scripts need to be run before any changes to FSMT-related code - it should cover all bases
17 | 
18 | CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
19 | RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
20 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/finetune_rag.sh:
--------------------------------------------------------------------------------
 1 | # Add parent directory to python path to access lightning_base.py
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
 5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options
 6 | 
 7 | python examples/rag/finetune_rag.py \
 8 |     --data_dir $DATA_DIR \
 9 |     --output_dir $OUTPUT_DIR \
10 |     --model_name_or_path $MODEL_NAME_OR_PATH \
11 |     --model_type rag_sequence \
12 |     --fp16 \
13 |     --gpus 8 \
14 |     --profile \
15 |     --do_train \
16 |     --do_predict \
17 |     --n_val -1 \
18 |     --train_batch_size 8 \
19 |     --eval_batch_size 1 \
20 |     --max_source_length 128 \
21 |     --max_target_length 25 \
22 |     --val_max_target_length 25 \
23 |     --test_max_target_length 25 \
24 |     --label_smoothing 0.1 \
25 |     --dropout 0.1 \
26 |     --attention_dropout 0.1 \
27 |     --weight_decay 0.001 \
28 |     --adam_epsilon 1e-08 \
29 |     --max_grad_norm 0.1 \
30 |     --lr_scheduler polynomial \
31 |     --learning_rate 3e-05 \
32 |     --num_train_epochs 100 \
33 |     --warmup_steps 500 \
34 |     --gradient_accumulation_steps 1 \
35 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/run_chunk.sh:
--------------------------------------------------------------------------------
 1 | if ! [ -f ./dev.txt ]; then
 2 |   echo "Downloading CONLL2003 dev dataset...."
 3 |   curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
 4 | fi
 5 | 
 6 | if ! [ -f ./test.txt ]; then
 7 |   echo "Downloading CONLL2003 test dataset...."
 8 |   curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
 9 | fi
10 | 
11 | if ! [ -f ./train.txt ]; then
12 |   echo "Downloading CONLL2003 train dataset...."
13 |   curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
14 | fi
15 | 
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=chunker-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 | 
24 | python3 run_ner.py \
25 | --task_type Chunk \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length  $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 | 
38 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/run_pos.sh:
--------------------------------------------------------------------------------
 1 | if ! [ -f ./dev.txt ]; then
 2 |   echo "Download dev dataset...."
 3 |   curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 4 | fi
 5 | 
 6 | if ! [ -f ./test.txt ]; then
 7 |   echo "Download test dataset...."
 8 |   curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
 9 | fi
10 | 
11 | if ! [ -f ./train.txt ]; then
12 |   echo "Download train dataset...."
13 |   curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
14 | fi
15 | 
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=postagger-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 | 
24 | python3 run_ner.py \
25 | --task_type POS \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length  $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 | 
38 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/scripts/preprocess.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | 
 6 | dataset = sys.argv[1]
 7 | model_name_or_path = sys.argv[2]
 8 | max_len = int(sys.argv[3])
 9 | 
10 | subword_len_counter = 0
11 | 
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | max_len -= tokenizer.num_special_tokens_to_add()
14 | 
15 | with open(dataset, "rt") as f_p:
16 |     for line in f_p:
17 |         line = line.rstrip()
18 | 
19 |         if not line:
20 |             print(line)
21 |             subword_len_counter = 0
22 |             continue
23 | 
24 |         token = line.split()[0]
25 | 
26 |         current_subwords_len = len(tokenizer.tokenize(token))
27 | 
28 |         # Token contains strange control characters like \x96 or \x95
29 |         # Just filter out the complete line
30 |         if current_subwords_len == 0:
31 |             continue
32 | 
33 |         if (subword_len_counter + current_subwords_len) > max_len:
34 |             print("")
35 |             print(line)
36 |             subword_len_counter = current_subwords_len
37 |             continue
38 | 
39 |         subword_len_counter += current_subwords_len
40 | 
41 |         print(line)
42 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import GlueDataset, GlueDataTrainingArguments
20 | from .language_modeling import (
21 |     LineByLineTextDataset,
22 |     LineByLineWithRefDataset,
23 |     LineByLineWithSOPTextDataset,
24 |     TextDataset,
25 |     TextDatasetForNextSentencePrediction,
26 | )
27 | from .squad import SquadDataset, SquadDataTrainingArguments
28 | 


--------------------------------------------------------------------------------
/sorting/run_sort_inftyformer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $1 == 'train' ]]; then
 4 |     echo 'Run training...'
 5 |     python3 train.py \
 6 |         --cuda \
 7 |         --data ../data/ \
 8 |         --dataset ../data_sort_8000 \
 9 |         --n_layer 3 \
10 |         --d_model 300 \
11 |         --n_head 6 \
12 |         --d_head 50 \
13 |         --d_inner 300 \
14 |         --dropout 0.1 \
15 |         --dropatt 0.0 \
16 |         --optim adam \
17 |         --lr 0.0002 \
18 |         --warmup_step 0 \
19 |         --max_step 20000 \
20 |         --tgt_len 1024 \
21 |         --mem_len 1024 \
22 |         --eval_tgt_len 1024 \
23 |         --batch_size 8 \
24 |         --gpu0_bsz 8 \
25 |         --continuous \
26 |         --long_term_attention \
27 |         --long_term_attention_norm='softmax' \
28 |         --long_term_attention_basis 512 \
29 |         --affines \
30 |         --augment \
31 |         --augment_len 1024 \
32 |         --infinite_memory \
33 |         --mask \
34 |         --mask_type 'cnn' \
35 |         --kl_regularizer  \
36 |         --kl_m .000001 \
37 |         --sigma_0 .05 \
38 |         --name infty_former \
39 |         --work_dir ./sort_8000 \
40 |         ${@:2}
41 |     echo 'unknown argment 1'
42 | fi
43 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_text2text_generation.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 | 
19 | 
20 | class Text2TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 |     pipeline_task = "text2text-generation"
22 |     small_models = ["patrickvonplaten/t5-tiny-random"]  # Default model - Models tested without the @slow decorator
23 |     large_models = []  # Models tested with the @slow decorator
24 |     invalid_inputs = [4, "<mask>"]
25 |     mandatory_keys = ["generated_text"]
26 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/run_pos.sh:
--------------------------------------------------------------------------------
 1 | if ! [ -f ./dev.txt ]; then
 2 |   echo "Download dev dataset...."
 3 |   curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 4 | fi
 5 | 
 6 | if ! [ -f ./test.txt ]; then
 7 |   echo "Download test dataset...."
 8 |   curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
 9 | fi
10 | 
11 | if ! [ -f ./train.txt ]; then
12 |   echo "Download train dataset...."
13 |   curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
14 | fi
15 | 
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=postagger-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 | 
24 | python3 run_ner.py \
25 | --task_type POS \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length  $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 | 
38 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag.sh:
--------------------------------------------------------------------------------
 1 | # Add parent directory to python path to access lightning_base.py
 2 | export PYTHONPATH="../":"${PYTHONPATH}"
 3 | 
 4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
 5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options
 6 | 
 7 | python examples/rag/finetune_rag.py \
 8 |     --data_dir $DATA_DIR \
 9 |     --output_dir $OUTPUT_DIR \
10 |     --model_name_or_path $MODEL_NAME_OR_PATH \
11 |     --model_type rag_sequence \
12 |     --fp16 \
13 |     --gpus 8 \
14 |     --profile \
15 |     --do_train \
16 |     --do_predict \
17 |     --n_val -1 \
18 |     --train_batch_size 8 \
19 |     --eval_batch_size 1 \
20 |     --max_source_length 128 \
21 |     --max_target_length 25 \
22 |     --val_max_target_length 25 \
23 |     --test_max_target_length 25 \
24 |     --label_smoothing 0.1 \
25 |     --dropout 0.1 \
26 |     --attention_dropout 0.1 \
27 |     --weight_decay 0.001 \
28 |     --adam_epsilon 1e-08 \
29 |     --max_grad_norm 0.1 \
30 |     --lr_scheduler polynomial \
31 |     --learning_rate 3e-05 \
32 |     --num_train_epochs 100 \
33 |     --warmup_steps 500 \
34 |     --gradient_accumulation_steps 1 \
35 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_feature_extraction.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 | 
19 | 
20 | class FeatureExtractionPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 |     pipeline_task = "feature-extraction"
22 |     small_models = [
23 |         "sshleifer/tiny-distilbert-base-cased"
24 |     ]  # Default model - Models tested without the @slow decorator
25 |     large_models = [None]  # Models tested with the @slow decorator
26 |     mandatory_keys = {}  # Keys which should be in the output
27 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/run_chunk.sh:
--------------------------------------------------------------------------------
 1 | if ! [ -f ./dev.txt ]; then
 2 |   echo "Downloading CONLL2003 dev dataset...."
 3 |   curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
 4 | fi
 5 | 
 6 | if ! [ -f ./test.txt ]; then
 7 |   echo "Downloading CONLL2003 test dataset...."
 8 |   curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
 9 | fi
10 | 
11 | if ! [ -f ./train.txt ]; then
12 |   echo "Downloading CONLL2003 train dataset...."
13 |   curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
14 | fi
15 | 
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=chunker-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 | 
24 | python3 run_ner.py \
25 | --task_type Chunk \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length  $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 | 
38 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/scripts/preprocess.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | 
 6 | dataset = sys.argv[1]
 7 | model_name_or_path = sys.argv[2]
 8 | max_len = int(sys.argv[3])
 9 | 
10 | subword_len_counter = 0
11 | 
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | max_len -= tokenizer.num_special_tokens_to_add()
14 | 
15 | with open(dataset, "rt") as f_p:
16 |     for line in f_p:
17 |         line = line.rstrip()
18 | 
19 |         if not line:
20 |             print(line)
21 |             subword_len_counter = 0
22 |             continue
23 | 
24 |         token = line.split()[0]
25 | 
26 |         current_subwords_len = len(tokenizer.tokenize(token))
27 | 
28 |         # Token contains strange control characters like \x96 or \x95
29 |         # Just filter out the complete line
30 |         if current_subwords_len == 0:
31 |             continue
32 | 
33 |         if (subword_len_counter + current_subwords_len) > max_len:
34 |             print("")
35 |             print(line)
36 |             subword_len_counter = current_subwords_len
37 |             continue
38 | 
39 |         subword_len_counter += current_subwords_len
40 | 
41 |         print(line)
42 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import GlueDataset, GlueDataTrainingArguments
20 | from .language_modeling import (
21 |     LineByLineTextDataset,
22 |     LineByLineWithRefDataset,
23 |     LineByLineWithSOPTextDataset,
24 |     TextDataset,
25 |     TextDatasetForNextSentencePrediction,
26 | )
27 | from .squad import SquadDataset, SquadDataTrainingArguments
28 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/release-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Release - Conda
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - v*
 7 | 
 8 | env:
 9 |   ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
10 | 
11 | jobs:
12 |   build_and_package:
13 |     runs-on: ubuntu-latest
14 |     defaults:
15 |       run:
16 |         shell: bash -l {0}
17 | 
18 |     steps:
19 |       - name: Checkout repository
20 |         uses: actions/checkout@v1
21 | 
22 |       - name: Install miniconda
23 |         uses: conda-incubator/setup-miniconda@v2
24 |         with:
25 |           auto-update-conda: true
26 |           auto-activate-base: false
27 |           activate-environment: "build-transformers"
28 |           channels: huggingface
29 | 
30 |       - name: Setup conda env
31 |         run: |
32 |           conda install -c defaults anaconda-client conda-build
33 | 
34 |       - name: Extract version
35 |         run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
36 | 
37 |       - name: Build conda packages
38 |         run: |
39 |           conda info
40 |           conda list
41 |           conda-build .github/conda
42 | 
43 |       - name: Upload to Anaconda
44 |         run: anaconda upload `conda-build .github/conda --output` --force
45 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | # Research projects
18 | 
19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific
20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work.
21 | 
22 | To use any of them, just run the command
23 | ```
24 | pip install -r requirements.txt
25 | ```
26 | inside the folder of your choice.
27 | 
28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder.
29 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 | 
19 | 
20 | class SentimentAnalysisPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 |     pipeline_task = "sentiment-analysis"
22 |     small_models = [
23 |         "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
24 |     ]  # Default model - Models tested without the @slow decorator
25 |     large_models = [None]  # Models tested with the @slow decorator
26 |     mandatory_keys = {"label", "score"}  # Keys which should be in the output
27 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | # Research projects
18 | 
19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific
20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work.
21 | 
22 | To use any of them, just run the command
23 | ```
24 | pip install -r requirements.txt
25 | ```
26 | inside the folder of your choice.
27 | 
28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder.
29 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
23 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/run_pos.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if ! [ -f ./dev.txt ]; then
 3 |   echo "Download dev dataset...."
 4 |   curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 5 | fi
 6 | 
 7 | if ! [ -f ./test.txt ]; then
 8 |   echo "Download test dataset...."
 9 |   curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
10 | fi
11 | 
12 | if ! [ -f ./train.txt ]; then
13 |   echo "Download train dataset...."
14 |   curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
15 | fi
16 | 
17 | export MAX_LENGTH=200
18 | export BERT_MODEL=bert-base-uncased
19 | export OUTPUT_DIR=postagger-model
20 | export BATCH_SIZE=32
21 | export NUM_EPOCHS=3
22 | export SAVE_STEPS=750
23 | export SEED=1
24 | 
25 | 
26 | # Add parent directory to python path to access lightning_base.py
27 | export PYTHONPATH="../":"${PYTHONPATH}"
28 | 
29 | python3 run_ner.py --data_dir ./ \
30 | --task_type POS \
31 | --model_name_or_path $BERT_MODEL \
32 | --output_dir $OUTPUT_DIR \
33 | --max_seq_length  $MAX_LENGTH \
34 | --num_train_epochs $NUM_EPOCHS \
35 | --train_batch_size $BATCH_SIZE \
36 | --seed $SEED \
37 | --gpus 1 \
38 | --do_train \
39 | --do_predict
40 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-generation/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | ## Language generation
18 | 
19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py).
20 | 
21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
23 | can try out the different models available in the library.
24 | 
25 | Example usage:
26 | 
27 | ```bash
28 | python run_generation.py \
29 |     --model_type=gpt2 \
30 |     --model_name_or_path=gpt2
31 | ```
32 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
23 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import unittest
17 | from unittest.mock import patch
18 | 
19 | from transformers.testing_utils import CaptureStd
20 | 
21 | 
22 | class CLITest(unittest.TestCase):
23 |     @patch("sys.argv", ["fakeprogrampath", "env"])
24 |     def test_cli_env(self):
25 |         # test transformers-cli env
26 |         import transformers.commands.transformers_cli
27 | 
28 |         with CaptureStd() as cs:
29 |             transformers.commands.transformers_cli.main()
30 |         assert "Python version" in cs.out
31 |         assert "Platform" in cs.out
32 |         assert "Using distributed or parallel set-up in script?" in cs.out
33 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/minify_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from pathlib import Path
17 | 
18 | import fire
19 | 
20 | 
21 | def minify(src_dir: str, dest_dir: str, n: int):
22 |     """Write first n lines of each file f in src_dir to dest_dir/f """
23 |     src_dir = Path(src_dir)
24 |     dest_dir = Path(dest_dir)
25 |     dest_dir.mkdir(exist_ok=True)
26 |     for path in src_dir.iterdir():
27 |         new = [x.rstrip() for x in list(path.open().readlines())][:n]
28 |         dest_path = dest_dir.joinpath(path.name)
29 |         print(dest_path)
30 |         dest_path.open("w").write("\n".join(new))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     fire.Fire(minify)
35 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/rouge_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import fire
16 | 
17 | from utils import calculate_rouge, save_json
18 | 
19 | 
20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
21 |     """Kwargs will be passed to calculate_rouge"""
22 |     pred_lns = [x.strip() for x in open(pred_path).readlines()]
23 |     tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)]
24 |     metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
25 |     if save_path is not None:
26 |         save_json(metrics, save_path, indent=None)
27 |     return metrics  # these print nicely
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     fire.Fire(calculate_rouge_path)
32 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_pos.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if ! [ -f ./dev.txt ]; then
 3 |   echo "Download dev dataset...."
 4 |   curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
 5 | fi
 6 | 
 7 | if ! [ -f ./test.txt ]; then
 8 |   echo "Download test dataset...."
 9 |   curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
10 | fi
11 | 
12 | if ! [ -f ./train.txt ]; then
13 |   echo "Download train dataset...."
14 |   curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
15 | fi
16 | 
17 | export MAX_LENGTH=200
18 | export BERT_MODEL=bert-base-uncased
19 | export OUTPUT_DIR=postagger-model
20 | export BATCH_SIZE=32
21 | export NUM_EPOCHS=3
22 | export SAVE_STEPS=750
23 | export SEED=1
24 | 
25 | 
26 | # Add parent directory to python path to access lightning_base.py
27 | export PYTHONPATH="../":"${PYTHONPATH}"
28 | 
29 | python3 run_ner.py --data_dir ./ \
30 | --task_type POS \
31 | --model_name_or_path $BERT_MODEL \
32 | --output_dir $OUTPUT_DIR \
33 | --max_seq_length  $MAX_LENGTH \
34 | --num_train_epochs $NUM_EPOCHS \
35 | --train_batch_size $BATCH_SIZE \
36 | --seed $SEED \
37 | --gpus 1 \
38 | --do_train \
39 | --do_predict
40 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..file_utils import requires_flax
 3 | 
 4 | 
 5 | class FlaxPreTrainedModel:
 6 |     def __init__(self, *args, **kwargs):
 7 |         requires_flax(self)
 8 | 
 9 |     @classmethod
10 |     def from_pretrained(self, *args, **kwargs):
11 |         requires_flax(self)
12 | 
13 | 
14 | FLAX_MODEL_MAPPING = None
15 | 
16 | 
17 | class FlaxAutoModel:
18 |     def __init__(self, *args, **kwargs):
19 |         requires_flax(self)
20 | 
21 |     @classmethod
22 |     def from_pretrained(self, *args, **kwargs):
23 |         requires_flax(self)
24 | 
25 | 
26 | class FlaxBertForMaskedLM:
27 |     def __init__(self, *args, **kwargs):
28 |         requires_flax(self)
29 | 
30 |     @classmethod
31 |     def from_pretrained(self, *args, **kwargs):
32 |         requires_flax(self)
33 | 
34 | 
35 | class FlaxBertModel:
36 |     def __init__(self, *args, **kwargs):
37 |         requires_flax(self)
38 | 
39 |     @classmethod
40 |     def from_pretrained(self, *args, **kwargs):
41 |         requires_flax(self)
42 | 
43 | 
44 | class FlaxRobertaModel:
45 |     def __init__(self, *args, **kwargs):
46 |         requires_flax(self)
47 | 
48 |     @classmethod
49 |     def from_pretrained(self, *args, **kwargs):
50 |         requires_flax(self)
51 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-generation/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | -->
16 | 
17 | ## Language generation
18 | 
19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py).
20 | 
21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
23 | can try out the different models available in the library.
24 | 
25 | Example usage:
26 | 
27 | ```bash
28 | python run_generation.py \
29 |     --model_type=gpt2 \
30 |     --model_name_or_path=gpt2
31 | ```
32 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/minify_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from pathlib import Path
17 | 
18 | import fire
19 | 
20 | 
21 | def minify(src_dir: str, dest_dir: str, n: int):
22 |     """Write first n lines of each file f in src_dir to dest_dir/f """
23 |     src_dir = Path(src_dir)
24 |     dest_dir = Path(dest_dir)
25 |     dest_dir.mkdir(exist_ok=True)
26 |     for path in src_dir.iterdir():
27 |         new = [x.rstrip() for x in list(path.open().readlines())][:n]
28 |         dest_path = dest_dir.joinpath(path.name)
29 |         print(dest_path)
30 |         dest_path.open("w").write("\n".join(new))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     fire.Fire(minify)
35 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/rouge_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import fire
16 | 
17 | from utils import calculate_rouge, save_json
18 | 
19 | 
20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
21 |     """Kwargs will be passed to calculate_rouge"""
22 |     pred_lns = [x.strip() for x in open(pred_path).readlines()]
23 |     tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)]
24 |     metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
25 |     if save_path is not None:
26 |         save_json(metrics, save_path, indent=None)
27 |     return metrics  # these print nicely
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     fire.Fire(calculate_rouge_path)
32 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
 1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
 2 | from ..file_utils import requires_flax
 3 | 
 4 | 
 5 | class FlaxPreTrainedModel:
 6 |     def __init__(self, *args, **kwargs):
 7 |         requires_flax(self)
 8 | 
 9 |     @classmethod
10 |     def from_pretrained(self, *args, **kwargs):
11 |         requires_flax(self)
12 | 
13 | 
14 | FLAX_MODEL_MAPPING = None
15 | 
16 | 
17 | class FlaxAutoModel:
18 |     def __init__(self, *args, **kwargs):
19 |         requires_flax(self)
20 | 
21 |     @classmethod
22 |     def from_pretrained(self, *args, **kwargs):
23 |         requires_flax(self)
24 | 
25 | 
26 | class FlaxBertForMaskedLM:
27 |     def __init__(self, *args, **kwargs):
28 |         requires_flax(self)
29 | 
30 |     @classmethod
31 |     def from_pretrained(self, *args, **kwargs):
32 |         requires_flax(self)
33 | 
34 | 
35 | class FlaxBertModel:
36 |     def __init__(self, *args, **kwargs):
37 |         requires_flax(self)
38 | 
39 |     @classmethod
40 |     def from_pretrained(self, *args, **kwargs):
41 |         requires_flax(self)
42 | 
43 | 
44 | class FlaxRobertaModel:
45 |     def __init__(self, *args, **kwargs):
46 |         requires_flax(self)
47 | 
48 |     @classmethod
49 |     def from_pretrained(self, *args, **kwargs):
50 |         requires_flax(self)
51 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/model_cards/README.md:
--------------------------------------------------------------------------------
 1 | ## 🔥 Model cards now live inside each huggingface.co model repo 🔥
 2 | 
 3 | 
 4 | For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub.
 5 | 
 6 | ### How to update a model card
 7 | 
 8 | You can directly update a model card inside any model repo you have **write access** to, i.e.:
 9 | - a model under your username namespace
10 | - a model under any organization you are a part of.
11 | 
12 | You can either:
13 | - update it, commit and push using your usual git workflow (command line, GUI, etc.)
14 | - or edit it directly from the website's UI.
15 | 
16 | **What if you want to create or update a model card for a model you don't have write access to?**
17 | 
18 | In that case, given that we don't have a Pull request system yet on huggingface.co (🤯),
19 | you can open an issue here, post the card's content, and tag the model author(s) and/or the Hugging Face team.
20 | 
21 | We might implement a more seamless process at some point, so your early feedback is precious!
22 | Please let us know of any suggestion.
23 | 
24 | ### What happened to the model cards here?
25 | 
26 | We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub.
27 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
 1 | .. 
 2 |     Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | 
 4 |     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 5 |     the License. You may obtain a copy of the License at
 6 | 
 7 |         http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |     Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
10 |     an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
11 |     specific language governing permissions and limitations under the License.
12 | 
13 | Configuration
14 | -----------------------------------------------------------------------------------------------------------------------
15 | 
16 | The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
17 | either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
18 | from HuggingFace's AWS S3 repository).
19 | 
20 | 
21 | PretrainedConfig
22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23 | 
24 | .. autoclass:: transformers.PretrainedConfig
25 |     :members:
26 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/sentence_splitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from filelock import FileLock
17 | 
18 | 
19 | try:
20 |     import nltk
21 | 
22 |     NLTK_AVAILABLE = True
23 | except (ImportError, ModuleNotFoundError):
24 |     NLTK_AVAILABLE = False
25 | 
26 | if NLTK_AVAILABLE:
27 |     with FileLock(".lock") as lock:
28 |         nltk.download("punkt", quiet=True)
29 | 
30 | 
31 | def add_newline_to_end_of_each_sentence(x: str) -> str:
32 |     """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
33 |     re.sub("<n>", "", x)  # remove pegasus newline char
34 |     assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
35 |     return "\n".join(nltk.sent_tokenize(x))
36 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/sentence_splitter.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 | 
16 | from filelock import FileLock
17 | 
18 | 
19 | try:
20 |     import nltk
21 | 
22 |     NLTK_AVAILABLE = True
23 | except (ImportError, ModuleNotFoundError):
24 |     NLTK_AVAILABLE = False
25 | 
26 | if NLTK_AVAILABLE:
27 |     with FileLock(".lock") as lock:
28 |         nltk.download("punkt", quiet=True)
29 | 
30 | 
31 | def add_newline_to_end_of_each_sentence(x: str) -> str:
32 |     """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
33 |     re.sub("<n>", "", x)  # remove pegasus newline char
34 |     assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
35 |     return "\n".join(nltk.sent_tokenize(x))
36 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/pegasus/build_test_sample_spm_no_bos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus 
17 | 
18 | # 1. pip install sentencepiece
19 | # 
20 | # 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
21 | 
22 | # 3. build
23 | import sentencepiece as spm
24 | 
25 | # pegasus:
26 | # 1. no bos
27 | # 2. eos_id is 1
28 | # 3. unk_id is 2
29 | # build a sample spm file accordingly
30 | spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2  --eos_id=1  --vocab_size=1000')
31 | 
32 | # 4. now update the fixture
33 | # mv test_sentencepiece_no_bos.model ../../tests/fixtures/
34 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .metrics import glue_compute_metrics, xnli_compute_metrics
20 | from .processors import (
21 |     DataProcessor,
22 |     InputExample,
23 |     InputFeatures,
24 |     SingleSentenceClassificationProcessor,
25 |     SquadExample,
26 |     SquadFeatures,
27 |     SquadV1Processor,
28 |     SquadV2Processor,
29 |     glue_convert_examples_to_features,
30 |     glue_output_modes,
31 |     glue_processors,
32 |     glue_tasks_num_labels,
33 |     squad_convert_examples_to_features,
34 |     xnli_output_modes,
35 |     xnli_processors,
36 |     xnli_tasks_num_labels,
37 | )
38 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 | 
17 | 
18 | from ...utils import logging
19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 | 
28 | 
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 |     """
31 |     This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 |     documentation alongside usage examples.
33 |     """
34 | 
35 |     model_type = "xlm-prophetnet"
36 | 


--------------------------------------------------------------------------------
/sorting/utils/exp_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import os, shutil
 3 | 
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def logging(s, log_path, print_=True, log_=True):
10 |     if print_:
11 |         print(s)
12 |     if log_:
13 |         with open(log_path, 'a+') as f_log:
14 |             f_log.write(s + '\n')
15 | 
16 | def get_logger(log_path, **kwargs):
17 |     return functools.partial(logging, log_path=log_path, **kwargs)
18 | 
19 | def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
20 |     if debug:
21 |         print('Debug Mode : no experiment dir created')
22 |         return functools.partial(logging, log_path=None, log_=False)
23 | 
24 |     if not os.path.exists(dir_path):
25 |         os.makedirs(dir_path)
26 | 
27 |     print('Experiment dir : {}'.format(dir_path))
28 |     if scripts_to_save is not None:
29 |         script_path = os.path.join(dir_path, 'scripts')
30 |         if not os.path.exists(script_path):
31 |             os.makedirs(script_path)
32 |         for script in scripts_to_save:
33 |             dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
34 |             shutil.copyfile(script, dst_file)
35 | 
36 |     return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
37 | 
38 | def save_checkpoint(model, optimizer, path, epoch):
39 |     torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
40 |     torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))
41 | 


--------------------------------------------------------------------------------
/finetune_gpt2/infinite_memory_transformer_sticky_mem/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "gpt2",
 3 |   "activation_function": "gelu_new",
 4 |   "affines": true,
 5 |   "architectures": [
 6 |     "GPT2LMHeadModel"
 7 |   ],
 8 |   "attn_drop": 0.1,
 9 |   "attn_pdrop": 0.1,
10 |   "bos_token_id": 50256,
11 |   "continuous": true,
12 |   "embd_pdrop": 0.1,
13 |   "eos_token_id": 50256,
14 |   "gradient_checkpointing": false,
15 |   "infinite_memory": true,
16 |   "initializer_range": 0.02,
17 |   "kl_regularizer": true,
18 |   "layer_norm_epsilon": 1e-05,
19 |   "long_term_attention": true,
20 |   "long_term_attention_basis": 512,
21 |   "long_term_attention_norm": "softmax",
22 |   "mask": true,
23 |   "mask_dropout": 0,
24 |   "mask_type": "cnn",
25 |   "model_type": "gpt2",
26 |   "mu_0": -1,
27 |   "n_ctx": 1024,
28 |   "n_embd": 768,
29 |   "n_head": 12,
30 |   "n_inner": null,
31 |   "n_layer": 12,
32 |   "n_positions": 1024,
33 |   "n_special": 0,
34 |   "predict_special_tokens": true,
35 |   "resid_pdrop": 0.1,
36 |   "sigma_0": 0.05,
37 |   "sticky_memories": true,
38 |   "summary_activation": null,
39 |   "summary_first_dropout": 0.1,
40 |   "summary_proj_to_labels": true,
41 |   "summary_type": "cls_index",
42 |   "summary_use_proj": true,
43 |   "task_specific_params": {
44 |     "text-generation": {
45 |       "do_sample": true,
46 |       "max_length": 50
47 |     }
48 |   },
49 |   "transformers_version": "4.5.0.dev0",
50 |   "use_cache": true,
51 |   "vocab_size": 50257
52 | }
53 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from .metrics import glue_compute_metrics, xnli_compute_metrics
20 | from .processors import (
21 |     DataProcessor,
22 |     InputExample,
23 |     InputFeatures,
24 |     SingleSentenceClassificationProcessor,
25 |     SquadExample,
26 |     SquadFeatures,
27 |     SquadV1Processor,
28 |     SquadV2Processor,
29 |     glue_convert_examples_to_features,
30 |     glue_output_modes,
31 |     glue_processors,
32 |     glue_tasks_num_labels,
33 |     squad_convert_examples_to_features,
34 |     xnli_output_modes,
35 |     xnli_processors,
36 |     xnli_tasks_num_labels,
37 | )
38 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/README.md:
--------------------------------------------------------------------------------
 1 | # Performer fine-tuning
 2 | 
 3 | Example authors: @TevenLeScao, @Patrickvonplaten
 4 | 
 5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller
 6 | 
 7 | ## Requirements
 8 | 
 9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it.
10 | 
11 | ## Examples
12 | 
13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
15 | 
16 | Here are a few key arguments:
17 | - Remove the `--performer` argument to use a standard Bert model.
18 |   
19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 
20 |   
21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument.
22 | 
23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging.
24 | 
25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need.


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/finetune_rag_ray.sh:
--------------------------------------------------------------------------------
 1 | # Sample script to finetune RAG using Ray for distributed retrieval.
 2 | 
 3 | # Add parent directory to python path to access lightning_base.py
 4 | export PYTHONPATH="../":"${PYTHONPATH}"
 5 | 
 6 | # Start a single-node Ray cluster.
 7 | ray start --head
 8 | 
 9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
11 | 
12 | python examples/rag/finetune_rag.py \
13 |     --data_dir $DATA_DIR \
14 |     --output_dir $OUTPUT_DIR \
15 |     --model_name_or_path $MODEL_NAME_OR_PATH \
16 |     --model_type rag_sequence \
17 |     --fp16 \
18 |     --gpus 8 \
19 |     --profile \
20 |     --do_train \
21 |     --do_predict \
22 |     --n_val -1 \
23 |     --train_batch_size 8 \
24 |     --eval_batch_size 1 \
25 |     --max_source_length 128 \
26 |     --max_target_length 25 \
27 |     --val_max_target_length 25 \
28 |     --test_max_target_length 25 \
29 |     --label_smoothing 0.1 \
30 |     --dropout 0.1 \
31 |     --attention_dropout 0.1 \
32 |     --weight_decay 0.001 \
33 |     --adam_epsilon 1e-08 \
34 |     --max_grad_norm 0.1 \
35 |     --lr_scheduler polynomial \
36 |     --learning_rate 3e-05 \
37 |     --num_train_epochs 100 \
38 |     --warmup_steps 500 \
39 |     --gradient_accumulation_steps 1 \
40 |     --distributed_retriever ray \
41 |     --num_retrieval_workers 4
42 | 
43 | # Stop the Ray cluster.
44 | ray stop
45 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 | 
17 | 
18 | from ...utils import logging
19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig
20 | 
21 | 
22 | logger = logging.get_logger(__name__)
23 | 
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 |     "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 | 
28 | 
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 |     """
31 |     This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 |     documentation alongside usage examples.
33 |     """
34 | 
35 |     model_type = "xlm-prophetnet"
36 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/README.md:
--------------------------------------------------------------------------------
 1 | # Performer fine-tuning
 2 | 
 3 | Example authors: @TevenLeScao, @Patrickvonplaten
 4 | 
 5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller
 6 | 
 7 | ## Requirements
 8 | 
 9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it.
10 | 
11 | ## Examples
12 | 
13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
15 | 
16 | Here are a few key arguments:
17 | - Remove the `--performer` argument to use a standard Bert model.
18 |   
19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 
20 |   
21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument.
22 | 
23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging.
24 | 
25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need.


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag_ray.sh:
--------------------------------------------------------------------------------
 1 | # Sample script to finetune RAG using Ray for distributed retrieval.
 2 | 
 3 | # Add parent directory to python path to access lightning_base.py
 4 | export PYTHONPATH="../":"${PYTHONPATH}"
 5 | 
 6 | # Start a single-node Ray cluster.
 7 | ray start --head
 8 | 
 9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
11 | 
12 | python examples/rag/finetune_rag.py \
13 |     --data_dir $DATA_DIR \
14 |     --output_dir $OUTPUT_DIR \
15 |     --model_name_or_path $MODEL_NAME_OR_PATH \
16 |     --model_type rag_sequence \
17 |     --fp16 \
18 |     --gpus 8 \
19 |     --profile \
20 |     --do_train \
21 |     --do_predict \
22 |     --n_val -1 \
23 |     --train_batch_size 8 \
24 |     --eval_batch_size 1 \
25 |     --max_source_length 128 \
26 |     --max_target_length 25 \
27 |     --val_max_target_length 25 \
28 |     --test_max_target_length 25 \
29 |     --label_smoothing 0.1 \
30 |     --dropout 0.1 \
31 |     --attention_dropout 0.1 \
32 |     --weight_decay 0.001 \
33 |     --adam_epsilon 1e-08 \
34 |     --max_grad_norm 0.1 \
35 |     --lr_scheduler polynomial \
36 |     --learning_rate 3e-05 \
37 |     --num_train_epochs 100 \
38 |     --warmup_steps 500 \
39 |     --gradient_accumulation_steps 1 \
40 |     --distributed_retriever ray \
41 |     --num_retrieval_workers 4
42 | 
43 | # Stop the Ray cluster.
44 | ray stop
45 | 


--------------------------------------------------------------------------------
/finetune_gpt2/infinite_memory_transformer/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "gpt2",
 3 |   "activation_function": "gelu_new",
 4 |   "affines": true,
 5 |   "architectures": [
 6 |     "GPT2LMHeadModel"
 7 |   ],
 8 |   "attn_drop": 0.1,
 9 |   "attn_pdrop": 0.1,
10 |   "bos_token_id": 50256,
11 |   "compression_rate": 2,
12 |   "compressive": false,
13 |   "continuous": true,
14 |   "embd_pdrop": 0.1,
15 |   "eos_token_id": 50256,
16 |   "gradient_checkpointing": false,
17 |   "infinite_memory": true,
18 |   "initializer_range": 0.02,
19 |   "kl_regularizer": true,
20 |   "layer_norm_epsilon": 1e-05,
21 |   "long_term_attention": true,
22 |   "long_term_attention_basis": 512,
23 |   "long_term_attention_norm": "softmax",
24 |   "mask": true,
25 |   "mask_dropout": 0,
26 |   "mask_type": "cnn",
27 |   "model_type": "gpt2",
28 |   "mu_0": -1,
29 |   "n_ctx": 1024,
30 |   "n_embd": 768,
31 |   "n_head": 12,
32 |   "n_inner": null,
33 |   "n_layer": 12,
34 |   "n_positions": 1024,
35 |   "n_special": 0,
36 |   "predict_special_tokens": true,
37 |   "resid_pdrop": 0.1,
38 |   "sigma_0": 0.05,
39 |   "sticky_memories": false,
40 |   "summary_activation": null,
41 |   "summary_first_dropout": 0.1,
42 |   "summary_proj_to_labels": true,
43 |   "summary_type": "cls_index",
44 |   "summary_use_proj": true,
45 |   "task_specific_params": {
46 |     "text-generation": {
47 |       "do_sample": true,
48 |       "max_length": 50
49 |     }
50 |   },
51 |   "transformers_version": "4.5.0.dev0",
52 |   "use_cache": true,
53 |   "vocab_size": 50257
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/xlm_prophetnet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from ...file_utils import is_sentencepiece_available, is_torch_available
20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
21 | 
22 | 
23 | if is_sentencepiece_available():
24 |     from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
25 | 
26 | if is_torch_available():
27 |     from .modeling_xlm_prophetnet import (
28 |         XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
29 |         XLMProphetNetDecoder,
30 |         XLMProphetNetEncoder,
31 |         XLMProphetNetForCausalLM,
32 |         XLMProphetNetForConditionalGeneration,
33 |         XLMProphetNetModel,
34 |     )
35 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_activations_tf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import unittest
16 | 
17 | from transformers import is_tf_available
18 | from transformers.testing_utils import require_tf
19 | 
20 | 
21 | if is_tf_available():
22 |     from transformers.activations_tf import get_tf_activation
23 | 
24 | 
25 | @require_tf
26 | class TestTFActivations(unittest.TestCase):
27 |     def test_get_activation(self):
28 |         get_tf_activation("swish")
29 |         get_tf_activation("silu")
30 |         get_tf_activation("gelu")
31 |         get_tf_activation("relu")
32 |         get_tf_activation("tanh")
33 |         get_tf_activation("gelu_new")
34 |         get_tf_activation("gelu_fast")
35 |         get_tf_activation("mish")
36 |         with self.assertRaises(KeyError):
37 |             get_tf_activation("bogus")
38 |         with self.assertRaises(KeyError):
39 |             get_tf_activation(None)
40 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/github-torch-hub.yml:
--------------------------------------------------------------------------------
 1 | name: Torch hub integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "*"
 7 | 
 8 | jobs:
 9 |   torch_hub_integration:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       # TODO quickfix but may need more investigation
13 |       ACTIONS_ALLOW_UNSECURE_COMMANDS: True
14 |     steps:
15 |     # no checkout necessary here.
16 |     - name: Extract branch name
17 |       run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
18 |     - name: Check branch name
19 |       run: echo $BRANCH
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v1
22 |       with:
23 |         python-version: 3.7
24 | 
25 |     - name: Loading cache
26 |       uses: actions/cache@v2
27 |       id: cache
28 |       with:
29 |         path: ~/.cache/pip
30 |         key: v0-torch_hub-${{ hashFiles('setup.py') }}
31 | 
32 |     - name: Install dependencies
33 |       run: |
34 |         pip install --upgrade pip
35 |         # install torch-hub specific dependencies
36 |         pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
37 |         # no longer needed
38 |         pip uninstall -y transformers
39 | 
40 |     - name: Torch hub list
41 |       run: |
42 |         python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
43 | 
44 |     - name: Torch hub help
45 |       run: |
46 |         python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
47 | 


--------------------------------------------------------------------------------
/document_grounded_generation/test_special_tokens.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import shutil
 3 | import unittest
 4 | 
 5 | from transformers import OpenAIGPTTokenizer, GPT2Tokenizer
 6 | from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS
 7 | 
 8 | class TestSpecialTokenTreatment(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.save_dir = Path('utest_save_dir')
12 |         self.save_dir.mkdir(exist_ok=True)
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.save_dir)
16 | 
17 |     def test_special_tokens_checkpoint_behavior(self):
18 |         toks = [OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2')]
19 |         for tok in toks:
20 |             self.assertEqual(len(tok.added_tokens_encoder), 0)
21 |             tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
22 |             self.assertEqual(len(tok.added_tokens_encoder), 5)
23 |             # Make sure we never split
24 |             self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2)
25 |             ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
26 |             self.assertTrue(all([x > 0 for x in ids]),
27 |                             f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
28 |             # Need to mantain indices through save. (this is also tested in pytorch-transformers)
29 |             tok.save_pretrained(self.save_dir)
30 |             tok_loaded = tok.from_pretrained(str(self.save_dir))
31 |             ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
32 |             self.assertListEqual(ids, ids2)
33 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | from ...file_utils import is_sentencepiece_available, is_torch_available
20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
21 | 
22 | 
23 | if is_sentencepiece_available():
24 |     from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
25 | 
26 | if is_torch_available():
27 |     from .modeling_xlm_prophetnet import (
28 |         XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
29 |         XLMProphetNetDecoder,
30 |         XLMProphetNetEncoder,
31 |         XLMProphetNetForCausalLM,
32 |         XLMProphetNetForConditionalGeneration,
33 |         XLMProphetNetModel,
34 |     )
35 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/convert_model_to_fp16.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from typing import Union
17 | 
18 | import fire
19 | import torch
20 | from tqdm import tqdm
21 | 
22 | 
23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
24 |     """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
25 |     state_dict = torch.load(src_path, map_location=map_location)
26 |     for k, v in tqdm(state_dict.items()):
27 |         if not isinstance(v, torch.Tensor):
28 |             raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
29 |         state_dict[k] = v.half()
30 |     if save_path is None:  # overwrite src_path
31 |         save_path = src_path
32 |     torch.save(state_dict, save_path)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     fire.Fire(convert)
37 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/old_test_tatoeba_conversion.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import tempfile
17 | import unittest
18 | 
19 | from transformers.file_utils import cached_property
20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
21 | from transformers.testing_utils import slow
22 | 
23 | 
24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
25 | class TatoebaConversionTester(unittest.TestCase):
26 |     @cached_property
27 |     def resolver(self):
28 |         tmp_dir = tempfile.mkdtemp()
29 |         return TatoebaConverter(save_dir=tmp_dir)
30 | 
31 |     @slow
32 |     def test_resolver(self):
33 |         self.resolver.convert_models(["heb-eng"])
34 | 
35 |     @slow
36 |     def test_model_card(self):
37 |         content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
38 |         assert mmeta["long_pair"] == "heb-eng"
39 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | python finetune_trainer.py \
16 |     --model_name_or_path=facebook/mbart-large-cc25 \
17 |     --data_dir $ENRO_DIR \
18 |     --output_dir mbart_cc25_enro --overwrite_output_dir \
19 |     --learning_rate=3e-5 \
20 |     --warmup_steps 500 \ 
21 |     --fp16 \
22 |     --label_smoothing 0.1 \
23 |     --adam_eps 1e-06 \
24 |     --src_lang en_XX --tgt_lang ro_RO \
25 |     --freeze_embeds \
26 |     --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
27 |     --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\
28 |     --sortish_sampler \
29 |     --num_train_epochs 6 \
30 |     --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
31 |     --do_train --do_eval --do_predict \
32 |     --evaluation_strategy steps \
33 |     --predict_with_generate --logging_first_step \
34 |     --task translation \
35 |     "$@"
36 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/convert_model_to_fp16.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from typing import Union
17 | 
18 | import fire
19 | import torch
20 | from tqdm import tqdm
21 | 
22 | 
23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
24 |     """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
25 |     state_dict = torch.load(src_path, map_location=map_location)
26 |     for k, v in tqdm(state_dict.items()):
27 |         if not isinstance(v, torch.Tensor):
28 |             raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
29 |         state_dict[k] = v.half()
30 |     if save_path is None:  # overwrite src_path
31 |         save_path = src_path
32 |     torch.save(state_dict, save_path)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     fire.Fire(convert)
37 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/parse_dpr_relevance_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint.
 3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting
 4 | positive contexts for a given query.
 5 | """
 6 | 
 7 | import argparse
 8 | import json
 9 | 
10 | from tqdm import tqdm
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser()
15 | 
16 |     # Required parameters
17 |     parser.add_argument(
18 |         "--src_path",
19 |         type=str,
20 |         default="biencoder-nq-dev.json",
21 |         help="Path to raw DPR training data",
22 |     )
23 |     parser.add_argument(
24 |         "--evaluation_set",
25 |         type=str,
26 |         help="where to store parsed evaluation_set file",
27 |     )
28 |     parser.add_argument(
29 |         "--gold_data_path",
30 |         type=str,
31 |         help="where to store parsed gold_data_path file",
32 |     )
33 |     args = parser.parse_args()
34 | 
35 |     with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open(
36 |         args.gold_data_path, "w"
37 |     ) as gold_file:
38 |         dpr_records = json.load(src_file)
39 |         for dpr_record in tqdm(dpr_records):
40 |             question = dpr_record["question"]
41 |             contexts = [context["title"] for context in dpr_record["positive_ctxs"]]
42 |             eval_file.write(question + "\n")
43 |             gold_file.write("\t".join(contexts) + "\n")
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | python finetune_trainer.py \
16 |     --model_name_or_path=facebook/mbart-large-cc25 \
17 |     --data_dir $ENRO_DIR \
18 |     --output_dir mbart_cc25_enro --overwrite_output_dir \
19 |     --learning_rate=3e-5 \
20 |     --warmup_steps 500 \ 
21 |     --fp16 \
22 |     --label_smoothing 0.1 \
23 |     --adam_eps 1e-06 \
24 |     --src_lang en_XX --tgt_lang ro_RO \
25 |     --freeze_embeds \
26 |     --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
27 |     --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\
28 |     --sortish_sampler \
29 |     --num_train_epochs 6 \
30 |     --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
31 |     --do_train --do_eval --do_predict \
32 |     --evaluation_strategy steps \
33 |     --predict_with_generate --logging_first_step \
34 |     --task translation \
35 |     "$@"
36 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/old_test_tatoeba_conversion.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import os
16 | import tempfile
17 | import unittest
18 | 
19 | from transformers.file_utils import cached_property
20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
21 | from transformers.testing_utils import slow
22 | 
23 | 
24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
25 | class TatoebaConversionTester(unittest.TestCase):
26 |     @cached_property
27 |     def resolver(self):
28 |         tmp_dir = tempfile.mkdtemp()
29 |         return TatoebaConverter(save_dir=tmp_dir)
30 | 
31 |     @slow
32 |     def test_resolver(self):
33 |         self.resolver.convert_models(["heb-eng"])
34 | 
35 |     @slow
36 |     def test_model_card(self):
37 |         content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
38 |         assert mmeta["long_pair"] == "heb-eng"
39 | 


--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/adversarial/README.md:
--------------------------------------------------------------------------------
 1 | ## Adversarial evaluation of model performances
 2 | 
 3 | Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
 4 | 
 5 | The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
 6 | 
 7 | This is an example of using test_hans.py:
 8 | 
 9 | ```bash
10 | export HANS_DIR=path-to-hans
11 | export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
12 | export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
13 | 
14 | python run_hans.py \
15 |         --task_name hans \
16 |         --model_type $MODEL_TYPE \
17 |         --do_eval \
18 |         --data_dir $HANS_DIR \
19 |         --model_name_or_path $MODEL_PATH \
20 |         --max_seq_length 128 \
21 |         --output_dir $MODEL_PATH \
22 | ```
23 | 
24 | This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
25 | 
26 | The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
27 | 
28 | ```bash
29 | Heuristic entailed results:
30 | lexical_overlap: 0.9702
31 | subsequence: 0.9942
32 | constituent: 0.9962
33 | 
34 | Heuristic non-entailed results:
35 | lexical_overlap: 0.199
36 | subsequence: 0.0396
37 | constituent: 0.118
38 | ```
39 | 


--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/parse_dpr_relevance_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint.
 3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting
 4 | positive contexts for a given query.
 5 | """
 6 | 
 7 | import argparse
 8 | import json
 9 | 
10 | from tqdm import tqdm
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser()
15 | 
16 |     # Required parameters
17 |     parser.add_argument(
18 |         "--src_path",
19 |         type=str,
20 |         default="biencoder-nq-dev.json",
21 |         help="Path to raw DPR training data",
22 |     )
23 |     parser.add_argument(
24 |         "--evaluation_set",
25 |         type=str,
26 |         help="where to store parsed evaluation_set file",
27 |     )
28 |     parser.add_argument(
29 |         "--gold_data_path",
30 |         type=str,
31 |         help="where to store parsed gold_data_path file",
32 |     )
33 |     args = parser.parse_args()
34 | 
35 |     with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open(
36 |         args.gold_data_path, "w"
37 |     ) as gold_file:
38 |         dpr_records = json.load(src_file)
39 |         for dpr_record in tqdm(dpr_records):
40 |             question = dpr_record["question"]
41 |             contexts = [context["title"] for context in dpr_record["positive_ctxs"]]
42 |             eval_file.write(question + "\n")
43 |             gold_file.write("\t".join(contexts) + "\n")
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/finetune_gpt2/utils/get_modified_files.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.:
17 | #   python ./utils/get_modified_files.py utils src tests examples
18 | #
19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered
20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results
21 | 
22 | import re
23 | import subprocess
24 | import sys
25 | 
26 | 
27 | fork_point_sha = subprocess.check_output("git merge-base master HEAD".split()).decode("utf-8")
28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
29 | 
30 | joined_dirs = "|".join(sys.argv[1:])
31 | regex = re.compile(fr"^({joined_dirs}).*?\.py$")
32 | 
33 | relevant_modified_files = [x for x in modified_files if regex.match(x)]
34 | print(" ".join(relevant_modified_files), end="")
35 | 


--------------------------------------------------------------------------------