├── finetune_gpt2
├── examples
│ ├── benchmarking
│ │ └── requirements.txt
│ ├── research_projects
│ │ ├── bertabs
│ │ │ ├── __init__.py
│ │ │ └── requirements.txt
│ │ ├── deebert
│ │ │ ├── src
│ │ │ │ └── __init__.py
│ │ │ ├── requirements.txt
│ │ │ ├── eval_deebert.sh
│ │ │ ├── entropy_eval.sh
│ │ │ └── train_deebert.sh
│ │ ├── bert-loses-patience
│ │ │ ├── pabee
│ │ │ │ └── __init__.py
│ │ │ └── requirements.txt
│ │ ├── bertology
│ │ │ └── requirements.txt
│ │ ├── adversarial
│ │ │ ├── requirements.txt
│ │ │ └── README.md
│ │ ├── longform-qa
│ │ │ ├── requirements.txt
│ │ │ └── README.md
│ │ ├── mlm_wwm
│ │ │ └── requirements.txt
│ │ ├── rag
│ │ │ ├── __init__.py
│ │ │ ├── requirements.txt
│ │ │ ├── finetune_rag.sh
│ │ │ ├── finetune_rag_ray.sh
│ │ │ └── parse_dpr_relevance_data.py
│ │ ├── pplm
│ │ │ ├── imgs
│ │ │ │ ├── wooly.png
│ │ │ │ └── headfigure.png
│ │ │ ├── requirements.txt
│ │ │ └── pplm_classification_head.py
│ │ ├── wav2vec2
│ │ │ ├── requirements.txt
│ │ │ ├── finetune_base_100.sh
│ │ │ ├── finetune_large_lv60_100.sh
│ │ │ ├── finetune_base_timit_asr.sh
│ │ │ ├── finetune_large_lv60_timit_asr.sh
│ │ │ ├── finetune_wav2vec2_xlsr_turkish.sh
│ │ │ ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
│ │ │ └── vocab
│ │ │ │ └── buckwalter.json
│ │ ├── distillation
│ │ │ ├── requirements.txt
│ │ │ └── training_configs
│ │ │ │ ├── distilgpt2.json
│ │ │ │ ├── distilbert-base-cased.json
│ │ │ │ ├── distilbert-base-uncased.json
│ │ │ │ ├── distilbert-base-multilingual-cased.json
│ │ │ │ └── distilroberta-base.json
│ │ ├── movement-pruning
│ │ │ ├── emmental
│ │ │ │ ├── modules
│ │ │ │ │ └── __init__.py
│ │ │ │ └── __init__.py
│ │ │ └── requirements.txt
│ │ ├── lxmert
│ │ │ └── README.md
│ │ ├── performer
│ │ │ ├── full_script.sh
│ │ │ ├── sanity_script.sh
│ │ │ └── README.md
│ │ ├── seq2seq-distillation
│ │ │ ├── requirements.txt
│ │ │ ├── finetune.sh
│ │ │ ├── finetune_t5.sh
│ │ │ ├── finetune_pegasus_xsum.sh
│ │ │ ├── train_mbart_cc25_enro.sh
│ │ │ ├── dynamic_bs_example.sh
│ │ │ ├── sentence_splitter.py
│ │ │ ├── distil_marian_no_teacher.sh
│ │ │ ├── train_distilbart_cnn.sh
│ │ │ ├── distil_marian_enro_teacher.sh
│ │ │ ├── train_distilbart_xsum.sh
│ │ │ └── finetune_bart_tiny.sh
│ │ ├── mm-imdb
│ │ │ └── README.md
│ │ └── README.md
│ ├── legacy
│ │ ├── seq2seq
│ │ │ ├── test_data
│ │ │ │ ├── test_data
│ │ │ │ ├── wmt_en_ro
│ │ │ │ │ ├── val.len
│ │ │ │ │ └── train.len
│ │ │ │ └── fsmt
│ │ │ │ │ └── build-eval-data.py
│ │ │ ├── __init__.py
│ │ │ ├── requirements.txt
│ │ │ ├── finetune.sh
│ │ │ ├── finetune_tpu.sh
│ │ │ ├── minify_dataset.py
│ │ │ ├── rouge_cli.py
│ │ │ ├── sentence_splitter.py
│ │ │ ├── convert_model_to_fp16.py
│ │ │ ├── old_test_tatoeba_conversion.py
│ │ │ └── train_mbart_cc25_enro.sh
│ │ ├── pytorch-lightning
│ │ │ ├── requirements.txt
│ │ │ ├── run_glue.sh
│ │ │ └── run_pos.sh
│ │ ├── README.md
│ │ └── token-classification
│ │ │ ├── run_chunk.sh
│ │ │ ├── run_pos.sh
│ │ │ └── scripts
│ │ │ └── preprocess.py
│ ├── question-answering
│ │ └── requirements.txt
│ ├── multiple-choice
│ │ └── requirements.txt
│ ├── text-generation
│ │ ├── requirements.txt
│ │ └── README.md
│ ├── token-classification
│ │ ├── requirements.txt
│ │ └── run.sh
│ ├── language-modeling
│ │ └── requirements.txt
│ ├── text-classification
│ │ └── requirements.txt
│ ├── seq2seq
│ │ └── requirements.txt
│ ├── _tests_requirements.txt
│ └── tests
│ │ └── deepspeed
│ │ └── ds_config.json
├── src
│ ├── transformers
│ │ ├── benchmark
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── dialogpt
│ │ │ │ └── __init__.py
│ │ │ └── xlm_prophetnet
│ │ │ │ ├── configuration_xlm_prophetnet.py
│ │ │ │ └── __init__.py
│ │ ├── sagemaker
│ │ │ └── __init__.py
│ │ ├── commands
│ │ │ └── __init__.py
│ │ ├── data
│ │ │ ├── datasets
│ │ │ │ └── __init__.py
│ │ │ ├── processors
│ │ │ │ └── __init__.py
│ │ │ └── __init__.py
│ │ └── utils
│ │ │ └── dummy_flax_objects.py
│ └── transformers.egg-info
│ │ ├── dependency_links.txt
│ │ ├── top_level.txt
│ │ └── entry_points.txt
├── infinite_memory_transformer_sticky_mem
│ └── config.json
├── infinite_memory_transformer
│ └── config.json
└── utils
│ └── get_modified_files.py
├── document_grounded_generation
├── transformers
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_pipelines_text2text_generation.py
│ │ ├── test_pipelines_feature_extraction.py
│ │ ├── test_pipelines_sentiment_analysis.py
│ │ ├── test_cli.py
│ │ └── test_activations_tf.py
│ ├── MANIFEST.in
│ ├── examples
│ │ ├── benchmarking
│ │ │ └── requirements.txt
│ │ ├── research_projects
│ │ │ ├── bertabs
│ │ │ │ ├── __init__.py
│ │ │ │ └── requirements.txt
│ │ │ ├── deebert
│ │ │ │ ├── src
│ │ │ │ │ └── __init__.py
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── eval_deebert.sh
│ │ │ │ ├── entropy_eval.sh
│ │ │ │ └── train_deebert.sh
│ │ │ ├── bert-loses-patience
│ │ │ │ ├── pabee
│ │ │ │ │ └── __init__.py
│ │ │ │ └── requirements.txt
│ │ │ ├── adversarial
│ │ │ │ └── requirements.txt
│ │ │ ├── bertology
│ │ │ │ └── requirements.txt
│ │ │ ├── longform-qa
│ │ │ │ ├── requirements.txt
│ │ │ │ └── README.md
│ │ │ ├── mlm_wwm
│ │ │ │ └── requirements.txt
│ │ │ ├── rag
│ │ │ │ ├── __init__.py
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── finetune_rag.sh
│ │ │ │ ├── finetune_rag_ray.sh
│ │ │ │ └── parse_dpr_relevance_data.py
│ │ │ ├── wav2vec2
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── finetune_base_100.sh
│ │ │ │ ├── finetune_large_lv60_100.sh
│ │ │ │ ├── finetune_base_timit_asr.sh
│ │ │ │ ├── finetune_large_lv60_timit_asr.sh
│ │ │ │ ├── finetune_wav2vec2_xlsr_turkish.sh
│ │ │ │ ├── finetune_large_xlsr_53_arabic_speech_corpus.sh
│ │ │ │ └── vocab
│ │ │ │ │ └── buckwalter.json
│ │ │ ├── distillation
│ │ │ │ ├── requirements.txt
│ │ │ │ └── training_configs
│ │ │ │ │ ├── distilgpt2.json
│ │ │ │ │ ├── distilbert-base-cased.json
│ │ │ │ │ ├── distilbert-base-uncased.json
│ │ │ │ │ ├── distilbert-base-multilingual-cased.json
│ │ │ │ │ └── distilroberta-base.json
│ │ │ ├── pplm
│ │ │ │ ├── imgs
│ │ │ │ │ ├── wooly.png
│ │ │ │ │ └── headfigure.png
│ │ │ │ ├── requirements.txt
│ │ │ │ └── pplm_classification_head.py
│ │ │ ├── movement-pruning
│ │ │ │ ├── emmental
│ │ │ │ │ ├── modules
│ │ │ │ │ │ └── __init__.py
│ │ │ │ │ └── __init__.py
│ │ │ │ └── requirements.txt
│ │ │ ├── lxmert
│ │ │ │ └── README.md
│ │ │ ├── performer
│ │ │ │ ├── full_script.sh
│ │ │ │ ├── sanity_script.sh
│ │ │ │ └── README.md
│ │ │ ├── seq2seq-distillation
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── finetune.sh
│ │ │ │ ├── finetune_t5.sh
│ │ │ │ ├── finetune_pegasus_xsum.sh
│ │ │ │ ├── train_mbart_cc25_enro.sh
│ │ │ │ ├── dynamic_bs_example.sh
│ │ │ │ ├── sentence_splitter.py
│ │ │ │ ├── distil_marian_no_teacher.sh
│ │ │ │ ├── train_distilbart_cnn.sh
│ │ │ │ ├── distil_marian_enro_teacher.sh
│ │ │ │ ├── train_distilbart_xsum.sh
│ │ │ │ └── finetune_bart_tiny.sh
│ │ │ ├── mm-imdb
│ │ │ │ └── README.md
│ │ │ └── README.md
│ │ ├── legacy
│ │ │ ├── seq2seq
│ │ │ │ ├── test_data
│ │ │ │ │ ├── test_data
│ │ │ │ │ ├── wmt_en_ro
│ │ │ │ │ │ ├── val.len
│ │ │ │ │ │ └── train.len
│ │ │ │ │ └── fsmt
│ │ │ │ │ │ └── build-eval-data.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── finetune.sh
│ │ │ │ ├── finetune_tpu.sh
│ │ │ │ ├── minify_dataset.py
│ │ │ │ ├── rouge_cli.py
│ │ │ │ ├── sentence_splitter.py
│ │ │ │ ├── convert_model_to_fp16.py
│ │ │ │ ├── train_mbart_cc25_enro.sh
│ │ │ │ └── old_test_tatoeba_conversion.py
│ │ │ ├── pytorch-lightning
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_glue.sh
│ │ │ │ └── run_pos.sh
│ │ │ ├── README.md
│ │ │ └── token-classification
│ │ │ │ ├── run_pos.sh
│ │ │ │ ├── run_chunk.sh
│ │ │ │ └── scripts
│ │ │ │ └── preprocess.py
│ │ ├── question-answering
│ │ │ └── requirements.txt
│ │ ├── token-classification
│ │ │ ├── requirements.txt
│ │ │ └── run.sh
│ │ ├── multiple-choice
│ │ │ └── requirements.txt
│ │ ├── text-generation
│ │ │ ├── requirements.txt
│ │ │ └── README.md
│ │ ├── language-modeling
│ │ │ └── requirements.txt
│ │ ├── text-classification
│ │ │ └── requirements.txt
│ │ ├── seq2seq
│ │ │ └── requirements.txt
│ │ ├── _tests_requirements.txt
│ │ └── tests
│ │ │ └── deepspeed
│ │ │ └── ds_config.json
│ ├── src
│ │ └── transformers
│ │ │ ├── benchmark
│ │ │ └── __init__.py
│ │ │ ├── models
│ │ │ ├── dialogpt
│ │ │ │ └── __init__.py
│ │ │ ├── gpt2
│ │ │ │ └── pre_process_wmt19.py
│ │ │ └── xlm_prophetnet
│ │ │ │ ├── configuration_xlm_prophetnet.py
│ │ │ │ └── __init__.py
│ │ │ ├── sagemaker
│ │ │ └── __init__.py
│ │ │ ├── commands
│ │ │ └── __init__.py
│ │ │ ├── data
│ │ │ ├── datasets
│ │ │ │ └── __init__.py
│ │ │ ├── processors
│ │ │ │ └── __init__.py
│ │ │ └── __init__.py
│ │ │ └── utils
│ │ │ └── dummy_flax_objects.py
│ ├── docs
│ │ ├── source
│ │ │ ├── contributing.md
│ │ │ ├── examples.md
│ │ │ ├── notebooks.md
│ │ │ ├── favicon.ico
│ │ │ ├── imgs
│ │ │ │ ├── ppl_full.gif
│ │ │ │ ├── ppl_chunked.gif
│ │ │ │ ├── ppl_sliding.gif
│ │ │ │ ├── local_attention_mask.png
│ │ │ │ ├── transformers_logo_name.png
│ │ │ │ ├── transformers_overview.png
│ │ │ │ ├── warmup_cosine_schedule.png
│ │ │ │ ├── warmup_linear_schedule.png
│ │ │ │ ├── warmup_constant_schedule.png
│ │ │ │ ├── warmup_cosine_hard_restarts_schedule.png
│ │ │ │ └── warmup_cosine_warm_restarts_schedule.png
│ │ │ ├── _static
│ │ │ │ └── css
│ │ │ │ │ ├── Calibre-Light.ttf
│ │ │ │ │ ├── Calibre-Thin.otf
│ │ │ │ │ ├── Calibre-Medium.otf
│ │ │ │ │ ├── Calibre-Regular.otf
│ │ │ │ │ └── code-snippets.css
│ │ │ └── main_classes
│ │ │ │ └── configuration.rst
│ │ └── Makefile
│ ├── .gitattributes
│ ├── pyproject.toml
│ ├── .github
│ │ ├── conda
│ │ │ ├── build.sh
│ │ │ └── meta.yaml
│ │ ├── ISSUE_TEMPLATE
│ │ │ ├── ---new-benchmark.md
│ │ │ ├── --new-model-addition.md
│ │ │ ├── question-help.md
│ │ │ └── feature-request.md
│ │ └── workflows
│ │ │ ├── stale.yml
│ │ │ ├── release-conda.yml
│ │ │ └── github-torch-hub.yml
│ ├── templates
│ │ ├── adding_a_new_model
│ │ │ ├── open_model_proposals
│ │ │ │ └── README.md
│ │ │ ├── tests
│ │ │ │ ├── pt-seq-2-seq-bart-tokenizer.json
│ │ │ │ ├── encoder-bert-tokenizer.json
│ │ │ │ ├── pt-encoder-bert-tokenizer.json
│ │ │ │ ├── standalone.json
│ │ │ │ ├── tf-encoder-bert-tokenizer.json
│ │ │ │ └── tf-seq-2-seq-bart-tokenizer.json
│ │ │ ├── cookiecutter.json
│ │ │ └── cookiecutter-template-{{cookiecutter.modelname}}
│ │ │ │ └── configuration.json
│ │ └── adding_a_new_example_script
│ │ │ └── cookiecutter.json
│ ├── .coveragerc
│ ├── docker
│ │ ├── transformers-pytorch-tpu
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── dataset.yaml
│ │ │ └── bert-base-cased.jsonnet
│ │ ├── transformers-pytorch-cpu
│ │ │ └── Dockerfile
│ │ ├── transformers-tensorflow-cpu
│ │ │ └── Dockerfile
│ │ ├── transformers-cpu
│ │ │ └── Dockerfile
│ │ ├── transformers-tensorflow-gpu
│ │ │ └── Dockerfile
│ │ ├── transformers-pytorch-gpu
│ │ │ └── Dockerfile
│ │ └── transformers-gpu
│ │ │ └── Dockerfile
│ ├── scripts
│ │ ├── tatoeba
│ │ │ └── upload_models.sh
│ │ ├── fsmt
│ │ │ └── tests-to-run.sh
│ │ └── pegasus
│ │ │ └── build_test_sample_spm_no_bos.py
│ ├── setup.cfg
│ └── model_cards
│ │ └── README.md
├── requirements.txt
└── test_special_tokens.py
├── requirements.txt
└── sorting
├── run_sort_inftyformer.sh
└── utils
└── exp_utils.py
/finetune_gpt2/examples/benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/dialogpt/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertabs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | transformers
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/test_data:
--------------------------------------------------------------------------------
1 | seq2seq/test_data
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bert-loses-patience/pabee/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/question-answering/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.2.1
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertabs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/dialogpt/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/contributing.md:
--------------------------------------------------------------------------------
1 | ../../CONTRIBUTING.md
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/examples.md:
--------------------------------------------------------------------------------
1 | ../../examples/README.md
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/notebooks.md:
--------------------------------------------------------------------------------
1 | ../../notebooks/README.md
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/src/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/gpt2/pre_process_wmt19.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertology/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/multiple-choice/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/adversarial/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bert-loses-patience/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/token-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | seqeval
2 | datasets >= 1.1.3
3 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py eol=lf
2 | *.rst eol=lf
3 | *.md eol=lf
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/test_data:
--------------------------------------------------------------------------------
1 | seq2seq/test_data
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/question-answering/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.2.1
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/pabee/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/adversarial/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertology/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/token-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | seqeval
2 | datasets >= 1.1.3
3 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/language-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/multiple-choice/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bert-loses-patience/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-generation/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece != 0.1.92
2 | protobuf
3 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | target-version = ['py35']
4 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install # Python command to install the script.
2 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/bertabs/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/longform-qa/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | faiss-cpu
3 | streamlit
4 | elasticsearch
5 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/mlm_wwm/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | ltp
5 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.1.3
3 | sentencepiece != 0.1.92
4 | protobuf
5 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/language-modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | transformers-cli = transformers.commands.transformers_cli:main
3 |
4 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | sacrebleu >= 1.4.12
5 | rouge-score
6 | nltk
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/bertabs/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers == 3.5.1
2 |
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 |
--------------------------------------------------------------------------------
/document_grounded_generation/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | pytorch-ignite
3 | transformers==2.5.1
4 | tensorboardX==1.8
5 | tensorflow # for tensorboardX
6 | spacy
7 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/longform-qa/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | faiss-cpu
3 | streamlit
4 | elasticsearch
5 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/mlm_wwm/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | ltp
5 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-classification/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | datasets >= 1.1.3
3 | sentencepiece != 0.1.92
4 | protobuf
5 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets >= 1.1.3
2 | sentencepiece != 0.1.92
3 | protobuf
4 | sacrebleu >= 1.4.12
5 | rouge-score
6 | nltk
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/wooly.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | torch>=1.5.0
4 | torchaudio
5 | jiwer==2.2.0
6 | lang-trans==0.6.0
7 | librosa==0.8.0
8 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/favicon.ico
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/research_projects/pplm/imgs/headfigure.png
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/requirements.txt:
--------------------------------------------------------------------------------
1 | faiss-cpu >= 1.6.3
2 | datasets >= 1.0.1
3 | psutil >= 5.7.0
4 | torch >= 1.4.0
5 | transformers
6 | pytorch-lightning==1.0.4
7 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/finetune_gpt2/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 |
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.6
7 | scipy>=1.4.1
8 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_full.gif
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_chunked.gif
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/ppl_sliding.gif
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | torch>=1.5.0
4 | torchaudio
5 | jiwer==2.2.0
6 | lang-trans==0.6.0
7 | librosa==0.8.0
8 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/open_model_proposals/README.md:
--------------------------------------------------------------------------------
1 | Currently the following model proposals are available:
2 |
3 | - [BigBird (Google)](./ADD_BIG_BIRD.md)
4 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 |
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.6
7 | scipy>=1.4.1
8 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/requirements.txt:
--------------------------------------------------------------------------------
1 | faiss-cpu >= 1.6.3
2 | datasets >= 1.0.1
3 | psutil >= 5.7.0
4 | torch >= 1.4.0
5 | transformers
6 | pytorch-lightning==1.0.4
7 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Light.ttf
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Thin.otf
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/local_attention_mask.png
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/emmental/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
3 | from .masked_nn import MaskedLinear
4 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Medium.otf
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/_static/css/Calibre-Regular.otf
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_logo_name.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/transformers_overview.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_schedule.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_linear_schedule.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_constant_schedule.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/wooly.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/research_projects/pplm/imgs/headfigure.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
3 | from .masked_nn import MaskedLinear
4 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deep-spin/infinite-former/HEAD/document_grounded_generation/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/lxmert/README.md:
--------------------------------------------------------------------------------
1 | # LXMERT DEMO
2 |
3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
4 | 2. install reqs: ``pip install -r ./requirements.txt``
5 | 3. usage is as shown in demo.ipynb
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.4.0
2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
3 | knockknock>=0.1.8.1
4 | h5py>=2.10.0
5 | numpy>=1.18.2
6 | scipy>=1.4.1
7 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/lxmert/README.md:
--------------------------------------------------------------------------------
1 | # LXMERT DEMO
2 |
3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
4 | 2. install reqs: ``pip install -r ./requirements.txt``
5 | 3. usage is as shown in demo.ipynb
6 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
1 | {
2 | "initializer_range": 0.02,
3 | "layer_norm_epsilon": 0.00001,
4 | "n_ctx": 1024,
5 | "n_embd": 768,
6 | "n_head": 12,
7 | "n_layer": 6,
8 | "n_positions": 1024,
9 | "vocab_size": 50257
10 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.4.0
2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
3 | knockknock>=0.1.8.1
4 | h5py>=2.10.0
5 | numpy>=1.18.2
6 | scipy>=1.4.1
7 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source=transformers
3 | omit =
4 | # skip convertion scripts from testing for now
5 | */convert_*
6 | */__main__.py
7 | [report]
8 | exclude_lines =
9 | pragma: no cover
10 | raise
11 | except
12 | register_parameter
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
1 | {
2 | "initializer_range": 0.02,
3 | "layer_norm_epsilon": 0.00001,
4 | "n_ctx": 1024,
5 | "n_embd": 768,
6 | "n_head": 12,
7 | "n_layer": 6,
8 | "n_positions": 1024,
9 | "vocab_size": 50257
10 | }
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.0.3
2 | gensim==3.8.3
3 | ignite==1.1.0
4 | matplotlib==3.4.3
5 | numpy==1.21.3
6 | pytorch-ignite==0.4.7
7 | pytorch-lightning==1.6.0
8 | rouge-score==0.0.4
9 | sacrebleu==2.0.0
10 | scikit-learn==1.0.1
11 | scipy==1.7.1
12 | tensorboard==2.9.0
13 | tensorboardX==1.8
14 | torch==1.9.0
15 | tqdm==4.62.3
16 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ~/.bashrc
3 | echo "running docker-entrypoint.sh"
4 | conda activate container
5 | echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
6 | echo "printed TPU info"
7 | export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
8 | exec "$@"#!/bin/bash
9 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/_tests_requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu >= 1.4.12
6 | rouge-score
7 | tensorflow_datasets
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/_tests_requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu >= 1.4.12
6 | rouge-score
7 | tensorflow_datasets
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/full_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/sanity_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | psutil
4 | sacrebleu
5 | rouge-score
6 | tensorflow_datasets
7 | pytorch-lightning==1.0.4
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/tatoeba/upload_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for FILE in converted/*; do
4 | model_name=`basename $FILE`
5 | transformers-cli repo create $model_name -y
6 | git clone https://huggingface.co/Helsinki-NLP/$model_name
7 | mv $FILE/* $model_name/
8 | cd $model_name
9 | git add . && git commit -m "initial commit"
10 | git push
11 | cd ..
12 | done
13 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | pytorch-lightning==1.0.4
9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | ray
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/movement-pruning/emmental/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .configuration_bert_masked import MaskedBertConfig
3 | from .modeling_bert_masked import (
4 | MaskedBertForMultipleChoice,
5 | MaskedBertForQuestionAnswering,
6 | MaskedBertForSequenceClassification,
7 | MaskedBertForTokenClassification,
8 | MaskedBertModel,
9 | )
10 | from .modules import *
11 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/full_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/sanity_script.sh:
--------------------------------------------------------------------------------
1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-cased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 28996
14 | }
15 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 30522
14 | }
15 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | pytorch-lightning==1.0.4
9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | transformers==3.5.1
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | psutil
4 | sacrebleu
5 | rouge-score
6 | tensorflow_datasets
7 | pytorch-lightning==1.0.4
8 | matplotlib
9 | git-python==1.0.3
10 | faiss-cpu
11 | streamlit
12 | elasticsearch
13 | nltk
14 | pandas
15 | datasets >= 1.1.3
16 | fire
17 | pytest
18 | conllu
19 | sentencepiece != 0.1.92
20 | protobuf
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
1 |
2 | .highlight .c1, .highlight .sd{
3 | color: #999
4 | }
5 |
6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
7 | color: #FB8D68;
8 | }
9 |
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
11 | color: #6670FF;
12 | }
13 |
14 | .highlight .gp {
15 | color: #FB8D68;
16 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/movement-pruning/emmental/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .configuration_bert_masked import MaskedBertConfig
3 | from .modeling_bert_masked import (
4 | MaskedBertForMultipleChoice,
5 | MaskedBertForQuestionAnswering,
6 | MaskedBertForSequenceClassification,
7 | MaskedBertForTokenClassification,
8 | MaskedBertModel,
9 | )
10 | from .modules import *
11 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_example_script/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 | "example_name": "text classification",
3 | "directory_name": "{{cookiecutter.example_name|lower|replace(' ', '-')}}",
4 | "example_shortcut": "{{cookiecutter.directory_name}}",
5 | "model_class": "AutoModel",
6 | "authors": "The HuggingFace Team",
7 | "can_train_from_scratch": ["True", "False"],
8 | "with_trainer": ["True", "False"]
9 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | pytorch-lightning==1.0.4
9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | ray
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 119547
14 | }
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-cased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 28996
14 | }
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboard
2 | scikit-learn
3 | seqeval
4 | psutil
5 | sacrebleu
6 | rouge-score
7 | tensorflow_datasets
8 | pytorch-lightning==1.0.4
9 | matplotlib
10 | git-python==1.0.3
11 | faiss-cpu
12 | streamlit
13 | elasticsearch
14 | nltk
15 | pandas
16 | datasets >= 1.1.3
17 | fire
18 | pytest
19 | conllu
20 | sentencepiece != 0.1.92
21 | protobuf
22 | transformers==3.5.1
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune.sh:
--------------------------------------------------------------------------------
1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
2 | # run ./finetune.sh --help to see all the possible options
3 | python finetune.py \
4 | --learning_rate=3e-5 \
5 | --fp16 \
6 | --gpus 1 \
7 | --do_train \
8 | --do_predict \
9 | --n_val 1000 \
10 | --val_check_interval 0.1 \
11 | "$@"
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 30522
14 | }
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
1 | {
2 | "activation": "gelu",
3 | "attention_dropout": 0.1,
4 | "dim": 768,
5 | "dropout": 0.1,
6 | "hidden_dim": 3072,
7 | "initializer_range": 0.02,
8 | "max_position_embeddings": 512,
9 | "n_heads": 12,
10 | "n_layers": 6,
11 | "sinusoidal_pos_embds": true,
12 | "tie_weights_": true,
13 | "vocab_size": 119547
14 | }
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune.sh:
--------------------------------------------------------------------------------
1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
2 | # run ./finetune.sh --help to see all the possible options
3 | python finetune.py \
4 | --learning_rate=3e-5 \
5 | --fp16 \
6 | --gpus 1 \
7 | --do_train \
8 | --do_predict \
9 | --n_val 1000 \
10 | --val_check_interval 0.1 \
11 | "$@"
12 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_t5.sh:
--------------------------------------------------------------------------------
1 | # Add parent directory to python path to access lightning_base.py
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | python finetune.py \
5 | --data_dir=$CNN_DIR \
6 | --learning_rate=3e-5 \
7 | --train_batch_size=$BS \
8 | --eval_batch_size=$BS \
9 | --output_dir=$OUTPUT_DIR \
10 | --max_source_length=512 \
11 | --max_target_length=56 \
12 | --val_check_interval=0.1 --n_val=200 \
13 | --do_train --do_predict \
14 | "$@"
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "NewENCDEC",
3 | "uppercase_modelname": "NEW_ENC_DEC",
4 | "lowercase_modelname": "new_enc_dec",
5 | "camelcase_modelname": "NewEncDec",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "new-enc-dec-base",
8 | "tokenizer_type": "Based on BART",
9 | "generate_tensorflow_and_pytorch": "PyTorch",
10 | "is_encoder_decoder_model": "True"
11 | }
12 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "vocab_size": 50265,
3 | "hidden_size": 768,
4 | "num_hidden_layers": 6,
5 | "num_attention_heads": 12,
6 | "intermediate_size": 3072,
7 | "hidden_act": "gelu",
8 | "hidden_dropout_prob": 0.1,
9 | "attention_probs_dropout_prob": 0.1,
10 | "max_position_embeddings": 514,
11 | "type_vocab_size": 1,
12 | "initializer_range": 0.02,
13 | "layer_norm_eps": 0.00001
14 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "Template",
3 | "uppercase_modelname": "TEMPLATE",
4 | "lowercase_modelname": "template",
5 | "camelcase_modelname": "Template",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "brand-new-bert-base-cased",
8 | "tokenizer_type": "Based on BERT",
9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
10 | "is_encoder_decoder_model": "False"
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "TemplatePT",
3 | "uppercase_modelname": "TEMPLATE_PT",
4 | "lowercase_modelname": "template_pt",
5 | "camelcase_modelname": "TemplatePt",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "brand-new-bert-base-cased",
8 | "tokenizer_type": "Based on BERT",
9 | "generate_tensorflow_and_pytorch": "PyTorch",
10 | "is_encoder_decoder_model": "False"
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/standalone.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "TemplateBI",
3 | "uppercase_modelname": "TEMPLATE_BI",
4 | "lowercase_modelname": "template_bi",
5 | "camelcase_modelname": "TemplateBi",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "bi-brand-new-bert-base-cased",
8 | "tokenizer_type": "Standalone",
9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
10 | "is_encoder_decoder_model": "False"
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "TemplateTF",
3 | "uppercase_modelname": "TEMPLATE_TF",
4 | "lowercase_modelname": "template_tf",
5 | "camelcase_modelname": "TemplateTf",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "brand-new-bert-base-cased",
8 | "tokenizer_type": "Based on BERT",
9 | "generate_tensorflow_and_pytorch": "TensorFlow",
10 | "is_encoder_decoder_model": "False"
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "NewTFENCDEC",
3 | "uppercase_modelname": "NEW_TF_ENC_DEC",
4 | "lowercase_modelname": "new_tf_enc_dec",
5 | "camelcase_modelname": "NewTFEncDec",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "new-tf-enc-dec-base",
8 | "tokenizer_type": "Based on BART",
9 | "generate_tensorflow_and_pytorch": "TensorFlow",
10 | "is_encoder_decoder_model": "True"
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_t5.sh:
--------------------------------------------------------------------------------
1 | # Add parent directory to python path to access lightning_base.py
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | python finetune.py \
5 | --data_dir=$CNN_DIR \
6 | --learning_rate=3e-5 \
7 | --train_batch_size=$BS \
8 | --eval_batch_size=$BS \
9 | --output_dir=$OUTPUT_DIR \
10 | --max_source_length=512 \
11 | --max_target_length=56 \
12 | --val_check_interval=0.1 --n_val=200 \
13 | --do_train --do_predict \
14 | "$@"
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
1 | {
2 | "vocab_size": 50265,
3 | "hidden_size": 768,
4 | "num_hidden_layers": 6,
5 | "num_attention_heads": 12,
6 | "intermediate_size": 3072,
7 | "hidden_act": "gelu",
8 | "hidden_dropout_prob": 0.1,
9 | "attention_probs_dropout_prob": 0.1,
10 | "max_position_embeddings": 514,
11 | "type_vocab_size": 1,
12 | "initializer_range": 0.02,
13 | "layer_norm_eps": 0.00001
14 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "BrandNewBERT",
3 | "uppercase_modelname": "BRAND_NEW_BERT",
4 | "lowercase_modelname": "brand_new_bert",
5 | "camelcase_modelname": "BrandNewBert",
6 | "authors": "The HuggingFace Team",
7 | "checkpoint_identifier": "brand-new-bert-base-cased",
8 | "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"],
9 | "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"],
10 | "is_encoder_decoder_model": ["True", "False"]
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "\U0001F5A5 New benchmark"
3 | about: Benchmark a part of this library and share your results
4 | title: "[Benchmark]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | # 🖥 Benchmarking `transformers`
11 |
12 | ## Benchmark
13 |
14 | Which part of `transformers` did you benchmark?
15 |
16 | ## Set-up
17 |
18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
19 |
20 | ## Results
21 |
22 | Put your results here!
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "\U0001F31F New model addition"
3 | about: Submit a proposal/request to implement a new Transformer-based model
4 | title: ''
5 | labels: New model
6 | assignees: ''
7 |
8 | ---
9 |
10 | # 🌟 New model addition
11 |
12 | ## Model description
13 |
14 |
15 |
16 | ## Open source status
17 |
18 | * [ ] the model implementation is available: (give details)
19 | * [ ] the model weights are available: (give details)
20 | * [ ] who are the authors: (mention them, if possible by @gh-username)
21 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | # From appendix C of paper https://arxiv.org/abs/1912.08777
5 | # Set --gradient_accumulation_steps so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
6 | python finetune.py \
7 | --learning_rate=1e-4 \
8 | --do_train \
9 | --do_predict \
10 | --n_val 1000 \
11 | --val_check_interval 0.25 \
12 | --max_source_length 512 --max_target_length 56 \
13 | --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
14 | "$@"
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | # From appendix C of paper https://arxiv.org/abs/1912.08777
5 | # Set --gradient_accumulation_steps so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
6 | python finetune.py \
7 | --learning_rate=1e-4 \
8 | --do_train \
9 | --do_predict \
10 | --n_val 1000 \
11 | --val_check_interval 0.25 \
12 | --max_source_length 512 --max_target_length 56 \
13 | --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
14 | "$@"
15 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json:
--------------------------------------------------------------------------------
1 | {
2 | "modelname": "{{cookiecutter.modelname}}",
3 | "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}",
4 | "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}",
5 | "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}",
6 | "authors": "{{cookiecutter.authors}}",
7 | "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}",
8 | "tokenizer_type": "{{cookiecutter.tokenizer_type}}",
9 | "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}",
10 | "is_encoder_decoder_model": ["True", "False"]
11 | }
12 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: Stale Bot
2 |
3 | on:
4 | schedule:
5 | - cron: "0 0 * * *"
6 |
7 | jobs:
8 | close_stale_issues:
9 | name: Close Stale Issues
10 | if: github.repository == 'huggingface/transformers'
11 | runs-on: ubuntu-latest
12 | env:
13 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
14 | steps:
15 | - uses: actions/checkout@v2
16 |
17 | - name: Setup Python
18 | uses: actions/setup-python@v1
19 | with:
20 | python-version: 3.7
21 |
22 | - name: Install requirements
23 | run: |
24 | pip install PyGithub
25 | - name: Close stale issues
26 | run: |
27 | python scripts/stale.py
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/longform-qa/README.md:
--------------------------------------------------------------------------------
1 | # Long Form Question Answering
2 |
3 | Author: @yjernite
4 |
5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
6 |
7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
8 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_100.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-base-100h" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="32" \
6 | --per_device_eval_batch_size="32" \
7 | --evaluation_strategy="steps" \
8 | --save_total_limit="3" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-base" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/longform-qa/README.md:
--------------------------------------------------------------------------------
1 | # Long Form Question Answering
2 |
3 | Author: @yjernite
4 |
5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
6 |
7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
8 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | python finetune.py \
5 | --learning_rate=3e-5 \
6 | --fp16 \
7 | --do_train \
8 | --val_check_interval=0.25 \
9 | --adam_eps 1e-06 \
10 | --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
11 | --data_dir $ENRO_DIR \
12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 | --train_batch_size=$BS --eval_batch_size=$BS \
14 | --task translation \
15 | --warmup_steps 500 \
16 | --freeze_embeds \
17 | --model_name_or_path=facebook/mbart-large-cc25 \
18 | "$@"
19 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-lv60-100h" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="16" \
6 | --per_device_eval_batch_size="16" \
7 | --evaluation_strategy="steps" \
8 | --save_total_limit="3" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_100.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-base-100h" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="32" \
6 | --per_device_eval_batch_size="32" \
7 | --evaluation_strategy="steps" \
8 | --save_total_limit="3" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-base" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-base-timit-asr" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="20" \
6 | --per_device_eval_batch_size="20" \
7 | --evaluation_strategy="steps" \
8 | --save_steps="500" \
9 | --eval_steps="100" \
10 | --logging_steps="50" \
11 | --learning_rate="5e-4" \
12 | --warmup_steps="3000" \
13 | --model_name_or_path="facebook/wav2vec2-base" \
14 | --fp16 \
15 | --dataset_name="timit_asr" \
16 | --train_split_name="train" \
17 | --validation_split_name="test" \
18 | --orthography="timit" \
19 | --preprocessing_num_workers="$(nproc)" \
20 | --group_by_length \
21 | --freeze_feature_extractor \
22 | --verbose_logging \
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | python finetune.py \
5 | --learning_rate=3e-5 \
6 | --fp16 \
7 | --do_train \
8 | --val_check_interval=0.25 \
9 | --adam_eps 1e-06 \
10 | --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
11 | --data_dir $ENRO_DIR \
12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 | --train_batch_size=$BS --eval_batch_size=$BS \
14 | --task translation \
15 | --warmup_steps 500 \
16 | --freeze_embeds \
17 | --model_name_or_path=facebook/mbart-large-cc25 \
18 | "$@"
19 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | export MAX_LEN=128
5 | export m=sshleifer/student_marian_en_ro_6_1
6 | python finetune.py \
7 | --learning_rate=3e-4 \
8 | --do_train \
9 | --fp16 \
10 | --data_dir wmt_en_ro \
11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 | --freeze_encoder --freeze_embeds \
13 | --train_batch_size=48 --eval_batch_size=64 \
14 | --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \
15 | --warmup_steps 500 --logger_name wandb --gpus 1 \
16 | --fp16_opt_level=O1 --task translation \
17 | "$@"
18 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | jupyter \
18 | torch
19 |
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 | python3 -m pip install --no-cache-dir .
24 |
25 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-lv60-100h" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="16" \
6 | --per_device_eval_batch_size="16" \
7 | --evaluation_strategy="steps" \
8 | --save_total_limit="3" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="librispeech_asr" \
17 | --dataset_config_name="clean" \
18 | --train_split_name="train.100" \
19 | --preprocessing_num_workers="32" \
20 | --group_by_length \
21 | --freeze_feature_extractor
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-tensorflow-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | mkl \
18 | tensorflow-cpu
19 |
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 | python3 -m pip install --no-cache-dir .
24 |
25 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | export MAX_LEN=128
5 | export m=sshleifer/student_marian_en_ro_6_1
6 | python finetune.py \
7 | --learning_rate=3e-4 \
8 | --do_train \
9 | --fp16 \
10 | --data_dir wmt_en_ro \
11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 | --freeze_encoder --freeze_embeds \
13 | --train_batch_size=48 --eval_batch_size=64 \
14 | --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \
15 | --warmup_steps 500 --logger_name wandb --gpus 1 \
16 | --fp16_opt_level=O1 --task translation \
17 | "$@"
18 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-base-timit-asr" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="20" \
6 | --per_device_eval_batch_size="20" \
7 | --evaluation_strategy="steps" \
8 | --save_steps="500" \
9 | --eval_steps="100" \
10 | --logging_steps="50" \
11 | --learning_rate="5e-4" \
12 | --warmup_steps="3000" \
13 | --model_name_or_path="facebook/wav2vec2-base" \
14 | --fp16 \
15 | --dataset_name="timit_asr" \
16 | --train_split_name="train" \
17 | --validation_split_name="test" \
18 | --orthography="timit" \
19 | --preprocessing_num_workers="$(nproc)" \
20 | --group_by_length \
21 | --freeze_feature_extractor \
22 | --verbose_logging \
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class ClassificationHead(torch.nn.Module):
5 | """Classification Head for transformer encoders"""
6 |
7 | def __init__(self, class_size, embed_size):
8 | super().__init__()
9 | self.class_size = class_size
10 | self.embed_size = embed_size
11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 | self.mlp = torch.nn.Linear(embed_size, class_size)
14 |
15 | def forward(self, hidden_state):
16 | # hidden_state = F.relu(self.mlp1(hidden_state))
17 | # hidden_state = self.mlp2(hidden_state)
18 | logits = self.mlp(hidden_state)
19 | return logits
20 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/sentence_splitter.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from filelock import FileLock
4 |
5 |
6 | try:
7 | import nltk
8 |
9 | NLTK_AVAILABLE = True
10 | except (ImportError, ModuleNotFoundError):
11 | NLTK_AVAILABLE = False
12 |
13 | if NLTK_AVAILABLE:
14 | with FileLock(".lock") as lock:
15 | nltk.download("punkt", quiet=True)
16 |
17 |
18 | def add_newline_to_end_of_each_sentence(x: str) -> str:
19 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
20 | re.sub("", "", x) # remove pegasus newline char
21 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
22 | return "\n".join(nltk.sent_tokenize(x))
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-cpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | jupyter \
18 | tensorflow-cpu \
19 | torch
20 |
21 | WORKDIR /workspace
22 | COPY . transformers/
23 | RUN cd transformers/ && \
24 | python3 -m pip install --no-cache-dir .
25 |
26 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/mm-imdb/README.md:
--------------------------------------------------------------------------------
1 | ## MM-IMDb
2 |
3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
4 |
5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
6 |
7 | ### Training on MM-IMDb
8 |
9 | ```
10 | python run_mmimdb.py \
11 | --data_dir /path/to/mmimdb/dataset/ \
12 | --model_type bert \
13 | --model_name_or_path bert-base-uncased \
14 | --output_dir /path/to/save/dir/ \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_len 512 \
18 | --gradient_accumulation_steps 20 \
19 | --num_image_embeds 3 \
20 | --num_train_epochs 100 \
21 | --patience 5
22 | ```
23 |
24 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | export MAX_LEN=128
5 | python finetune.py \
6 | --learning_rate=3e-4 \
7 | --do_train \
8 | --do_predict \
9 | --fp16 \
10 | --val_check_interval 0.25 \
11 | --data_dir $ENRO_DIR \
12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 | --freeze_encoder --freeze_embeds \
14 | --train_batch_size=$BS --eval_batch_size=$BS \
15 | --tokenizer_name $m --model_name_or_path $m \
16 | --warmup_steps 500 --sortish_sampler --logger_name wandb \
17 | --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \
18 | "$@"
19 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-tensorflow-gpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | mkl \
18 | tensorflow
19 |
20 | WORKDIR /workspace
21 | COPY . transformers/
22 | RUN cd transformers/ && \
23 | python3 -m pip install --no-cache-dir .
24 |
25 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class ClassificationHead(torch.nn.Module):
5 | """Classification Head for transformer encoders"""
6 |
7 | def __init__(self, class_size, embed_size):
8 | super().__init__()
9 | self.class_size = class_size
10 | self.embed_size = embed_size
11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 | self.mlp = torch.nn.Linear(embed_size, class_size)
14 |
15 | def forward(self, hidden_state):
16 | # hidden_state = F.relu(self.mlp1(hidden_state))
17 | # hidden_state = self.mlp2(hidden_state)
18 | logits = self.mlp(hidden_state)
19 | return logits
20 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/sentence_splitter.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from filelock import FileLock
4 |
5 |
6 | try:
7 | import nltk
8 |
9 | NLTK_AVAILABLE = True
10 | except (ImportError, ModuleNotFoundError):
11 | NLTK_AVAILABLE = False
12 |
13 | if NLTK_AVAILABLE:
14 | with FileLock(".lock") as lock:
15 | nltk.download("punkt", quiet=True)
16 |
17 |
18 | def add_newline_to_end_of_each_sentence(x: str) -> str:
19 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
20 | re.sub("", "", x) # remove pegasus newline char
21 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
22 | return "\n".join(nltk.sent_tokenize(x))
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | export BS=32
5 | export GAS=1
6 |
7 | python finetune.py \
8 | --learning_rate=3e-5 \
9 | --fp16 \
10 | --gpus 1 \
11 | --do_train \
12 | --do_predict \
13 | --val_check_interval 0.25 \
14 | --n_val 500 \
15 | --num_train_epochs 2 \
16 | --freeze_encoder --freeze_embeds --data_dir cnn_dm \
17 | --max_target_length 142 --val_max_target_length=142 \
18 | --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
19 | --model_name_or_path sshleifer/student_cnn_12_6 \
20 | --tokenizer_name facebook/bart-large \
21 | --warmup_steps 500 \
22 | --output_dir distilbart-cnn-12-6 \
23 | "$@"
24 |
25 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="2" \
6 | --per_device_eval_batch_size="2" \
7 | --gradient_accumulation_steps="4" \
8 | --evaluation_strategy="steps" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="timit_asr" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --orthography="timit" \
20 | --preprocessing_num_workers="$(nproc)" \
21 | --group_by_length \
22 | --freeze_feature_extractor \
23 | --verbose_logging \
24 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/mm-imdb/README.md:
--------------------------------------------------------------------------------
1 | ## MM-IMDb
2 |
3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
4 |
5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
6 |
7 | ### Training on MM-IMDb
8 |
9 | ```
10 | python run_mmimdb.py \
11 | --data_dir /path/to/mmimdb/dataset/ \
12 | --model_type bert \
13 | --model_name_or_path bert-base-uncased \
14 | --output_dir /path/to/save/dir/ \
15 | --do_train \
16 | --do_eval \
17 | --max_seq_len 512 \
18 | --gradient_accumulation_steps 20 \
19 | --num_image_embeds 3 \
20 | --num_train_epochs 100 \
21 | --patience 5
22 | ```
23 |
24 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | export MAX_LEN=128
5 | python finetune.py \
6 | --learning_rate=3e-4 \
7 | --do_train \
8 | --do_predict \
9 | --fp16 \
10 | --val_check_interval 0.25 \
11 | --data_dir $ENRO_DIR \
12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
13 | --freeze_encoder --freeze_embeds \
14 | --train_batch_size=$BS --eval_batch_size=$BS \
15 | --tokenizer_name $m --model_name_or_path $m \
16 | --warmup_steps 500 --sortish_sampler --logger_name wandb \
17 | --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \
18 | "$@"
19 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_common_voice.py \
3 | --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
4 | --dataset_config_name="tr" \
5 | --output_dir=./wav2vec2-large-xlsr-turkish-demo \
6 | --overwrite_output_dir \
7 | --num_train_epochs="5" \
8 | --per_device_train_batch_size="16" \
9 | --evaluation_strategy="steps" \
10 | --learning_rate="3e-4" \
11 | --warmup_steps="500" \
12 | --fp16 \
13 | --freeze_feature_extractor \
14 | --save_steps="400" \
15 | --eval_steps="400" \
16 | --save_total_limit="3" \
17 | --logging_steps="400" \
18 | --group_by_length \
19 | --feat_proj_dropout="0.0" \
20 | --layerdrop="0.1" \
21 | --gradient_checkpointing \
22 | --do_train --do_eval
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | export BS=32
5 | export GAS=1
6 |
7 | python finetune.py \
8 | --learning_rate=3e-5 \
9 | --fp16 \
10 | --gpus 1 \
11 | --do_train \
12 | --do_predict \
13 | --val_check_interval 0.25 \
14 | --n_val 500 \
15 | --num_train_epochs 2 \
16 | --freeze_encoder --freeze_embeds --data_dir cnn_dm \
17 | --max_target_length 142 --val_max_target_length=142 \
18 | --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
19 | --model_name_or_path sshleifer/student_cnn_12_6 \
20 | --tokenizer_name facebook/bart-large \
21 | --warmup_steps 500 \
22 | --output_dir distilbart-cnn-12-6 \
23 | "$@"
24 |
25 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-lv60-timit-asr" \
4 | --num_train_epochs="30" \
5 | --per_device_train_batch_size="2" \
6 | --per_device_eval_batch_size="2" \
7 | --gradient_accumulation_steps="4" \
8 | --evaluation_strategy="steps" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="facebook/wav2vec2-large-lv60" \
15 | --fp16 \
16 | --dataset_name="timit_asr" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --orthography="timit" \
20 | --preprocessing_num_workers="$(nproc)" \
21 | --group_by_length \
22 | --freeze_feature_extractor \
23 | --verbose_logging \
24 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_common_voice.py \
3 | --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
4 | --dataset_config_name="tr" \
5 | --output_dir=./wav2vec2-large-xlsr-turkish-demo \
6 | --overwrite_output_dir \
7 | --num_train_epochs="5" \
8 | --per_device_train_batch_size="16" \
9 | --evaluation_strategy="steps" \
10 | --learning_rate="3e-4" \
11 | --warmup_steps="500" \
12 | --fp16 \
13 | --freeze_feature_extractor \
14 | --save_steps="400" \
15 | --eval_steps="400" \
16 | --save_total_limit="3" \
17 | --logging_steps="400" \
18 | --group_by_length \
19 | --feat_proj_dropout="0.0" \
20 | --layerdrop="0.1" \
21 | --gradient_checkpointing \
22 | --do_train --do_eval
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/token-classification/run.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python3 run_ner.py \
16 | --model_name_or_path bert-base-uncased \
17 | --dataset_name conll2003 \
18 | --output_dir /tmp/test-ner \
19 | --do_train \
20 | --do_eval
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/token-classification/run.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python3 run_ner.py \
16 | --model_name_or_path bert-base-uncased \
17 | --dataset_name conll2003 \
18 | --output_dir /tmp/test-ner \
19 | --do_train \
20 | --do_eval
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/question-help.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "❓ Questions & Help"
3 | about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | # ❓ Questions & Help
11 |
12 |
16 |
17 | ## Details
18 |
19 |
20 |
21 |
23 |
24 | **A link to original question on the forum**:
25 |
26 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | # export MAX_LEN=128
5 | python distillation.py \
6 | --learning_rate=3e-4 \
7 | --do_train \
8 | --fp16 \
9 | --val_check_interval 0.25 \
10 | --teacher Helsinki-NLP/opus-mt-en-ro \
11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 | --student_decoder_layers 3 --student_encoder_layers 6 \
13 | --freeze_encoder --freeze_embeds \
14 | --model_name_or_path IGNORED \
15 | --alpha_hid=3. \
16 | --train_batch_size=$BS --eval_batch_size=$BS \
17 | --tokenizer_name Helsinki-NLP/opus-mt-en-ro \
18 | --warmup_steps 500 --logger_name wandb \
19 | --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \
20 | "$@"
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "\U0001F680 Feature request"
3 | about: Submit a proposal/request for a new transformers feature
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | # 🚀 Feature request
11 |
12 |
14 |
15 | ## Motivation
16 |
17 |
20 |
21 | ## Your contribution
22 |
23 |
26 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | python distillation.py \
4 | --teacher facebook/bart-large-xsum --data_dir xsum \
5 | --tokenizer_name facebook/bart-large-xsum \
6 | --student_decoder_layers 6 --student_encoder_layers 12 \
7 | --freeze_encoder --freeze_embeds \
8 | --learning_rate=3e-4 \
9 | --do_train \
10 | --do_predict \
11 | --fp16 --fp16_opt_level=O1 \
12 | --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
13 | --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
14 | --model_name_or_path IGNORED \
15 | --alpha_hid=3. \
16 | --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
17 | --sortish_sampler \
18 | --num_train_epochs=6 \
19 | --warmup_steps 500 \
20 | --output_dir distilbart_xsum_12_6 \
21 | "$@"
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | export WANDB_PROJECT=dmar
4 | # export MAX_LEN=128
5 | python distillation.py \
6 | --learning_rate=3e-4 \
7 | --do_train \
8 | --fp16 \
9 | --val_check_interval 0.25 \
10 | --teacher Helsinki-NLP/opus-mt-en-ro \
11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
12 | --student_decoder_layers 3 --student_encoder_layers 6 \
13 | --freeze_encoder --freeze_embeds \
14 | --model_name_or_path IGNORED \
15 | --alpha_hid=3. \
16 | --train_batch_size=$BS --eval_batch_size=$BS \
17 | --tokenizer_name Helsinki-NLP/opus-mt-en-ro \
18 | --warmup_steps 500 --logger_name wandb \
19 | --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \
20 | "$@"
21 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
4 | --num_train_epochs="50" \
5 | --per_device_train_batch_size="1" \
6 | --per_device_eval_batch_size="1" \
7 | --gradient_accumulation_steps="8" \
8 | --evaluation_strategy="steps" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
15 | --fp16 \
16 | --dataset_name="arabic_speech_corpus" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --max_duration_in_seconds="15" \
20 | --orthography="buckwalter" \
21 | --preprocessing_num_workers="$(nproc)" \
22 | --group_by_length \
23 | --freeze_feature_extractor \
24 | --target_feature_extractor_sampling_rate \
25 | --verbose_logging \
26 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 | python distillation.py \
4 | --teacher facebook/bart-large-xsum --data_dir xsum \
5 | --tokenizer_name facebook/bart-large-xsum \
6 | --student_decoder_layers 6 --student_encoder_layers 12 \
7 | --freeze_encoder --freeze_embeds \
8 | --learning_rate=3e-4 \
9 | --do_train \
10 | --do_predict \
11 | --fp16 --fp16_opt_level=O1 \
12 | --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
13 | --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
14 | --model_name_or_path IGNORED \
15 | --alpha_hid=3. \
16 | --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
17 | --sortish_sampler \
18 | --num_train_epochs=6 \
19 | --warmup_steps 500 \
20 | --output_dir distilbart_xsum_12_6 \
21 | "$@"
22 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/eval_deebert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 | MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 |
16 |
17 | python -u run_glue_deebert.py \
18 | --model_type $MODEL_TYPE \
19 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
20 | --task_name $DATASET \
21 | --do_eval \
22 | --do_lower_case \
23 | --data_dir $PATH_TO_DATA/$DATASET \
24 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
25 | --plot_data_dir ./results/ \
26 | --max_seq_length 128 \
27 | --eval_each_highway \
28 | --eval_highway \
29 | --overwrite_cache \
30 | --per_gpu_eval_batch_size=1
31 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | # Legacy examples
18 |
19 | This folder contains examples which are not actively maintained (mostly contributed by the community).
20 |
21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python run_asr.py \
3 | --output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
4 | --num_train_epochs="50" \
5 | --per_device_train_batch_size="1" \
6 | --per_device_eval_batch_size="1" \
7 | --gradient_accumulation_steps="8" \
8 | --evaluation_strategy="steps" \
9 | --save_steps="500" \
10 | --eval_steps="100" \
11 | --logging_steps="50" \
12 | --learning_rate="5e-4" \
13 | --warmup_steps="3000" \
14 | --model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
15 | --fp16 \
16 | --dataset_name="arabic_speech_corpus" \
17 | --train_split_name="train" \
18 | --validation_split_name="test" \
19 | --max_duration_in_seconds="15" \
20 | --orthography="buckwalter" \
21 | --preprocessing_num_workers="$(nproc)" \
22 | --group_by_length \
23 | --freeze_feature_extractor \
24 | --target_feature_extractor_sampling_rate \
25 | --verbose_logging \
26 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/eval_deebert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 | MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 |
16 |
17 | python -u run_glue_deebert.py \
18 | --model_type $MODEL_TYPE \
19 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
20 | --task_name $DATASET \
21 | --do_eval \
22 | --do_lower_case \
23 | --data_dir $PATH_TO_DATA/$DATASET \
24 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
25 | --plot_data_dir ./results/ \
26 | --max_seq_length 128 \
27 | --eval_each_highway \
28 | --eval_highway \
29 | --overwrite_cache \
30 | --per_gpu_eval_batch_size=1
31 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/dataset.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: PersistentVolume
3 | metadata:
4 | name: huggingface-cluster-disk
5 | spec:
6 | storageClassName: ""
7 | capacity:
8 | storage: 500Gi
9 | accessModes:
10 | - ReadOnlyMany
11 | claimRef:
12 | namespace: default
13 | name: huggingface-cluster-disk-claim
14 | gcePersistentDisk:
15 | pdName: huggingface-cluster-disk
16 | fsType: ext4
17 | readOnly: true
18 | ---
19 | apiVersion: v1
20 | kind: PersistentVolumeClaim
21 | metadata:
22 | name: huggingface-cluster-disk-claim
23 | spec:
24 | # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
25 | # A nil storageClassName value uses the default StorageClass. For details, see
26 | # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
27 | storageClassName: ""
28 | accessModes:
29 | - ReadOnlyMany
30 | resources:
31 | requests:
32 | storage: 1Ki
33 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | # Legacy examples
18 |
19 | This folder contains examples which are not actively maintained (mostly contributed by the community).
20 |
21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
22 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-gpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | mkl \
18 | torch
19 |
20 | RUN git clone https://github.com/NVIDIA/apex
21 | RUN cd apex && \
22 | python3 setup.py install && \
23 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
24 |
25 | WORKDIR /workspace
26 | COPY . transformers/
27 | RUN cd transformers/ && \
28 | python3 -m pip install --no-cache-dir .
29 |
30 | CMD ["/bin/bash"]
31 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/sagemaker/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2021 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .trainer_sm import SageMakerTrainer
20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available
21 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-gpu/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
2 | LABEL maintainer="Hugging Face"
3 | LABEL repository="transformers"
4 |
5 | RUN apt update && \
6 | apt install -y bash \
7 | build-essential \
8 | git \
9 | curl \
10 | ca-certificates \
11 | python3 \
12 | python3-pip && \
13 | rm -rf /var/lib/apt/lists
14 |
15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16 | python3 -m pip install --no-cache-dir \
17 | jupyter \
18 | tensorflow \
19 | torch
20 |
21 | RUN git clone https://github.com/NVIDIA/apex
22 | RUN cd apex && \
23 | python3 setup.py install && \
24 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
25 |
26 | WORKDIR /workspace
27 | COPY . transformers/
28 | RUN cd transformers/ && \
29 | python3 -m pip install --no-cache-dir .
30 |
31 | CMD ["/bin/bash"]
32 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/sagemaker/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2021 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .trainer_sm import SageMakerTrainer
20 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_distributed_available
21 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh:
--------------------------------------------------------------------------------
1 | # Script for verifying that run_bart_sum can be invoked from its directory
2 |
3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test)
4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz
5 | tar -xzvf cnn_tiny.tgz
6 | rm cnn_tiny.tgz
7 |
8 | export OUTPUT_DIR_NAME=bart_utest_output
9 | export CURRENT_DIR=${PWD}
10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
11 |
12 | # Make output directory if it doesn't exist
13 | mkdir -p $OUTPUT_DIR
14 |
15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py
16 | export PYTHONPATH="../":"${PYTHONPATH}"
17 | python finetune.py \
18 | --data_dir=cnn_tiny/ \
19 | --model_name_or_path=sshleifer/bart-tiny-random \
20 | --learning_rate=3e-5 \
21 | --train_batch_size=2 \
22 | --eval_batch_size=2 \
23 | --output_dir=$OUTPUT_DIR \
24 | --num_train_epochs=1 \
25 | --gpus=0 \
26 | --do_train "$@"
27 |
28 | rm -rf cnn_tiny
29 | rm -rf $OUTPUT_DIR
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/wav2vec2/vocab/buckwalter.json:
--------------------------------------------------------------------------------
1 | {
2 | "": 0,
3 | "": 1,
4 | "": 2,
5 | "": 3,
6 | "/": 4,
7 | "'": 5,
8 | "|": 6,
9 | ">": 7,
10 | "&": 8,
11 | "<": 9,
12 | "}": 10,
13 | "A": 11,
14 | "b": 12,
15 | "p": 13,
16 | "t": 14,
17 | "v": 15,
18 | "j": 16,
19 | "H": 17,
20 | "x": 18,
21 | "d": 19,
22 | "*": 20,
23 | "r": 21,
24 | "z": 22,
25 | "s": 23,
26 | "$": 24,
27 | "S": 25,
28 | "D": 26,
29 | "T": 27,
30 | "Z": 28,
31 | "E": 29,
32 | "g": 30,
33 | "_": 31,
34 | "f": 32,
35 | "q": 33,
36 | "k": 34,
37 | "l": 35,
38 | "m": 36,
39 | "n": 37,
40 | "h": 38,
41 | "w": 39,
42 | "Y": 40,
43 | "y": 41,
44 | "F": 42,
45 | "N": 43,
46 | "K": 44,
47 | "a": 45,
48 | "u": 46,
49 | "i": 47,
50 | "~": 48,
51 | "o": 49,
52 | "`": 50,
53 | "{": 51,
54 | "P": 52,
55 | "J": 53,
56 | "V": 54,
57 | "G": 55
58 | }
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | default_section = FIRSTPARTY
3 | ensure_newline_before_comments = True
4 | force_grid_wrap = 0
5 | include_trailing_comma = True
6 | known_first_party = transformers
7 | known_third_party =
8 | absl
9 | conllu
10 | datasets
11 | elasticsearch
12 | fairseq
13 | faiss-cpu
14 | fastprogress
15 | fire
16 | fugashi
17 | git
18 | h5py
19 | matplotlib
20 | nltk
21 | numpy
22 | packaging
23 | pandas
24 | PIL
25 | psutil
26 | pytest
27 | pytorch_lightning
28 | rouge_score
29 | sacrebleu
30 | seqeval
31 | sklearn
32 | streamlit
33 | tensorboardX
34 | tensorflow
35 | tensorflow_datasets
36 | timeout_decorator
37 | torch
38 | torchaudio
39 | torchtext
40 | torchvision
41 | torch_xla
42 | tqdm
43 |
44 | line_length = 119
45 | lines_after_imports = 2
46 | multi_line_output = 3
47 | use_parentheses = True
48 |
49 | [flake8]
50 | ignore = E203, E501, E741, W503, W605
51 | max-line-length = 119
52 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 |
18 |
19 | class BaseTransformersCLICommand(ABC):
20 | @staticmethod
21 | @abstractmethod
22 | def register_subcommand(parser: ArgumentParser):
23 | raise NotImplementedError()
24 |
25 | @abstractmethod
26 | def run(self):
27 | raise NotImplementedError()
28 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/wav2vec2/vocab/buckwalter.json:
--------------------------------------------------------------------------------
1 | {
2 | "": 0,
3 | "": 1,
4 | "": 2,
5 | "": 3,
6 | "/": 4,
7 | "'": 5,
8 | "|": 6,
9 | ">": 7,
10 | "&": 8,
11 | "<": 9,
12 | "}": 10,
13 | "A": 11,
14 | "b": 12,
15 | "p": 13,
16 | "t": 14,
17 | "v": 15,
18 | "j": 16,
19 | "H": 17,
20 | "x": 18,
21 | "d": 19,
22 | "*": 20,
23 | "r": 21,
24 | "z": 22,
25 | "s": 23,
26 | "$": 24,
27 | "S": 25,
28 | "D": 26,
29 | "T": 27,
30 | "Z": 28,
31 | "E": 29,
32 | "g": 30,
33 | "_": 31,
34 | "f": 32,
35 | "q": 33,
36 | "k": 34,
37 | "l": 35,
38 | "m": 36,
39 | "n": 37,
40 | "h": 38,
41 | "w": 39,
42 | "Y": 40,
43 | "y": 41,
44 | "F": 42,
45 | "N": 43,
46 | "K": 44,
47 | "a": 45,
48 | "u": 46,
49 | "i": 47,
50 | "~": 48,
51 | "o": 49,
52 | "`": 50,
53 | "{": 51,
54 | "P": 52,
55 | "J": 53,
56 | "V": 54,
57 | "G": 55
58 | }
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import io
4 | import json
5 | import subprocess
6 |
7 |
8 | pairs = [
9 | ["en", "ru"],
10 | ["ru", "en"],
11 | ["en", "de"],
12 | ["de", "en"],
13 | ]
14 |
15 | n_objs = 8
16 |
17 |
18 | def get_all_data(pairs, n_objs):
19 | text = {}
20 | for src, tgt in pairs:
21 | pair = f"{src}-{tgt}"
22 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
23 | src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
24 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
25 | tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
26 | text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
27 | return text
28 |
29 |
30 | text = get_all_data(pairs, n_objs)
31 | filename = "./fsmt_val_data.json"
32 | with io.open(filename, "w", encoding="utf-8") as f:
33 | bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
34 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh:
--------------------------------------------------------------------------------
1 | # Script for verifying that run_bart_sum can be invoked from its directory
2 |
3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test)
4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz
5 | tar -xzvf cnn_tiny.tgz
6 | rm cnn_tiny.tgz
7 |
8 | export OUTPUT_DIR_NAME=bart_utest_output
9 | export CURRENT_DIR=${PWD}
10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
11 |
12 | # Make output directory if it doesn't exist
13 | mkdir -p $OUTPUT_DIR
14 |
15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py
16 | export PYTHONPATH="../":"${PYTHONPATH}"
17 | python finetune.py \
18 | --data_dir=cnn_tiny/ \
19 | --model_name_or_path=sshleifer/bart-tiny-random \
20 | --learning_rate=3e-5 \
21 | --train_batch_size=2 \
22 | --eval_batch_size=2 \
23 | --output_dir=$OUTPUT_DIR \
24 | --num_train_epochs=1 \
25 | --gpus=0 \
26 | --do_train "$@"
27 |
28 | rm -rf cnn_tiny
29 | rm -rf $OUTPUT_DIR
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 | from argparse import ArgumentParser
17 |
18 |
19 | class BaseTransformersCLICommand(ABC):
20 | @staticmethod
21 | @abstractmethod
22 | def register_subcommand(parser: ArgumentParser):
23 | raise NotImplementedError()
24 |
25 | @abstractmethod
26 | def run(self):
27 | raise NotImplementedError()
28 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/run_glue.sh:
--------------------------------------------------------------------------------
1 | # Install example requirements
2 | pip install -r ../requirements.txt
3 |
4 | # Download glue data
5 | python3 ../../utils/download_glue_data.py
6 |
7 | export TASK=mrpc
8 | export DATA_DIR=./glue_data/MRPC/
9 | export MAX_LENGTH=128
10 | export LEARNING_RATE=2e-5
11 | export BERT_MODEL=bert-base-cased
12 | export BATCH_SIZE=32
13 | export NUM_EPOCHS=3
14 | export SEED=2
15 | export OUTPUT_DIR_NAME=mrpc-pl-bert
16 | export CURRENT_DIR=${PWD}
17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
18 |
19 | # Make output directory if it doesn't exist
20 | mkdir -p $OUTPUT_DIR
21 | # Add parent directory to python path to access lightning_base.py
22 | export PYTHONPATH="../":"${PYTHONPATH}"
23 |
24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
25 | --task $TASK \
26 | --model_name_or_path $BERT_MODEL \
27 | --output_dir $OUTPUT_DIR \
28 | --max_seq_length $MAX_LENGTH \
29 | --learning_rate $LEARNING_RATE \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --train_batch_size $BATCH_SIZE \
32 | --seed $SEED \
33 | --do_train \
34 | --do_predict
35 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/entropy_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 | MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 |
16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7"
17 |
18 | for ENTROPY in $ENTROPIES; do
19 | python -u run_glue_deebert.py \
20 | --model_type $MODEL_TYPE \
21 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
22 | --task_name $DATASET \
23 | --do_eval \
24 | --do_lower_case \
25 | --data_dir $PATH_TO_DATA/$DATASET \
26 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
27 | --plot_data_dir ./results/ \
28 | --max_seq_length 128 \
29 | --early_exit_entropy $ENTROPY \
30 | --eval_highway \
31 | --overwrite_cache \
32 | --per_gpu_eval_batch_size=1
33 | done
34 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import io
4 | import json
5 | import subprocess
6 |
7 |
8 | pairs = [
9 | ["en", "ru"],
10 | ["ru", "en"],
11 | ["en", "de"],
12 | ["de", "en"],
13 | ]
14 |
15 | n_objs = 8
16 |
17 |
18 | def get_all_data(pairs, n_objs):
19 | text = {}
20 | for src, tgt in pairs:
21 | pair = f"{src}-{tgt}"
22 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
23 | src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
24 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
25 | tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
26 | text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
27 | return text
28 |
29 |
30 | text = get_all_data(pairs, n_objs)
31 | filename = "./fsmt_val_data.json"
32 | with io.open(filename, "w", encoding="utf-8") as f:
33 | bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
34 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "transformers" %}
2 |
3 | package:
4 | name: "{{ name|lower }}"
5 | version: "{{ TRANSFORMERS_VERSION }}"
6 |
7 | source:
8 | path: ../../
9 |
10 | build:
11 | noarch: python
12 |
13 | requirements:
14 | host:
15 | - python
16 | - pip
17 | - numpy >=1.17
18 | - dataclasses
19 | - packaging
20 | - filelock
21 | - requests
22 | - tqdm >=4.27
23 | - sacremoses
24 | - regex !=2019.12.17
25 | - protobuf
26 | - tokenizers >=0.10.1,<0.11.0
27 | run:
28 | - python
29 | - numpy >=1.17
30 | - dataclasses
31 | - packaging
32 | - filelock
33 | - requests
34 | - tqdm >=4.27
35 | - sacremoses
36 | - regex !=2019.12.17
37 | - protobuf
38 | - tokenizers >=0.10.1,<0.11.0
39 |
40 | test:
41 | imports:
42 | - transformers
43 |
44 | about:
45 | home: https://huggingface.co
46 | license: Apache License 2.0
47 | license_file: LICENSE
48 | summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
49 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/finetune.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
16 | # run ./finetune.sh --help to see all the possible options
17 | python finetune_trainer.py \
18 | --learning_rate=3e-5 \
19 | --fp16 \
20 | --do_train --do_eval --do_predict \
21 | --evaluation_strategy steps \
22 | --predict_with_generate \
23 | --n_val 1000 \
24 | "$@"
25 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/entropy_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | if [ $MODEL_TYPE = 'bert' ]
12 | then
13 | MODEL_NAME=${MODEL_NAME}-uncased
14 | fi
15 |
16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7"
17 |
18 | for ENTROPY in $ENTROPIES; do
19 | python -u run_glue_deebert.py \
20 | --model_type $MODEL_TYPE \
21 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
22 | --task_name $DATASET \
23 | --do_eval \
24 | --do_lower_case \
25 | --data_dir $PATH_TO_DATA/$DATASET \
26 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
27 | --plot_data_dir ./results/ \
28 | --max_seq_length 128 \
29 | --early_exit_entropy $ENTROPY \
30 | --eval_highway \
31 | --overwrite_cache \
32 | --per_gpu_eval_batch_size=1
33 | done
34 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_glue.sh:
--------------------------------------------------------------------------------
1 | # Install example requirements
2 | pip install -r ../requirements.txt
3 |
4 | # Download glue data
5 | python3 ../../utils/download_glue_data.py
6 |
7 | export TASK=mrpc
8 | export DATA_DIR=./glue_data/MRPC/
9 | export MAX_LENGTH=128
10 | export LEARNING_RATE=2e-5
11 | export BERT_MODEL=bert-base-cased
12 | export BATCH_SIZE=32
13 | export NUM_EPOCHS=3
14 | export SEED=2
15 | export OUTPUT_DIR_NAME=mrpc-pl-bert
16 | export CURRENT_DIR=${PWD}
17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
18 |
19 | # Make output directory if it doesn't exist
20 | mkdir -p $OUTPUT_DIR
21 | # Add parent directory to python path to access lightning_base.py
22 | export PYTHONPATH="../":"${PYTHONPATH}"
23 |
24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
25 | --task $TASK \
26 | --model_name_or_path $BERT_MODEL \
27 | --output_dir $OUTPUT_DIR \
28 | --max_seq_length $MAX_LENGTH \
29 | --learning_rate $LEARNING_RATE \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --train_batch_size $BATCH_SIZE \
32 | --seed $SEED \
33 | --do_train \
34 | --do_predict
35 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/deebert/train_deebert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | EPOCHS=10
12 | if [ $MODEL_TYPE = 'bert' ]
13 | then
14 | EPOCHS=3
15 | MODEL_NAME=${MODEL_NAME}-uncased
16 | fi
17 |
18 |
19 | python -u run_glue_deebert.py \
20 | --model_type $MODEL_TYPE \
21 | --model_name_or_path $MODEL_NAME \
22 | --task_name $DATASET \
23 | --do_train \
24 | --do_eval \
25 | --do_lower_case \
26 | --data_dir $PATH_TO_DATA/$DATASET \
27 | --max_seq_length 128 \
28 | --per_gpu_eval_batch_size=1 \
29 | --per_gpu_train_batch_size=8 \
30 | --learning_rate 2e-5 \
31 | --num_train_epochs $EPOCHS \
32 | --overwrite_output_dir \
33 | --seed 42 \
34 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
35 | --plot_data_dir ./results/ \
36 | --save_steps 0 \
37 | --overwrite_cache \
38 | --eval_after_first_stage
39 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/finetune.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
16 | # run ./finetune.sh --help to see all the possible options
17 | python finetune_trainer.py \
18 | --learning_rate=3e-5 \
19 | --fp16 \
20 | --do_train --do_eval --do_predict \
21 | --evaluation_strategy steps \
22 | --predict_with_generate \
23 | --n_val 1000 \
24 | "$@"
25 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/deebert/train_deebert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export CUDA_VISIBLE_DEVICES=0
3 |
4 | PATH_TO_DATA=/h/xinji/projects/GLUE
5 |
6 | MODEL_TYPE=bert # bert or roberta
7 | MODEL_SIZE=base # base or large
8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
9 |
10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
11 | EPOCHS=10
12 | if [ $MODEL_TYPE = 'bert' ]
13 | then
14 | EPOCHS=3
15 | MODEL_NAME=${MODEL_NAME}-uncased
16 | fi
17 |
18 |
19 | python -u run_glue_deebert.py \
20 | --model_type $MODEL_TYPE \
21 | --model_name_or_path $MODEL_NAME \
22 | --task_name $DATASET \
23 | --do_train \
24 | --do_eval \
25 | --do_lower_case \
26 | --data_dir $PATH_TO_DATA/$DATASET \
27 | --max_seq_length 128 \
28 | --per_gpu_eval_batch_size=1 \
29 | --per_gpu_train_batch_size=8 \
30 | --learning_rate 2e-5 \
31 | --num_train_epochs $EPOCHS \
32 | --overwrite_output_dir \
33 | --seed 42 \
34 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
35 | --plot_data_dir ./results/ \
36 | --save_steps 0 \
37 | --overwrite_cache \
38 | --eval_after_first_stage
39 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/finetune_tpu.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | export TPU_NUM_CORES=8
16 |
17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
18 | # run ./finetune_tpu.sh --help to see all the possible options
19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \
20 | finetune_trainer.py \
21 | --learning_rate=3e-5 \
22 | --do_train --do_eval \
23 | --evaluation_strategy steps \
24 | --prediction_loss_only \
25 | --n_val 1000 \
26 | "$@"
27 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/tests/deepspeed/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 32,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 |
11 | "zero_optimization": {
12 | "stage": 2,
13 | "allgather_partitions": true,
14 | "allgather_bucket_size": 2e8,
15 | "overlap_comm": true,
16 | "reduce_scatter": true,
17 | "reduce_bucket_size": 2e8,
18 | "contiguous_gradients": true,
19 | "cpu_offload": true
20 | },
21 |
22 | "optimizer": {
23 | "type": "AdamW",
24 | "params": {
25 | "lr": 3e-5,
26 | "betas": [0.8, 0.999],
27 | "eps": 1e-8,
28 | "weight_decay": 3e-7
29 | }
30 | },
31 |
32 | "scheduler": {
33 | "type": "WarmupLR",
34 | "params": {
35 | "warmup_min_lr": 0,
36 | "warmup_max_lr": 3e-5,
37 | "warmup_num_steps": 500
38 | }
39 | },
40 |
41 | "steps_per_print": 2000,
42 | "wall_clock_breakdown": false
43 | }
44 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet:
--------------------------------------------------------------------------------
1 | local base = import 'templates/base.libsonnet';
2 | local tpus = import 'templates/tpus.libsonnet';
3 | local utils = import "templates/utils.libsonnet";
4 | local volumes = import "templates/volumes.libsonnet";
5 |
6 | local bertBaseCased = base.BaseTest {
7 | frameworkPrefix: "hf",
8 | modelName: "bert-base-cased",
9 | mode: "example",
10 | configMaps: [],
11 |
12 | timeout: 3600, # 1 hour, in seconds
13 |
14 | image: std.extVar('image'),
15 | imageTag: std.extVar('image-tag'),
16 |
17 | tpuSettings+: {
18 | softwareVersion: "pytorch-nightly",
19 | },
20 | accelerator: tpus.v3_8,
21 |
22 | volumeMap+: {
23 | datasets: volumes.PersistentVolumeSpec {
24 | name: "huggingface-cluster-disk",
25 | mountPath: "/datasets",
26 | },
27 | },
28 | command: utils.scriptCommand(
29 | |||
30 | python -m pytest -s transformers/examples/test_xla_examples.py -v
31 | test_exit_code=$?
32 | echo "\nFinished running commands.\n"
33 | test $test_exit_code -eq 0
34 | |||
35 | ),
36 | };
37 |
38 | bertBaseCased.oneshotJob
39 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/finetune_tpu.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | export TPU_NUM_CORES=8
16 |
17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
18 | # run ./finetune_tpu.sh --help to see all the possible options
19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \
20 | finetune_trainer.py \
21 | --learning_rate=3e-5 \
22 | --do_train --do_eval \
23 | --evaluation_strategy steps \
24 | --prediction_loss_only \
25 | --n_val 1000 \
26 | "$@"
27 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/tests/deepspeed/ds_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": true,
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 32,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 |
11 | "zero_optimization": {
12 | "stage": 2,
13 | "allgather_partitions": true,
14 | "allgather_bucket_size": 2e8,
15 | "overlap_comm": true,
16 | "reduce_scatter": true,
17 | "reduce_bucket_size": 2e8,
18 | "contiguous_gradients": true,
19 | "cpu_offload": true
20 | },
21 |
22 | "optimizer": {
23 | "type": "AdamW",
24 | "params": {
25 | "lr": 3e-5,
26 | "betas": [0.8, 0.999],
27 | "eps": 1e-8,
28 | "weight_decay": 3e-7
29 | }
30 | },
31 |
32 | "scheduler": {
33 | "type": "WarmupLR",
34 | "params": {
35 | "warmup_min_lr": 0,
36 | "warmup_max_lr": 3e-5,
37 | "warmup_num_steps": 500
38 | }
39 | },
40 |
41 | "steps_per_print": 2000,
42 | "wall_clock_breakdown": false
43 | }
44 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/fsmt/tests-to-run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # these scripts need to be run before any changes to FSMT-related code - it should cover all bases
17 |
18 | CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
19 | RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
20 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/finetune_rag.sh:
--------------------------------------------------------------------------------
1 | # Add parent directory to python path to access lightning_base.py
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options
6 |
7 | python examples/rag/finetune_rag.py \
8 | --data_dir $DATA_DIR \
9 | --output_dir $OUTPUT_DIR \
10 | --model_name_or_path $MODEL_NAME_OR_PATH \
11 | --model_type rag_sequence \
12 | --fp16 \
13 | --gpus 8 \
14 | --profile \
15 | --do_train \
16 | --do_predict \
17 | --n_val -1 \
18 | --train_batch_size 8 \
19 | --eval_batch_size 1 \
20 | --max_source_length 128 \
21 | --max_target_length 25 \
22 | --val_max_target_length 25 \
23 | --test_max_target_length 25 \
24 | --label_smoothing 0.1 \
25 | --dropout 0.1 \
26 | --attention_dropout 0.1 \
27 | --weight_decay 0.001 \
28 | --adam_epsilon 1e-08 \
29 | --max_grad_norm 0.1 \
30 | --lr_scheduler polynomial \
31 | --learning_rate 3e-05 \
32 | --num_train_epochs 100 \
33 | --warmup_steps 500 \
34 | --gradient_accumulation_steps 1 \
35 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/run_chunk.sh:
--------------------------------------------------------------------------------
1 | if ! [ -f ./dev.txt ]; then
2 | echo "Downloading CONLL2003 dev dataset...."
3 | curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
4 | fi
5 |
6 | if ! [ -f ./test.txt ]; then
7 | echo "Downloading CONLL2003 test dataset...."
8 | curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
9 | fi
10 |
11 | if ! [ -f ./train.txt ]; then
12 | echo "Downloading CONLL2003 train dataset...."
13 | curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
14 | fi
15 |
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=chunker-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 |
24 | python3 run_ner.py \
25 | --task_type Chunk \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 |
38 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/run_pos.sh:
--------------------------------------------------------------------------------
1 | if ! [ -f ./dev.txt ]; then
2 | echo "Download dev dataset...."
3 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
4 | fi
5 |
6 | if ! [ -f ./test.txt ]; then
7 | echo "Download test dataset...."
8 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
9 | fi
10 |
11 | if ! [ -f ./train.txt ]; then
12 | echo "Download train dataset...."
13 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
14 | fi
15 |
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=postagger-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 |
24 | python3 run_ner.py \
25 | --task_type POS \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 |
38 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/token-classification/scripts/preprocess.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from transformers import AutoTokenizer
4 |
5 |
6 | dataset = sys.argv[1]
7 | model_name_or_path = sys.argv[2]
8 | max_len = int(sys.argv[3])
9 |
10 | subword_len_counter = 0
11 |
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | max_len -= tokenizer.num_special_tokens_to_add()
14 |
15 | with open(dataset, "rt") as f_p:
16 | for line in f_p:
17 | line = line.rstrip()
18 |
19 | if not line:
20 | print(line)
21 | subword_len_counter = 0
22 | continue
23 |
24 | token = line.split()[0]
25 |
26 | current_subwords_len = len(tokenizer.tokenize(token))
27 |
28 | # Token contains strange control characters like \x96 or \x95
29 | # Just filter out the complete line
30 | if current_subwords_len == 0:
31 | continue
32 |
33 | if (subword_len_counter + current_subwords_len) > max_len:
34 | print("")
35 | print(line)
36 | subword_len_counter = current_subwords_len
37 | continue
38 |
39 | subword_len_counter += current_subwords_len
40 |
41 | print(line)
42 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .glue import GlueDataset, GlueDataTrainingArguments
20 | from .language_modeling import (
21 | LineByLineTextDataset,
22 | LineByLineWithRefDataset,
23 | LineByLineWithSOPTextDataset,
24 | TextDataset,
25 | TextDatasetForNextSentencePrediction,
26 | )
27 | from .squad import SquadDataset, SquadDataTrainingArguments
28 |
--------------------------------------------------------------------------------
/sorting/run_sort_inftyformer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ $1 == 'train' ]]; then
4 | echo 'Run training...'
5 | python3 train.py \
6 | --cuda \
7 | --data ../data/ \
8 | --dataset ../data_sort_8000 \
9 | --n_layer 3 \
10 | --d_model 300 \
11 | --n_head 6 \
12 | --d_head 50 \
13 | --d_inner 300 \
14 | --dropout 0.1 \
15 | --dropatt 0.0 \
16 | --optim adam \
17 | --lr 0.0002 \
18 | --warmup_step 0 \
19 | --max_step 20000 \
20 | --tgt_len 1024 \
21 | --mem_len 1024 \
22 | --eval_tgt_len 1024 \
23 | --batch_size 8 \
24 | --gpu0_bsz 8 \
25 | --continuous \
26 | --long_term_attention \
27 | --long_term_attention_norm='softmax' \
28 | --long_term_attention_basis 512 \
29 | --affines \
30 | --augment \
31 | --augment_len 1024 \
32 | --infinite_memory \
33 | --mask \
34 | --mask_type 'cnn' \
35 | --kl_regularizer \
36 | --kl_m .000001 \
37 | --sigma_0 .05 \
38 | --name infty_former \
39 | --work_dir ./sort_8000 \
40 | ${@:2}
41 | echo 'unknown argment 1'
42 | fi
43 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_text2text_generation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 |
19 |
20 | class Text2TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 | pipeline_task = "text2text-generation"
22 | small_models = ["patrickvonplaten/t5-tiny-random"] # Default model - Models tested without the @slow decorator
23 | large_models = [] # Models tested with the @slow decorator
24 | invalid_inputs = [4, ""]
25 | mandatory_keys = ["generated_text"]
26 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/run_pos.sh:
--------------------------------------------------------------------------------
1 | if ! [ -f ./dev.txt ]; then
2 | echo "Download dev dataset...."
3 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
4 | fi
5 |
6 | if ! [ -f ./test.txt ]; then
7 | echo "Download test dataset...."
8 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
9 | fi
10 |
11 | if ! [ -f ./train.txt ]; then
12 | echo "Download train dataset...."
13 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
14 | fi
15 |
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=postagger-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 |
24 | python3 run_ner.py \
25 | --task_type POS \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 |
38 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag.sh:
--------------------------------------------------------------------------------
1 | # Add parent directory to python path to access lightning_base.py
2 | export PYTHONPATH="../":"${PYTHONPATH}"
3 |
4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options
6 |
7 | python examples/rag/finetune_rag.py \
8 | --data_dir $DATA_DIR \
9 | --output_dir $OUTPUT_DIR \
10 | --model_name_or_path $MODEL_NAME_OR_PATH \
11 | --model_type rag_sequence \
12 | --fp16 \
13 | --gpus 8 \
14 | --profile \
15 | --do_train \
16 | --do_predict \
17 | --n_val -1 \
18 | --train_batch_size 8 \
19 | --eval_batch_size 1 \
20 | --max_source_length 128 \
21 | --max_target_length 25 \
22 | --val_max_target_length 25 \
23 | --test_max_target_length 25 \
24 | --label_smoothing 0.1 \
25 | --dropout 0.1 \
26 | --attention_dropout 0.1 \
27 | --weight_decay 0.001 \
28 | --adam_epsilon 1e-08 \
29 | --max_grad_norm 0.1 \
30 | --lr_scheduler polynomial \
31 | --learning_rate 3e-05 \
32 | --num_train_epochs 100 \
33 | --warmup_steps 500 \
34 | --gradient_accumulation_steps 1 \
35 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_feature_extraction.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 |
19 |
20 | class FeatureExtractionPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 | pipeline_task = "feature-extraction"
22 | small_models = [
23 | "sshleifer/tiny-distilbert-base-cased"
24 | ] # Default model - Models tested without the @slow decorator
25 | large_models = [None] # Models tested with the @slow decorator
26 | mandatory_keys = {} # Keys which should be in the output
27 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/run_chunk.sh:
--------------------------------------------------------------------------------
1 | if ! [ -f ./dev.txt ]; then
2 | echo "Downloading CONLL2003 dev dataset...."
3 | curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
4 | fi
5 |
6 | if ! [ -f ./test.txt ]; then
7 | echo "Downloading CONLL2003 test dataset...."
8 | curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
9 | fi
10 |
11 | if ! [ -f ./train.txt ]; then
12 | echo "Downloading CONLL2003 train dataset...."
13 | curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
14 | fi
15 |
16 | export MAX_LENGTH=200
17 | export BERT_MODEL=bert-base-uncased
18 | export OUTPUT_DIR=chunker-model
19 | export BATCH_SIZE=32
20 | export NUM_EPOCHS=3
21 | export SAVE_STEPS=750
22 | export SEED=1
23 |
24 | python3 run_ner.py \
25 | --task_type Chunk \
26 | --data_dir . \
27 | --model_name_or_path $BERT_MODEL \
28 | --output_dir $OUTPUT_DIR \
29 | --max_seq_length $MAX_LENGTH \
30 | --num_train_epochs $NUM_EPOCHS \
31 | --per_gpu_train_batch_size $BATCH_SIZE \
32 | --save_steps $SAVE_STEPS \
33 | --seed $SEED \
34 | --do_train \
35 | --do_eval \
36 | --do_predict
37 |
38 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/token-classification/scripts/preprocess.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from transformers import AutoTokenizer
4 |
5 |
6 | dataset = sys.argv[1]
7 | model_name_or_path = sys.argv[2]
8 | max_len = int(sys.argv[3])
9 |
10 | subword_len_counter = 0
11 |
12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
13 | max_len -= tokenizer.num_special_tokens_to_add()
14 |
15 | with open(dataset, "rt") as f_p:
16 | for line in f_p:
17 | line = line.rstrip()
18 |
19 | if not line:
20 | print(line)
21 | subword_len_counter = 0
22 | continue
23 |
24 | token = line.split()[0]
25 |
26 | current_subwords_len = len(tokenizer.tokenize(token))
27 |
28 | # Token contains strange control characters like \x96 or \x95
29 | # Just filter out the complete line
30 | if current_subwords_len == 0:
31 | continue
32 |
33 | if (subword_len_counter + current_subwords_len) > max_len:
34 | print("")
35 | print(line)
36 | subword_len_counter = current_subwords_len
37 | continue
38 |
39 | subword_len_counter += current_subwords_len
40 |
41 | print(line)
42 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .glue import GlueDataset, GlueDataTrainingArguments
20 | from .language_modeling import (
21 | LineByLineTextDataset,
22 | LineByLineWithRefDataset,
23 | LineByLineWithSOPTextDataset,
24 | TextDataset,
25 | TextDatasetForNextSentencePrediction,
26 | )
27 | from .squad import SquadDataset, SquadDataTrainingArguments
28 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/release-conda.yml:
--------------------------------------------------------------------------------
1 | name: Release - Conda
2 |
3 | on:
4 | push:
5 | tags:
6 | - v*
7 |
8 | env:
9 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
10 |
11 | jobs:
12 | build_and_package:
13 | runs-on: ubuntu-latest
14 | defaults:
15 | run:
16 | shell: bash -l {0}
17 |
18 | steps:
19 | - name: Checkout repository
20 | uses: actions/checkout@v1
21 |
22 | - name: Install miniconda
23 | uses: conda-incubator/setup-miniconda@v2
24 | with:
25 | auto-update-conda: true
26 | auto-activate-base: false
27 | activate-environment: "build-transformers"
28 | channels: huggingface
29 |
30 | - name: Setup conda env
31 | run: |
32 | conda install -c defaults anaconda-client conda-build
33 |
34 | - name: Extract version
35 | run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
36 |
37 | - name: Build conda packages
38 | run: |
39 | conda info
40 | conda list
41 | conda-build .github/conda
42 |
43 | - name: Upload to Anaconda
44 | run: anaconda upload `conda-build .github/conda --output` --force
45 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | # Research projects
18 |
19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific
20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work.
21 |
22 | To use any of them, just run the command
23 | ```
24 | pip install -r requirements.txt
25 | ```
26 | inside the folder of your choice.
27 |
28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder.
29 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_pipelines_sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | from .test_pipelines_common import MonoInputPipelineCommonMixin
18 |
19 |
20 | class SentimentAnalysisPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
21 | pipeline_task = "sentiment-analysis"
22 | small_models = [
23 | "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
24 | ] # Default model - Models tested without the @slow decorator
25 | large_models = [None] # Models tested with the @slow decorator
26 | mandatory_keys = {"label", "score"} # Keys which should be in the output
27 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | # Research projects
18 |
19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific
20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work.
21 |
22 | To use any of them, just run the command
23 | ```
24 | pip install -r requirements.txt
25 | ```
26 | inside the folder of your choice.
27 |
28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder.
29 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
23 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/pytorch-lightning/run_pos.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | if ! [ -f ./dev.txt ]; then
3 | echo "Download dev dataset...."
4 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
5 | fi
6 |
7 | if ! [ -f ./test.txt ]; then
8 | echo "Download test dataset...."
9 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
10 | fi
11 |
12 | if ! [ -f ./train.txt ]; then
13 | echo "Download train dataset...."
14 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
15 | fi
16 |
17 | export MAX_LENGTH=200
18 | export BERT_MODEL=bert-base-uncased
19 | export OUTPUT_DIR=postagger-model
20 | export BATCH_SIZE=32
21 | export NUM_EPOCHS=3
22 | export SAVE_STEPS=750
23 | export SEED=1
24 |
25 |
26 | # Add parent directory to python path to access lightning_base.py
27 | export PYTHONPATH="../":"${PYTHONPATH}"
28 |
29 | python3 run_ner.py --data_dir ./ \
30 | --task_type POS \
31 | --model_name_or_path $BERT_MODEL \
32 | --output_dir $OUTPUT_DIR \
33 | --max_seq_length $MAX_LENGTH \
34 | --num_train_epochs $NUM_EPOCHS \
35 | --train_batch_size $BATCH_SIZE \
36 | --seed $SEED \
37 | --gpus 1 \
38 | --do_train \
39 | --do_predict
40 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/text-generation/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | ## Language generation
18 |
19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py).
20 |
21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
23 | can try out the different models available in the library.
24 |
25 | Example usage:
26 |
27 | ```bash
28 | python run_generation.py \
29 | --model_type=gpt2 \
30 | --model_name_or_path=gpt2
31 | ```
32 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
23 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019-present, the HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import unittest
17 | from unittest.mock import patch
18 |
19 | from transformers.testing_utils import CaptureStd
20 |
21 |
22 | class CLITest(unittest.TestCase):
23 | @patch("sys.argv", ["fakeprogrampath", "env"])
24 | def test_cli_env(self):
25 | # test transformers-cli env
26 | import transformers.commands.transformers_cli
27 |
28 | with CaptureStd() as cs:
29 | transformers.commands.transformers_cli.main()
30 | assert "Python version" in cs.out
31 | assert "Platform" in cs.out
32 | assert "Using distributed or parallel set-up in script?" in cs.out
33 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/minify_dataset.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from pathlib import Path
17 |
18 | import fire
19 |
20 |
21 | def minify(src_dir: str, dest_dir: str, n: int):
22 | """Write first n lines of each file f in src_dir to dest_dir/f """
23 | src_dir = Path(src_dir)
24 | dest_dir = Path(dest_dir)
25 | dest_dir.mkdir(exist_ok=True)
26 | for path in src_dir.iterdir():
27 | new = [x.rstrip() for x in list(path.open().readlines())][:n]
28 | dest_path = dest_dir.joinpath(path.name)
29 | print(dest_path)
30 | dest_path.open("w").write("\n".join(new))
31 |
32 |
33 | if __name__ == "__main__":
34 | fire.Fire(minify)
35 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/rouge_cli.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import fire
16 |
17 | from utils import calculate_rouge, save_json
18 |
19 |
20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
21 | """Kwargs will be passed to calculate_rouge"""
22 | pred_lns = [x.strip() for x in open(pred_path).readlines()]
23 | tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)]
24 | metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
25 | if save_path is not None:
26 | save_json(metrics, save_path, indent=None)
27 | return metrics # these print nicely
28 |
29 |
30 | if __name__ == "__main__":
31 | fire.Fire(calculate_rouge_path)
32 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/pytorch-lightning/run_pos.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | if ! [ -f ./dev.txt ]; then
3 | echo "Download dev dataset...."
4 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
5 | fi
6 |
7 | if ! [ -f ./test.txt ]; then
8 | echo "Download test dataset...."
9 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
10 | fi
11 |
12 | if ! [ -f ./train.txt ]; then
13 | echo "Download train dataset...."
14 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
15 | fi
16 |
17 | export MAX_LENGTH=200
18 | export BERT_MODEL=bert-base-uncased
19 | export OUTPUT_DIR=postagger-model
20 | export BATCH_SIZE=32
21 | export NUM_EPOCHS=3
22 | export SAVE_STEPS=750
23 | export SEED=1
24 |
25 |
26 | # Add parent directory to python path to access lightning_base.py
27 | export PYTHONPATH="../":"${PYTHONPATH}"
28 |
29 | python3 run_ner.py --data_dir ./ \
30 | --task_type POS \
31 | --model_name_or_path $BERT_MODEL \
32 | --output_dir $OUTPUT_DIR \
33 | --max_seq_length $MAX_LENGTH \
34 | --num_train_epochs $NUM_EPOCHS \
35 | --train_batch_size $BATCH_SIZE \
36 | --seed $SEED \
37 | --gpus 1 \
38 | --do_train \
39 | --do_predict
40 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..file_utils import requires_flax
3 |
4 |
5 | class FlaxPreTrainedModel:
6 | def __init__(self, *args, **kwargs):
7 | requires_flax(self)
8 |
9 | @classmethod
10 | def from_pretrained(self, *args, **kwargs):
11 | requires_flax(self)
12 |
13 |
14 | FLAX_MODEL_MAPPING = None
15 |
16 |
17 | class FlaxAutoModel:
18 | def __init__(self, *args, **kwargs):
19 | requires_flax(self)
20 |
21 | @classmethod
22 | def from_pretrained(self, *args, **kwargs):
23 | requires_flax(self)
24 |
25 |
26 | class FlaxBertForMaskedLM:
27 | def __init__(self, *args, **kwargs):
28 | requires_flax(self)
29 |
30 | @classmethod
31 | def from_pretrained(self, *args, **kwargs):
32 | requires_flax(self)
33 |
34 |
35 | class FlaxBertModel:
36 | def __init__(self, *args, **kwargs):
37 | requires_flax(self)
38 |
39 | @classmethod
40 | def from_pretrained(self, *args, **kwargs):
41 | requires_flax(self)
42 |
43 |
44 | class FlaxRobertaModel:
45 | def __init__(self, *args, **kwargs):
46 | requires_flax(self)
47 |
48 | @classmethod
49 | def from_pretrained(self, *args, **kwargs):
50 | requires_flax(self)
51 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/text-generation/README.md:
--------------------------------------------------------------------------------
1 |
16 |
17 | ## Language generation
18 |
19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py).
20 |
21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
23 | can try out the different models available in the library.
24 |
25 | Example usage:
26 |
27 | ```bash
28 | python run_generation.py \
29 | --model_type=gpt2 \
30 | --model_name_or_path=gpt2
31 | ```
32 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/minify_dataset.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from pathlib import Path
17 |
18 | import fire
19 |
20 |
21 | def minify(src_dir: str, dest_dir: str, n: int):
22 | """Write first n lines of each file f in src_dir to dest_dir/f """
23 | src_dir = Path(src_dir)
24 | dest_dir = Path(dest_dir)
25 | dest_dir.mkdir(exist_ok=True)
26 | for path in src_dir.iterdir():
27 | new = [x.rstrip() for x in list(path.open().readlines())][:n]
28 | dest_path = dest_dir.joinpath(path.name)
29 | print(dest_path)
30 | dest_path.open("w").write("\n".join(new))
31 |
32 |
33 | if __name__ == "__main__":
34 | fire.Fire(minify)
35 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/rouge_cli.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import fire
16 |
17 | from utils import calculate_rouge, save_json
18 |
19 |
20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
21 | """Kwargs will be passed to calculate_rouge"""
22 | pred_lns = [x.strip() for x in open(pred_path).readlines()]
23 | tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)]
24 | metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
25 | if save_path is not None:
26 | save_json(metrics, save_path, indent=None)
27 | return metrics # these print nicely
28 |
29 |
30 | if __name__ == "__main__":
31 | fire.Fire(calculate_rouge_path)
32 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..file_utils import requires_flax
3 |
4 |
5 | class FlaxPreTrainedModel:
6 | def __init__(self, *args, **kwargs):
7 | requires_flax(self)
8 |
9 | @classmethod
10 | def from_pretrained(self, *args, **kwargs):
11 | requires_flax(self)
12 |
13 |
14 | FLAX_MODEL_MAPPING = None
15 |
16 |
17 | class FlaxAutoModel:
18 | def __init__(self, *args, **kwargs):
19 | requires_flax(self)
20 |
21 | @classmethod
22 | def from_pretrained(self, *args, **kwargs):
23 | requires_flax(self)
24 |
25 |
26 | class FlaxBertForMaskedLM:
27 | def __init__(self, *args, **kwargs):
28 | requires_flax(self)
29 |
30 | @classmethod
31 | def from_pretrained(self, *args, **kwargs):
32 | requires_flax(self)
33 |
34 |
35 | class FlaxBertModel:
36 | def __init__(self, *args, **kwargs):
37 | requires_flax(self)
38 |
39 | @classmethod
40 | def from_pretrained(self, *args, **kwargs):
41 | requires_flax(self)
42 |
43 |
44 | class FlaxRobertaModel:
45 | def __init__(self, *args, **kwargs):
46 | requires_flax(self)
47 |
48 | @classmethod
49 | def from_pretrained(self, *args, **kwargs):
50 | requires_flax(self)
51 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/model_cards/README.md:
--------------------------------------------------------------------------------
1 | ## 🔥 Model cards now live inside each huggingface.co model repo 🔥
2 |
3 |
4 | For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub.
5 |
6 | ### How to update a model card
7 |
8 | You can directly update a model card inside any model repo you have **write access** to, i.e.:
9 | - a model under your username namespace
10 | - a model under any organization you are a part of.
11 |
12 | You can either:
13 | - update it, commit and push using your usual git workflow (command line, GUI, etc.)
14 | - or edit it directly from the website's UI.
15 |
16 | **What if you want to create or update a model card for a model you don't have write access to?**
17 |
18 | In that case, given that we don't have a Pull request system yet on huggingface.co (🤯),
19 | you can open an issue here, post the card's content, and tag the model author(s) and/or the Hugging Face team.
20 |
21 | We might implement a more seamless process at some point, so your early feedback is precious!
22 | Please let us know of any suggestion.
23 |
24 | ### What happened to the model cards here?
25 |
26 | We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub.
27 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
1 | ..
2 | Copyright 2020 The HuggingFace Team. All rights reserved.
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
5 | the License. You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
10 | an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
11 | specific language governing permissions and limitations under the License.
12 |
13 | Configuration
14 | -----------------------------------------------------------------------------------------------------------------------
15 |
16 | The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
17 | either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
18 | from HuggingFace's AWS S3 repository).
19 |
20 |
21 | PretrainedConfig
22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23 |
24 | .. autoclass:: transformers.PretrainedConfig
25 | :members:
26 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/sentence_splitter.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from filelock import FileLock
17 |
18 |
19 | try:
20 | import nltk
21 |
22 | NLTK_AVAILABLE = True
23 | except (ImportError, ModuleNotFoundError):
24 | NLTK_AVAILABLE = False
25 |
26 | if NLTK_AVAILABLE:
27 | with FileLock(".lock") as lock:
28 | nltk.download("punkt", quiet=True)
29 |
30 |
31 | def add_newline_to_end_of_each_sentence(x: str) -> str:
32 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
33 | re.sub("", "", x) # remove pegasus newline char
34 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
35 | return "\n".join(nltk.sent_tokenize(x))
36 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/sentence_splitter.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import re
15 |
16 | from filelock import FileLock
17 |
18 |
19 | try:
20 | import nltk
21 |
22 | NLTK_AVAILABLE = True
23 | except (ImportError, ModuleNotFoundError):
24 | NLTK_AVAILABLE = False
25 |
26 | if NLTK_AVAILABLE:
27 | with FileLock(".lock") as lock:
28 | nltk.download("punkt", quiet=True)
29 |
30 |
31 | def add_newline_to_end_of_each_sentence(x: str) -> str:
32 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
33 | re.sub("", "", x) # remove pegasus newline char
34 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
35 | return "\n".join(nltk.sent_tokenize(x))
36 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/scripts/pegasus/build_test_sample_spm_no_bos.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
17 |
18 | # 1. pip install sentencepiece
19 | #
20 | # 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
21 |
22 | # 3. build
23 | import sentencepiece as spm
24 |
25 | # pegasus:
26 | # 1. no bos
27 | # 2. eos_id is 1
28 | # 3. unk_id is 2
29 | # build a sample spm file accordingly
30 | spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000')
31 |
32 | # 4. now update the fixture
33 | # mv test_sentencepiece_no_bos.model ../../tests/fixtures/
34 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .metrics import glue_compute_metrics, xnli_compute_metrics
20 | from .processors import (
21 | DataProcessor,
22 | InputExample,
23 | InputFeatures,
24 | SingleSentenceClassificationProcessor,
25 | SquadExample,
26 | SquadFeatures,
27 | SquadV1Processor,
28 | SquadV2Processor,
29 | glue_convert_examples_to_features,
30 | glue_output_modes,
31 | glue_processors,
32 | glue_tasks_num_labels,
33 | squad_convert_examples_to_features,
34 | xnli_output_modes,
35 | xnli_processors,
36 | xnli_tasks_num_labels,
37 | )
38 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 |
17 |
18 | from ...utils import logging
19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 |
28 |
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 | """
31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 | documentation alongside usage examples.
33 | """
34 |
35 | model_type = "xlm-prophetnet"
36 |
--------------------------------------------------------------------------------
/sorting/utils/exp_utils.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import os, shutil
3 |
4 | import numpy as np
5 |
6 | import torch
7 |
8 |
9 | def logging(s, log_path, print_=True, log_=True):
10 | if print_:
11 | print(s)
12 | if log_:
13 | with open(log_path, 'a+') as f_log:
14 | f_log.write(s + '\n')
15 |
16 | def get_logger(log_path, **kwargs):
17 | return functools.partial(logging, log_path=log_path, **kwargs)
18 |
19 | def create_exp_dir(dir_path, scripts_to_save=None, debug=False):
20 | if debug:
21 | print('Debug Mode : no experiment dir created')
22 | return functools.partial(logging, log_path=None, log_=False)
23 |
24 | if not os.path.exists(dir_path):
25 | os.makedirs(dir_path)
26 |
27 | print('Experiment dir : {}'.format(dir_path))
28 | if scripts_to_save is not None:
29 | script_path = os.path.join(dir_path, 'scripts')
30 | if not os.path.exists(script_path):
31 | os.makedirs(script_path)
32 | for script in scripts_to_save:
33 | dst_file = os.path.join(dir_path, 'scripts', os.path.basename(script))
34 | shutil.copyfile(script, dst_file)
35 |
36 | return get_logger(log_path=os.path.join(dir_path, 'log.txt'))
37 |
38 | def save_checkpoint(model, optimizer, path, epoch):
39 | torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch)))
40 | torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch)))
41 |
--------------------------------------------------------------------------------
/finetune_gpt2/infinite_memory_transformer_sticky_mem/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "gpt2",
3 | "activation_function": "gelu_new",
4 | "affines": true,
5 | "architectures": [
6 | "GPT2LMHeadModel"
7 | ],
8 | "attn_drop": 0.1,
9 | "attn_pdrop": 0.1,
10 | "bos_token_id": 50256,
11 | "continuous": true,
12 | "embd_pdrop": 0.1,
13 | "eos_token_id": 50256,
14 | "gradient_checkpointing": false,
15 | "infinite_memory": true,
16 | "initializer_range": 0.02,
17 | "kl_regularizer": true,
18 | "layer_norm_epsilon": 1e-05,
19 | "long_term_attention": true,
20 | "long_term_attention_basis": 512,
21 | "long_term_attention_norm": "softmax",
22 | "mask": true,
23 | "mask_dropout": 0,
24 | "mask_type": "cnn",
25 | "model_type": "gpt2",
26 | "mu_0": -1,
27 | "n_ctx": 1024,
28 | "n_embd": 768,
29 | "n_head": 12,
30 | "n_inner": null,
31 | "n_layer": 12,
32 | "n_positions": 1024,
33 | "n_special": 0,
34 | "predict_special_tokens": true,
35 | "resid_pdrop": 0.1,
36 | "sigma_0": 0.05,
37 | "sticky_memories": true,
38 | "summary_activation": null,
39 | "summary_first_dropout": 0.1,
40 | "summary_proj_to_labels": true,
41 | "summary_type": "cls_index",
42 | "summary_use_proj": true,
43 | "task_specific_params": {
44 | "text-generation": {
45 | "do_sample": true,
46 | "max_length": 50
47 | }
48 | },
49 | "transformers_version": "4.5.0.dev0",
50 | "use_cache": true,
51 | "vocab_size": 50257
52 | }
53 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from .metrics import glue_compute_metrics, xnli_compute_metrics
20 | from .processors import (
21 | DataProcessor,
22 | InputExample,
23 | InputFeatures,
24 | SingleSentenceClassificationProcessor,
25 | SquadExample,
26 | SquadFeatures,
27 | SquadV1Processor,
28 | SquadV2Processor,
29 | glue_convert_examples_to_features,
30 | glue_output_modes,
31 | glue_processors,
32 | glue_tasks_num_labels,
33 | squad_convert_examples_to_features,
34 | xnli_output_modes,
35 | xnli_processors,
36 | xnli_tasks_num_labels,
37 | )
38 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/performer/README.md:
--------------------------------------------------------------------------------
1 | # Performer fine-tuning
2 |
3 | Example authors: @TevenLeScao, @Patrickvonplaten
4 |
5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller
6 |
7 | ## Requirements
8 |
9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it.
10 |
11 | ## Examples
12 |
13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
15 |
16 | Here are a few key arguments:
17 | - Remove the `--performer` argument to use a standard Bert model.
18 |
19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint.
20 |
21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument.
22 |
23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging.
24 |
25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need.
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/finetune_rag_ray.sh:
--------------------------------------------------------------------------------
1 | # Sample script to finetune RAG using Ray for distributed retrieval.
2 |
3 | # Add parent directory to python path to access lightning_base.py
4 | export PYTHONPATH="../":"${PYTHONPATH}"
5 |
6 | # Start a single-node Ray cluster.
7 | ray start --head
8 |
9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
11 |
12 | python examples/rag/finetune_rag.py \
13 | --data_dir $DATA_DIR \
14 | --output_dir $OUTPUT_DIR \
15 | --model_name_or_path $MODEL_NAME_OR_PATH \
16 | --model_type rag_sequence \
17 | --fp16 \
18 | --gpus 8 \
19 | --profile \
20 | --do_train \
21 | --do_predict \
22 | --n_val -1 \
23 | --train_batch_size 8 \
24 | --eval_batch_size 1 \
25 | --max_source_length 128 \
26 | --max_target_length 25 \
27 | --val_max_target_length 25 \
28 | --test_max_target_length 25 \
29 | --label_smoothing 0.1 \
30 | --dropout 0.1 \
31 | --attention_dropout 0.1 \
32 | --weight_decay 0.001 \
33 | --adam_epsilon 1e-08 \
34 | --max_grad_norm 0.1 \
35 | --lr_scheduler polynomial \
36 | --learning_rate 3e-05 \
37 | --num_train_epochs 100 \
38 | --warmup_steps 500 \
39 | --gradient_accumulation_steps 1 \
40 | --distributed_retriever ray \
41 | --num_retrieval_workers 4
42 |
43 | # Stop the Ray cluster.
44 | ray stop
45 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 |
17 |
18 | from ...utils import logging
19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 |
28 |
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 | """
31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 | documentation alongside usage examples.
33 | """
34 |
35 | model_type = "xlm-prophetnet"
36 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/performer/README.md:
--------------------------------------------------------------------------------
1 | # Performer fine-tuning
2 |
3 | Example authors: @TevenLeScao, @Patrickvonplaten
4 |
5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller
6 |
7 | ## Requirements
8 |
9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it.
10 |
11 | ## Examples
12 |
13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
15 |
16 | Here are a few key arguments:
17 | - Remove the `--performer` argument to use a standard Bert model.
18 |
19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint.
20 |
21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument.
22 |
23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging.
24 |
25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need.
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/finetune_rag_ray.sh:
--------------------------------------------------------------------------------
1 | # Sample script to finetune RAG using Ray for distributed retrieval.
2 |
3 | # Add parent directory to python path to access lightning_base.py
4 | export PYTHONPATH="../":"${PYTHONPATH}"
5 |
6 | # Start a single-node Ray cluster.
7 | ray start --head
8 |
9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
11 |
12 | python examples/rag/finetune_rag.py \
13 | --data_dir $DATA_DIR \
14 | --output_dir $OUTPUT_DIR \
15 | --model_name_or_path $MODEL_NAME_OR_PATH \
16 | --model_type rag_sequence \
17 | --fp16 \
18 | --gpus 8 \
19 | --profile \
20 | --do_train \
21 | --do_predict \
22 | --n_val -1 \
23 | --train_batch_size 8 \
24 | --eval_batch_size 1 \
25 | --max_source_length 128 \
26 | --max_target_length 25 \
27 | --val_max_target_length 25 \
28 | --test_max_target_length 25 \
29 | --label_smoothing 0.1 \
30 | --dropout 0.1 \
31 | --attention_dropout 0.1 \
32 | --weight_decay 0.001 \
33 | --adam_epsilon 1e-08 \
34 | --max_grad_norm 0.1 \
35 | --lr_scheduler polynomial \
36 | --learning_rate 3e-05 \
37 | --num_train_epochs 100 \
38 | --warmup_steps 500 \
39 | --gradient_accumulation_steps 1 \
40 | --distributed_retriever ray \
41 | --num_retrieval_workers 4
42 |
43 | # Stop the Ray cluster.
44 | ray stop
45 |
--------------------------------------------------------------------------------
/finetune_gpt2/infinite_memory_transformer/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "gpt2",
3 | "activation_function": "gelu_new",
4 | "affines": true,
5 | "architectures": [
6 | "GPT2LMHeadModel"
7 | ],
8 | "attn_drop": 0.1,
9 | "attn_pdrop": 0.1,
10 | "bos_token_id": 50256,
11 | "compression_rate": 2,
12 | "compressive": false,
13 | "continuous": true,
14 | "embd_pdrop": 0.1,
15 | "eos_token_id": 50256,
16 | "gradient_checkpointing": false,
17 | "infinite_memory": true,
18 | "initializer_range": 0.02,
19 | "kl_regularizer": true,
20 | "layer_norm_epsilon": 1e-05,
21 | "long_term_attention": true,
22 | "long_term_attention_basis": 512,
23 | "long_term_attention_norm": "softmax",
24 | "mask": true,
25 | "mask_dropout": 0,
26 | "mask_type": "cnn",
27 | "model_type": "gpt2",
28 | "mu_0": -1,
29 | "n_ctx": 1024,
30 | "n_embd": 768,
31 | "n_head": 12,
32 | "n_inner": null,
33 | "n_layer": 12,
34 | "n_positions": 1024,
35 | "n_special": 0,
36 | "predict_special_tokens": true,
37 | "resid_pdrop": 0.1,
38 | "sigma_0": 0.05,
39 | "sticky_memories": false,
40 | "summary_activation": null,
41 | "summary_first_dropout": 0.1,
42 | "summary_proj_to_labels": true,
43 | "summary_type": "cls_index",
44 | "summary_use_proj": true,
45 | "task_specific_params": {
46 | "text-generation": {
47 | "do_sample": true,
48 | "max_length": 50
49 | }
50 | },
51 | "transformers_version": "4.5.0.dev0",
52 | "use_cache": true,
53 | "vocab_size": 50257
54 | }
55 |
56 |
--------------------------------------------------------------------------------
/finetune_gpt2/src/transformers/models/xlm_prophetnet/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from ...file_utils import is_sentencepiece_available, is_torch_available
20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
21 |
22 |
23 | if is_sentencepiece_available():
24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
25 |
26 | if is_torch_available():
27 | from .modeling_xlm_prophetnet import (
28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
29 | XLMProphetNetDecoder,
30 | XLMProphetNetEncoder,
31 | XLMProphetNetForCausalLM,
32 | XLMProphetNetForConditionalGeneration,
33 | XLMProphetNetModel,
34 | )
35 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/tests/test_activations_tf.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 |
17 | from transformers import is_tf_available
18 | from transformers.testing_utils import require_tf
19 |
20 |
21 | if is_tf_available():
22 | from transformers.activations_tf import get_tf_activation
23 |
24 |
25 | @require_tf
26 | class TestTFActivations(unittest.TestCase):
27 | def test_get_activation(self):
28 | get_tf_activation("swish")
29 | get_tf_activation("silu")
30 | get_tf_activation("gelu")
31 | get_tf_activation("relu")
32 | get_tf_activation("tanh")
33 | get_tf_activation("gelu_new")
34 | get_tf_activation("gelu_fast")
35 | get_tf_activation("mish")
36 | with self.assertRaises(KeyError):
37 | get_tf_activation("bogus")
38 | with self.assertRaises(KeyError):
39 | get_tf_activation(None)
40 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/.github/workflows/github-torch-hub.yml:
--------------------------------------------------------------------------------
1 | name: Torch hub integration
2 |
3 | on:
4 | push:
5 | branches:
6 | - "*"
7 |
8 | jobs:
9 | torch_hub_integration:
10 | runs-on: ubuntu-latest
11 | env:
12 | # TODO quickfix but may need more investigation
13 | ACTIONS_ALLOW_UNSECURE_COMMANDS: True
14 | steps:
15 | # no checkout necessary here.
16 | - name: Extract branch name
17 | run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
18 | - name: Check branch name
19 | run: echo $BRANCH
20 | - name: Set up Python
21 | uses: actions/setup-python@v1
22 | with:
23 | python-version: 3.7
24 |
25 | - name: Loading cache
26 | uses: actions/cache@v2
27 | id: cache
28 | with:
29 | path: ~/.cache/pip
30 | key: v0-torch_hub-${{ hashFiles('setup.py') }}
31 |
32 | - name: Install dependencies
33 | run: |
34 | pip install --upgrade pip
35 | # install torch-hub specific dependencies
36 | pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
37 | # no longer needed
38 | pip uninstall -y transformers
39 |
40 | - name: Torch hub list
41 | run: |
42 | python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
43 |
44 | - name: Torch hub help
45 | run: |
46 | python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
47 |
--------------------------------------------------------------------------------
/document_grounded_generation/test_special_tokens.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import shutil
3 | import unittest
4 |
5 | from transformers import OpenAIGPTTokenizer, GPT2Tokenizer
6 | from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS
7 |
8 | class TestSpecialTokenTreatment(unittest.TestCase):
9 |
10 | def setUp(self):
11 | self.save_dir = Path('utest_save_dir')
12 | self.save_dir.mkdir(exist_ok=True)
13 |
14 | def tearDown(self):
15 | shutil.rmtree(self.save_dir)
16 |
17 | def test_special_tokens_checkpoint_behavior(self):
18 | toks = [OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2')]
19 | for tok in toks:
20 | self.assertEqual(len(tok.added_tokens_encoder), 0)
21 | tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
22 | self.assertEqual(len(tok.added_tokens_encoder), 5)
23 | # Make sure we never split
24 | self.assertEqual(len(tok.tokenize(" ")), 2)
25 | ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS)
26 | self.assertTrue(all([x > 0 for x in ids]),
27 | f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}')
28 | # Need to mantain indices through save. (this is also tested in pytorch-transformers)
29 | tok.save_pretrained(self.save_dir)
30 | tok_loaded = tok.from_pretrained(str(self.save_dir))
31 | ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS)
32 | self.assertListEqual(ids, ids2)
33 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/src/transformers/models/xlm_prophetnet/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | # Copyright 2020 The HuggingFace Team. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | from ...file_utils import is_sentencepiece_available, is_torch_available
20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
21 |
22 |
23 | if is_sentencepiece_available():
24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer
25 |
26 | if is_torch_available():
27 | from .modeling_xlm_prophetnet import (
28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
29 | XLMProphetNetDecoder,
30 | XLMProphetNetEncoder,
31 | XLMProphetNetForCausalLM,
32 | XLMProphetNetForConditionalGeneration,
33 | XLMProphetNetModel,
34 | )
35 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/convert_model_to_fp16.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from typing import Union
17 |
18 | import fire
19 | import torch
20 | from tqdm import tqdm
21 |
22 |
23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
24 | """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
25 | state_dict = torch.load(src_path, map_location=map_location)
26 | for k, v in tqdm(state_dict.items()):
27 | if not isinstance(v, torch.Tensor):
28 | raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
29 | state_dict[k] = v.half()
30 | if save_path is None: # overwrite src_path
31 | save_path = src_path
32 | torch.save(state_dict, save_path)
33 |
34 |
35 | if __name__ == "__main__":
36 | fire.Fire(convert)
37 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/old_test_tatoeba_conversion.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import tempfile
17 | import unittest
18 |
19 | from transformers.file_utils import cached_property
20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
21 | from transformers.testing_utils import slow
22 |
23 |
24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
25 | class TatoebaConversionTester(unittest.TestCase):
26 | @cached_property
27 | def resolver(self):
28 | tmp_dir = tempfile.mkdtemp()
29 | return TatoebaConverter(save_dir=tmp_dir)
30 |
31 | @slow
32 | def test_resolver(self):
33 | self.resolver.convert_models(["heb-eng"])
34 |
35 | @slow
36 | def test_model_card(self):
37 | content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
38 | assert mmeta["long_pair"] == "heb-eng"
39 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/legacy/seq2seq/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python finetune_trainer.py \
16 | --model_name_or_path=facebook/mbart-large-cc25 \
17 | --data_dir $ENRO_DIR \
18 | --output_dir mbart_cc25_enro --overwrite_output_dir \
19 | --learning_rate=3e-5 \
20 | --warmup_steps 500 \
21 | --fp16 \
22 | --label_smoothing 0.1 \
23 | --adam_eps 1e-06 \
24 | --src_lang en_XX --tgt_lang ro_RO \
25 | --freeze_embeds \
26 | --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
27 | --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\
28 | --sortish_sampler \
29 | --num_train_epochs 6 \
30 | --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
31 | --do_train --do_eval --do_predict \
32 | --evaluation_strategy steps \
33 | --predict_with_generate --logging_first_step \
34 | --task translation \
35 | "$@"
36 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/convert_model_to_fp16.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2020 The HuggingFace Team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from typing import Union
17 |
18 | import fire
19 | import torch
20 | from tqdm import tqdm
21 |
22 |
23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
24 | """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
25 | state_dict = torch.load(src_path, map_location=map_location)
26 | for k, v in tqdm(state_dict.items()):
27 | if not isinstance(v, torch.Tensor):
28 | raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
29 | state_dict[k] = v.half()
30 | if save_path is None: # overwrite src_path
31 | save_path = src_path
32 | torch.save(state_dict, save_path)
33 |
34 |
35 | if __name__ == "__main__":
36 | fire.Fire(convert)
37 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/rag/parse_dpr_relevance_data.py:
--------------------------------------------------------------------------------
1 | """
2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint.
3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting
4 | positive contexts for a given query.
5 | """
6 |
7 | import argparse
8 | import json
9 |
10 | from tqdm import tqdm
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser()
15 |
16 | # Required parameters
17 | parser.add_argument(
18 | "--src_path",
19 | type=str,
20 | default="biencoder-nq-dev.json",
21 | help="Path to raw DPR training data",
22 | )
23 | parser.add_argument(
24 | "--evaluation_set",
25 | type=str,
26 | help="where to store parsed evaluation_set file",
27 | )
28 | parser.add_argument(
29 | "--gold_data_path",
30 | type=str,
31 | help="where to store parsed gold_data_path file",
32 | )
33 | args = parser.parse_args()
34 |
35 | with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open(
36 | args.gold_data_path, "w"
37 | ) as gold_file:
38 | dpr_records = json.load(src_file)
39 | for dpr_record in tqdm(dpr_records):
40 | question = dpr_record["question"]
41 | contexts = [context["title"] for context in dpr_record["positive_ctxs"]]
42 | eval_file.write(question + "\n")
43 | gold_file.write("\t".join(contexts) + "\n")
44 |
45 |
46 | if __name__ == "__main__":
47 | main()
48 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/train_mbart_cc25_enro.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | python finetune_trainer.py \
16 | --model_name_or_path=facebook/mbart-large-cc25 \
17 | --data_dir $ENRO_DIR \
18 | --output_dir mbart_cc25_enro --overwrite_output_dir \
19 | --learning_rate=3e-5 \
20 | --warmup_steps 500 \
21 | --fp16 \
22 | --label_smoothing 0.1 \
23 | --adam_eps 1e-06 \
24 | --src_lang en_XX --tgt_lang ro_RO \
25 | --freeze_embeds \
26 | --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
27 | --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\
28 | --sortish_sampler \
29 | --num_train_epochs 6 \
30 | --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
31 | --do_train --do_eval --do_predict \
32 | --evaluation_strategy steps \
33 | --predict_with_generate --logging_first_step \
34 | --task translation \
35 | "$@"
36 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/legacy/seq2seq/old_test_tatoeba_conversion.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import tempfile
17 | import unittest
18 |
19 | from transformers.file_utils import cached_property
20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
21 | from transformers.testing_utils import slow
22 |
23 |
24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
25 | class TatoebaConversionTester(unittest.TestCase):
26 | @cached_property
27 | def resolver(self):
28 | tmp_dir = tempfile.mkdtemp()
29 | return TatoebaConverter(save_dir=tmp_dir)
30 |
31 | @slow
32 | def test_resolver(self):
33 | self.resolver.convert_models(["heb-eng"])
34 |
35 | @slow
36 | def test_model_card(self):
37 | content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
38 | assert mmeta["long_pair"] == "heb-eng"
39 |
--------------------------------------------------------------------------------
/finetune_gpt2/examples/research_projects/adversarial/README.md:
--------------------------------------------------------------------------------
1 | ## Adversarial evaluation of model performances
2 |
3 | Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
4 |
5 | The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
6 |
7 | This is an example of using test_hans.py:
8 |
9 | ```bash
10 | export HANS_DIR=path-to-hans
11 | export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
12 | export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
13 |
14 | python run_hans.py \
15 | --task_name hans \
16 | --model_type $MODEL_TYPE \
17 | --do_eval \
18 | --data_dir $HANS_DIR \
19 | --model_name_or_path $MODEL_PATH \
20 | --max_seq_length 128 \
21 | --output_dir $MODEL_PATH \
22 | ```
23 |
24 | This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
25 |
26 | The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
27 |
28 | ```bash
29 | Heuristic entailed results:
30 | lexical_overlap: 0.9702
31 | subsequence: 0.9942
32 | constituent: 0.9962
33 |
34 | Heuristic non-entailed results:
35 | lexical_overlap: 0.199
36 | subsequence: 0.0396
37 | constituent: 0.118
38 | ```
39 |
--------------------------------------------------------------------------------
/document_grounded_generation/transformers/examples/research_projects/rag/parse_dpr_relevance_data.py:
--------------------------------------------------------------------------------
1 | """
2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint.
3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting
4 | positive contexts for a given query.
5 | """
6 |
7 | import argparse
8 | import json
9 |
10 | from tqdm import tqdm
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser()
15 |
16 | # Required parameters
17 | parser.add_argument(
18 | "--src_path",
19 | type=str,
20 | default="biencoder-nq-dev.json",
21 | help="Path to raw DPR training data",
22 | )
23 | parser.add_argument(
24 | "--evaluation_set",
25 | type=str,
26 | help="where to store parsed evaluation_set file",
27 | )
28 | parser.add_argument(
29 | "--gold_data_path",
30 | type=str,
31 | help="where to store parsed gold_data_path file",
32 | )
33 | args = parser.parse_args()
34 |
35 | with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open(
36 | args.gold_data_path, "w"
37 | ) as gold_file:
38 | dpr_records = json.load(src_file)
39 | for dpr_record in tqdm(dpr_records):
40 | question = dpr_record["question"]
41 | contexts = [context["title"] for context in dpr_record["positive_ctxs"]]
42 | eval_file.write(question + "\n")
43 | gold_file.write("\t".join(contexts) + "\n")
44 |
45 |
46 | if __name__ == "__main__":
47 | main()
48 |
--------------------------------------------------------------------------------
/finetune_gpt2/utils/get_modified_files.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.:
17 | # python ./utils/get_modified_files.py utils src tests examples
18 | #
19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered
20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results
21 |
22 | import re
23 | import subprocess
24 | import sys
25 |
26 |
27 | fork_point_sha = subprocess.check_output("git merge-base master HEAD".split()).decode("utf-8")
28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split()
29 |
30 | joined_dirs = "|".join(sys.argv[1:])
31 | regex = re.compile(fr"^({joined_dirs}).*?\.py$")
32 |
33 | relevant_modified_files = [x for x in modified_files if regex.match(x)]
34 | print(" ".join(relevant_modified_files), end="")
35 |
--------------------------------------------------------------------------------