├── .gitignore ├── README.md ├── legacy └── v1.0 │ └── README.md ├── multilingual_rouge_scoring ├── README.md ├── __init__.py ├── io.py ├── requirements.txt ├── rouge.py ├── rouge_scorer.py ├── scoring.py ├── setup.cfg ├── setup.py ├── setuptools ├── stemmers.py ├── tokenization_wrapper.py └── tokenizers.py └── seq2seq ├── README.md ├── __init__.py ├── distributed_trainer.sh ├── evaluate.sh ├── extract_data.py ├── job.sh ├── pipeline.py ├── requirements.txt ├── sentence_splitter.py ├── setup.sh ├── trainer.sh ├── transformers ├── .circleci │ ├── config.yml │ └── deploy.sh ├── .coveragerc ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── ---new-benchmark.md │ │ ├── --new-model-addition.md │ │ ├── bug-report.md │ │ ├── feature-request.md │ │ ├── migration.md │ │ └── question-help.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── conda │ │ ├── build.sh │ │ └── meta.yaml │ ├── stale.yml │ └── workflows │ │ ├── github-torch-hub.yml │ │ ├── model-templates.yml │ │ ├── release-conda.yml │ │ ├── self-push.yml │ │ └── self-scheduled.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docker │ ├── transformers-cpu │ │ └── Dockerfile │ ├── transformers-gpu │ │ └── Dockerfile │ ├── transformers-pytorch-cpu │ │ └── Dockerfile │ ├── transformers-pytorch-gpu │ │ └── Dockerfile │ ├── transformers-pytorch-tpu │ │ ├── Dockerfile │ │ ├── bert-base-cased.jsonnet │ │ ├── dataset.yaml │ │ └── docker-entrypoint.sh │ ├── transformers-tensorflow-cpu │ │ └── Dockerfile │ └── transformers-tensorflow-gpu │ │ └── Dockerfile ├── docs │ ├── Makefile │ ├── README.md │ └── source │ │ ├── _static │ │ ├── css │ │ │ ├── Calibre-Light.ttf │ │ │ ├── Calibre-Medium.otf │ │ │ ├── Calibre-Regular.otf │ │ │ ├── Calibre-Thin.otf │ │ │ ├── code-snippets.css │ │ │ └── huggingface.css │ │ └── js │ │ │ ├── custom.js │ │ │ └── huggingface_logo.svg │ │ ├── benchmarks.rst │ │ ├── bertology.rst │ │ ├── conf.py │ │ ├── contributing.md │ │ ├── converting_tensorflow_models.rst │ │ ├── custom_datasets.rst │ │ ├── examples.md │ │ ├── favicon.ico │ │ ├── glossary.rst │ │ ├── imgs │ │ ├── local_attention_mask.png │ │ ├── ppl_chunked.gif │ │ ├── ppl_full.gif │ │ ├── ppl_sliding.gif │ │ ├── transformers_logo_name.png │ │ ├── warmup_constant_schedule.png │ │ ├── warmup_cosine_hard_restarts_schedule.png │ │ ├── warmup_cosine_schedule.png │ │ ├── warmup_cosine_warm_restarts_schedule.png │ │ └── warmup_linear_schedule.png │ │ ├── index.rst │ │ ├── installation.md │ │ ├── internal │ │ ├── generation_utils.rst │ │ ├── modeling_utils.rst │ │ ├── pipelines_utils.rst │ │ ├── tokenization_utils.rst │ │ └── trainer_utils.rst │ │ ├── main_classes │ │ ├── callback.rst │ │ ├── configuration.rst │ │ ├── logging.rst │ │ ├── model.rst │ │ ├── optimizer_schedules.rst │ │ ├── output.rst │ │ ├── pipelines.rst │ │ ├── processors.rst │ │ ├── tokenizer.rst │ │ └── trainer.rst │ │ ├── migration.md │ │ ├── model_doc │ │ ├── albert.rst │ │ ├── auto.rst │ │ ├── bart.rst │ │ ├── barthez.rst │ │ ├── bert.rst │ │ ├── bertgeneration.rst │ │ ├── blenderbot.rst │ │ ├── camembert.rst │ │ ├── ctrl.rst │ │ ├── deberta.rst │ │ ├── dialogpt.rst │ │ ├── distilbert.rst │ │ ├── dpr.rst │ │ ├── electra.rst │ │ ├── encoderdecoder.rst │ │ ├── flaubert.rst │ │ ├── fsmt.rst │ │ ├── funnel.rst │ │ ├── gpt.rst │ │ ├── gpt2.rst │ │ ├── layoutlm.rst │ │ ├── longformer.rst │ │ ├── lxmert.rst │ │ ├── marian.rst │ │ ├── mbart.rst │ │ ├── mobilebert.rst │ │ ├── mpnet.rst │ │ ├── mt5.rst │ │ ├── pegasus.rst │ │ ├── prophetnet.rst │ │ ├── rag.rst │ │ ├── reformer.rst │ │ ├── retribert.rst │ │ ├── roberta.rst │ │ ├── squeezebert.rst │ │ ├── t5.rst │ │ ├── tapas.rst │ │ ├── transformerxl.rst │ │ ├── xlm.rst │ │ ├── xlmprophetnet.rst │ │ ├── xlmroberta.rst │ │ └── xlnet.rst │ │ ├── model_sharing.rst │ │ ├── model_summary.rst │ │ ├── multilingual.rst │ │ ├── notebooks.md │ │ ├── perplexity.rst │ │ ├── philosophy.rst │ │ ├── preprocessing.rst │ │ ├── pretrained_models.rst │ │ ├── quicktour.rst │ │ ├── serialization.rst │ │ ├── task_summary.rst │ │ ├── testing.rst │ │ ├── tokenizer_summary.rst │ │ └── training.rst ├── examples │ ├── README.md │ ├── _tests_requirements.txt │ ├── benchmarking │ │ ├── README.md │ │ ├── plot_csv_file.py │ │ ├── requirements.txt │ │ ├── run_benchmark.py │ │ └── run_benchmark_tf.py │ ├── conftest.py │ ├── language-modeling │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run_clm.py │ │ ├── run_mlm.py │ │ ├── run_mlm_flax.py │ │ ├── run_mlm_wwm.py │ │ └── run_plm.py │ ├── legacy │ │ ├── README.md │ │ ├── multiple_choice │ │ │ ├── run_multiple_choice.py │ │ │ └── utils_multiple_choice.py │ │ ├── pytorch-lightning │ │ │ ├── lightning_base.py │ │ │ ├── requirements.txt │ │ │ ├── run_glue.py │ │ │ ├── run_glue.sh │ │ │ ├── run_ner.py │ │ │ ├── run_ner.sh │ │ │ └── run_pos.sh │ │ ├── question-answering │ │ │ ├── run_squad.py │ │ │ └── run_squad_trainer.py │ │ ├── run_camembert.py │ │ ├── run_chinese_ref.py │ │ ├── run_language_modeling.py │ │ ├── run_openai_gpt.py │ │ ├── run_swag.py │ │ ├── run_transfo_xl.py │ │ └── token-classification │ │ │ ├── README.md │ │ │ ├── run.sh │ │ │ ├── run_chunk.sh │ │ │ ├── run_ner.py │ │ │ ├── run_pos.sh │ │ │ ├── scripts │ │ │ └── preprocess.py │ │ │ ├── tasks.py │ │ │ └── utils_ner.py │ ├── multiple-choice │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run_swag.py │ │ ├── run_tf_multiple_choice.py │ │ └── utils_multiple_choice.py │ ├── question-answering │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run_qa.py │ │ ├── run_qa_beam_search.py │ │ ├── run_tf_squad.py │ │ ├── squad_v2_local │ │ │ ├── evaluate.py │ │ │ └── squad_v2_local.py │ │ ├── trainer_qa.py │ │ └── utils_qa.py │ ├── research_projects │ │ ├── README.md │ │ ├── adversarial │ │ │ ├── README.md │ │ │ ├── requirements.txt │ │ │ ├── run_hans.py │ │ │ └── utils_hans.py │ │ ├── bert-loses-patience │ │ │ ├── README.md │ │ │ ├── pabee │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling_pabee_albert.py │ │ │ │ └── modeling_pabee_bert.py │ │ │ ├── requirements.txt │ │ │ ├── run_glue_with_pabee.py │ │ │ └── test_run_glue_with_pabee.py │ │ ├── bertabs │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── configuration_bertabs.py │ │ │ ├── convert_bertabs_original_pytorch_checkpoint.py │ │ │ ├── modeling_bertabs.py │ │ │ ├── requirements.txt │ │ │ ├── run_summarization.py │ │ │ ├── test_utils_summarization.py │ │ │ └── utils_summarization.py │ │ ├── bertology │ │ │ ├── requirements.txt │ │ │ ├── run_bertology.py │ │ │ └── run_prune_gpt.py │ │ ├── deebert │ │ │ ├── README.md │ │ │ ├── entropy_eval.sh │ │ │ ├── eval_deebert.sh │ │ │ ├── requirements.txt │ │ │ ├── run_glue_deebert.py │ │ │ ├── src │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling_highway_bert.py │ │ │ │ └── modeling_highway_roberta.py │ │ │ ├── test_glue_deebert.py │ │ │ └── train_deebert.sh │ │ ├── distillation │ │ │ ├── README.md │ │ │ ├── distiller.py │ │ │ ├── grouped_batch_sampler.py │ │ │ ├── lm_seqs_dataset.py │ │ │ ├── requirements.txt │ │ │ ├── run_squad_w_distillation.py │ │ │ ├── scripts │ │ │ │ ├── binarized_data.py │ │ │ │ ├── extract.py │ │ │ │ ├── extract_distilbert.py │ │ │ │ └── token_counts.py │ │ │ ├── train.py │ │ │ ├── training_configs │ │ │ │ ├── distilbert-base-cased.json │ │ │ │ ├── distilbert-base-multilingual-cased.json │ │ │ │ ├── distilbert-base-uncased.json │ │ │ │ ├── distilgpt2.json │ │ │ │ └── distilroberta-base.json │ │ │ └── utils.py │ │ ├── longform-qa │ │ │ ├── README.md │ │ │ ├── eli5_app.py │ │ │ ├── eli5_utils.py │ │ │ └── requirements.txt │ │ ├── mm-imdb │ │ │ ├── README.md │ │ │ ├── run_mmimdb.py │ │ │ └── utils_mmimdb.py │ │ ├── movement-pruning │ │ │ ├── README.md │ │ │ ├── Saving_PruneBERT.ipynb │ │ │ ├── bertarize.py │ │ │ ├── counts_parameters.py │ │ │ ├── emmental │ │ │ │ ├── __init__.py │ │ │ │ ├── configuration_bert_masked.py │ │ │ │ ├── modeling_bert_masked.py │ │ │ │ └── modules │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── binarizer.py │ │ │ │ │ └── masked_nn.py │ │ │ ├── lxmert │ │ │ │ ├── README.md │ │ │ │ ├── demo.ipynb │ │ │ │ ├── extracting_data.py │ │ │ │ ├── modeling_frcnn.py │ │ │ │ ├── processing_image.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── utils.py │ │ │ │ └── visualizing_image.py │ │ │ ├── masked_run_glue.py │ │ │ ├── masked_run_squad.py │ │ │ └── requirements.txt │ │ ├── performer │ │ │ ├── README.md │ │ │ ├── full_script.sh │ │ │ ├── modeling_flax_performer.py │ │ │ ├── modeling_flax_performer_utils.py │ │ │ ├── run_mlm_performer.py │ │ │ └── sanity_script.sh │ │ ├── pplm │ │ │ ├── README.md │ │ │ ├── imgs │ │ │ │ ├── headfigure.png │ │ │ │ └── wooly.png │ │ │ ├── pplm_classification_head.py │ │ │ ├── requirements.txt │ │ │ ├── run_pplm.py │ │ │ └── run_pplm_discrim_train.py │ │ ├── rag │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── _test_finetune_rag.py │ │ │ ├── callbacks_rag.py │ │ │ ├── consolidate_rag_checkpoint.py │ │ │ ├── distributed_pytorch_retriever.py │ │ │ ├── distributed_ray_retriever.py │ │ │ ├── eval_rag.py │ │ │ ├── finetune_rag.py │ │ │ ├── finetune_rag.sh │ │ │ ├── finetune_rag_ray.sh │ │ │ ├── lightning_base.py │ │ │ ├── parse_dpr_relevance_data.py │ │ │ ├── requirements.txt │ │ │ ├── test_data │ │ │ │ └── my_knowledge_dataset.csv │ │ │ ├── test_distributed_retriever.py │ │ │ ├── use_own_knowledge_dataset.py │ │ │ └── utils_rag.py │ │ └── seq2seq-distillation │ │ │ ├── README.md │ │ │ ├── _test_bash_script.py │ │ │ ├── _test_make_student.py │ │ │ ├── _test_seq2seq_examples.py │ │ │ ├── _test_seq2seq_examples_multi_gpu.py │ │ │ ├── callbacks.py │ │ │ ├── convert_pl_checkpoint_to_hf.py │ │ │ ├── distil_marian_enro_teacher.sh │ │ │ ├── distil_marian_no_teacher.sh │ │ │ ├── distillation.py │ │ │ ├── dynamic_bs_example.sh │ │ │ ├── finetune.py │ │ │ ├── finetune.sh │ │ │ ├── finetune_bart_tiny.sh │ │ │ ├── finetune_pegasus_xsum.sh │ │ │ ├── finetune_t5.sh │ │ │ ├── lightning_base.py │ │ │ ├── make_student.py │ │ │ ├── precomputed_pseudo_labels.md │ │ │ ├── requirements.txt │ │ │ ├── run_eval.py │ │ │ ├── sentence_splitter.py │ │ │ ├── train_distilbart_cnn.sh │ │ │ ├── train_distilbart_xsum.sh │ │ │ ├── train_mbart_cc25_enro.sh │ │ │ ├── utils copy.py │ │ │ └── utils.py │ ├── seq2seq │ │ ├── README.md │ │ ├── __init__.py │ │ ├── convert_model_to_fp16.py │ │ ├── download_wmt.py │ │ ├── finetune.sh │ │ ├── finetune_tpu.sh │ │ ├── finetune_trainer.py │ │ ├── minify_dataset.py │ │ ├── pack_dataset.py │ │ ├── requirements.txt │ │ ├── romanian_postprocessing.md │ │ ├── rouge_cli.py │ │ ├── run_distributed_eval.py │ │ ├── run_eval.py │ │ ├── run_eval_search.py │ │ ├── save_len_file.py │ │ ├── save_randomly_initialized_model.py │ │ ├── sentence_splitter.py │ │ ├── seq2seq_trainer.py │ │ ├── seq2seq_training_args.py │ │ ├── test_calculate_rouge.py │ │ ├── test_data │ │ │ ├── fsmt │ │ │ │ ├── build-eval-data.py │ │ │ │ └── fsmt_val_data.json │ │ │ ├── test_data │ │ │ └── wmt_en_ro │ │ │ │ ├── test.source │ │ │ │ ├── test.target │ │ │ │ ├── train.len │ │ │ │ ├── train.source │ │ │ │ ├── train.target │ │ │ │ ├── val.len │ │ │ │ ├── val.source │ │ │ │ └── val.target │ │ ├── test_datasets.py │ │ ├── test_finetune_trainer.py │ │ ├── test_fsmt_bleu_score.py │ │ ├── test_seq2seq_examples.py │ │ ├── test_seq2seq_examples_multi_gpu.py │ │ ├── test_tatoeba_conversion.py │ │ ├── train_distil_marian_enro.sh │ │ ├── train_distil_marian_enro_tpu.sh │ │ ├── train_distilbart_cnn.sh │ │ ├── train_mbart_cc25_enro.sh │ │ ├── utils.py │ │ └── xla_spawn.py │ ├── test_examples.py │ ├── test_xla_examples.py │ ├── text-classification │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run_glue.py │ │ ├── run_tf_glue.py │ │ ├── run_tf_text_classification.py │ │ └── run_xnli.py │ ├── text-generation │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run_generation.py │ ├── token-classification │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run.sh │ │ ├── run_ner.py │ │ └── run_tf_ner.py │ └── xla_spawn.py ├── hubconf.py ├── model_cards │ ├── README.md │ └── google │ │ └── tapas-base │ │ └── README.md ├── notebooks │ ├── 01-training-tokenizers.ipynb │ ├── 02-transformers.ipynb │ ├── 03-pipelines.ipynb │ ├── 04-onnx-export.ipynb │ ├── 05-benchmark.ipynb │ └── README.md ├── pyproject.toml ├── scripts │ ├── fsmt │ │ ├── convert-allenai-wmt16.sh │ │ ├── convert-allenai-wmt19.sh │ │ ├── convert-facebook-wmt19.sh │ │ ├── eval-allenai-wmt16.sh │ │ ├── eval-allenai-wmt19.sh │ │ ├── eval-facebook-wmt19.sh │ │ ├── fsmt-make-super-tiny-model.py │ │ ├── fsmt-make-tiny-model.py │ │ ├── gen-card-allenai-wmt16.py │ │ ├── gen-card-allenai-wmt19.py │ │ ├── gen-card-facebook-wmt19.py │ │ ├── s3-move.sh │ │ └── tests-to-run.sh │ ├── pegasus │ │ └── build_test_sample_spm_no_bos.py │ └── tatoeba │ │ ├── README.md │ │ └── upload_models.sh ├── setup.cfg ├── setup.py ├── src │ └── transformers │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── activations_tf.py │ │ ├── benchmark │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_tf.py │ │ ├── benchmark_args_utils.py │ │ ├── benchmark_tf.py │ │ └── benchmark_utils.py │ │ ├── commands │ │ ├── __init__.py │ │ ├── add_new_model.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── lfs.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ │ ├── configuration_utils.py │ │ ├── convert_graph_to_onnx.py │ │ ├── convert_pytorch_checkpoint_to_tf2.py │ │ ├── convert_slow_tokenizer.py │ │ ├── convert_slow_tokenizers_checkpoints_to_fast.py │ │ ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py │ │ ├── data │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ ├── language_modeling.py │ │ │ └── squad.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ ├── processors │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ ├── squad.py │ │ │ ├── utils.py │ │ │ └── xnli.py │ │ └── test_generation_utils.py │ │ ├── dependency_versions_check.py │ │ ├── dependency_versions_table.py │ │ ├── file_utils.py │ │ ├── generation_beam_search.py │ │ ├── generation_logits_process.py │ │ ├── generation_tf_utils.py │ │ ├── generation_utils.py │ │ ├── hf_api.py │ │ ├── hf_argparser.py │ │ ├── integrations.py │ │ ├── modelcard.py │ │ ├── modeling_flax_utils.py │ │ ├── modeling_outputs.py │ │ ├── modeling_tf_outputs.py │ │ ├── modeling_tf_pytorch_utils.py │ │ ├── modeling_tf_utils.py │ │ ├── modeling_utils.py │ │ ├── models │ │ ├── __init__.py │ │ ├── albert │ │ │ ├── __init__.py │ │ │ ├── configuration_albert.py │ │ │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_albert.py │ │ │ ├── modeling_tf_albert.py │ │ │ ├── tokenization_albert.py │ │ │ └── tokenization_albert_fast.py │ │ ├── auto │ │ │ ├── __init__.py │ │ │ ├── configuration_auto.py │ │ │ ├── modeling_auto.py │ │ │ ├── modeling_flax_auto.py │ │ │ ├── modeling_tf_auto.py │ │ │ └── tokenization_auto.py │ │ ├── bart │ │ │ ├── __init__.py │ │ │ ├── configuration_bart.py │ │ │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_bart.py │ │ │ ├── modeling_tf_bart.py │ │ │ ├── tokenization_bart.py │ │ │ └── tokenization_bart_fast.py │ │ ├── barthez │ │ │ ├── __init__.py │ │ │ ├── tokenization_barthez.py │ │ │ └── tokenization_barthez_fast.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── configuration_bert.py │ │ │ ├── convert_bert_original_tf2_checkpoint_to_pytorch.py │ │ │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ │ │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ │ │ ├── modeling_bert.py │ │ │ ├── modeling_flax_bert.py │ │ │ ├── modeling_tf_bert.py │ │ │ ├── tokenization_bert.py │ │ │ └── tokenization_bert_fast.py │ │ ├── bert_generation │ │ │ ├── __init__.py │ │ │ ├── configuration_bert_generation.py │ │ │ ├── modeling_bert_generation.py │ │ │ └── tokenization_bert_generation.py │ │ ├── bert_japanese │ │ │ ├── __init__.py │ │ │ └── tokenization_bert_japanese.py │ │ ├── bertweet │ │ │ ├── __init__.py │ │ │ └── tokenization_bertweet.py │ │ ├── blenderbot │ │ │ ├── __init__.py │ │ │ ├── configuration_blenderbot.py │ │ │ ├── convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_blenderbot.py │ │ │ ├── modeling_tf_blenderbot.py │ │ │ └── tokenization_blenderbot.py │ │ ├── camembert │ │ │ ├── __init__.py │ │ │ ├── configuration_camembert.py │ │ │ ├── modeling_camembert.py │ │ │ ├── modeling_tf_camembert.py │ │ │ ├── tokenization_camembert.py │ │ │ └── tokenization_camembert_fast.py │ │ ├── ctrl │ │ │ ├── __init__.py │ │ │ ├── configuration_ctrl.py │ │ │ ├── modeling_ctrl.py │ │ │ ├── modeling_tf_ctrl.py │ │ │ └── tokenization_ctrl.py │ │ ├── deberta │ │ │ ├── __init__.py │ │ │ ├── configuration_deberta.py │ │ │ ├── modeling_deberta.py │ │ │ └── tokenization_deberta.py │ │ ├── dialogpt │ │ │ ├── __init__.py │ │ │ └── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ │ ├── distilbert │ │ │ ├── __init__.py │ │ │ ├── configuration_distilbert.py │ │ │ ├── modeling_distilbert.py │ │ │ ├── modeling_tf_distilbert.py │ │ │ ├── tokenization_distilbert.py │ │ │ └── tokenization_distilbert_fast.py │ │ ├── dpr │ │ │ ├── __init__.py │ │ │ ├── configuration_dpr.py │ │ │ ├── convert_dpr_original_checkpoint_to_pytorch.py │ │ │ ├── modeling_dpr.py │ │ │ ├── modeling_tf_dpr.py │ │ │ ├── tokenization_dpr.py │ │ │ └── tokenization_dpr_fast.py │ │ ├── electra │ │ │ ├── __init__.py │ │ │ ├── configuration_electra.py │ │ │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_electra.py │ │ │ ├── modeling_tf_electra.py │ │ │ ├── tokenization_electra.py │ │ │ └── tokenization_electra_fast.py │ │ ├── encoder_decoder │ │ │ ├── __init__.py │ │ │ ├── configuration_encoder_decoder.py │ │ │ └── modeling_encoder_decoder.py │ │ ├── flaubert │ │ │ ├── __init__.py │ │ │ ├── configuration_flaubert.py │ │ │ ├── modeling_flaubert.py │ │ │ ├── modeling_tf_flaubert.py │ │ │ └── tokenization_flaubert.py │ │ ├── fsmt │ │ │ ├── __init__.py │ │ │ ├── configuration_fsmt.py │ │ │ ├── convert_fsmt_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_fsmt.py │ │ │ └── tokenization_fsmt.py │ │ ├── funnel │ │ │ ├── __init__.py │ │ │ ├── configuration_funnel.py │ │ │ ├── convert_funnel_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_funnel.py │ │ │ ├── modeling_tf_funnel.py │ │ │ ├── tokenization_funnel.py │ │ │ └── tokenization_funnel_fast.py │ │ ├── gpt2 │ │ │ ├── __init__.py │ │ │ ├── configuration_gpt2.py │ │ │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_gpt2.py │ │ │ ├── modeling_tf_gpt2.py │ │ │ ├── tokenization_gpt2.py │ │ │ └── tokenization_gpt2_fast.py │ │ ├── herbert │ │ │ ├── __init__.py │ │ │ ├── tokenization_herbert.py │ │ │ └── tokenization_herbert_fast.py │ │ ├── layoutlm │ │ │ ├── __init__.py │ │ │ ├── configuration_layoutlm.py │ │ │ ├── modeling_layoutlm.py │ │ │ ├── tokenization_layoutlm.py │ │ │ └── tokenization_layoutlm_fast.py │ │ ├── longformer │ │ │ ├── __init__.py │ │ │ ├── configuration_longformer.py │ │ │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ │ │ ├── modeling_longformer.py │ │ │ ├── modeling_tf_longformer.py │ │ │ ├── tokenization_longformer.py │ │ │ └── tokenization_longformer_fast.py │ │ ├── lxmert │ │ │ ├── __init__.py │ │ │ ├── configuration_lxmert.py │ │ │ ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_lxmert.py │ │ │ ├── modeling_tf_lxmert.py │ │ │ ├── tokenization_lxmert.py │ │ │ └── tokenization_lxmert_fast.py │ │ ├── marian │ │ │ ├── __init__.py │ │ │ ├── configuration_marian.py │ │ │ ├── convert_marian_tatoeba_to_pytorch.py │ │ │ ├── convert_marian_to_pytorch.py │ │ │ ├── modeling_marian.py │ │ │ ├── modeling_tf_marian.py │ │ │ └── tokenization_marian.py │ │ ├── mbart │ │ │ ├── __init__.py │ │ │ ├── configuration_mbart.py │ │ │ ├── convert_mbart_original_checkpoint_to_pytorch.py │ │ │ ├── modeling_mbart.py │ │ │ ├── modeling_tf_mbart.py │ │ │ ├── tokenization_mbart.py │ │ │ └── tokenization_mbart_fast.py │ │ ├── mmbt │ │ │ ├── __init__.py │ │ │ ├── configuration_mmbt.py │ │ │ └── modeling_mmbt.py │ │ ├── mobilebert │ │ │ ├── __init__.py │ │ │ ├── configuration_mobilebert.py │ │ │ ├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_mobilebert.py │ │ │ ├── modeling_tf_mobilebert.py │ │ │ ├── tokenization_mobilebert.py │ │ │ └── tokenization_mobilebert_fast.py │ │ ├── mpnet │ │ │ ├── __init__.py │ │ │ ├── configuration_mpnet.py │ │ │ ├── modeling_mpnet.py │ │ │ ├── modeling_tf_mpnet.py │ │ │ ├── tokenization_mpnet.py │ │ │ └── tokenization_mpnet_fast.py │ │ ├── mt5 │ │ │ ├── __init__.py │ │ │ ├── configuration_mt5.py │ │ │ ├── modeling_mt5.py │ │ │ └── modeling_tf_mt5.py │ │ ├── openai │ │ │ ├── __init__.py │ │ │ ├── configuration_openai.py │ │ │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_openai.py │ │ │ ├── modeling_tf_openai.py │ │ │ ├── tokenization_openai.py │ │ │ └── tokenization_openai_fast.py │ │ ├── pegasus │ │ │ ├── __init__.py │ │ │ ├── configuration_pegasus.py │ │ │ ├── convert_pegasus_tf_to_pytorch.py │ │ │ ├── modeling_pegasus.py │ │ │ ├── modeling_tf_pegasus.py │ │ │ ├── tokenization_pegasus.py │ │ │ └── tokenization_pegasus_fast.py │ │ ├── phobert │ │ │ ├── __init__.py │ │ │ └── tokenization_phobert.py │ │ ├── prophetnet │ │ │ ├── __init__.py │ │ │ ├── configuration_prophetnet.py │ │ │ ├── convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_prophetnet.py │ │ │ └── tokenization_prophetnet.py │ │ ├── rag │ │ │ ├── __init__.py │ │ │ ├── configuration_rag.py │ │ │ ├── modeling_rag.py │ │ │ ├── retrieval_rag.py │ │ │ └── tokenization_rag.py │ │ ├── reformer │ │ │ ├── __init__.py │ │ │ ├── configuration_reformer.py │ │ │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ │ │ ├── modeling_reformer.py │ │ │ ├── tokenization_reformer.py │ │ │ └── tokenization_reformer_fast.py │ │ ├── retribert │ │ │ ├── __init__.py │ │ │ ├── configuration_retribert.py │ │ │ ├── modeling_retribert.py │ │ │ ├── tokenization_retribert.py │ │ │ └── tokenization_retribert_fast.py │ │ ├── roberta │ │ │ ├── __init__.py │ │ │ ├── configuration_roberta.py │ │ │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_flax_roberta.py │ │ │ ├── modeling_roberta.py │ │ │ ├── modeling_tf_roberta.py │ │ │ ├── tokenization_roberta.py │ │ │ └── tokenization_roberta_fast.py │ │ ├── squeezebert │ │ │ ├── __init__.py │ │ │ ├── configuration_squeezebert.py │ │ │ ├── modeling_squeezebert.py │ │ │ ├── tokenization_squeezebert.py │ │ │ └── tokenization_squeezebert_fast.py │ │ ├── t5 │ │ │ ├── __init__.py │ │ │ ├── configuration_t5.py │ │ │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_t5.py │ │ │ ├── modeling_tf_t5.py │ │ │ ├── tokenization_t5.py │ │ │ └── tokenization_t5_fast.py │ │ ├── tapas │ │ │ ├── __init__.py │ │ │ ├── configuration_tapas.py │ │ │ ├── convert_tapas_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_tapas.py │ │ │ └── tokenization_tapas.py │ │ ├── transfo_xl │ │ │ ├── __init__.py │ │ │ ├── configuration_transfo_xl.py │ │ │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_tf_transfo_xl.py │ │ │ ├── modeling_tf_transfo_xl_utilities.py │ │ │ ├── modeling_transfo_xl.py │ │ │ ├── modeling_transfo_xl_utilities.py │ │ │ └── tokenization_transfo_xl.py │ │ ├── xlm │ │ │ ├── __init__.py │ │ │ ├── configuration_xlm.py │ │ │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ │ │ ├── modeling_tf_xlm.py │ │ │ ├── modeling_xlm.py │ │ │ └── tokenization_xlm.py │ │ ├── xlm_prophetnet │ │ │ ├── __init__.py │ │ │ ├── configuration_xlm_prophetnet.py │ │ │ ├── modeling_xlm_prophetnet.py │ │ │ └── tokenization_xlm_prophetnet.py │ │ ├── xlm_roberta │ │ │ ├── __init__.py │ │ │ ├── configuration_xlm_roberta.py │ │ │ ├── modeling_tf_xlm_roberta.py │ │ │ ├── modeling_xlm_roberta.py │ │ │ ├── tokenization_xlm_roberta.py │ │ │ └── tokenization_xlm_roberta_fast.py │ │ └── xlnet │ │ │ ├── __init__.py │ │ │ ├── configuration_xlnet.py │ │ │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ │ │ ├── modeling_tf_xlnet.py │ │ │ ├── modeling_xlnet.py │ │ │ ├── tokenization_xlnet.py │ │ │ └── tokenization_xlnet_fast.py │ │ ├── optimization.py │ │ ├── optimization_tf.py │ │ ├── pipelines.py │ │ ├── testing_utils.py │ │ ├── tokenization_utils.py │ │ ├── tokenization_utils_base.py │ │ ├── tokenization_utils_fast.py │ │ ├── trainer.py │ │ ├── trainer_callback.py │ │ ├── trainer_pt_utils.py │ │ ├── trainer_seq2seq.py │ │ ├── trainer_tf.py │ │ ├── trainer_utils.py │ │ ├── training_args.py │ │ ├── training_args_seq2seq.py │ │ ├── training_args_tf.py │ │ └── utils │ │ ├── __init__.py │ │ ├── dummy_flax_objects.py │ │ ├── dummy_pt_objects.py │ │ ├── dummy_sentencepiece_objects.py │ │ ├── dummy_tf_objects.py │ │ ├── dummy_tokenizers_objects.py │ │ ├── hp_naming.py │ │ ├── logging.py │ │ ├── model_parallel_utils.py │ │ ├── notebook.py │ │ ├── sentencepiece_model_pb2.py │ │ └── versions.py ├── templates │ ├── adding_a_new_example_script │ │ ├── README.md │ │ ├── cookiecutter.json │ │ └── {{cookiecutter.directory_name}} │ │ │ └── run_{{cookiecutter.example_shortcut}}.py │ └── adding_a_new_model │ │ ├── README.md │ │ ├── cookiecutter-template-{{cookiecutter.modelname}} │ │ ├── __init__.py │ │ ├── configuration.json │ │ ├── configuration_{{cookiecutter.lowercase_modelname}}.py │ │ ├── modeling_tf_{{cookiecutter.lowercase_modelname}}.py │ │ ├── modeling_{{cookiecutter.lowercase_modelname}}.py │ │ ├── test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py │ │ ├── test_modeling_{{cookiecutter.lowercase_modelname}}.py │ │ ├── to_replace_{{cookiecutter.lowercase_modelname}}.py │ │ ├── tokenization_fast_{{cookiecutter.lowercase_modelname}}.py │ │ ├── tokenization_{{cookiecutter.lowercase_modelname}}.py │ │ └── {{cookiecutter.lowercase_modelname}}.rst │ │ ├── cookiecutter.json │ │ └── tests │ │ ├── encoder-bert-tokenizer.json │ │ ├── pt-encoder-bert-tokenizer.json │ │ ├── pt-seq-2-seq-bart-tokenizer.json │ │ ├── standalone.json │ │ ├── tf-encoder-bert-tokenizer.json │ │ └── tf-seq-2-seq-bart-tokenizer.json ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── fixtures │ │ └── sample_text_no_unicode.txt │ ├── test_activations.py │ ├── test_activations_tf.py │ ├── test_benchmark.py │ ├── test_benchmark_tf.py │ ├── test_cli.py │ ├── test_configuration_auto.py │ ├── test_configuration_common.py │ ├── test_data_collator.py │ ├── test_doc_samples.py │ ├── test_file_utils.py │ ├── test_flax_auto.py │ ├── test_generation_beam_search.py │ ├── test_generation_logits_process.py │ ├── test_generation_utils.py │ ├── test_hf_api.py │ ├── test_hf_argparser.py │ ├── test_logging.py │ ├── test_model_card.py │ ├── test_model_output.py │ ├── test_modeling_albert.py │ ├── test_modeling_auto.py │ ├── test_modeling_bart.py │ ├── test_modeling_bert.py │ ├── test_modeling_bert_generation.py │ ├── test_modeling_blenderbot.py │ ├── test_modeling_camembert.py │ ├── test_modeling_common.py │ ├── test_modeling_ctrl.py │ ├── test_modeling_deberta.py │ ├── test_modeling_distilbert.py │ ├── test_modeling_dpr.py │ ├── test_modeling_electra.py │ ├── test_modeling_encoder_decoder.py │ ├── test_modeling_flaubert.py │ ├── test_modeling_flax_bert.py │ ├── test_modeling_flax_common.py │ ├── test_modeling_flax_roberta.py │ ├── test_modeling_fsmt.py │ ├── test_modeling_funnel.py │ ├── test_modeling_gpt2.py │ ├── test_modeling_layoutlm.py │ ├── test_modeling_longformer.py │ ├── test_modeling_lxmert.py │ ├── test_modeling_marian.py │ ├── test_modeling_mbart.py │ ├── test_modeling_mobilebert.py │ ├── test_modeling_mpnet.py │ ├── test_modeling_mt5.py │ ├── test_modeling_openai.py │ ├── test_modeling_pegasus.py │ ├── test_modeling_prophetnet.py │ ├── test_modeling_rag.py │ ├── test_modeling_reformer.py │ ├── test_modeling_roberta.py │ ├── test_modeling_squeezebert.py │ ├── test_modeling_t5.py │ ├── test_modeling_tapas.py │ ├── test_modeling_tf_albert.py │ ├── test_modeling_tf_auto.py │ ├── test_modeling_tf_bart.py │ ├── test_modeling_tf_bert.py │ ├── test_modeling_tf_blenderbot.py │ ├── test_modeling_tf_camembert.py │ ├── test_modeling_tf_common.py │ ├── test_modeling_tf_ctrl.py │ ├── test_modeling_tf_distilbert.py │ ├── test_modeling_tf_dpr.py │ ├── test_modeling_tf_electra.py │ ├── test_modeling_tf_flaubert.py │ ├── test_modeling_tf_funnel.py │ ├── test_modeling_tf_gpt2.py │ ├── test_modeling_tf_longformer.py │ ├── test_modeling_tf_lxmert.py │ ├── test_modeling_tf_marian.py │ ├── test_modeling_tf_mbart.py │ ├── test_modeling_tf_mobilebert.py │ ├── test_modeling_tf_mpnet.py │ ├── test_modeling_tf_mt5.py │ ├── test_modeling_tf_openai.py │ ├── test_modeling_tf_pegasus.py │ ├── test_modeling_tf_pytorch.py │ ├── test_modeling_tf_roberta.py │ ├── test_modeling_tf_t5.py │ ├── test_modeling_tf_transfo_xl.py │ ├── test_modeling_tf_xlm.py │ ├── test_modeling_tf_xlm_roberta.py │ ├── test_modeling_tf_xlnet.py │ ├── test_modeling_transfo_xl.py │ ├── test_modeling_xlm.py │ ├── test_modeling_xlm_prophetnet.py │ ├── test_modeling_xlm_roberta.py │ ├── test_modeling_xlnet.py │ ├── test_onnx.py │ ├── test_optimization.py │ ├── test_optimization_tf.py │ ├── test_pipelines_common.py │ ├── test_pipelines_conversational.py │ ├── test_pipelines_feature_extraction.py │ ├── test_pipelines_fill_mask.py │ ├── test_pipelines_ner.py │ ├── test_pipelines_question_answering.py │ ├── test_pipelines_sentiment_analysis.py │ ├── test_pipelines_summarization.py │ ├── test_pipelines_table_question_answering.py │ ├── test_pipelines_text2text_generation.py │ ├── test_pipelines_text_generation.py │ ├── test_pipelines_translation.py │ ├── test_pipelines_zero_shot.py │ ├── test_retrieval_rag.py │ ├── test_skip_decorators.py │ ├── test_tokenization_albert.py │ ├── test_tokenization_auto.py │ ├── test_tokenization_bart.py │ ├── test_tokenization_barthez.py │ ├── test_tokenization_bert.py │ ├── test_tokenization_bert_generation.py │ ├── test_tokenization_bert_japanese.py │ ├── test_tokenization_bertweet.py │ ├── test_tokenization_blenderbot.py │ ├── test_tokenization_camembert.py │ ├── test_tokenization_common.py │ ├── test_tokenization_ctrl.py │ ├── test_tokenization_deberta.py │ ├── test_tokenization_distilbert.py │ ├── test_tokenization_dpr.py │ ├── test_tokenization_fsmt.py │ ├── test_tokenization_funnel.py │ ├── test_tokenization_gpt2.py │ ├── test_tokenization_herbert.py │ ├── test_tokenization_layoutlm.py │ ├── test_tokenization_lxmert.py │ ├── test_tokenization_marian.py │ ├── test_tokenization_mbart.py │ ├── test_tokenization_mpnet.py │ ├── test_tokenization_openai.py │ ├── test_tokenization_pegasus.py │ ├── test_tokenization_phobert.py │ ├── test_tokenization_prophetnet.py │ ├── test_tokenization_rag.py │ ├── test_tokenization_reformer.py │ ├── test_tokenization_roberta.py │ ├── test_tokenization_squeezebert.py │ ├── test_tokenization_t5.py │ ├── test_tokenization_tapas.py │ ├── test_tokenization_transfo_xl.py │ ├── test_tokenization_utils.py │ ├── test_tokenization_xlm.py │ ├── test_tokenization_xlm_prophetnet.py │ ├── test_tokenization_xlm_roberta.py │ ├── test_tokenization_xlnet.py │ ├── test_trainer.py │ ├── test_trainer_callback.py │ ├── test_trainer_distributed.py │ ├── test_trainer_seq2seq.py │ ├── test_trainer_tpu.py │ ├── test_trainer_utils.py │ ├── test_utils_check_copies.py │ └── test_versions_utils.py ├── utils │ ├── check_copies.py │ ├── check_dummies.py │ ├── check_repo.py │ ├── check_table.py │ ├── download_glue_data.py │ ├── get_modified_files.py │ ├── link_tester.py │ └── style_doc.py └── valohai.yaml └── utils.py /legacy/v1.0/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | Dataset files are in `.jsonl` format i.e. one `JSON` per line. Each `JSON` has the following two keys: 3 | 4 | * `summary`: Article Summary 5 | * `text`: Article Text / Content 6 | 7 | [Download](https://docs.google.com/uc?export=download&id=1y0VJ5gTC9ZT42FnJgbrQSWSGT-jTkbfM) the dataset. The datasets stats are as follows: 8 | 9 | Language| ISO 639-1 Code | Total examples 10 | --------|----------------|--------------- 11 | Amharic | am | 5461 12 | Arabic | ar | 40327 13 | Azerbaijani | az | 7332 14 | Bengali | bn | 8226 15 | Burmese | my | 5002 16 | Chinese (Simplified) | zh-CN | 39810 17 | English | en | 301444 18 | French | fr | 9100 19 | Gujarati | gu | 9665 20 | Hausa | ha | 6313 21 | Hindi | hi | 51715 22 | Igbo | ig | 4559 23 | Indonesian | id | 44170 24 | Japanese | ja | 7585 25 | Kirundi | rn | 5558 26 | Korean | ko | 4281 27 | Kyrgyz | ky | 2315 28 | Marathi | mr | 11164 29 | Nepali | np | 5286 30 | Oromo | om | 5738 31 | Pashto | ps | 15274 32 | Persian | fa | 25783 33 | Pidgin | n/a | 9715 34 | Portuguese | pt | 23521 35 | Punjabi | pa | 8678 36 | Russian | ru | 52712 37 | Scottish Gaelic | gd | 1101 38 | Serbian (Cyrillic) | sr | 7317 39 | Serbian (Latin) | sr | 7263 40 | Sinhala | si | 3414 41 | Somali | so | 5636 42 | Spanish | es | 44413 43 | Swahili | sw | 9310 44 | Tamil | ta | 17846 45 | Telugu | te | 11308 46 | Thai | th | 6928 47 | Tigrinya | ti | 4827 48 | Turkish | tr | 29510 49 | Ukrainian | uk | 57952 50 | Urdu | ur | 40714 51 | Uzbek | uz | 4944 52 | Vietnamese | vi | 23468 53 | Welsh | cy | 11596 54 | Yoruba | yo | 6316 55 | -------------------------------------------------------------------------------- /multilingual_rouge_scoring/__init__.py: -------------------------------------------------------------------------------- 1 | import rouge_score as rouge 2 | -------------------------------------------------------------------------------- /multilingual_rouge_scoring/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/otuncelli/turkish-stemmer-python 2 | git+https://github.com/abhik1505040/bengali-stemmer 3 | absl-py 4 | nltk 5 | numpy 6 | six>=1.14 7 | pythainlp 8 | pyonmttok 9 | jieba 10 | fugashi[unidic] -------------------------------------------------------------------------------- /multilingual_rouge_scoring/setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | tag_build = 3 | tag_date = 0 4 | 5 | -------------------------------------------------------------------------------- /multilingual_rouge_scoring/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="rouge_score", 8 | author="Google LLC", 9 | author_email="no-reply@google.com", 10 | description="Pure python implementation of ROUGE-1.5.5.", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | packages=['rouge_score'], 14 | package_dir = {'rouge_score':''}, 15 | classifiers=[ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: Apache Software License" 18 | ], 19 | install_requires=[ 20 | "absl-py", 21 | "nltk", 22 | "numpy", 23 | "six>=1.14.0", 24 | ], 25 | python_requires='>=3.6', 26 | ) 27 | -------------------------------------------------------------------------------- /multilingual_rouge_scoring/tokenization_wrapper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Lint as: python2, python3 18 | """A library for tokenizing text.""" 19 | 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | from rouge_score.tokenizers import BasicTokenizer 25 | 26 | 27 | def tokenize(text, stemmer=None, tokenizer=None): 28 | """Tokenize input text into a list of tokens. 29 | 30 | Args: 31 | text: A text blob to tokenize. 32 | stemmer: An optional stemmer. 33 | tokenizer: An optional tokenizer. 34 | 35 | Returns: 36 | A list of string tokens extracted from input text. 37 | """ 38 | if tokenizer is None: 39 | tokenizer = BasicTokenizer() 40 | 41 | # Convert everything to lowercase. 42 | text = text.lower() 43 | # replace punctuation and tokenize 44 | tokens = tokenizer(text) 45 | 46 | if stemmer: 47 | tokens = [stemmer(x) for x in tokens] 48 | 49 | tokens = [x for x in tokens if x] 50 | return tokens 51 | -------------------------------------------------------------------------------- /seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /seq2seq/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # misc. settings 4 | export seed=1234 5 | 6 | # model settings 7 | export model_name= 8 | 9 | # input / output settings 10 | export input_dir="XLSum_input/individual/bengali" 11 | export output_dir="XLSum_output/individual/bengali" 12 | 13 | # batch / sequence sizes 14 | export PER_DEVICE_EVAL_BATCH_SIZE=8 15 | export MAX_SOURCE_LENGTH=512 16 | export TEST_MAX_TARGET_LENGTH=84 17 | 18 | # evaluation settings 19 | export rouge_lang="bengali" 20 | export eval_beams=4 21 | export length_penalty=0.6 22 | export no_repeat_ngram_size=2 23 | 24 | # optional_arguments 25 | optional_arguments=( 26 | "--cache_dir cache_dir/" 27 | ) 28 | 29 | # optional for logging 30 | # export WANDB_PROJECT="MT5-Experiments" 31 | # export WANDB_WATCH=false 32 | # export WANDB_MODE="dryrun" 33 | export WANDB_DISABLED=true 34 | 35 | python ./pipeline.py \ 36 | --model_name_or_path $model_name \ 37 | --data_dir $input_dir --output_dir $output_dir \ 38 | --per_device_eval_batch_size=$PER_DEVICE_EVAL_BATCH_SIZE \ 39 | --max_source_length $MAX_SOURCE_LENGTH --test_max_target_length $TEST_MAX_TARGET_LENGTH \ 40 | --rouge_lang $rouge_lang --length_penalty $length_penalty --no_repeat_ngram_size $no_repeat_ngram_size \ 41 | --eval_beams $eval_beams --seed $seed --overwrite_output_dir --predict_with_generate --do_predict \ 42 | $(echo ${optional_arguments[@]}) 43 | -------------------------------------------------------------------------------- /seq2seq/job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=XLSum 3 | #SBATCH --output=xlsum_four_node_eight_gpu.log 4 | #SBATCH --nodes=4 # Number of nodes 5 | #SBATCH --ntasks=4 # Number of MPI ranks 6 | #SBATCH --cpus-per-task=2 # Number of OpenMP threads for each MPI process/rank 7 | #SBATCH --mem=32G # Total memory 8 | #SBATCH --time=0-168:00:00 # Walltime in hh:mm:ss or d-hh:mm:ss 9 | #SBATCH --gres=gpu:p100:2 # no of gpus per node 10 | 11 | 12 | module load gcc/7.3.0 13 | module load openmpi/3.0.0 14 | module load anaconda3/5.1.0 15 | module load cuda/10.2.89 16 | module load cudnn/7.6.5-cuda-10.2.89 17 | module load nccl/2.6.4 18 | module load python/3.7.4 19 | module load git/2.18.0 20 | 21 | # export WANDB_API_KEY="" 22 | export NCCL_DEBUG=INFO 23 | export NPROC_PER_NODE=2 24 | export PARENT=`/bin/hostname -s` 25 | export MPORT=12345 26 | export CHILDREN=`scontrol show hostnames $SLURM_JOB_NODELIST | grep -v $PARENT` 27 | export HOSTLIST="$PARENT $CHILDREN" 28 | export WORLD_SIZE=$SLURM_NTASKS 29 | 30 | # you may need to manually set this to the absolute path 31 | # of this directory, depending on your SLURM configuration 32 | export BASE_DIR=$(pwd) 33 | 34 | # using absolute directories to avoid any unwanted errors 35 | source activate "${BASE_DIR}/env" 36 | srun "${BASE_DIR}/distributed_trainer.sh" 37 | -------------------------------------------------------------------------------- /seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | sentencepiece != 0.1.92 7 | protobuf -------------------------------------------------------------------------------- /seq2seq/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | try: 16 | import nltk 17 | 18 | NLTK_AVAILABLE = True 19 | except (ImportError, ModuleNotFoundError): 20 | NLTK_AVAILABLE = False 21 | 22 | 23 | def add_newline_to_end_of_each_sentence(x: str) -> str: 24 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 25 | re.sub("", "", x) # remove pegasus newline char 26 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 27 | return "\n".join(nltk.sent_tokenize(x)) 28 | -------------------------------------------------------------------------------- /seq2seq/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # install transformers and requirements 4 | pip install --upgrade -r requirements.txt 5 | pip install --upgrade transformers/ 6 | 7 | # install rouge module and dependecies 8 | pip install -r ../multilingual_rouge_scoring/requirements.txt 9 | python -m unidic download # for japanese segmentation 10 | pip install --upgrade ../multilingual_rouge_scoring 11 | python -m nltk.downloader punkt -------------------------------------------------------------------------------- /seq2seq/transformers/.circleci/deploy.sh: -------------------------------------------------------------------------------- 1 | cd docs 2 | 3 | function deploy_doc(){ 4 | echo "Creating doc at commit $1 and pushing to folder $2" 5 | git checkout $1 6 | if [ ! -z "$2" ] 7 | then 8 | if [ "$2" == "master" ]; then 9 | echo "Pushing master" 10 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir/$2/ 11 | cp -r _build/html/_static . 12 | elif ssh -oStrictHostKeyChecking=no $doc "[ -d $dir/$2 ]"; then 13 | echo "Directory" $2 "already exists" 14 | scp -r -oStrictHostKeyChecking=no _static/* $doc:$dir/$2/_static/ 15 | else 16 | echo "Pushing version" $2 17 | make clean && make html 18 | rm -rf _build/html/_static 19 | cp -r _static _build/html 20 | scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 21 | fi 22 | else 23 | echo "Pushing stable" 24 | make clean && make html 25 | rm -rf _build/html/_static 26 | cp -r _static _build/html 27 | scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 28 | fi 29 | } 30 | 31 | # You can find the commit for each tag on https://github.com/huggingface/transformers/tags 32 | deploy_doc "master" master 33 | deploy_doc "b33a385" v1.0.0 34 | deploy_doc "fe02e45" v1.1.0 35 | deploy_doc "89fd345" v1.2.0 36 | deploy_doc "fc9faa8" v2.0.0 37 | deploy_doc "3ddce1d" v2.1.1 38 | deploy_doc "3616209" v2.2.0 39 | deploy_doc "d0f8b9a" v2.3.0 40 | deploy_doc "6664ea9" v2.4.0 41 | deploy_doc "fb560dc" v2.5.0 42 | deploy_doc "b90745c" v2.5.1 43 | deploy_doc "fbc5bf1" v2.6.0 44 | deploy_doc "6f5a12a" v2.7.0 45 | deploy_doc "11c3257" v2.8.0 46 | deploy_doc "e7cfc1a" v2.9.0 47 | deploy_doc "7cb203f" v2.9.1 48 | deploy_doc "10d7239" v2.10.0 49 | deploy_doc "b42586e" v2.11.0 50 | deploy_doc "7fb8bdf" v3.0.2 51 | deploy_doc "4b3ee9c" v3.1.0 52 | deploy_doc "3ebb1b3" v3.2.0 53 | deploy_doc "0613f05" v3.3.1 54 | deploy_doc "eb0e0ce" v3.4.0 55 | deploy_doc "818878d" v3.5.1 56 | deploy_doc "c781171" v4.0.0 57 | deploy_doc "bfa4ccf" # v4.1.1 Latest stable release 58 | -------------------------------------------------------------------------------- /seq2seq/transformers/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=transformers 3 | omit = 4 | # skip convertion scripts from testing for now 5 | */convert_* 6 | */__main__.py 7 | [report] 8 | exclude_lines = 9 | pragma: no cover 10 | raise 11 | except 12 | register_parameter -------------------------------------------------------------------------------- /seq2seq/transformers/.github/ISSUE_TEMPLATE/---new-benchmark.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F5A5 New benchmark" 3 | about: Benchmark a part of this library and share your results 4 | title: "[Benchmark]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🖥 Benchmarking `transformers` 11 | 12 | ## Benchmark 13 | 14 | Which part of `transformers` did you benchmark? 15 | 16 | ## Set-up 17 | 18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use? 19 | 20 | ## Results 21 | 22 | Put your results here! 23 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/ISSUE_TEMPLATE/--new-model-addition.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F31F New model addition" 3 | about: Submit a proposal/request to implement a new Transformer-based model 4 | title: '' 5 | labels: New model 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🌟 New model addition 11 | 12 | ## Model description 13 | 14 | 15 | 16 | ## Open source status 17 | 18 | * [ ] the model implementation is available: (give details) 19 | * [ ] the model weights are available: (give details) 20 | * [ ] who are the authors: (mention them, if possible by @gh-username) 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature request" 3 | about: Submit a proposal/request for a new transformers feature 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🚀 Feature request 11 | 12 | 14 | 15 | ## Motivation 16 | 17 | 20 | 21 | ## Your contribution 22 | 23 | 26 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/ISSUE_TEMPLATE/question-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓ Questions & Help" 3 | about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/ 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # ❓ Questions & Help 11 | 12 | 16 | 17 | ## Details 18 | 19 | 20 | 21 | 23 | 24 | **A link to original question on the forum**: 25 | 26 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/conda/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install # Python command to install the script. 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "transformers" %} 2 | 3 | package: 4 | name: "{{ name|lower }}" 5 | version: "{{ TRANSFORMERS_VERSION }}" 6 | 7 | source: 8 | path: ../../ 9 | 10 | build: 11 | noarch: python 12 | 13 | requirements: 14 | host: 15 | - python 16 | - pip 17 | - numpy 18 | - dataclasses 19 | - packaging 20 | - filelock 21 | - requests 22 | - tqdm >=4.27 23 | - sacremoses 24 | - regex !=2019.12.17 25 | - protobuf 26 | - tokenizers ==0.9.4 27 | run: 28 | - python 29 | - numpy 30 | - dataclasses 31 | - packaging 32 | - filelock 33 | - requests 34 | - tqdm >=4.27 35 | - sacremoses 36 | - regex !=2019.12.17 37 | - protobuf 38 | - tokenizers ==0.9.4 39 | 40 | test: 41 | imports: 42 | - transformers 43 | 44 | about: 45 | home: https://huggingface.co 46 | license: Apache License 2.0 47 | license_file: LICENSE 48 | summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0." 49 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false -------------------------------------------------------------------------------- /seq2seq/transformers/.github/workflows/github-torch-hub.yml: -------------------------------------------------------------------------------- 1 | name: Torch hub integration 2 | 3 | on: 4 | push: 5 | branches: 6 | - "*" 7 | 8 | jobs: 9 | torch_hub_integration: 10 | runs-on: ubuntu-latest 11 | env: 12 | # TODO quickfix but may need more investigation 13 | ACTIONS_ALLOW_UNSECURE_COMMANDS: True 14 | steps: 15 | # no checkout necessary here. 16 | - name: Extract branch name 17 | run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}" 18 | - name: Check branch name 19 | run: echo $BRANCH 20 | - name: Set up Python 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: 3.7 24 | 25 | - name: Loading cache 26 | uses: actions/cache@v2 27 | id: cache 28 | with: 29 | path: ~/.cache/pip 30 | key: v0-torch_hub-${{ hashFiles('setup.py') }} 31 | 32 | - name: Install dependencies 33 | run: | 34 | pip install --upgrade pip 35 | pip install torch 36 | pip install numpy filelock protobuf requests tqdm regex sentencepiece sacremoses tokenizers packaging 37 | 38 | - name: Torch hub list 39 | run: | 40 | python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))" 41 | 42 | - name: Torch hub help 43 | run: | 44 | python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))" 45 | -------------------------------------------------------------------------------- /seq2seq/transformers/.github/workflows/release-conda.yml: -------------------------------------------------------------------------------- 1 | name: Release - Conda 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | env: 9 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }} 10 | 11 | jobs: 12 | build_and_package: 13 | runs-on: ubuntu-latest 14 | defaults: 15 | run: 16 | shell: bash -l {0} 17 | 18 | steps: 19 | - name: Checkout repository 20 | uses: actions/checkout@v1 21 | 22 | - name: Install miniconda 23 | uses: conda-incubator/setup-miniconda@v2 24 | with: 25 | auto-update-conda: true 26 | auto-activate-base: false 27 | activate-environment: "build-transformers" 28 | channels: huggingface 29 | 30 | - name: Setup conda env 31 | run: | 32 | conda install -c defaults anaconda-client conda-build 33 | 34 | - name: Extract version 35 | run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV 36 | 37 | - name: Build conda packages 38 | run: | 39 | conda info 40 | conda build .github/conda 41 | 42 | - name: Upload to Anaconda 43 | run: anaconda upload `conda build .github/conda --output` --force -------------------------------------------------------------------------------- /seq2seq/transformers/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow-cpu \ 19 | torch 20 | 21 | WORKDIR /workspace 22 | COPY . transformers/ 23 | RUN cd transformers/ && \ 24 | python3 -m pip install --no-cache-dir . 25 | 26 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | tensorflow \ 19 | torch 20 | 21 | RUN git clone https://github.com/NVIDIA/apex 22 | RUN cd apex && \ 23 | python3 setup.py install && \ 24 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 25 | 26 | WORKDIR /workspace 27 | COPY . transformers/ 28 | RUN cd transformers/ && \ 29 | python3 -m pip install --no-cache-dir . 30 | 31 | CMD ["/bin/bash"] 32 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-pytorch-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | jupyter \ 18 | torch 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-pytorch-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | torch 19 | 20 | RUN git clone https://github.com/NVIDIA/apex 21 | RUN cd apex && \ 22 | python3 setup.py install && \ 23 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 24 | 25 | WORKDIR /workspace 26 | COPY . transformers/ 27 | RUN cd transformers/ && \ 28 | python3 -m pip install --no-cache-dir . 29 | 30 | CMD ["/bin/bash"] 31 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet: -------------------------------------------------------------------------------- 1 | local base = import 'templates/base.libsonnet'; 2 | local tpus = import 'templates/tpus.libsonnet'; 3 | local utils = import "templates/utils.libsonnet"; 4 | local volumes = import "templates/volumes.libsonnet"; 5 | 6 | local bertBaseCased = base.BaseTest { 7 | frameworkPrefix: "hf", 8 | modelName: "bert-base-cased", 9 | mode: "example", 10 | configMaps: [], 11 | 12 | timeout: 3600, # 1 hour, in seconds 13 | 14 | image: std.extVar('image'), 15 | imageTag: std.extVar('image-tag'), 16 | 17 | tpuSettings+: { 18 | softwareVersion: "pytorch-nightly", 19 | }, 20 | accelerator: tpus.v3_8, 21 | 22 | volumeMap+: { 23 | datasets: volumes.PersistentVolumeSpec { 24 | name: "huggingface-cluster-disk", 25 | mountPath: "/datasets", 26 | }, 27 | }, 28 | command: utils.scriptCommand( 29 | ||| 30 | python -m pytest -s transformers/examples/test_xla_examples.py -v 31 | test_exit_code=$? 32 | echo "\nFinished running commands.\n" 33 | test $test_exit_code -eq 0 34 | ||| 35 | ), 36 | }; 37 | 38 | bertBaseCased.oneshotJob 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-pytorch-tpu/dataset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: huggingface-cluster-disk 5 | spec: 6 | storageClassName: "" 7 | capacity: 8 | storage: 500Gi 9 | accessModes: 10 | - ReadOnlyMany 11 | claimRef: 12 | namespace: default 13 | name: huggingface-cluster-disk-claim 14 | gcePersistentDisk: 15 | pdName: huggingface-cluster-disk 16 | fsType: ext4 17 | readOnly: true 18 | --- 19 | apiVersion: v1 20 | kind: PersistentVolumeClaim 21 | metadata: 22 | name: huggingface-cluster-disk-claim 23 | spec: 24 | # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass. 25 | # A nil storageClassName value uses the default StorageClass. For details, see 26 | # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1 27 | storageClassName: "" 28 | accessModes: 29 | - ReadOnlyMany 30 | resources: 31 | requests: 32 | storage: 1Ki 33 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ~/.bashrc 3 | echo "running docker-entrypoint.sh" 4 | conda activate container 5 | echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS 6 | echo "printed TPU info" 7 | export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" 8 | exec "$@"#!/bin/bash 9 | -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-tensorflow-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow-cpu 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /seq2seq/transformers/docker/transformers-tensorflow-gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 2 | LABEL maintainer="Hugging Face" 3 | LABEL repository="transformers" 4 | 5 | RUN apt update && \ 6 | apt install -y bash \ 7 | build-essential \ 8 | git \ 9 | curl \ 10 | ca-certificates \ 11 | python3 \ 12 | python3-pip && \ 13 | rm -rf /var/lib/apt/lists 14 | 15 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 16 | python3 -m pip install --no-cache-dir \ 17 | mkl \ 18 | tensorflow 19 | 20 | WORKDIR /workspace 21 | COPY . transformers/ 22 | RUN cd transformers/ && \ 23 | python3 -m pip install --no-cache-dir . 24 | 25 | CMD ["/bin/bash"] -------------------------------------------------------------------------------- /seq2seq/transformers/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } 13 | 14 | .highlight .gp { 15 | color: #FB8D68; 16 | } -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/contributing.md: -------------------------------------------------------------------------------- 1 | ../../CONTRIBUTING.md -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/examples.md: -------------------------------------------------------------------------------- 1 | ../../examples/README.md -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/favicon.ico -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/local_attention_mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/local_attention_mask.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/ppl_chunked.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/ppl_chunked.gif -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/ppl_full.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/ppl_full.gif -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/ppl_sliding.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/ppl_sliding.gif -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | .. 2 | Copyright 2020 The HuggingFace Team. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 5 | the License. You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 10 | an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 11 | specific language governing permissions and limitations under the License. 12 | 13 | Configuration 14 | ----------------------------------------------------------------------------------------------------------------------- 15 | 16 | The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration 17 | either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded 18 | from HuggingFace's AWS S3 repository). 19 | 20 | 21 | PretrainedConfig 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | .. autoclass:: transformers.PretrainedConfig 25 | :members: 26 | -------------------------------------------------------------------------------- /seq2seq/transformers/docs/source/notebooks.md: -------------------------------------------------------------------------------- 1 | ../../notebooks/README.md -------------------------------------------------------------------------------- /seq2seq/transformers/examples/_tests_requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/benchmarking/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # 🤗 Benchmark results 18 | 19 | Here, you can find a list of the different benchmark results created by the community. 20 | 21 | If you would like to list benchmark results on your favorite models of the [model hub](https://huggingface.co/models) here, please open a Pull Request and add it below. 22 | 23 | | Benchmark description | Results | Environment info | Author | 24 | |:----------|:-------------|:-------------|------:| 25 | | PyTorch Benchmark on inference for `bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 26 | | PyTorch Benchmark on inference for `bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Partick von Platen](https://github.com/patrickvonplaten) | 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/benchmarking/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/benchmarking/requirements.txt -------------------------------------------------------------------------------- /seq2seq/transformers/examples/benchmarking/run_benchmark.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Benchmarking the library on inference and training """ 17 | 18 | from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments 19 | 20 | 21 | def main(): 22 | parser = HfArgumentParser(PyTorchBenchmarkArguments) 23 | try: 24 | benchmark_args = parser.parse_args_into_dataclasses()[0] 25 | except ValueError as e: 26 | arg_error_msg = "Arg --no_{0} is no longer used, please use --no-{0} instead." 27 | begin_error_msg = " ".join(str(e).split(" ")[:-1]) 28 | full_error_msg = "" 29 | depreciated_args = eval(str(e).split(" ")[-1]) 30 | wrong_args = [] 31 | for arg in depreciated_args: 32 | # arg[2:] removes '--' 33 | if arg[2:] in PyTorchBenchmarkArguments.deprecated_args: 34 | # arg[5:] removes '--no_' 35 | full_error_msg += arg_error_msg.format(arg[5:]) 36 | else: 37 | wrong_args.append(arg) 38 | if len(wrong_args) > 0: 39 | full_error_msg = full_error_msg + begin_error_msg + str(wrong_args) 40 | raise ValueError(full_error_msg) 41 | 42 | benchmark = PyTorchBenchmark(args=benchmark_args) 43 | benchmark.run() 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # tests directory-specific settings - this file is run automatically 16 | # by pytest before any tests are run 17 | 18 | import sys 19 | import warnings 20 | from os.path import abspath, dirname, join 21 | 22 | 23 | # allow having multiple repository checkouts and not needing to remember to rerun 24 | # 'pip install -e .[dev]' when switching between checkouts and running tests. 25 | git_repo_path = abspath(join(dirname(dirname(__file__)), "src")) 26 | sys.path.insert(1, git_repo_path) 27 | 28 | # silence FutureWarning warnings in tests since often we can't act on them until 29 | # they become normal warnings - i.e. the tests still need to test the current functionality 30 | warnings.simplefilter(action="ignore", category=FutureWarning) 31 | 32 | 33 | def pytest_addoption(parser): 34 | from transformers.testing_utils import pytest_addoption_shared 35 | 36 | pytest_addoption_shared(parser) 37 | 38 | 39 | def pytest_terminal_summary(terminalreporter): 40 | from transformers.testing_utils import pytest_terminal_summary_main 41 | 42 | make_reports = terminalreporter.config.getoption("--make-reports") 43 | if make_reports: 44 | pytest_terminal_summary_main(terminalreporter, id=make_reports) 45 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/language-modeling/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Legacy examples 18 | 19 | This folder contains examples which are not actively maintained (mostly contributed by the community). 20 | 21 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 22 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/pytorch-lightning/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | ray 23 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/pytorch-lightning/run_glue.sh: -------------------------------------------------------------------------------- 1 | # Install example requirements 2 | pip install -r ../requirements.txt 3 | 4 | # Download glue data 5 | python3 ../../utils/download_glue_data.py 6 | 7 | export TASK=mrpc 8 | export DATA_DIR=./glue_data/MRPC/ 9 | export MAX_LENGTH=128 10 | export LEARNING_RATE=2e-5 11 | export BERT_MODEL=bert-base-cased 12 | export BATCH_SIZE=32 13 | export NUM_EPOCHS=3 14 | export SEED=2 15 | export OUTPUT_DIR_NAME=mrpc-pl-bert 16 | export CURRENT_DIR=${PWD} 17 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 18 | 19 | # Make output directory if it doesn't exist 20 | mkdir -p $OUTPUT_DIR 21 | # Add parent directory to python path to access lightning_base.py 22 | export PYTHONPATH="../":"${PYTHONPATH}" 23 | 24 | python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \ 25 | --task $TASK \ 26 | --model_name_or_path $BERT_MODEL \ 27 | --output_dir $OUTPUT_DIR \ 28 | --max_seq_length $MAX_LENGTH \ 29 | --learning_rate $LEARNING_RATE \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --train_batch_size $BATCH_SIZE \ 32 | --seed $SEED \ 33 | --do_train \ 34 | --do_predict 35 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/pytorch-lightning/run_ner.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # for seqeval metrics import 4 | pip install -r ../requirements.txt 5 | 6 | ## The relevant files are currently on a shared Google 7 | ## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J 8 | ## Monitor for changes and eventually migrate to nlp dataset 9 | curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \ 10 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp 11 | curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \ 12 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp 13 | curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \ 14 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp 15 | 16 | export MAX_LENGTH=128 17 | export BERT_MODEL=bert-base-multilingual-cased 18 | python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt 19 | python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt 20 | python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt 21 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt 22 | export BATCH_SIZE=32 23 | export NUM_EPOCHS=3 24 | export SEED=1 25 | 26 | export OUTPUT_DIR_NAME=germeval-model 27 | export CURRENT_DIR=${PWD} 28 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 29 | mkdir -p $OUTPUT_DIR 30 | 31 | # Add parent directory to python path to access lightning_base.py 32 | export PYTHONPATH="../":"${PYTHONPATH}" 33 | 34 | python3 run_ner.py --data_dir ./ \ 35 | --labels ./labels.txt \ 36 | --model_name_or_path $BERT_MODEL \ 37 | --output_dir $OUTPUT_DIR \ 38 | --max_seq_length $MAX_LENGTH \ 39 | --num_train_epochs $NUM_EPOCHS \ 40 | --train_batch_size $BATCH_SIZE \ 41 | --seed $SEED \ 42 | --gpus 1 \ 43 | --do_train \ 44 | --do_predict 45 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/pytorch-lightning/run_pos.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if ! [ -f ./dev.txt ]; then 3 | echo "Download dev dataset...." 4 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 5 | fi 6 | 7 | if ! [ -f ./test.txt ]; then 8 | echo "Download test dataset...." 9 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 10 | fi 11 | 12 | if ! [ -f ./train.txt ]; then 13 | echo "Download train dataset...." 14 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 15 | fi 16 | 17 | export MAX_LENGTH=200 18 | export BERT_MODEL=bert-base-uncased 19 | export OUTPUT_DIR=postagger-model 20 | export BATCH_SIZE=32 21 | export NUM_EPOCHS=3 22 | export SAVE_STEPS=750 23 | export SEED=1 24 | 25 | 26 | # Add parent directory to python path to access lightning_base.py 27 | export PYTHONPATH="../":"${PYTHONPATH}" 28 | 29 | python3 run_ner.py --data_dir ./ \ 30 | --task_type POS \ 31 | --model_name_or_path $BERT_MODEL \ 32 | --output_dir $OUTPUT_DIR \ 33 | --max_seq_length $MAX_LENGTH \ 34 | --num_train_epochs $NUM_EPOCHS \ 35 | --train_batch_size $BATCH_SIZE \ 36 | --seed $SEED \ 37 | --gpus 1 \ 38 | --do_train \ 39 | --do_predict 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/token-classification/run.sh: -------------------------------------------------------------------------------- 1 | ## The relevant files are currently on a shared Google 2 | ## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J 3 | ## Monitor for changes and eventually migrate to nlp dataset 4 | curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \ 5 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp 6 | curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \ 7 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp 8 | curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \ 9 | | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp 10 | 11 | export MAX_LENGTH=128 12 | export BERT_MODEL=bert-base-multilingual-cased 13 | python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt 14 | python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt 15 | python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt 16 | cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt 17 | export OUTPUT_DIR=germeval-model 18 | export BATCH_SIZE=32 19 | export NUM_EPOCHS=3 20 | export SAVE_STEPS=750 21 | export SEED=1 22 | 23 | python3 run_ner.py \ 24 | --task_type NER \ 25 | --data_dir . \ 26 | --labels ./labels.txt \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/token-classification/run_chunk.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Downloading CONLL2003 dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Downloading CONLL2003 test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Downloading CONLL2003 train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=chunker-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type Chunk \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/token-classification/run_pos.sh: -------------------------------------------------------------------------------- 1 | if ! [ -f ./dev.txt ]; then 2 | echo "Download dev dataset...." 3 | curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu' 4 | fi 5 | 6 | if ! [ -f ./test.txt ]; then 7 | echo "Download test dataset...." 8 | curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu' 9 | fi 10 | 11 | if ! [ -f ./train.txt ]; then 12 | echo "Download train dataset...." 13 | curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu' 14 | fi 15 | 16 | export MAX_LENGTH=200 17 | export BERT_MODEL=bert-base-uncased 18 | export OUTPUT_DIR=postagger-model 19 | export BATCH_SIZE=32 20 | export NUM_EPOCHS=3 21 | export SAVE_STEPS=750 22 | export SEED=1 23 | 24 | python3 run_ner.py \ 25 | --task_type POS \ 26 | --data_dir . \ 27 | --model_name_or_path $BERT_MODEL \ 28 | --output_dir $OUTPUT_DIR \ 29 | --max_seq_length $MAX_LENGTH \ 30 | --num_train_epochs $NUM_EPOCHS \ 31 | --per_gpu_train_batch_size $BATCH_SIZE \ 32 | --save_steps $SAVE_STEPS \ 33 | --seed $SEED \ 34 | --do_train \ 35 | --do_eval \ 36 | --do_predict 37 | 38 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/legacy/token-classification/scripts/preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from transformers import AutoTokenizer 4 | 5 | 6 | dataset = sys.argv[1] 7 | model_name_or_path = sys.argv[2] 8 | max_len = int(sys.argv[3]) 9 | 10 | subword_len_counter = 0 11 | 12 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 13 | max_len -= tokenizer.num_special_tokens_to_add() 14 | 15 | with open(dataset, "rt") as f_p: 16 | for line in f_p: 17 | line = line.rstrip() 18 | 19 | if not line: 20 | print(line) 21 | subword_len_counter = 0 22 | continue 23 | 24 | token = line.split()[0] 25 | 26 | current_subwords_len = len(tokenizer.tokenize(token)) 27 | 28 | # Token contains strange control characters like \x96 or \x95 29 | # Just filter out the complete line 30 | if current_subwords_len == 0: 31 | continue 32 | 33 | if (subword_len_counter + current_subwords_len) > max_len: 34 | print("") 35 | print(line) 36 | subword_len_counter = current_subwords_len 37 | continue 38 | 39 | subword_len_counter += current_subwords_len 40 | 41 | print(line) 42 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/multiple-choice/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/question-answering/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Research projects 18 | 19 | This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific 20 | version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work. 21 | 22 | To use any of them, just run the command 23 | ``` 24 | pip install -r requirements.txt 25 | ``` 26 | inside the folder of your choice. 27 | 28 | If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder. 29 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/adversarial/README.md: -------------------------------------------------------------------------------- 1 | ## Adversarial evaluation of model performances 2 | 3 | Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi). 4 | 5 | The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans). 6 | 7 | This is an example of using test_hans.py: 8 | 9 | ```bash 10 | export HANS_DIR=path-to-hans 11 | export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc 12 | export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py 13 | 14 | python run_hans.py \ 15 | --task_name hans \ 16 | --model_type $MODEL_TYPE \ 17 | --do_eval \ 18 | --data_dir $HANS_DIR \ 19 | --model_name_or_path $MODEL_PATH \ 20 | --max_seq_length 128 \ 21 | --output_dir $MODEL_PATH \ 22 | ``` 23 | 24 | This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset. 25 | 26 | The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows: 27 | 28 | ```bash 29 | Heuristic entailed results: 30 | lexical_overlap: 0.9702 31 | subsequence: 0.9942 32 | constituent: 0.9962 33 | 34 | Heuristic non-entailed results: 35 | lexical_overlap: 0.199 36 | subsequence: 0.0396 37 | constituent: 0.118 38 | ``` 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/adversarial/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bert-loses-patience/pabee/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/research_projects/bert-loses-patience/pabee/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bert-loses-patience/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from unittest.mock import patch 5 | 6 | import run_glue_with_pabee 7 | from transformers.testing_utils import TestCasePlus, require_torch_non_multi_gpu_but_fix_me 8 | 9 | 10 | logging.basicConfig(level=logging.DEBUG) 11 | 12 | logger = logging.getLogger() 13 | 14 | 15 | def get_setup_file(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-f") 18 | args = parser.parse_args() 19 | return args.f 20 | 21 | 22 | class PabeeTests(TestCasePlus): 23 | @require_torch_non_multi_gpu_but_fix_me 24 | def test_run_glue(self): 25 | stream_handler = logging.StreamHandler(sys.stdout) 26 | logger.addHandler(stream_handler) 27 | 28 | tmp_dir = self.get_auto_remove_tmp_dir() 29 | testargs = f""" 30 | run_glue_with_pabee.py 31 | --model_type albert 32 | --model_name_or_path albert-base-v2 33 | --data_dir ./tests/fixtures/tests_samples/MRPC/ 34 | --output_dir {tmp_dir} 35 | --overwrite_output_dir 36 | --task_name mrpc 37 | --do_train 38 | --do_eval 39 | --per_gpu_train_batch_size=2 40 | --per_gpu_eval_batch_size=1 41 | --learning_rate=2e-5 42 | --max_steps=50 43 | --warmup_steps=2 44 | --seed=42 45 | --max_seq_length=128 46 | """.split() 47 | 48 | with patch.object(sys, "argv", testargs): 49 | result = run_glue_with_pabee.main() 50 | for value in result.values(): 51 | self.assertGreaterEqual(value, 0.75) 52 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bertabs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/research_projects/bertabs/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bertabs/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | 3 | # For ROUGE 4 | nltk 5 | py-rouge 6 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/bertology/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/deebert/entropy_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7" 17 | 18 | for ENTROPY in $ENTROPIES; do 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 22 | --task_name $DATASET \ 23 | --do_eval \ 24 | --do_lower_case \ 25 | --data_dir $PATH_TO_DATA/$DATASET \ 26 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 27 | --plot_data_dir ./results/ \ 28 | --max_seq_length 128 \ 29 | --early_exit_entropy $ENTROPY \ 30 | --eval_highway \ 31 | --overwrite_cache \ 32 | --per_gpu_eval_batch_size=1 33 | done 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/deebert/eval_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | if [ $MODEL_TYPE = 'bert' ] 12 | then 13 | MODEL_NAME=${MODEL_NAME}-uncased 14 | fi 15 | 16 | 17 | python -u run_glue_deebert.py \ 18 | --model_type $MODEL_TYPE \ 19 | --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 20 | --task_name $DATASET \ 21 | --do_eval \ 22 | --do_lower_case \ 23 | --data_dir $PATH_TO_DATA/$DATASET \ 24 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 25 | --plot_data_dir ./results/ \ 26 | --max_seq_length 128 \ 27 | --eval_each_highway \ 28 | --eval_highway \ 29 | --overwrite_cache \ 30 | --per_gpu_eval_batch_size=1 31 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/deebert/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers == 3.5.1 2 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/deebert/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/research_projects/deebert/src/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/deebert/train_deebert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export CUDA_VISIBLE_DEVICES=0 3 | 4 | PATH_TO_DATA=/h/xinji/projects/GLUE 5 | 6 | MODEL_TYPE=bert # bert or roberta 7 | MODEL_SIZE=base # base or large 8 | DATASET=MRPC # SST-2, MRPC, RTE, QNLI, QQP, or MNLI 9 | 10 | MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE} 11 | EPOCHS=10 12 | if [ $MODEL_TYPE = 'bert' ] 13 | then 14 | EPOCHS=3 15 | MODEL_NAME=${MODEL_NAME}-uncased 16 | fi 17 | 18 | 19 | python -u run_glue_deebert.py \ 20 | --model_type $MODEL_TYPE \ 21 | --model_name_or_path $MODEL_NAME \ 22 | --task_name $DATASET \ 23 | --do_train \ 24 | --do_eval \ 25 | --do_lower_case \ 26 | --data_dir $PATH_TO_DATA/$DATASET \ 27 | --max_seq_length 128 \ 28 | --per_gpu_eval_batch_size=1 \ 29 | --per_gpu_train_batch_size=8 \ 30 | --learning_rate 2e-5 \ 31 | --num_train_epochs $EPOCHS \ 32 | --overwrite_output_dir \ 33 | --seed 42 \ 34 | --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \ 35 | --plot_data_dir ./results/ \ 36 | --save_steps 0 \ 37 | --overwrite_cache \ 38 | --eval_after_first_stage 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | gitpython==3.0.2 4 | tensorboard>=1.14.0 5 | tensorboardX==1.8 6 | psutil==5.6.6 7 | scipy>=1.4.1 8 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/training_configs/distilbert-base-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 28996 14 | } 15 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 119547 14 | } 15 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/distillation/training_configs/distilroberta-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab_size": 50265, 3 | "hidden_size": 768, 4 | "num_hidden_layers": 6, 5 | "num_attention_heads": 12, 6 | "intermediate_size": 3072, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "attention_probs_dropout_prob": 0.1, 10 | "max_position_embeddings": 514, 11 | "type_vocab_size": 1, 12 | "initializer_range": 0.02, 13 | "layer_norm_eps": 0.00001 14 | } -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/longform-qa/README.md: -------------------------------------------------------------------------------- 1 | # Long Form Question Answering 2 | 3 | Author: @yjernite 4 | 5 | This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries. 6 | 7 | You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html). 8 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/longform-qa/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | faiss-cpu 3 | streamlit 4 | elasticsearch 5 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/mm-imdb/README.md: -------------------------------------------------------------------------------- 1 | ## MM-IMDb 2 | 3 | Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py). 4 | 5 | [MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata. 6 | 7 | ### Training on MM-IMDb 8 | 9 | ``` 10 | python run_mmimdb.py \ 11 | --data_dir /path/to/mmimdb/dataset/ \ 12 | --model_type bert \ 13 | --model_name_or_path bert-base-uncased \ 14 | --output_dir /path/to/save/dir/ \ 15 | --do_train \ 16 | --do_eval \ 17 | --max_seq_len 512 \ 18 | --gradient_accumulation_steps 20 \ 19 | --num_image_embeds 3 \ 20 | --num_train_epochs 100 \ 21 | --patience 5 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/movement-pruning/emmental/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .configuration_bert_masked import MaskedBertConfig 3 | from .modeling_bert_masked import ( 4 | MaskedBertForMultipleChoice, 5 | MaskedBertForQuestionAnswering, 6 | MaskedBertForSequenceClassification, 7 | MaskedBertForTokenClassification, 8 | MaskedBertModel, 9 | ) 10 | from .modules import * 11 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/movement-pruning/emmental/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer 3 | from .masked_nn import MaskedLinear 4 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/movement-pruning/lxmert/README.md: -------------------------------------------------------------------------------- 1 | # LXMERT DEMO 2 | 3 | 1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate`` 4 | 2. install reqs: ``pip install -r ./requirements.txt`` 5 | 3. usage is as shown in demo.ipynb 6 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/movement-pruning/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.4.0 2 | -e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers 3 | knockknock>=0.1.8.1 4 | h5py>=2.10.0 5 | numpy>=1.18.2 6 | scipy>=1.4.1 7 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/performer/README.md: -------------------------------------------------------------------------------- 1 | # Performer fine-tuning 2 | 3 | Example authors: @TevenLeScao, @Patrickvonplaten 4 | 5 | Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller 6 | 7 | ## Requirements 8 | 9 | `datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it. 10 | 11 | ## Examples 12 | 13 | `sanity_script.sh` will launch performer fine-tuning from the bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`. 14 | `full_script.sh` will launch performer fine-tuning from the bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`. 15 | 16 | Here are a few key arguments: 17 | - Remove the `--performer` argument to use a standard Bert model. 18 | 19 | - Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 20 | 21 | - You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument. 22 | 23 | - Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging. 24 | 25 | - You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need. -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/performer/full_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/performer/sanity_script.sh: -------------------------------------------------------------------------------- 1 | TOKENIZERS_PARALLELISM=true python run_mlm_performer.py --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/pplm/imgs/headfigure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/research_projects/pplm/imgs/headfigure.png -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/pplm/imgs/wooly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/research_projects/pplm/imgs/wooly.png -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/pplm/pplm_classification_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ClassificationHead(torch.nn.Module): 5 | """Classification Head for transformer encoders""" 6 | 7 | def __init__(self, class_size, embed_size): 8 | super().__init__() 9 | self.class_size = class_size 10 | self.embed_size = embed_size 11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size) 12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size)) 13 | self.mlp = torch.nn.Linear(embed_size, class_size) 14 | 15 | def forward(self, hidden_state): 16 | # hidden_state = F.relu(self.mlp1(hidden_state)) 17 | # hidden_state = self.mlp2(hidden_state) 18 | logits = self.mlp(hidden_state) 19 | return logits 20 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/pplm/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | pytorch-lightning==1.0.4 9 | matplotlib 10 | git-python==1.0.3 11 | faiss-cpu 12 | streamlit 13 | elasticsearch 14 | nltk 15 | pandas 16 | datasets >= 1.1.3 17 | fire 18 | pytest 19 | conllu 20 | sentencepiece != 0.1.92 21 | protobuf 22 | transformers==3.5.1 23 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/rag/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/rag/finetune_rag.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 5 | # run ./examples/rag/finetune_rag.sh --help to see all the possible options 6 | 7 | python examples/rag/finetune_rag.py \ 8 | --data_dir $DATA_DIR \ 9 | --output_dir $OUTPUT_DIR \ 10 | --model_name_or_path $MODEL_NAME_OR_PATH \ 11 | --model_type rag_sequence \ 12 | --fp16 \ 13 | --gpus 8 \ 14 | --profile \ 15 | --do_train \ 16 | --do_predict \ 17 | --n_val -1 \ 18 | --train_batch_size 8 \ 19 | --eval_batch_size 1 \ 20 | --max_source_length 128 \ 21 | --max_target_length 25 \ 22 | --val_max_target_length 25 \ 23 | --test_max_target_length 25 \ 24 | --label_smoothing 0.1 \ 25 | --dropout 0.1 \ 26 | --attention_dropout 0.1 \ 27 | --weight_decay 0.001 \ 28 | --adam_epsilon 1e-08 \ 29 | --max_grad_norm 0.1 \ 30 | --lr_scheduler polynomial \ 31 | --learning_rate 3e-05 \ 32 | --num_train_epochs 100 \ 33 | --warmup_steps 500 \ 34 | --gradient_accumulation_steps 1 \ 35 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/rag/finetune_rag_ray.sh: -------------------------------------------------------------------------------- 1 | # Sample script to finetune RAG using Ray for distributed retrieval. 2 | 3 | # Add parent directory to python path to access lightning_base.py 4 | export PYTHONPATH="../":"${PYTHONPATH}" 5 | 6 | # Start a single-node Ray cluster. 7 | ray start --head 8 | 9 | # A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path 10 | # run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options 11 | 12 | python examples/rag/finetune_rag.py \ 13 | --data_dir $DATA_DIR \ 14 | --output_dir $OUTPUT_DIR \ 15 | --model_name_or_path $MODEL_NAME_OR_PATH \ 16 | --model_type rag_sequence \ 17 | --fp16 \ 18 | --gpus 8 \ 19 | --profile \ 20 | --do_train \ 21 | --do_predict \ 22 | --n_val -1 \ 23 | --train_batch_size 8 \ 24 | --eval_batch_size 1 \ 25 | --max_source_length 128 \ 26 | --max_target_length 25 \ 27 | --val_max_target_length 25 \ 28 | --test_max_target_length 25 \ 29 | --label_smoothing 0.1 \ 30 | --dropout 0.1 \ 31 | --attention_dropout 0.1 \ 32 | --weight_decay 0.001 \ 33 | --adam_epsilon 1e-08 \ 34 | --max_grad_norm 0.1 \ 35 | --lr_scheduler polynomial \ 36 | --learning_rate 3e-05 \ 37 | --num_train_epochs 100 \ 38 | --warmup_steps 500 \ 39 | --gradient_accumulation_steps 1 \ 40 | --distributed_retriever ray \ 41 | --num_retrieval_workers 4 42 | 43 | # Stop the Ray cluster. 44 | ray stop 45 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/rag/parse_dpr_relevance_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint. 3 | Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting 4 | positive contexts for a given query. 5 | """ 6 | 7 | import argparse 8 | import json 9 | 10 | from tqdm import tqdm 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | 16 | # Required parameters 17 | parser.add_argument( 18 | "--src_path", 19 | type=str, 20 | default="biencoder-nq-dev.json", 21 | help="Path to raw DPR training data", 22 | ) 23 | parser.add_argument( 24 | "--evaluation_set", 25 | type=str, 26 | help="where to store parsed evaluation_set file", 27 | ) 28 | parser.add_argument( 29 | "--gold_data_path", 30 | type=str, 31 | help="where to store parsed gold_data_path file", 32 | ) 33 | args = parser.parse_args() 34 | 35 | with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open( 36 | args.gold_data_path, "w" 37 | ) as gold_file: 38 | dpr_records = json.load(src_file) 39 | for dpr_record in tqdm(dpr_records): 40 | question = dpr_record["question"] 41 | contexts = [context["title"] for context in dpr_record["positive_ctxs"]] 42 | eval_file.write(question + "\n") 43 | gold_file.write("\t".join(contexts) + "\n") 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/rag/requirements.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu >= 1.6.3 2 | datasets >= 1.0.1 3 | psutil >= 5.7.0 4 | torch >= 1.4.0 5 | transformers 6 | pytorch-lightning==1.0.4 7 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | # export MAX_LEN=128 5 | python distillation.py \ 6 | --learning_rate=3e-4 \ 7 | --do_train \ 8 | --fp16 \ 9 | --val_check_interval 0.25 \ 10 | --teacher Helsinki-NLP/opus-mt-en-ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --student_decoder_layers 3 --student_encoder_layers 6 \ 13 | --freeze_encoder --freeze_embeds \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=$BS --eval_batch_size=$BS \ 17 | --tokenizer_name Helsinki-NLP/opus-mt-en-ro \ 18 | --warmup_steps 500 --logger_name wandb \ 19 | --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \ 20 | "$@" 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | python distillation.py \ 5 | --learning_rate=3e-4 \ 6 | --do_train \ 7 | --do_predict \ 8 | --fp16 --no_teacher \ 9 | --val_check_interval 0.25 \ 10 | --data_dir $ENRO_DIR \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --freeze_encoder --freeze_embeds \ 13 | --train_batch_size=$BS --eval_batch_size=$BS \ 14 | --tokenizer_name $m --model_name_or_path $m \ 15 | --warmup_steps 500 --sortish_sampler --logger_name wandb \ 16 | --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \ 17 | "$@" 18 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | export WANDB_PROJECT=dmar 4 | export MAX_LEN=128 5 | export m=sshleifer/student_marian_en_ro_6_1 6 | python finetune.py \ 7 | --learning_rate=3e-4 \ 8 | --do_train \ 9 | --fp16 \ 10 | --data_dir wmt_en_ro \ 11 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 12 | --freeze_encoder --freeze_embeds \ 13 | --train_batch_size=48 --eval_batch_size=64 \ 14 | --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \ 15 | --warmup_steps 500 --logger_name wandb --gpus 1 \ 16 | --fp16_opt_level=O1 --task translation \ 17 | "$@" 18 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/finetune.sh: -------------------------------------------------------------------------------- 1 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 2 | # run ./finetune.sh --help to see all the possible options 3 | python finetune.py \ 4 | --learning_rate=3e-5 \ 5 | --fp16 \ 6 | --gpus 1 \ 7 | --do_train \ 8 | --do_predict \ 9 | --n_val 1000 \ 10 | --val_check_interval 0.1 \ 11 | "$@" 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh: -------------------------------------------------------------------------------- 1 | # Script for verifying that run_bart_sum can be invoked from its directory 2 | 3 | # Get tiny dataset with cnn_dm format (4 examples for train, val, test) 4 | wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz 5 | tar -xzvf cnn_tiny.tgz 6 | rm cnn_tiny.tgz 7 | 8 | export OUTPUT_DIR_NAME=bart_utest_output 9 | export CURRENT_DIR=${PWD} 10 | export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} 11 | 12 | # Make output directory if it doesn't exist 13 | mkdir -p $OUTPUT_DIR 14 | 15 | # Add parent directory to python path to access lightning_base.py and testing_utils.py 16 | export PYTHONPATH="../":"${PYTHONPATH}" 17 | python finetune.py \ 18 | --data_dir=cnn_tiny/ \ 19 | --model_name_or_path=sshleifer/bart-tiny-random \ 20 | --learning_rate=3e-5 \ 21 | --train_batch_size=2 \ 22 | --eval_batch_size=2 \ 23 | --output_dir=$OUTPUT_DIR \ 24 | --num_train_epochs=1 \ 25 | --gpus=0 \ 26 | --do_train "$@" 27 | 28 | rm -rf cnn_tiny 29 | rm -rf $OUTPUT_DIR 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | # From appendix C of paper https://arxiv.org/abs/1912.08777 5 | # Set --gradient_accumulation_steps so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16) 6 | python finetune.py \ 7 | --learning_rate=1e-4 \ 8 | --do_train \ 9 | --do_predict \ 10 | --n_val 1000 \ 11 | --val_check_interval 0.25 \ 12 | --max_source_length 512 --max_target_length 56 \ 13 | --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/finetune_t5.sh: -------------------------------------------------------------------------------- 1 | # Add parent directory to python path to access lightning_base.py 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --data_dir=$CNN_DIR \ 6 | --learning_rate=3e-5 \ 7 | --train_batch_size=$BS \ 8 | --eval_batch_size=$BS \ 9 | --output_dir=$OUTPUT_DIR \ 10 | --max_source_length=512 \ 11 | --max_target_length=56 \ 12 | --val_check_interval=0.1 --n_val=200 \ 13 | --do_train --do_predict \ 14 | "$@" 15 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | psutil 4 | sacrebleu 5 | rouge-score 6 | tensorflow_datasets 7 | pytorch-lightning==1.0.4 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from filelock import FileLock 4 | 5 | 6 | try: 7 | import nltk 8 | 9 | NLTK_AVAILABLE = True 10 | except (ImportError, ModuleNotFoundError): 11 | NLTK_AVAILABLE = False 12 | 13 | if NLTK_AVAILABLE: 14 | with FileLock(".lock") as lock: 15 | nltk.download("punkt", quiet=True) 16 | 17 | 18 | def add_newline_to_end_of_each_sentence(x: str) -> str: 19 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 20 | re.sub("", "", x) # remove pegasus newline char 21 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 22 | return "\n".join(nltk.sent_tokenize(x)) 23 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | export BS=32 5 | export GAS=1 6 | 7 | python finetune.py \ 8 | --learning_rate=3e-5 \ 9 | --fp16 \ 10 | --gpus 1 \ 11 | --do_train \ 12 | --do_predict \ 13 | --val_check_interval 0.25 \ 14 | --n_val 500 \ 15 | --num_train_epochs 2 \ 16 | --freeze_encoder --freeze_embeds --data_dir cnn_dm \ 17 | --max_target_length 142 --val_max_target_length=142 \ 18 | --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \ 19 | --model_name_or_path sshleifer/student_cnn_12_6 \ 20 | --tokenizer_name facebook/bart-large \ 21 | --warmup_steps 500 \ 22 | --output_dir distilbart-cnn-12-6 \ 23 | "$@" 24 | 25 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | python distillation.py \ 4 | --teacher facebook/bart-large-xsum --data_dir xsum \ 5 | --tokenizer_name facebook/bart-large-xsum \ 6 | --student_decoder_layers 6 --student_encoder_layers 12 \ 7 | --freeze_encoder --freeze_embeds \ 8 | --learning_rate=3e-4 \ 9 | --do_train \ 10 | --do_predict \ 11 | --fp16 --fp16_opt_level=O1 \ 12 | --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \ 13 | --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \ 14 | --model_name_or_path IGNORED \ 15 | --alpha_hid=3. \ 16 | --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \ 17 | --sortish_sampler \ 18 | --num_train_epochs=6 \ 19 | --warmup_steps 500 \ 20 | --output_dir distilbart_xsum_12_6 \ 21 | "$@" 22 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export PYTHONPATH="../":"${PYTHONPATH}" 3 | 4 | python finetune.py \ 5 | --learning_rate=3e-5 \ 6 | --fp16 \ 7 | --do_train \ 8 | --val_check_interval=0.25 \ 9 | --adam_eps 1e-06 \ 10 | --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ 11 | --data_dir $ENRO_DIR \ 12 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ 13 | --train_batch_size=$BS --eval_batch_size=$BS \ 14 | --task translation \ 15 | --warmup_steps 500 \ 16 | --freeze_embeds \ 17 | --model_name_or_path=facebook/mbart-large-cc25 \ 18 | "$@" 19 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | sys.path.insert(1, os.path.dirname(os.path.realpath(__file__))) 6 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/convert_model_to_fp16.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from typing import Union 17 | 18 | import fire 19 | import torch 20 | from tqdm import tqdm 21 | 22 | 23 | def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None: 24 | """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space.""" 25 | state_dict = torch.load(src_path, map_location=map_location) 26 | for k, v in tqdm(state_dict.items()): 27 | if not isinstance(v, torch.Tensor): 28 | raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin") 29 | state_dict[k] = v.half() 30 | if save_path is None: # overwrite src_path 31 | save_path = src_path 32 | torch.save(state_dict, save_path) 33 | 34 | 35 | if __name__ == "__main__": 36 | fire.Fire(convert) 37 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/finetune.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 16 | # run ./finetune.sh --help to see all the possible options 17 | python finetune_trainer.py \ 18 | --learning_rate=3e-5 \ 19 | --fp16 \ 20 | --do_train --do_eval --do_predict \ 21 | --evaluation_strategy steps \ 22 | --predict_with_generate \ 23 | --n_val 1000 \ 24 | "$@" 25 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/finetune_tpu.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export TPU_NUM_CORES=8 16 | 17 | # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path 18 | # run ./finetune_tpu.sh --help to see all the possible options 19 | python xla_spawn.py --num_cores $TPU_NUM_CORES \ 20 | finetune_trainer.py \ 21 | --learning_rate=3e-5 \ 22 | --do_train --do_eval \ 23 | --evaluation_strategy steps \ 24 | --prediction_loss_only \ 25 | --n_val 1000 \ 26 | "$@" 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/minify_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from pathlib import Path 17 | 18 | import fire 19 | 20 | 21 | def minify(src_dir: str, dest_dir: str, n: int): 22 | """Write first n lines of each file f in src_dir to dest_dir/f """ 23 | src_dir = Path(src_dir) 24 | dest_dir = Path(dest_dir) 25 | dest_dir.mkdir(exist_ok=True) 26 | for path in src_dir.iterdir(): 27 | new = [x.rstrip() for x in list(path.open().readlines())][:n] 28 | dest_path = dest_dir.joinpath(path.name) 29 | print(dest_path) 30 | dest_path.open("w").write("\n".join(new)) 31 | 32 | 33 | if __name__ == "__main__": 34 | fire.Fire(minify) 35 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | scikit-learn 3 | seqeval 4 | psutil 5 | sacrebleu 6 | rouge-score 7 | tensorflow_datasets 8 | matplotlib 9 | git-python==1.0.3 10 | faiss-cpu 11 | streamlit 12 | elasticsearch 13 | nltk 14 | pandas 15 | datasets >= 1.1.3 16 | fire 17 | pytest 18 | conllu 19 | sentencepiece != 0.1.92 20 | protobuf 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/rouge_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import fire 16 | 17 | from utils import calculate_rouge, save_json 18 | 19 | 20 | def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs): 21 | """Kwargs will be passed to calculate_rouge""" 22 | pred_lns = [x.strip() for x in open(pred_path).readlines()] 23 | tgt_lns = [x.strip() for x in open(tgt_path).readlines()][: len(pred_lns)] 24 | metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs) 25 | if save_path is not None: 26 | save_json(metrics, save_path, indent=None) 27 | return metrics # these print nicely 28 | 29 | 30 | if __name__ == "__main__": 31 | fire.Fire(calculate_rouge_path) 32 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/save_randomly_initialized_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import fire 17 | 18 | from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer 19 | 20 | 21 | def save_randomly_initialized_version(config_name: str, save_dir: str, **config_kwargs): 22 | """Save a randomly initialized version of a model using a pretrained config. 23 | Args: 24 | config_name: which config to use 25 | save_dir: where to save the resulting model and tokenizer 26 | config_kwargs: Passed to AutoConfig 27 | 28 | Usage:: 29 | save_randomly_initialized_version("facebook/bart-large-cnn", "distilbart_random_cnn_6_3", encoder_layers=6, decoder_layers=3, num_beams=3) 30 | """ 31 | cfg = AutoConfig.from_pretrained(config_name, **config_kwargs) 32 | model = AutoModelForSeq2SeqLM.from_config(cfg) 33 | model.save_pretrained(save_dir) 34 | AutoTokenizer.from_pretrained(config_name).save_pretrained(save_dir) 35 | return model 36 | 37 | 38 | if __name__ == "__main__": 39 | fire.Fire(save_randomly_initialized_version) 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/sentence_splitter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import re 15 | 16 | from filelock import FileLock 17 | 18 | 19 | try: 20 | import nltk 21 | 22 | NLTK_AVAILABLE = True 23 | except (ImportError, ModuleNotFoundError): 24 | NLTK_AVAILABLE = False 25 | 26 | if NLTK_AVAILABLE: 27 | with FileLock(".lock") as lock: 28 | nltk.download("punkt", quiet=True) 29 | 30 | 31 | def add_newline_to_end_of_each_sentence(x: str) -> str: 32 | """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS.""" 33 | re.sub("", "", x) # remove pegasus newline char 34 | assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)" 35 | return "\n".join(nltk.sent_tokenize(x)) 36 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/test_data/fsmt/build-eval-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import io 4 | import json 5 | import subprocess 6 | 7 | 8 | pairs = [ 9 | ["en", "ru"], 10 | ["ru", "en"], 11 | ["en", "de"], 12 | ["de", "en"], 13 | ] 14 | 15 | n_objs = 8 16 | 17 | 18 | def get_all_data(pairs, n_objs): 19 | text = {} 20 | for src, tgt in pairs: 21 | pair = f"{src}-{tgt}" 22 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split() 23 | src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 24 | cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split() 25 | tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() 26 | text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]} 27 | return text 28 | 29 | 30 | text = get_all_data(pairs, n_objs) 31 | filename = "./fsmt_val_data.json" 32 | with io.open(filename, "w", encoding="utf-8") as f: 33 | bleu_data = json.dump(text, f, indent=2, ensure_ascii=False) 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/test_data/test_data: -------------------------------------------------------------------------------- 1 | seq2seq/test_data -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/test_data/wmt_en_ro/train.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/seq2seq/test_data/wmt_en_ro/train.len -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/test_data/wmt_en_ro/val.len: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/examples/seq2seq/test_data/wmt_en_ro/val.len -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/test_tatoeba_conversion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import tempfile 17 | import unittest 18 | 19 | from transformers.file_utils import cached_property 20 | from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter 21 | from transformers.testing_utils import require_torch_non_multi_gpu_but_fix_me, slow 22 | 23 | 24 | @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.") 25 | class TatoebaConversionTester(unittest.TestCase): 26 | @cached_property 27 | def resolver(self): 28 | tmp_dir = tempfile.mkdtemp() 29 | return TatoebaConverter(save_dir=tmp_dir) 30 | 31 | @slow 32 | @require_torch_non_multi_gpu_but_fix_me 33 | def test_resolver(self): 34 | self.resolver.convert_models(["heb-eng"]) 35 | 36 | @slow 37 | @require_torch_non_multi_gpu_but_fix_me 38 | def test_model_card(self): 39 | content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True) 40 | assert mmeta["long_pair"] == "heb-eng" 41 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/train_distil_marian_enro.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export WANDB_PROJECT=distil-marian 16 | export BS=64 17 | export GAS=1 18 | export m=sshleifer/student_marian_en_ro_6_3 19 | export MAX_LEN=128 20 | python finetune_trainer.py \ 21 | --tokenizer_name $m --model_name_or_path $m \ 22 | --data_dir $ENRO_DIR \ 23 | --output_dir marian_en_ro_6_3 --overwrite_output_dir \ 24 | --learning_rate=3e-4 \ 25 | --warmup_steps 500 --sortish_sampler \ 26 | --fp16 \ 27 | --gradient_accumulation_steps=$GAS \ 28 | --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \ 29 | --freeze_encoder --freeze_embeds \ 30 | --num_train_epochs=6 \ 31 | --save_steps 3000 --eval_steps 3000 \ 32 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN \ 33 | --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \ 34 | --do_train --do_eval --do_predict \ 35 | --evaluation_strategy steps \ 36 | --predict_with_generate --logging_first_step \ 37 | --task translation --label_smoothing_factor 0.1 \ 38 | "$@" 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/train_distil_marian_enro_tpu.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export WANDB_PROJECT=distil-marian 16 | export BS=64 17 | export m=sshleifer/student_marian_en_ro_6_3 18 | export MAX_LEN=128 19 | export TPU_NUM_CORES=8 20 | 21 | python xla_spawn.py --num_cores $TPU_NUM_CORES \ 22 | finetune_trainer.py \ 23 | --tokenizer_name $m --model_name_or_path $m \ 24 | --data_dir $ENRO_DIR \ 25 | --output_dir marian_en_ro_6_3 --overwrite_output_dir \ 26 | --learning_rate=3e-4 \ 27 | --warmup_steps 500 \ 28 | --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \ 29 | --freeze_encoder --freeze_embeds \ 30 | --num_train_epochs=6 \ 31 | --save_steps 500 --eval_steps 500 \ 32 | --logging_first_step --logging_steps 200 \ 33 | --max_source_length $MAX_LEN --max_target_length $MAX_LEN \ 34 | --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \ 35 | --do_train --do_eval \ 36 | --evaluation_strategy steps \ 37 | --prediction_loss_only \ 38 | --task translation --label_smoothing_factor 0.1 \ 39 | "$@" 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/train_distilbart_cnn.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | export WANDB_PROJECT=distilbart-trainer 16 | export BS=32 17 | export m=sshleifer/student_cnn_12_6 18 | export tok=facebook/bart-large 19 | export MAX_TGT_LEN=142 20 | 21 | python finetune_trainer.py \ 22 | --model_name_or_path $m --tokenizer_name $tok \ 23 | --data_dir cnn_dm \ 24 | --output_dir distilbart-cnn-12-6 --overwrite_output_dir \ 25 | --learning_rate=3e-5 \ 26 | --warmup_steps 500 --sortish_sampler \ 27 | --fp16 \ 28 | --n_val 500 \ 29 | --gradient_accumulation_steps=1 \ 30 | --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \ 31 | --freeze_encoder --freeze_embeds \ 32 | --num_train_epochs=2 \ 33 | --save_steps 3000 --eval_steps 3000 \ 34 | --logging_first_step \ 35 | --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN\ 36 | --do_train --do_eval --do_predict \ 37 | --evaluation_strategy steps \ 38 | --predict_with_generate --sortish_sampler \ 39 | "$@" 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/seq2seq/train_mbart_cc25_enro.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python finetune_trainer.py \ 16 | --model_name_or_path=facebook/mbart-large-cc25 \ 17 | --data_dir $ENRO_DIR \ 18 | --output_dir mbart_cc25_enro --overwrite_output_dir \ 19 | --learning_rate=3e-5 \ 20 | --warmup_steps 500 \ 21 | --fp16 \ 22 | --label_smoothing 0.1 \ 23 | --adam_eps 1e-06 \ 24 | --src_lang en_XX --tgt_lang ro_RO \ 25 | --freeze_embeds \ 26 | --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \ 27 | --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\ 28 | --sortish_sampler \ 29 | --num_train_epochs 6 \ 30 | --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \ 31 | --do_train --do_eval --do_predict \ 32 | --evaluation_strategy steps \ 33 | --predict_with_generate --logging_first_step \ 34 | --task translation \ 35 | "$@" 36 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/text-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets >= 1.1.3 2 | sentencepiece != 0.1.92 3 | protobuf 4 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/text-generation/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | ## Language generation 18 | 19 | Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py). 20 | 21 | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL. 22 | A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you 23 | can try out the different models available in the library. 24 | 25 | Example usage: 26 | 27 | ```bash 28 | python run_generation.py \ 29 | --model_type=gpt2 \ 30 | --model_name_or_path=gpt2 31 | ``` 32 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/text-generation/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece != 0.1.92 2 | protobuf 3 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/token-classification/requirements.txt: -------------------------------------------------------------------------------- 1 | seqeval 2 | datasets >= 1.1.3 3 | -------------------------------------------------------------------------------- /seq2seq/transformers/examples/token-classification/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | python3 run_ner.py \ 16 | --model_name_or_path bert-base-uncased \ 17 | --dataset_name conll2003 \ 18 | --output_dir /tmp/test-ner \ 19 | --do_train \ 20 | --do_eval 21 | -------------------------------------------------------------------------------- /seq2seq/transformers/model_cards/README.md: -------------------------------------------------------------------------------- 1 | ## 🔥 Model cards now live inside each huggingface.co model repo 🔥 2 | 3 | 4 | For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub. 5 | 6 | ### How to update a model card 7 | 8 | You can directly update a model card inside any model repo you have **write access** to, i.e.: 9 | - a model under your username namespace 10 | - a model under any organization you are a part of. 11 | 12 | You can either: 13 | - update it, commit and push using your usual git workflow (command line, GUI, etc.) 14 | - or edit it directly from the website's UI. 15 | 16 | **What if you want to create or update a model card for a model you don't have write access to?** 17 | 18 | In that case, given that we don't have a Pull request system yet on huggingface.co (🤯), 19 | you can open an issue here, post the card's content, and tag the model author(s) and/or the Hugging Face team. 20 | 21 | We might implement a more seamless process at some point, so your early feedback is precious! 22 | Please let us know of any suggestion. 23 | 24 | ### What happened to the model cards here? 25 | 26 | We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub. 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 119 3 | target-version = ['py35'] 4 | -------------------------------------------------------------------------------- /seq2seq/transformers/scripts/fsmt/tests-to-run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # these scripts need to be run before any changes to FSMT-related code - it should cover all bases 17 | 18 | CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py 19 | RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py 20 | -------------------------------------------------------------------------------- /seq2seq/transformers/scripts/pegasus/build_test_sample_spm_no_bos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus 17 | 18 | # 1. pip install sentencepiece 19 | # 20 | # 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt 21 | 22 | # 3. build 23 | import sentencepiece as spm 24 | 25 | # pegasus: 26 | # 1. no bos 27 | # 2. eos_id is 1 28 | # 3. unk_id is 2 29 | # build a sample spm file accordingly 30 | spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000') 31 | 32 | # 4. now update the fixture 33 | # mv test_sentencepiece_no_bos.model ../../tests/fixtures/ 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/scripts/tatoeba/upload_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for FILE in converted/*; do 4 | model_name=`basename $FILE` 5 | transformers-cli repo create $model_name -y 6 | git clone https://huggingface.co/Helsinki-NLP/$model_name 7 | mv $FILE/* $model_name/ 8 | cd $model_name 9 | git add . && git commit -m "initial commit" 10 | git push 11 | cd .. 12 | done 13 | -------------------------------------------------------------------------------- /seq2seq/transformers/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = transformers 7 | known_third_party = 8 | absl 9 | conllu 10 | datasets 11 | elasticsearch 12 | fairseq 13 | faiss-cpu 14 | fastprogress 15 | fire 16 | fugashi 17 | git 18 | h5py 19 | matplotlib 20 | nltk 21 | numpy 22 | packaging 23 | pandas 24 | PIL 25 | psutil 26 | pytest 27 | pytorch_lightning 28 | rouge_score 29 | sacrebleu 30 | seqeval 31 | sklearn 32 | streamlit 33 | tensorboardX 34 | tensorflow 35 | tensorflow_datasets 36 | timeout_decorator 37 | torch 38 | torchtext 39 | torchvision 40 | torch_xla 41 | tqdm 42 | 43 | line_length = 119 44 | lines_after_imports = 2 45 | multi_line_output = 3 46 | use_parentheses = True 47 | 48 | [flake8] 49 | ignore = E203, E501, E741, W503, W605 50 | max-line-length = 119 51 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/src/transformers/benchmark/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseTransformersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .metrics import glue_compute_metrics, xnli_compute_metrics 20 | from .processors import ( 21 | DataProcessor, 22 | InputExample, 23 | InputFeatures, 24 | SingleSentenceClassificationProcessor, 25 | SquadExample, 26 | SquadFeatures, 27 | SquadV1Processor, 28 | SquadV2Processor, 29 | glue_convert_examples_to_features, 30 | glue_output_modes, 31 | glue_processors, 32 | glue_tasks_num_labels, 33 | squad_convert_examples_to_features, 34 | xnli_output_modes, 35 | xnli_processors, 36 | xnli_tasks_num_labels, 37 | ) 38 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import GlueDataset, GlueDataTrainingArguments 20 | from .language_modeling import ( 21 | LineByLineTextDataset, 22 | LineByLineWithRefDataset, 23 | LineByLineWithSOPTextDataset, 24 | TextDataset, 25 | TextDatasetForNextSentencePrediction, 26 | ) 27 | from .squad import SquadDataset, SquadDataTrainingArguments 28 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 20 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 21 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 22 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 23 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/dependency_versions_check.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from .dependency_versions_table import deps 17 | from .utils.versions import require_version_core 18 | 19 | 20 | # define which module versions we always want to check at run time 21 | # (usually the ones defined in `install_requires` in setup.py) 22 | # 23 | # order specific notes: 24 | # - tqdm must be checked before tokenizers 25 | 26 | pkgs_to_check_at_runtime = "python tqdm regex sacremoses requests packaging filelock numpy tokenizers".split() 27 | if sys.version_info < (3, 7): 28 | pkgs_to_check_at_runtime.append("dataclasses") 29 | 30 | for pkg in pkgs_to_check_at_runtime: 31 | if pkg in deps: 32 | if pkg == "tokenizers": 33 | # must be loaded here, or else tqdm check may fail 34 | from .file_utils import is_tokenizers_available 35 | 36 | if not is_tokenizers_available(): 37 | continue # not required, check version only if installed 38 | 39 | require_version_core(deps[pkg]) 40 | else: 41 | raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py") 42 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/dependency_versions_table.py: -------------------------------------------------------------------------------- 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update: 2 | # 1. modify the `_deps` dict in setup.py 3 | # 2. run `make deps_table_update`` 4 | deps = { 5 | "black": "black>=20.8b1", 6 | "cookiecutter": "cookiecutter==1.7.2", 7 | "dataclasses": "dataclasses", 8 | "datasets": "datasets", 9 | "faiss-cpu": "faiss-cpu", 10 | "fastapi": "fastapi", 11 | "filelock": "filelock", 12 | "flake8": "flake8>=3.8.3", 13 | "flax": "flax>=0.2.2", 14 | "fugashi": "fugashi>=1.0", 15 | "ipadic": "ipadic>=1.0.0,<2.0", 16 | "isort": "isort>=5.5.4", 17 | "jax": "jax>=0.2.0", 18 | "jaxlib": "jaxlib==0.1.55", 19 | "keras2onnx": "keras2onnx", 20 | "numpy": "numpy", 21 | "onnxconverter-common": "onnxconverter-common", 22 | "onnxruntime-tools": "onnxruntime-tools>=1.4.2", 23 | "onnxruntime": "onnxruntime>=1.4.0", 24 | "packaging": "packaging", 25 | "parameterized": "parameterized", 26 | "protobuf": "protobuf", 27 | "psutil": "psutil", 28 | "pydantic": "pydantic", 29 | "pytest": "pytest", 30 | "pytest-xdist": "pytest-xdist", 31 | "python": "python>=3.6.0", 32 | "recommonmark": "recommonmark", 33 | "regex": "regex!=2019.12.17", 34 | "requests": "requests", 35 | "sacremoses": "sacremoses", 36 | "scikit-learn": "scikit-learn", 37 | "sentencepiece": "sentencepiece==0.1.91", 38 | "sphinx-copybutton": "sphinx-copybutton", 39 | "sphinx-markdown-tables": "sphinx-markdown-tables", 40 | "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", 41 | "sphinx": "sphinx==3.2.1", 42 | "starlette": "starlette", 43 | "tensorflow-cpu": "tensorflow-cpu>=2.3", 44 | "tensorflow": "tensorflow>=2.3", 45 | "timeout-decorator": "timeout-decorator", 46 | "tokenizers": "tokenizers==0.9.4", 47 | "torch": "torch>=1.0", 48 | "tqdm": "tqdm>=4.27", 49 | "unidic": "unidic>=1.0.2", 50 | "unidic_lite": "unidic_lite>=1.0.7", 51 | "uvicorn": "uvicorn", 52 | } 53 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/src/transformers/models/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/bart/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_bart import BartConfig 21 | from .tokenization_bart import BartTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_bart_fast import BartTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_bart import ( 29 | BART_PRETRAINED_MODEL_ARCHIVE_LIST, 30 | BartForConditionalGeneration, 31 | BartForQuestionAnswering, 32 | BartForSequenceClassification, 33 | BartModel, 34 | BartPretrainedModel, 35 | PretrainedBartModel, 36 | ) 37 | 38 | if is_tf_available(): 39 | from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/barthez/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tokenizers_available 20 | 21 | 22 | if is_sentencepiece_available(): 23 | from .tokenization_barthez import BarthezTokenizer 24 | 25 | if is_tokenizers_available(): 26 | from .tokenization_barthez_fast import BarthezTokenizerFast 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/bert_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_torch_available 20 | from .configuration_bert_generation import BertGenerationConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_bert_generation import BertGenerationTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_bert_generation import ( 28 | BertGenerationDecoder, 29 | BertGenerationEncoder, 30 | load_tf_weights_in_bert_generation, 31 | ) 32 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/bert_japanese/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer 20 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/bertweet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .tokenization_bertweet import BertweetTokenizer 20 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/blenderbot/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_torch_available 20 | from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig 21 | from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_blenderbot import ( 26 | BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | BlenderbotForConditionalGeneration, 28 | BlenderbotModel, 29 | ) 30 | 31 | if is_tf_available(): 32 | from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration 33 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/camembert/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | from ...utils import logging 19 | from ..roberta.configuration_roberta import RobertaConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json", 26 | "umberto-commoncrawl-cased-v1": "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json", 27 | "umberto-wikipedia-uncased-v1": "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json", 28 | } 29 | 30 | 31 | class CamembertConfig(RobertaConfig): 32 | """ 33 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate 34 | documentation alongside usage examples. 35 | """ 36 | 37 | model_type = "camembert" 38 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/ctrl/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_torch_available 20 | from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig 21 | from .tokenization_ctrl import CTRLTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_ctrl import ( 26 | CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | CTRLForSequenceClassification, 28 | CTRLLMHeadModel, 29 | CTRLModel, 30 | CTRLPreTrainedModel, 31 | ) 32 | 33 | if is_tf_available(): 34 | from .modeling_tf_ctrl import ( 35 | TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, 36 | TFCTRLForSequenceClassification, 37 | TFCTRLLMHeadModel, 38 | TFCTRLModel, 39 | TFCTRLPreTrainedModel, 40 | ) 41 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/deberta/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig 21 | from .tokenization_deberta import DebertaTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_deberta import ( 26 | DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | DebertaForSequenceClassification, 28 | DebertaModel, 29 | DebertaPreTrainedModel, 30 | ) 31 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/dialogpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/src/transformers/models/dialogpt/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | 18 | import torch 19 | 20 | from transformers.file_utils import WEIGHTS_NAME 21 | 22 | 23 | DIALOGPT_MODELS = ["small", "medium", "large"] 24 | 25 | OLD_KEY = "lm_head.decoder.weight" 26 | NEW_KEY = "lm_head.weight" 27 | 28 | 29 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 30 | d = torch.load(checkpoint_path) 31 | d[NEW_KEY] = d.pop(OLD_KEY) 32 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 33 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--dialogpt_path", default=".", type=str) 39 | args = parser.parse_args() 40 | for MODEL in DIALOGPT_MODELS: 41 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 42 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 43 | convert_dialogpt_checkpoint( 44 | checkpoint_path, 45 | pytorch_dump_folder_path, 46 | ) 47 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/encoder_decoder/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_encoder_decoder import EncoderDecoderConfig 21 | 22 | 23 | if is_torch_available(): 24 | from .modeling_encoder_decoder import EncoderDecoderModel 25 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/flaubert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_torch_available 20 | from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig 21 | from .tokenization_flaubert import FlaubertTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_flaubert import ( 26 | FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | FlaubertForMultipleChoice, 28 | FlaubertForQuestionAnswering, 29 | FlaubertForQuestionAnsweringSimple, 30 | FlaubertForSequenceClassification, 31 | FlaubertForTokenClassification, 32 | FlaubertModel, 33 | FlaubertWithLMHeadModel, 34 | ) 35 | 36 | if is_tf_available(): 37 | from .modeling_tf_flaubert import ( 38 | TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 39 | TFFlaubertForMultipleChoice, 40 | TFFlaubertForQuestionAnsweringSimple, 41 | TFFlaubertForSequenceClassification, 42 | TFFlaubertForTokenClassification, 43 | TFFlaubertModel, 44 | TFFlaubertWithLMHeadModel, 45 | ) 46 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/fsmt/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig 21 | from .tokenization_fsmt import FSMTTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel 26 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/gpt2/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config 21 | from .tokenization_gpt2 import GPT2Tokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_gpt2_fast import GPT2TokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_gpt2 import ( 29 | GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, 30 | GPT2DoubleHeadsModel, 31 | GPT2ForSequenceClassification, 32 | GPT2LMHeadModel, 33 | GPT2Model, 34 | GPT2PreTrainedModel, 35 | load_tf_weights_in_gpt2, 36 | ) 37 | 38 | if is_tf_available(): 39 | from .modeling_tf_gpt2 import ( 40 | TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, 41 | TFGPT2DoubleHeadsModel, 42 | TFGPT2ForSequenceClassification, 43 | TFGPT2LMHeadModel, 44 | TFGPT2MainLayer, 45 | TFGPT2Model, 46 | TFGPT2PreTrainedModel, 47 | ) 48 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/herbert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tokenizers_available 20 | from .tokenization_herbert import HerbertTokenizer 21 | 22 | 23 | if is_tokenizers_available(): 24 | from .tokenization_herbert_fast import HerbertTokenizerFast 25 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/layoutlm/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tokenizers_available, is_torch_available 20 | from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig 21 | from .tokenization_layoutlm import LayoutLMTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_layoutlm_fast import LayoutLMTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_layoutlm import ( 29 | LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST, 30 | LayoutLMForMaskedLM, 31 | LayoutLMForTokenClassification, 32 | LayoutLMModel, 33 | ) 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/lxmert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig 21 | from .tokenization_lxmert import LxmertTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_lxmert_fast import LxmertTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_lxmert import ( 29 | LxmertEncoder, 30 | LxmertForPreTraining, 31 | LxmertForQuestionAnswering, 32 | LxmertModel, 33 | LxmertPreTrainedModel, 34 | LxmertVisualFeatureEncoder, 35 | LxmertXLayer, 36 | ) 37 | 38 | if is_tf_available(): 39 | from .modeling_tf_lxmert import ( 40 | TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST, 41 | TFLxmertForPreTraining, 42 | TFLxmertMainLayer, 43 | TFLxmertModel, 44 | TFLxmertPreTrainedModel, 45 | TFLxmertVisualFeatureEncoder, 46 | ) 47 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/marian/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available 20 | from .configuration_marian import MarianConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_marian import MarianTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_marian import MarianMTModel 28 | 29 | if is_tf_available(): 30 | from .modeling_tf_marian import TFMarianMTModel 31 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mbart/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_mbart import MBartConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_mbart import MBartTokenizer 25 | 26 | if is_tokenizers_available(): 27 | from .tokenization_mbart_fast import MBartTokenizerFast 28 | 29 | if is_torch_available(): 30 | from .modeling_mbart import MBartForConditionalGeneration, MBartModel 31 | 32 | if is_tf_available(): 33 | from .modeling_tf_mbart import TFMBartForConditionalGeneration 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mbart/modeling_tf_mbart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF mBART model, originally from fairseq.""" 16 | from ...file_utils import add_start_docstrings 17 | from ...utils import logging 18 | from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration 19 | from .configuration_mbart import MBartConfig 20 | 21 | 22 | _CONFIG_FOR_DOC = "MBartConfig" 23 | 24 | START_DOCSTRING = BART_START_DOCSTRING.replace( 25 | "inherits from :class:`~transformers.TFPreTrainedModel`", 26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 27 | ).replace("BartConfig", _CONFIG_FOR_DOC) 28 | 29 | 30 | logger = logging.get_logger(__name__) 31 | 32 | 33 | @add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING) 34 | class TFMBartForConditionalGeneration(TFBartForConditionalGeneration): 35 | config_class = MBartConfig 36 | # All the code is in src/transformers/models/bart/modeling_tf_bart.py 37 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mmbt/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_mmbt import MMBTConfig 21 | 22 | 23 | if is_torch_available(): 24 | from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings 25 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mmbt/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | from ...utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | class MMBTConfig(object): 25 | """ 26 | This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to 27 | instantiate a MMBT model according to the specified arguments, defining the model architecture. 28 | 29 | Args: 30 | config (:class:`~transformers.PreTrainedConfig`): 31 | Config of the underlying Transformer models. Its values are copied over to use a single config. 32 | num_labels (:obj:`int`, `optional`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, `optional`, defaults to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mobilebert/tokenization_mobilebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Copyright 2020 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Tokenization classes for MobileBERT.""" 17 | 18 | from ...utils import logging 19 | from ..bert.tokenization_bert import BertTokenizer 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"} 28 | } 29 | 30 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} 31 | 32 | 33 | PRETRAINED_INIT_CONFIGURATION = {} 34 | 35 | 36 | class MobileBertTokenizer(BertTokenizer): 37 | r""" 38 | Construct a MobileBERT tokenizer. 39 | 40 | :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 41 | tokenization: punctuation splitting and wordpiece. 42 | 43 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 44 | parameters. 45 | """ 46 | 47 | vocab_files_names = VOCAB_FILES_NAMES 48 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 49 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 50 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 51 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/mt5/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_mt5 import MT5Config 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from ..t5.tokenization_t5 import T5Tokenizer 25 | 26 | MT5Tokenizer = T5Tokenizer 27 | 28 | if is_tokenizers_available(): 29 | from ..t5.tokenization_t5_fast import T5TokenizerFast 30 | 31 | MT5TokenizerFast = T5TokenizerFast 32 | 33 | if is_torch_available(): 34 | from .modeling_mt5 import MT5EncoderModel, MT5ForConditionalGeneration, MT5Model 35 | 36 | if is_tf_available(): 37 | from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model 38 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/openai/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig 21 | from .tokenization_openai import OpenAIGPTTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_openai_fast import OpenAIGPTTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_openai import ( 29 | OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, 30 | OpenAIGPTDoubleHeadsModel, 31 | OpenAIGPTForSequenceClassification, 32 | OpenAIGPTLMHeadModel, 33 | OpenAIGPTModel, 34 | OpenAIGPTPreTrainedModel, 35 | load_tf_weights_in_openai_gpt, 36 | ) 37 | 38 | if is_tf_available(): 39 | from .modeling_tf_openai import ( 40 | TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST, 41 | TFOpenAIGPTDoubleHeadsModel, 42 | TFOpenAIGPTForSequenceClassification, 43 | TFOpenAIGPTLMHeadModel, 44 | TFOpenAIGPTMainLayer, 45 | TFOpenAIGPTModel, 46 | TFOpenAIGPTPreTrainedModel, 47 | ) 48 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/pegasus/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_pegasus import PegasusConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_pegasus import PegasusTokenizer 25 | 26 | if is_tokenizers_available(): 27 | from .tokenization_pegasus_fast import PegasusTokenizerFast 28 | 29 | if is_torch_available(): 30 | from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel 31 | 32 | if is_tf_available(): 33 | from .modeling_tf_pegasus import TFPegasusForConditionalGeneration 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/pegasus/modeling_tf_pegasus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF Pegasus model, ported from the fairseq repo.""" 16 | from ...file_utils import add_start_docstrings 17 | from ...utils import logging 18 | from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration 19 | from .configuration_pegasus import PegasusConfig 20 | 21 | 22 | _CONFIG_FOR_DOC = "PegasusConfig" 23 | 24 | START_DOCSTRING = BART_START_DOCSTRING.replace( 25 | "inherits from :class:`~transformers.TFPreTrainedModel`", 26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 27 | ).replace("BartConfig", _CONFIG_FOR_DOC) 28 | 29 | 30 | logger = logging.get_logger(__name__) 31 | 32 | 33 | @add_start_docstrings("Pegasus model for summarization", START_DOCSTRING) 34 | class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration): 35 | _keys_to_ignore_on_load_missing = [ 36 | r"final_logits_bias", 37 | r"model.encoder.embed_positions.weight", 38 | r"model.decoder.embed_positions.weight", 39 | ] 40 | config_class = PegasusConfig 41 | # All the code is in src/transformers/models/bart/modeling_tf_bart.py 42 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/phobert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from .tokenization_phobert import PhobertTokenizer 20 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig 21 | from .tokenization_prophetnet import ProphetNetTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_prophetnet import ( 26 | PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | ProphetNetDecoder, 28 | ProphetNetEncoder, 29 | ProphetNetForCausalLM, 30 | ProphetNetForConditionalGeneration, 31 | ProphetNetModel, 32 | ProphetNetPreTrainedModel, 33 | ) 34 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/rag/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_rag import RagConfig 21 | from .retrieval_rag import RagRetriever 22 | from .tokenization_rag import RagTokenizer 23 | 24 | 25 | if is_torch_available(): 26 | from .modeling_rag import RagModel, RagSequenceForGeneration, RagTokenForGeneration 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/reformer/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tokenizers_available, is_torch_available 20 | from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_reformer import ReformerTokenizer 25 | 26 | if is_tokenizers_available(): 27 | from .tokenization_reformer_fast import ReformerTokenizerFast 28 | 29 | if is_torch_available(): 30 | from .modeling_reformer import ( 31 | REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, 32 | ReformerAttention, 33 | ReformerForMaskedLM, 34 | ReformerForQuestionAnswering, 35 | ReformerForSequenceClassification, 36 | ReformerLayer, 37 | ReformerModel, 38 | ReformerModelWithLMHead, 39 | ) 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/retribert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tokenizers_available, is_torch_available 20 | from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig 21 | from .tokenization_retribert import RetriBertTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_retribert_fast import RetriBertTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_retribert import RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST, RetriBertModel, RetriBertPreTrainedModel 29 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/squeezebert/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tokenizers_available, is_torch_available 20 | from .configuration_squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig 21 | from .tokenization_squeezebert import SqueezeBertTokenizer 22 | 23 | 24 | if is_tokenizers_available(): 25 | from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast 26 | 27 | if is_torch_available(): 28 | from .modeling_squeezebert import ( 29 | SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 30 | SqueezeBertForMaskedLM, 31 | SqueezeBertForMultipleChoice, 32 | SqueezeBertForQuestionAnswering, 33 | SqueezeBertForSequenceClassification, 34 | SqueezeBertForTokenClassification, 35 | SqueezeBertModel, 36 | SqueezeBertModule, 37 | SqueezeBertPreTrainedModel, 38 | ) 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/t5/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available 20 | from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_t5 import T5Tokenizer 25 | 26 | if is_tokenizers_available(): 27 | from .tokenization_t5_fast import T5TokenizerFast 28 | 29 | if is_torch_available(): 30 | from .modeling_t5 import ( 31 | T5_PRETRAINED_MODEL_ARCHIVE_LIST, 32 | T5EncoderModel, 33 | T5ForConditionalGeneration, 34 | T5Model, 35 | T5PreTrainedModel, 36 | load_tf_weights_in_t5, 37 | ) 38 | 39 | if is_tf_available(): 40 | from .modeling_tf_t5 import ( 41 | TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, 42 | TFT5EncoderModel, 43 | TFT5ForConditionalGeneration, 44 | TFT5Model, 45 | TFT5PreTrainedModel, 46 | ) 47 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/tapas/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_torch_available 20 | from .configuration_tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig 21 | from .tokenization_tapas import TapasTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_tapas import ( 26 | TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | TapasForMaskedLM, 28 | TapasForQuestionAnswering, 29 | TapasForSequenceClassification, 30 | TapasModel, 31 | ) 32 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/transfo_xl/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_torch_available 20 | from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig 21 | from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_transfo_xl import ( 26 | TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | AdaptiveEmbedding, 28 | TransfoXLForSequenceClassification, 29 | TransfoXLLMHeadModel, 30 | TransfoXLModel, 31 | TransfoXLPreTrainedModel, 32 | load_tf_weights_in_transfo_xl, 33 | ) 34 | 35 | if is_tf_available(): 36 | from .modeling_tf_transfo_xl import ( 37 | TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, 38 | TFAdaptiveEmbedding, 39 | TFTransfoXLForSequenceClassification, 40 | TFTransfoXLLMHeadModel, 41 | TFTransfoXLMainLayer, 42 | TFTransfoXLModel, 43 | TFTransfoXLPreTrainedModel, 44 | ) 45 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/xlm/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_tf_available, is_torch_available 20 | from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig 21 | from .tokenization_xlm import XLMTokenizer 22 | 23 | 24 | if is_torch_available(): 25 | from .modeling_xlm import ( 26 | XLM_PRETRAINED_MODEL_ARCHIVE_LIST, 27 | XLMForMultipleChoice, 28 | XLMForQuestionAnswering, 29 | XLMForQuestionAnsweringSimple, 30 | XLMForSequenceClassification, 31 | XLMForTokenClassification, 32 | XLMModel, 33 | XLMPreTrainedModel, 34 | XLMWithLMHeadModel, 35 | ) 36 | 37 | if is_tf_available(): 38 | from .modeling_tf_xlm import ( 39 | TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, 40 | TFXLMForMultipleChoice, 41 | TFXLMForQuestionAnsweringSimple, 42 | TFXLMForSequenceClassification, 43 | TFXLMForTokenClassification, 44 | TFXLMMainLayer, 45 | TFXLMModel, 46 | TFXLMPreTrainedModel, 47 | TFXLMWithLMHeadModel, 48 | ) 49 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/xlm_prophetnet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | # Copyright 2020 The HuggingFace Team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | from ...file_utils import is_sentencepiece_available, is_torch_available 20 | from .configuration_xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig 21 | 22 | 23 | if is_sentencepiece_available(): 24 | from .tokenization_xlm_prophetnet import XLMProphetNetTokenizer 25 | 26 | if is_torch_available(): 27 | from .modeling_xlm_prophetnet import ( 28 | XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST, 29 | XLMProphetNetDecoder, 30 | XLMProphetNetEncoder, 31 | XLMProphetNetForCausalLM, 32 | XLMProphetNetForConditionalGeneration, 33 | XLMProphetNetModel, 34 | ) 35 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM-ProphetNet model configuration """ 16 | 17 | 18 | from ...utils import logging 19 | from ..prophetnet.configuration_prophetnet import ProphetNetConfig 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json", 26 | } 27 | 28 | 29 | class XLMProphetNetConfig(ProphetNetConfig): 30 | """ 31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate 32 | documentation alongside usage examples. 33 | """ 34 | 35 | model_type = "xlm-prophetnet" 36 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/training_args_seq2seq.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from dataclasses import dataclass, field 17 | 18 | from .file_utils import add_start_docstrings 19 | from .training_args import TrainingArguments 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | @dataclass 26 | @add_start_docstrings(TrainingArguments.__doc__) 27 | class Seq2SeqTrainingArguments(TrainingArguments): 28 | """ 29 | sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`): 30 | Whether to use a `sortish sampler` or not. Only possible if the underlying datasets are `Seq2SeqDataset` for 31 | now but will become generally available in the near future. 32 | 33 | It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness for 34 | the training set. 35 | predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`): 36 | Whether to use generate to calculate generative metrics (ROUGE, BLEU). 37 | """ 38 | 39 | sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."}) 40 | predict_with_generate: bool = field( 41 | default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} 42 | ) 43 | -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/src/transformers/utils/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/src/transformers/utils/dummy_flax_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_flax 3 | 4 | 5 | class FlaxPreTrainedModel: 6 | def __init__(self, *args, **kwargs): 7 | requires_flax(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_flax(self) 12 | 13 | 14 | FLAX_MODEL_MAPPING = None 15 | 16 | 17 | class FlaxAutoModel: 18 | def __init__(self, *args, **kwargs): 19 | requires_flax(self) 20 | 21 | @classmethod 22 | def from_pretrained(self, *args, **kwargs): 23 | requires_flax(self) 24 | 25 | 26 | class FlaxBertForMaskedLM: 27 | def __init__(self, *args, **kwargs): 28 | requires_flax(self) 29 | 30 | @classmethod 31 | def from_pretrained(self, *args, **kwargs): 32 | requires_flax(self) 33 | 34 | 35 | class FlaxBertModel: 36 | def __init__(self, *args, **kwargs): 37 | requires_flax(self) 38 | 39 | @classmethod 40 | def from_pretrained(self, *args, **kwargs): 41 | requires_flax(self) 42 | 43 | 44 | class FlaxRobertaModel: 45 | def __init__(self, *args, **kwargs): 46 | requires_flax(self) 47 | 48 | @classmethod 49 | def from_pretrained(self, *args, **kwargs): 50 | requires_flax(self) 51 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_example_script/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # How to add a new example script in 🤗 Transformers 18 | 19 | This folder provide a template for adding a new example script implementing a training or inference task with the 20 | models in the 🤗 Transformers library. To use it, you will need to install cookiecutter: 21 | ``` 22 | pip install cookiecutter 23 | ``` 24 | or refer to the installation page of the [cookiecutter documentation](https://cookiecutter.readthedocs.io/). 25 | 26 | You can then run the following command inside the `examples` folder of the transformers repo: 27 | ``` 28 | cookiecutter ../templates/adding_a_new_example_script/ 29 | ``` 30 | and answer the questions asked, which will generate a new folder where you will find a pre-filled template for your 31 | example following the best practices we recommend for them. 32 | 33 | Adjust the way the data is preprocessed, the model is loaded or the Trainer is instantiated then when you're happy, add 34 | a `README.md` in the folder (or complete the existing one if you added a script to an existing folder) telling a user 35 | how to run your script. 36 | 37 | Make a PR to the 🤗 Transformers repo. Don't forget to tweet about your new example with a carbon screenshot of how to 38 | run it and tag @huggingface! 39 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_example_script/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "example_name": "text classification", 3 | "directory_name": "{{cookiecutter.example_name|lower|replace(' ', '-')}}", 4 | "example_shortcut": "{{cookiecutter.directory_name}}", 5 | "model_class": "AutoModel", 6 | "authors": "The HuggingFace Team", 7 | "can_train_from_scratch": ["True", "False"] 8 | } -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "{{cookiecutter.modelname}}", 3 | "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}", 4 | "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}", 5 | "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}", 6 | "authors": "{{cookiecutter.authors}}", 7 | "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}", 8 | "tokenizer_type": "{{cookiecutter.tokenizer_type}}", 9 | "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}", 10 | "is_encoder_decoder_model": ["True", "False"] 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "BrandNewBERT", 3 | "uppercase_modelname": "BRAND_NEW_BERT", 4 | "lowercase_modelname": "brand_new_bert", 5 | "camelcase_modelname": "BrandNewBert", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"], 9 | "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"], 10 | "is_encoder_decoder_model": ["True", "False"] 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "Template", 3 | "uppercase_modelname": "TEMPLATE", 4 | "lowercase_modelname": "template", 5 | "camelcase_modelname": "Template", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplatePT", 3 | "uppercase_modelname": "TEMPLATE_PT", 4 | "lowercase_modelname": "template_pt", 5 | "camelcase_modelname": "TemplatePt", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "PyTorch", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "NewENCDEC", 3 | "uppercase_modelname": "NEW_ENC_DEC", 4 | "lowercase_modelname": "new_enc_dec", 5 | "camelcase_modelname": "NewEncDec", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "new-enc-dec-base", 8 | "tokenizer_type": "Based on BART", 9 | "generate_tensorflow_and_pytorch": "PyTorch", 10 | "is_encoder_decoder_model": "True" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/standalone.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplateBI", 3 | "uppercase_modelname": "TEMPLATE_BI", 4 | "lowercase_modelname": "template_bi", 5 | "camelcase_modelname": "TemplateBi", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "bi-brand-new-bert-base-cased", 8 | "tokenizer_type": "Standalone", 9 | "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "TemplateTF", 3 | "uppercase_modelname": "TEMPLATE_TF", 4 | "lowercase_modelname": "template_tf", 5 | "camelcase_modelname": "TemplateTf", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "brand-new-bert-base-cased", 8 | "tokenizer_type": "Based on BERT", 9 | "generate_tensorflow_and_pytorch": "TensorFlow", 10 | "is_encoder_decoder_model": "False" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "modelname": "NewTFENCDEC", 3 | "uppercase_modelname": "NEW_TF_ENC_DEC", 4 | "lowercase_modelname": "new_tf_enc_dec", 5 | "camelcase_modelname": "NewTFEncDec", 6 | "authors": "The HuggingFace Team", 7 | "checkpoint_identifier": "new-tf-enc-dec-base", 8 | "tokenizer_type": "Based on BART", 9 | "generate_tensorflow_and_pytorch": "TensorFlow", 10 | "is_encoder_decoder_model": "True" 11 | } 12 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csebuetnlp/xl-sum/afcd803c8d7c98f3be60e9a7ce57fcbc3e729e8c/seq2seq/transformers/tests/__init__.py -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_activations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from transformers import is_torch_available 18 | from transformers.testing_utils import require_torch 19 | 20 | 21 | if is_torch_available(): 22 | import torch 23 | 24 | from transformers.activations import _gelu_python, gelu_new, get_activation 25 | 26 | 27 | @require_torch 28 | class TestActivations(unittest.TestCase): 29 | def test_gelu_versions(self): 30 | x = torch.Tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100]) 31 | torch_builtin = get_activation("gelu") 32 | self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item()) 33 | self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item()) 34 | 35 | def test_get_activation(self): 36 | get_activation("swish") 37 | get_activation("silu") 38 | get_activation("relu") 39 | get_activation("tanh") 40 | get_activation("gelu_new") 41 | get_activation("gelu_fast") 42 | with self.assertRaises(KeyError): 43 | get_activation("bogus") 44 | with self.assertRaises(KeyError): 45 | get_activation(None) 46 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_activations_tf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from transformers import is_tf_available 18 | from transformers.testing_utils import require_tf 19 | 20 | 21 | if is_tf_available(): 22 | from transformers.activations_tf import get_tf_activation 23 | 24 | 25 | @require_tf 26 | class TestTFActivations(unittest.TestCase): 27 | def test_get_activation(self): 28 | get_tf_activation("swish") 29 | get_tf_activation("silu") 30 | get_tf_activation("gelu") 31 | get_tf_activation("relu") 32 | get_tf_activation("tanh") 33 | get_tf_activation("gelu_new") 34 | get_tf_activation("gelu_fast") 35 | get_tf_activation("mish") 36 | with self.assertRaises(KeyError): 37 | get_tf_activation("bogus") 38 | with self.assertRaises(KeyError): 39 | get_tf_activation(None) 40 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import unittest 17 | from unittest.mock import patch 18 | 19 | from transformers.testing_utils import CaptureStd 20 | 21 | 22 | class CLITest(unittest.TestCase): 23 | @patch("sys.argv", ["fakeprogrampath", "env"]) 24 | def test_cli_env(self): 25 | # test transformers-cli env 26 | import transformers.commands.transformers_cli 27 | 28 | with CaptureStd() as cs: 29 | transformers.commands.transformers_cli.main() 30 | assert "Python version" in cs.out 31 | assert "Platform" in cs.out 32 | assert "Using distributed or parallel set-up in script?" in cs.out 33 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_pipelines_feature_extraction.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class FeatureExtractionPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "feature-extraction" 22 | small_models = [ 23 | "sshleifer/tiny-distilbert-base-cased" 24 | ] # Default model - Models tested without the @slow decorator 25 | large_models = [None] # Models tested with the @slow decorator 26 | mandatory_keys = {} # Keys which should be in the output 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_pipelines_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class SentimentAnalysisPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "sentiment-analysis" 22 | small_models = [ 23 | "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english" 24 | ] # Default model - Models tested without the @slow decorator 25 | large_models = [None] # Models tested with the @slow decorator 26 | mandatory_keys = {"label", "score"} # Keys which should be in the output 27 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_pipelines_text2text_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from .test_pipelines_common import MonoInputPipelineCommonMixin 18 | 19 | 20 | class Text2TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 21 | pipeline_task = "text2text-generation" 22 | small_models = ["patrickvonplaten/t5-tiny-random"] # Default model - Models tested without the @slow decorator 23 | large_models = [] # Models tested with the @slow decorator 24 | invalid_inputs = [4, ""] 25 | mandatory_keys = ["generated_text"] 26 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_pipelines_text_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | 17 | from transformers import pipeline 18 | 19 | from .test_pipelines_common import MonoInputPipelineCommonMixin 20 | 21 | 22 | class TextGenerationPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): 23 | pipeline_task = "text-generation" 24 | pipeline_running_kwargs = {"prefix": "This is "} 25 | small_models = ["sshleifer/tiny-ctrl"] # Models tested without the @slow decorator 26 | large_models = [] # Models tested with the @slow decorator 27 | 28 | def test_simple_generation(self): 29 | nlp = pipeline(task="text-generation", model=self.small_models[0]) 30 | # text-generation is non-deterministic by nature, we can't fully test the output 31 | 32 | outputs = nlp("This is a test") 33 | 34 | self.assertEqual(len(outputs), 1) 35 | self.assertEqual(list(outputs[0].keys()), ["generated_text"]) 36 | self.assertEqual(type(outputs[0]["generated_text"]), str) 37 | 38 | outputs = nlp(["This is a test", "This is a second test"]) 39 | self.assertEqual(len(outputs[0]), 1) 40 | self.assertEqual(list(outputs[0][0].keys()), ["generated_text"]) 41 | self.assertEqual(type(outputs[0][0]["generated_text"]), str) 42 | self.assertEqual(list(outputs[1][0].keys()), ["generated_text"]) 43 | self.assertEqual(type(outputs[1][0]["generated_text"]), str) 44 | -------------------------------------------------------------------------------- /seq2seq/transformers/tests/test_tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from transformers import DistilBertTokenizer, DistilBertTokenizerFast 18 | from transformers.testing_utils import require_tokenizers, slow 19 | 20 | from .test_tokenization_bert import BertTokenizationTest 21 | 22 | 23 | @require_tokenizers 24 | class DistilBertTokenizationTest(BertTokenizationTest): 25 | 26 | tokenizer_class = DistilBertTokenizer 27 | rust_tokenizer_class = DistilBertTokenizerFast 28 | test_rust_tokenizer = True 29 | 30 | @slow 31 | def test_sequence_builders(self): 32 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 33 | 34 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 35 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 36 | 37 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 38 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 39 | 40 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 41 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ 42 | tokenizer.sep_token_id 43 | ] 44 | -------------------------------------------------------------------------------- /seq2seq/transformers/utils/get_modified_files.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # this script reports modified .py files under the desired list of top-level sub-dirs passed as a list of arguments, e.g.: 17 | # python ./utils/get_modified_files.py utils src tests examples 18 | # 19 | # it uses git to find the forking point and which files were modified - i.e. files not under git won't be considered 20 | # since the output of this script is fed into Makefile commands it doesn't print a newline after the results 21 | 22 | import re 23 | import subprocess 24 | import sys 25 | 26 | 27 | fork_point_sha = subprocess.check_output("git merge-base --fork-point master".split()).decode("utf-8") 28 | modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split() 29 | 30 | joined_dirs = "|".join(sys.argv[1:]) 31 | regex = re.compile(fr"^({joined_dirs}).*?\.py$") 32 | 33 | relevant_modified_files = [x for x in modified_files if regex.match(x)] 34 | print(" ".join(relevant_modified_files), end="") 35 | --------------------------------------------------------------------------------