├── .gitignore ├── README.md ├── few_shot_ppl.py ├── mlm ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README.md ├── main.py ├── mlm-scoring.png ├── obtain_ppl_for_mlm.sh ├── scripts │ ├── librispeech-download-text.sh │ └── librispeech-score.sh ├── setup.py ├── src │ ├── main.py │ ├── mlm.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── entry_points.txt │ │ ├── not-zip-safe │ │ ├── requires.txt │ │ └── top_level.txt │ └── mlm │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── batchify.cpython-36.pyc │ │ ├── loaders.cpython-36.pyc │ │ └── scorers.cpython-36.pyc │ │ ├── batchify.py │ │ ├── cmds.py │ │ ├── loaders.py │ │ ├── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── bert.cpython-36.pyc │ │ │ └── gpt2.cpython-36.pyc │ │ ├── bert.py │ │ └── gpt2.py │ │ └── scorers.py └── tests │ ├── test_cmds.py │ ├── test_loaders.py │ ├── test_models.py │ └── test_scorers.py ├── obtain_ppl_for_clm.sh ├── plot ├── HKUST.jpg ├── method_illustration.png └── pytorch-logo-dark.png ├── run_few_shot.sh ├── run_language_modelling_clm.py └── transformers ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc ├── activations.cpython-37.pyc ├── configuration_albert.cpython-37.pyc ├── configuration_auto.cpython-37.pyc ├── configuration_bart.cpython-37.pyc ├── configuration_bert.cpython-37.pyc ├── configuration_bert_generation.cpython-37.pyc ├── configuration_blenderbot.cpython-37.pyc ├── configuration_camembert.cpython-37.pyc ├── configuration_ctrl.cpython-37.pyc ├── configuration_deberta.cpython-37.pyc ├── configuration_distilbert.cpython-37.pyc ├── configuration_dpr.cpython-37.pyc ├── configuration_electra.cpython-37.pyc ├── configuration_encoder_decoder.cpython-37.pyc ├── configuration_flaubert.cpython-37.pyc ├── configuration_fsmt.cpython-37.pyc ├── configuration_funnel.cpython-37.pyc ├── configuration_gpt2.cpython-37.pyc ├── configuration_layoutlm.cpython-37.pyc ├── configuration_longformer.cpython-37.pyc ├── configuration_lxmert.cpython-37.pyc ├── configuration_marian.cpython-37.pyc ├── configuration_mbart.cpython-37.pyc ├── configuration_mmbt.cpython-37.pyc ├── configuration_mobilebert.cpython-37.pyc ├── configuration_openai.cpython-37.pyc ├── configuration_pegasus.cpython-37.pyc ├── configuration_prophetnet.cpython-37.pyc ├── configuration_rag.cpython-37.pyc ├── configuration_reformer.cpython-37.pyc ├── configuration_retribert.cpython-37.pyc ├── configuration_roberta.cpython-37.pyc ├── configuration_squeezebert.cpython-37.pyc ├── configuration_t5.cpython-37.pyc ├── configuration_transfo_xl.cpython-37.pyc ├── configuration_utils.cpython-37.pyc ├── configuration_xlm.cpython-37.pyc ├── configuration_xlm_prophetnet.cpython-37.pyc ├── configuration_xlm_roberta.cpython-37.pyc ├── configuration_xlnet.cpython-37.pyc ├── convert_slow_tokenizer.cpython-37.pyc ├── file_utils.cpython-37.pyc ├── generation_beam_search.cpython-37.pyc ├── generation_logits_process.cpython-37.pyc ├── generation_utils.cpython-37.pyc ├── hf_argparser.cpython-37.pyc ├── integrations.cpython-37.pyc ├── modelcard.cpython-37.pyc ├── modeling_albert.cpython-37.pyc ├── modeling_auto.cpython-37.pyc ├── modeling_bart.cpython-37.pyc ├── modeling_bert.cpython-37.pyc ├── modeling_bert_generation.cpython-37.pyc ├── modeling_blenderbot.cpython-37.pyc ├── modeling_camembert.cpython-37.pyc ├── modeling_ctrl.cpython-37.pyc ├── modeling_deberta.cpython-37.pyc ├── modeling_distilbert.cpython-37.pyc ├── modeling_dpr.cpython-37.pyc ├── modeling_electra.cpython-37.pyc ├── modeling_encoder_decoder.cpython-37.pyc ├── modeling_flaubert.cpython-37.pyc ├── modeling_fsmt.cpython-37.pyc ├── modeling_funnel.cpython-37.pyc ├── modeling_gpt2.cpython-37.pyc ├── modeling_layoutlm.cpython-37.pyc ├── modeling_longformer.cpython-37.pyc ├── modeling_lxmert.cpython-37.pyc ├── modeling_marian.cpython-37.pyc ├── modeling_mbart.cpython-37.pyc ├── modeling_mmbt.cpython-37.pyc ├── modeling_mobilebert.cpython-37.pyc ├── modeling_openai.cpython-37.pyc ├── modeling_outputs.cpython-37.pyc ├── modeling_pegasus.cpython-37.pyc ├── modeling_prophetnet.cpython-37.pyc ├── modeling_rag.cpython-37.pyc ├── modeling_reformer.cpython-37.pyc ├── modeling_retribert.cpython-37.pyc ├── modeling_roberta.cpython-37.pyc ├── modeling_squeezebert.cpython-37.pyc ├── modeling_t5.cpython-37.pyc ├── modeling_tf_pytorch_utils.cpython-37.pyc ├── modeling_transfo_xl.cpython-37.pyc ├── modeling_transfo_xl_utilities.cpython-37.pyc ├── modeling_utils.cpython-37.pyc ├── modeling_xlm.cpython-37.pyc ├── modeling_xlm_prophetnet.cpython-37.pyc ├── modeling_xlm_roberta.cpython-37.pyc ├── modeling_xlnet.cpython-37.pyc ├── optimization.cpython-37.pyc ├── pipelines.cpython-37.pyc ├── retrieval_rag.cpython-37.pyc ├── tokenization_albert.cpython-37.pyc ├── tokenization_albert_fast.cpython-37.pyc ├── tokenization_auto.cpython-37.pyc ├── tokenization_bart.cpython-37.pyc ├── tokenization_bart_fast.cpython-37.pyc ├── tokenization_bert.cpython-37.pyc ├── tokenization_bert_fast.cpython-37.pyc ├── tokenization_bert_generation.cpython-37.pyc ├── tokenization_bert_japanese.cpython-37.pyc ├── tokenization_bertweet.cpython-37.pyc ├── tokenization_blenderbot.cpython-37.pyc ├── tokenization_camembert.cpython-37.pyc ├── tokenization_camembert_fast.cpython-37.pyc ├── tokenization_ctrl.cpython-37.pyc ├── tokenization_deberta.cpython-37.pyc ├── tokenization_distilbert.cpython-37.pyc ├── tokenization_distilbert_fast.cpython-37.pyc ├── tokenization_dpr.cpython-37.pyc ├── tokenization_dpr_fast.cpython-37.pyc ├── tokenization_electra.cpython-37.pyc ├── tokenization_electra_fast.cpython-37.pyc ├── tokenization_flaubert.cpython-37.pyc ├── tokenization_fsmt.cpython-37.pyc ├── tokenization_funnel.cpython-37.pyc ├── tokenization_funnel_fast.cpython-37.pyc ├── tokenization_gpt2.cpython-37.pyc ├── tokenization_gpt2_fast.cpython-37.pyc ├── tokenization_herbert.cpython-37.pyc ├── tokenization_herbert_fast.cpython-37.pyc ├── tokenization_layoutlm.cpython-37.pyc ├── tokenization_layoutlm_fast.cpython-37.pyc ├── tokenization_longformer.cpython-37.pyc ├── tokenization_longformer_fast.cpython-37.pyc ├── tokenization_lxmert.cpython-37.pyc ├── tokenization_lxmert_fast.cpython-37.pyc ├── tokenization_marian.cpython-37.pyc ├── tokenization_mbart.cpython-37.pyc ├── tokenization_mbart_fast.cpython-37.pyc ├── tokenization_mobilebert.cpython-37.pyc ├── tokenization_mobilebert_fast.cpython-37.pyc ├── tokenization_openai.cpython-37.pyc ├── tokenization_openai_fast.cpython-37.pyc ├── tokenization_pegasus.cpython-37.pyc ├── tokenization_pegasus_fast.cpython-37.pyc ├── tokenization_phobert.cpython-37.pyc ├── tokenization_prophetnet.cpython-37.pyc ├── tokenization_rag.cpython-37.pyc ├── tokenization_reformer.cpython-37.pyc ├── tokenization_reformer_fast.cpython-37.pyc ├── tokenization_retribert.cpython-37.pyc ├── tokenization_retribert_fast.cpython-37.pyc ├── tokenization_roberta.cpython-37.pyc ├── tokenization_roberta_fast.cpython-37.pyc ├── tokenization_squeezebert.cpython-37.pyc ├── tokenization_squeezebert_fast.cpython-37.pyc ├── tokenization_t5.cpython-37.pyc ├── tokenization_t5_fast.cpython-37.pyc ├── tokenization_transfo_xl.cpython-37.pyc ├── tokenization_utils.cpython-37.pyc ├── tokenization_utils_base.cpython-37.pyc ├── tokenization_utils_fast.cpython-37.pyc ├── tokenization_xlm.cpython-37.pyc ├── tokenization_xlm_prophetnet.cpython-37.pyc ├── tokenization_xlm_roberta.cpython-37.pyc ├── tokenization_xlm_roberta_fast.cpython-37.pyc ├── tokenization_xlnet.cpython-37.pyc ├── tokenization_xlnet_fast.cpython-37.pyc ├── trainer.cpython-37.pyc ├── trainer_callback.cpython-37.pyc ├── trainer_pt_utils.cpython-37.pyc ├── trainer_utils.cpython-37.pyc ├── training_args.cpython-37.pyc └── training_args_tf.cpython-37.pyc ├── activations.py ├── activations_tf.py ├── benchmark ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── benchmark.cpython-37.pyc │ ├── benchmark_args.cpython-37.pyc │ ├── benchmark_args_utils.cpython-37.pyc │ └── benchmark_utils.cpython-37.pyc ├── benchmark.py ├── benchmark_args.py ├── benchmark_args_tf.py ├── benchmark_args_utils.py ├── benchmark_tf.py └── benchmark_utils.py ├── commands ├── __init__.py ├── add_new_model.py ├── convert.py ├── download.py ├── env.py ├── run.py ├── serving.py ├── train.py ├── transformers_cli.py └── user.py ├── configuration_albert.py ├── configuration_auto.py ├── configuration_bart.py ├── configuration_bert.py ├── configuration_bert_generation.py ├── configuration_blenderbot.py ├── configuration_camembert.py ├── configuration_ctrl.py ├── configuration_deberta.py ├── configuration_distilbert.py ├── configuration_dpr.py ├── configuration_electra.py ├── configuration_encoder_decoder.py ├── configuration_flaubert.py ├── configuration_fsmt.py ├── configuration_funnel.py ├── configuration_gpt2.py ├── configuration_layoutlm.py ├── configuration_longformer.py ├── configuration_lxmert.py ├── configuration_marian.py ├── configuration_mbart.py ├── configuration_mmbt.py ├── configuration_mobilebert.py ├── configuration_openai.py ├── configuration_pegasus.py ├── configuration_prophetnet.py ├── configuration_rag.py ├── configuration_reformer.py ├── configuration_retribert.py ├── configuration_roberta.py ├── configuration_squeezebert.py ├── configuration_t5.py ├── configuration_transfo_xl.py ├── configuration_utils.py ├── configuration_xlm.py ├── configuration_xlm_prophetnet.py ├── configuration_xlm_roberta.py ├── configuration_xlnet.py ├── convert_albert_original_tf_checkpoint_to_pytorch.py ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py ├── convert_bert_original_tf2_checkpoint_to_pytorch.py ├── convert_bert_original_tf_checkpoint_to_pytorch.py ├── convert_bert_pytorch_checkpoint_to_original_tf.py ├── convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py ├── convert_dpr_original_checkpoint_to_pytorch.py ├── convert_electra_original_tf_checkpoint_to_pytorch.py ├── convert_fsmt_original_pytorch_checkpoint_to_pytorch.py ├── convert_funnel_original_tf_checkpoint_to_pytorch.py ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py ├── convert_graph_to_onnx.py ├── convert_longformer_original_pytorch_lightning_to_pytorch.py ├── convert_lxmert_original_tf_checkpoint_to_pytorch.py ├── convert_marian_tatoeba_to_pytorch.py ├── convert_marian_to_pytorch.py ├── convert_mbart_original_checkpoint_to_pytorch.py ├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py ├── convert_openai_original_tf_checkpoint_to_pytorch.py ├── convert_pegasus_tf_to_pytorch.py ├── convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py ├── convert_pytorch_checkpoint_to_tf2.py ├── convert_reformer_trax_checkpoint_to_pytorch.py ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py ├── convert_slow_tokenizer.py ├── convert_slow_tokenizers_checkpoints_to_fast.py ├── convert_t5_original_tf_checkpoint_to_pytorch.py ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py ├── data ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── data_collator.cpython-37.pyc ├── data_collator.py ├── datasets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── glue.cpython-37.pyc │ │ ├── language_modeling.cpython-37.pyc │ │ ├── lm.cpython-37.pyc │ │ └── squad.cpython-37.pyc │ ├── glue.py │ ├── language_modeling.py │ ├── lm.py │ └── squad.py ├── metrics │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-37.pyc │ └── squad_metrics.py ├── processors │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── factcheck.cpython-37.pyc │ │ ├── glue.cpython-37.pyc │ │ ├── squad.cpython-37.pyc │ │ ├── utils.cpython-37.pyc │ │ └── xnli.cpython-37.pyc │ ├── factcheck.py │ ├── factcheck_old.py │ ├── glue.py │ ├── squad.py │ ├── utils.py │ └── xnli.py └── test_generation_utils.py ├── file_utils.py ├── generation_beam_search.py ├── generation_logits_process.py ├── generation_tf_utils.py ├── generation_utils.py ├── hf_api.py ├── hf_argparser.py ├── integrations.py ├── modelcard.py ├── modeling_albert.py ├── modeling_auto.py ├── modeling_bart.py ├── modeling_bert.py ├── modeling_bert_generation.py ├── modeling_blenderbot.py ├── modeling_camembert.py ├── modeling_ctrl.py ├── modeling_deberta.py ├── modeling_distilbert.py ├── modeling_dpr.py ├── modeling_electra.py ├── modeling_encoder_decoder.py ├── modeling_flaubert.py ├── modeling_flax_auto.py ├── modeling_flax_bert.py ├── modeling_flax_roberta.py ├── modeling_flax_utils.py ├── modeling_fsmt.py ├── modeling_funnel.py ├── modeling_gpt2.py ├── modeling_layoutlm.py ├── modeling_longformer.py ├── modeling_lxmert.py ├── modeling_marian.py ├── modeling_mbart.py ├── modeling_mmbt.py ├── modeling_mobilebert.py ├── modeling_openai.py ├── modeling_outputs.py ├── modeling_pegasus.py ├── modeling_prophetnet.py ├── modeling_rag.py ├── modeling_reformer.py ├── modeling_retribert.py ├── modeling_roberta.py ├── modeling_squeezebert.py ├── modeling_t5.py ├── modeling_tf_albert.py ├── modeling_tf_auto.py ├── modeling_tf_bart.py ├── modeling_tf_bert.py ├── modeling_tf_blenderbot.py ├── modeling_tf_camembert.py ├── modeling_tf_ctrl.py ├── modeling_tf_distilbert.py ├── modeling_tf_dpr.py ├── modeling_tf_electra.py ├── modeling_tf_flaubert.py ├── modeling_tf_funnel.py ├── modeling_tf_gpt2.py ├── modeling_tf_longformer.py ├── modeling_tf_lxmert.py ├── modeling_tf_marian.py ├── modeling_tf_mbart.py ├── modeling_tf_mobilebert.py ├── modeling_tf_openai.py ├── modeling_tf_outputs.py ├── modeling_tf_pegasus.py ├── modeling_tf_pytorch_utils.py ├── modeling_tf_roberta.py ├── modeling_tf_t5.py ├── modeling_tf_transfo_xl.py ├── modeling_tf_transfo_xl_utilities.py ├── modeling_tf_utils.py ├── modeling_tf_xlm.py ├── modeling_tf_xlm_roberta.py ├── modeling_tf_xlnet.py ├── modeling_transfo_xl.py ├── modeling_transfo_xl_utilities.py ├── modeling_utils.py ├── modeling_xlm.py ├── modeling_xlm_prophetnet.py ├── modeling_xlm_roberta.py ├── modeling_xlnet.py ├── optimization.py ├── optimization_tf.py ├── pipelines.py ├── retrieval_rag.py ├── testing_utils.py ├── tokenization_albert.py ├── tokenization_albert_fast.py ├── tokenization_auto.py ├── tokenization_bart.py ├── tokenization_bart_fast.py ├── tokenization_bert.py ├── tokenization_bert_fast.py ├── tokenization_bert_generation.py ├── tokenization_bert_japanese.py ├── tokenization_bertweet.py ├── tokenization_blenderbot.py ├── tokenization_camembert.py ├── tokenization_camembert_fast.py ├── tokenization_ctrl.py ├── tokenization_deberta.py ├── tokenization_distilbert.py ├── tokenization_distilbert_fast.py ├── tokenization_dpr.py ├── tokenization_dpr_fast.py ├── tokenization_electra.py ├── tokenization_electra_fast.py ├── tokenization_flaubert.py ├── tokenization_fsmt.py ├── tokenization_funnel.py ├── tokenization_funnel_fast.py ├── tokenization_gpt2.py ├── tokenization_gpt2_fast.py ├── tokenization_herbert.py ├── tokenization_herbert_fast.py ├── tokenization_layoutlm.py ├── tokenization_layoutlm_fast.py ├── tokenization_longformer.py ├── tokenization_longformer_fast.py ├── tokenization_lxmert.py ├── tokenization_lxmert_fast.py ├── tokenization_marian.py ├── tokenization_mbart.py ├── tokenization_mbart_fast.py ├── tokenization_mobilebert.py ├── tokenization_mobilebert_fast.py ├── tokenization_openai.py ├── tokenization_openai_fast.py ├── tokenization_pegasus.py ├── tokenization_pegasus_fast.py ├── tokenization_phobert.py ├── tokenization_prophetnet.py ├── tokenization_rag.py ├── tokenization_reformer.py ├── tokenization_reformer_fast.py ├── tokenization_retribert.py ├── tokenization_retribert_fast.py ├── tokenization_roberta.py ├── tokenization_roberta_fast.py ├── tokenization_squeezebert.py ├── tokenization_squeezebert_fast.py ├── tokenization_t5.py ├── tokenization_t5_fast.py ├── tokenization_transfo_xl.py ├── tokenization_utils.py ├── tokenization_utils_base.py ├── tokenization_utils_fast.py ├── tokenization_xlm.py ├── tokenization_xlm_prophetnet.py ├── tokenization_xlm_roberta.py ├── tokenization_xlm_roberta_fast.py ├── tokenization_xlnet.py ├── tokenization_xlnet_fast.py ├── trainer.py ├── trainer_callback.py ├── trainer_pt_utils.py ├── trainer_tf.py ├── trainer_utils.py ├── training_args.py ├── training_args_tf.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc ├── dummy_flax_objects.cpython-37.pyc ├── dummy_tf_objects.cpython-37.pyc ├── logging.cpython-37.pyc └── sentencepiece_model_pb2.cpython-37.pyc ├── dummy_flax_objects.py ├── dummy_pt_objects.py ├── dummy_sentencepiece_objects.py ├── dummy_tf_objects.py ├── dummy_tokenizers_objects.py ├── hp_naming.py ├── logging.py ├── notebook.py └── sentencepiece_model_pb2.py /.gitignore: -------------------------------------------------------------------------------- 1 | ppl_results/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Perplexity-FactChecking 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | 5 | This respository contains the code for our paper: 6 | **Towards Few-Shot Fact-Checking via Perplexity**. *Nayeon Lee\*, Yejin Bang\*, [Andrea Madotto](https://andreamad8.github.io/)*, Madian Khabsa, Pascale Fung, *NAACL2021* [[PDF]](https://www.aclweb.org/anthology/2021.naacl-main.158.pdf) 7 | 8 | 9 | 10 | ## How to run 11 | 12 | #### 1. Dataset Preparation 13 | 14 | To download the testset with evidence used for experiments described in the paper, please fill in the request form - https://forms.gle/5key5cTqCu5ZLTnr7 15 | The details of test set can be found in the paper. 16 | 17 | After you download, please locate the data files under directory 'data/' 18 | 19 | #### 2. Obtain Evidence-conditioned Perplexity 20 | By running the below script, files with perplexity scores will be saved in "/ppl_results" directory. 21 | 22 | **a. Causal Language Model** 23 | 24 | ``` 25 | bash obtain_evidence_conditioned_perplexity_clm.sh 26 | ``` 27 | 28 | **b. Masked Language Model** 29 | 30 | ``` 31 | bash mlm/obtain_evidence_conditioned_perplexity_mlm.sh 32 | ``` 33 | 34 | 35 | #### 3. Hyper-parameter search (of the optimal threshold), and evaluate performance 36 | 37 | ``` 38 | bash run_few_shot.sh 39 | ``` 40 | 41 | ## Citation: 42 | 43 | If you find this paper and code useful, please cite our paper: 44 | 45 | ``` 46 | @inproceedings{lee-etal-2021-towards, 47 | title = "Towards Few-shot Fact-Checking via Perplexity", 48 | author = "Lee, Nayeon and 49 | Bang, Yejin and 50 | Madotto, Andrea and 51 | Fung, Pascale", 52 | booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies", 53 | month = jun, 54 | year = "2021", 55 | address = "Online", 56 | publisher = "Association for Computational Linguistics", 57 | url = "https://www.aclweb.org/anthology/2021.naacl-main.158", 58 | pages = "1971--1981" 59 | } 60 | ``` 61 | 62 | ## Acknowledgement 63 | This repository is implemented using [**Huggingface**](https://github.com/huggingface/transformers) codebase. 64 | For MLM, we utilize code from [**MLM-scoring Github**](https://github.com/awslabs/mlm-scoring) 65 | 66 | -------------------------------------------------------------------------------- /mlm/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. -------------------------------------------------------------------------------- /mlm/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. -------------------------------------------------------------------------------- /mlm/NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -------------------------------------------------------------------------------- /mlm/main.py: -------------------------------------------------------------------------------- 1 | # import mlm 2 | # from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer 3 | # from mlm.models import get_pretrained 4 | # import mxnet as mx 5 | # ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)] 6 | 7 | # model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased') # bert-large-en-uncased 8 | # scorer = MLMScorer(model, vocab, tokenizer, ctxs) 9 | 10 | # print(scorer.score_sentences(["Hello world!"])) 11 | 12 | # # for line in all_lines: 13 | # # print(scorer.score_sentences(["Hello world!"])) 14 | # # # >> [-12.410664200782776] 15 | # # # print(scorer.score_sentences(["Hello world!"], per_token=True)) 16 | from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer 17 | from mlm.models import get_pretrained 18 | import mxnet as mx 19 | import numpy as np 20 | import argparse 21 | import jsonlines 22 | from tqdm import tqdm 23 | 24 | def fever_data_cleaning(sent): 25 | sent = sent.replace('-LRB-', '(') 26 | sent = sent.replace('-RRB-', ')') 27 | sent = sent.replace('-LSB-', '[') 28 | sent = sent.replace('-RSB-', ']') 29 | return sent 30 | 31 | def prepare_data(file_path): 32 | test_set = [] 33 | with jsonlines.open(file_path) as reader: 34 | objs = [obj for obj in reader] 35 | 36 | for obj in objs: 37 | claim = obj['claim'].lower().strip() 38 | evs_line = fever_data_cleaning(obj['evidences'][0][0]).lower().strip() 39 | test_sent = " ".join([evs_line, claim]) 40 | test_set.append(test_sent) 41 | return test_set 42 | 43 | models_mapping = { 44 | 'bert-large': 'bert-large-en-uncased', 45 | 'bert-base':'bert-base-en-uncased' 46 | } 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | "--train_data_file", default=None, type=str, required=False, help="" 51 | ) 52 | parser.add_argument( 53 | "--output_eval_file", default=None, type=str, required=False, help="" 54 | ) 55 | parser.add_argument( 56 | "--model_name", default=None, type=str, required=False, help="" 57 | ) 58 | args = parser.parse_args() 59 | 60 | modelName = models_mapping[args.model_name] 61 | ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)] 62 | model, vocab, tokenizer = get_pretrained(ctxs, modelName) # bert-base-en-uncased bert-large-en-uncased 63 | scorer = MLMScorer(model, vocab, tokenizer, ctxs) 64 | 65 | ppl_results=[] 66 | file_path = args.train_data_file 67 | 68 | print("Evaluating ", file_path) 69 | 70 | with jsonlines.open(file_path) as reader: 71 | objs = [obj for obj in reader] 72 | 73 | for i in tqdm(range(len(objs))): 74 | obj = objs[i] 75 | claim = fever_data_cleaning(obj['claim'].lower().strip()) 76 | evs_line = fever_data_cleaning(obj['evidences'][0][0]).lower().strip() 77 | test_sent = " ".join([evs_line, claim]) 78 | ppl = {'perplexity': scorer.score_sentences([test_sent])[0]} 79 | ppl_results.append(ppl) 80 | 81 | with jsonlines.open(args.output_eval_file.replace(".npy", ".jsonl"), 'a') as writer: 82 | writer.write(ppl) 83 | # ppl_results = scorer.score_sentences(test_lines) 84 | np.save(args.output_eval_file, ppl_results) 85 | -------------------------------------------------------------------------------- /mlm/mlm-scoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/mlm-scoring.png -------------------------------------------------------------------------------- /mlm/obtain_ppl_for_mlm.sh: -------------------------------------------------------------------------------- 1 | TEST_DATA_PATH=/path/to/test/data.jsonl # e.g. covid_scientific.jsonl 2 | EXP_NAME=output-name 3 | 4 | LM_MODEL_TYPE=bert-base # bert-large 5 | python main.py \ 6 | --train_data_file=$TEST_DATA_PATH \ 7 | --output_eval_file=/path-to-project/ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \ 8 | --model_name=$LM_MODEL_TYPE -------------------------------------------------------------------------------- /mlm/scripts/librispeech-download-text.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | set -x 5 | 6 | target_dir=$1 7 | 8 | mkdir -p ${1} 9 | wget -O ${1}/librispeech-lm-norm.txt.gz http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz 10 | gunzip ${1}/librispeech-lm-norm.txt.gz 11 | # To avoid tripping up cased models 12 | tr '[:upper:]' '[:lower:]' < ${1}/librispeech-lm-norm.txt > ${1}/librispeech-lm-norm.lower.txt 13 | # Split to a number that's divisible by 10, and 4/8/16 GPUs ;) 14 | split --numeric-suffixes --suffix-length 2 --number l/80 ${1}/librispeech-lm-norm.lower.txt ${1}/part. 15 | # Clean up 16 | rm ${1}/librispeech-lm-norm.txt ${1}/librispeech-lm-norm.lower.txt 17 | echo "There should be 80 parts in ${1}; I found $(ls -1q ${1}/part.* | wc -l)." 18 | -------------------------------------------------------------------------------- /mlm/scripts/librispeech-score.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # set -e 4 | set -x 5 | 6 | source_dir=$1 7 | target_dir=$2 8 | start=$3 9 | end=$4 10 | gpus=$5 11 | split_size=$6 12 | model=$7 13 | 14 | if [ "$8" != "" ]; then 15 | model_weights_arg="--weights $8" 16 | else 17 | model_weights_arg="" 18 | fi 19 | 20 | ### TODO: Scale better so that split sizes are not absurdly low 21 | 22 | for x in `seq -w ${start} ${end}` 23 | do 24 | mkdir -p ${target_dir} 25 | mlm score ${model_weights_arg} \ 26 | --mode ref \ 27 | --model ${model} \ 28 | --gpus ${gpus} \ 29 | --split-size ${split_size} \ 30 | ${source_dir}/part.${x} \ 31 | > ${target_dir}/part.${x}.ref.scores \ 32 | 2> >(tee ${target_dir}/part.${x}.ref.log >&2) 33 | done 34 | -------------------------------------------------------------------------------- /mlm/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name='mlm', 7 | version='0.1', 8 | description="Masked Language Model Scoring", 9 | author='Julian Salazar', 10 | packages=find_packages('src'), 11 | package_dir={'': 'src'}, 12 | entry_points = { 13 | 'console_scripts': ['mlm=mlm.cmds:main'], 14 | }, 15 | 16 | install_requires=[ 17 | 'gluonnlp~=0.8.3', 18 | 'regex', 19 | 'sacrebleu', 20 | 'mosestokenizer', 21 | 'transformers~=3.3.1' 22 | ], 23 | 24 | extras_require={ 25 | 'dev': [ 26 | 'pylint', 27 | 'pytest', 28 | 'pytest-cov', 29 | 'mypy' 30 | ] 31 | }, 32 | 33 | # Needed for static type checking 34 | # https://mypy.readthedocs.io/en/latest/installed_packages.html 35 | zip_safe=False 36 | ) 37 | -------------------------------------------------------------------------------- /mlm/src/main.py: -------------------------------------------------------------------------------- 1 | from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer 2 | from mlm.models import get_pretrained 3 | import mxnet as mx 4 | ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)] 5 | 6 | # MXNet MLMs (use names from mlm.models.SUPPORTED_MLMS) 7 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased') 8 | scorer = MLMScorer(model, vocab, tokenizer, ctxs) 9 | print(scorer.score_sentences(["Hello world!"])) 10 | # >> [-12.410664200782776] 11 | print(scorer.score_sentences(["Hello world!"], per_token=True)) 12 | 13 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: mlm 3 | Version: 0.1 4 | Summary: Masked Language Model Scoring 5 | Home-page: UNKNOWN 6 | Author: Julian Salazar 7 | License: UNKNOWN 8 | Description: UNKNOWN 9 | Platform: UNKNOWN 10 | Provides-Extra: dev 11 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | src/mlm/__init__.py 4 | src/mlm/batchify.py 5 | src/mlm/cmds.py 6 | src/mlm/loaders.py 7 | src/mlm/scorers.py 8 | src/mlm.egg-info/PKG-INFO 9 | src/mlm.egg-info/SOURCES.txt 10 | src/mlm.egg-info/dependency_links.txt 11 | src/mlm.egg-info/entry_points.txt 12 | src/mlm.egg-info/not-zip-safe 13 | src/mlm.egg-info/requires.txt 14 | src/mlm.egg-info/top_level.txt 15 | src/mlm/models/__init__.py 16 | src/mlm/models/bert.py 17 | src/mlm/models/gpt2.py -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | mlm = mlm.cmds:main 3 | 4 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | gluonnlp~=0.8.3 2 | regex 3 | sacrebleu 4 | mosestokenizer 5 | transformers~=3.3.1 6 | 7 | [dev] 8 | pylint 9 | pytest 10 | pytest-cov 11 | mypy 12 | -------------------------------------------------------------------------------- /mlm/src/mlm.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | mlm 2 | -------------------------------------------------------------------------------- /mlm/src/mlm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__init__.py -------------------------------------------------------------------------------- /mlm/src/mlm/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/__pycache__/batchify.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/batchify.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/__pycache__/loaders.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/loaders.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/__pycache__/scorers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/scorers.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/models/__pycache__/bert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/bert.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/src/mlm/models/__pycache__/gpt2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/gpt2.cpython-36.pyc -------------------------------------------------------------------------------- /mlm/tests/test_cmds.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import random 4 | 5 | import mxnet as mx 6 | import numpy as np 7 | 8 | from mlm.cmds import setup_ctxs 9 | 10 | 11 | def test_setup_ctxs(): 12 | 13 | # CPU 14 | ctxs = setup_ctxs('-1') 15 | assert len(ctxs) == 1 16 | assert ctxs[0] == mx.cpu() 17 | # Test randomness 18 | assert random.randint(0, 1000000) == 885440 19 | assert np.random.randint(0, 1000000) == 985772 20 | assert mx.random.randint(0, 1000000, ctx=ctxs[0])[0] == 656751 21 | 22 | # GPU 23 | ctxs = setup_ctxs('0,2') 24 | assert len(ctxs) == 2 25 | assert ctxs[0] == mx.gpu(0) 26 | assert ctxs[1] == mx.gpu(2) 27 | # Test randomness 28 | for ctx in ctxs: 29 | assert mx.random.randint(0, 1000000, ctx=ctx)[0] == 248005 30 | -------------------------------------------------------------------------------- /mlm/tests/test_scorers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import mxnet as mx 4 | from mxnet.gluon.data import Dataset 5 | 6 | from mlm.loaders import Corpus 7 | from mlm.models import get_pretrained 8 | from mlm.scorers import LMScorer, MLMScorer, MLMScorerPT 9 | 10 | 11 | # The ASR case, where we append . as an EOS 12 | 13 | def _get_scorer_and_corpus_eos(): 14 | ctxs = [mx.cpu()] 15 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased') 16 | scorer_mx = MLMScorer(model, vocab, tokenizer, ctxs, eos=True, wwm=False) 17 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-uncased') 18 | scorer_pt = MLMScorerPT(model, vocab, tokenizer, ctxs, eos=True, wwm=False) 19 | corpus = Corpus.from_dict({'utt': {'ref': "I am Sam"}}) 20 | return scorer_mx, scorer_pt, corpus 21 | 22 | 23 | def test_mlmscorer_corpus_to_dataset(): 24 | scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos() 25 | dataset = scorer_mx.corpus_to_dataset(corpus) 26 | assert isinstance(dataset, Dataset) 27 | # Our three tokens, plus the EOS 28 | assert len(dataset) == 4 29 | 30 | 31 | def test_mlmscorer_score_eos(): 32 | scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos() 33 | scores, _ = scorer_mx.score(corpus) 34 | assert len(scores) == 1 35 | assert pytest.approx(scores[0], abs=0.0001) == -13.3065947 36 | scores, _ = scorer_pt.score(corpus) 37 | assert len(scores) == 1 38 | assert pytest.approx(scores[0], abs=0.0001) == -13.3065947 39 | 40 | 41 | # The general case 42 | 43 | def test_mlmscorer_score_sentences(): 44 | 45 | TEST_CASES = ( 46 | # README examples 47 | ('bert-base-en-cased', MLMScorer, (None, -6.126666069030762, -5.50140380859375, -0.7823182344436646, None)), 48 | ('bert-base-cased', MLMScorerPT, (None, -6.126738548278809, -5.501765727996826, -0.782496988773346, None)), 49 | ('gpt2-117m-en-cased', LMScorer, (-8.293947219848633, -6.387561798095703, -1.3138668537139893)), 50 | # etc. 51 | ('albert-base-v2', MLMScorerPT, (None, -16.480087280273438, -12.897505760192871, -4.277405738830566, None)), 52 | ('distilbert-base-cased', MLMScorerPT, (None, -5.1874895095825195, -6.390861511230469, -3.8225560188293457, None)), 53 | ) 54 | 55 | for name, scorer_cls, expected_scores in TEST_CASES: 56 | model, vocab, tokenizer = get_pretrained([mx.cpu()], name) 57 | scorer = scorer_cls(model, vocab, tokenizer, [mx.cpu()]) 58 | scores = scorer.score_sentences(["Hello world!"], per_token=True)[0] 59 | expected_total = 0 60 | for score, expected_score in zip(scores, expected_scores): 61 | if score is None and expected_score is None: 62 | continue 63 | assert pytest.approx(score, abs=0.0001) == expected_score 64 | expected_total += expected_score 65 | score_total = scorer.score_sentences(["Hello world!"], per_token=False)[0] 66 | assert pytest.approx(score_total, abs=0.0001) == expected_total 67 | -------------------------------------------------------------------------------- /obtain_ppl_for_clm.sh: -------------------------------------------------------------------------------- 1 | # covid myth 2 | COVID_MYTH_PATH='data/covid_scientific.jsonl' 3 | COVID_MYTH_EXP_NAME=covid_scientific 4 | 5 | # covid politifact 6 | COVID_POLITIFACT_W_JUSTIFICATION_PATH='data/covid_social.jsonl' 7 | COVID_POLITIFACT_EXP_NAME=covid_politifact_justification 8 | 9 | # fever 10 | FEVER_TEST_PATH='data/fever_test.jsonl' 11 | FEVER_TEXT_EXP_NAME=fever_test 12 | 13 | PATHS=( $COVID_MYTH_PATH $COVID_POLITIFACT_W_JUSTIFICATION_PATH $FEVER_TEST_PATH ) 14 | NAMES=( $COVID_MYTH_EXP_NAME $COVID_POLITIFACT_EXP_NAME $FEVER_TEXT_EXP_NAME ) 15 | 16 | LM_MODEL_TYPE=gpt2 # Some options: gpt2 gpt2-xl gpt2 gpt2-medium gpt2-large gpt2-xl 17 | 18 | mkdir -p ppl_results 19 | 20 | for i in 0 #1 2 21 | do 22 | INPUT_FILE_NAME=${PATHS[$i]} 23 | EXP_NAME=${NAMES[$i]} 24 | CUDA_VISIBLE_DEVICES=0 python run_language_modelling_clm.py \ 25 | --model_name_or_path $LM_MODEL_TYPE \ 26 | --data_file_path $INPUT_FILE_NAME \ 27 | --do_eval \ 28 | --per_gpu_train_batch_size 1 \ 29 | --per_device_train_batch_size 1 \ 30 | --block_size 128 \ 31 | --per_gpu_eval_batch_size 1 \ 32 | --per_device_eval_batch_size 1 \ 33 | --result_path ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \ 34 | --overwrite_output_dir 35 | done -------------------------------------------------------------------------------- /plot/HKUST.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/HKUST.jpg -------------------------------------------------------------------------------- /plot/method_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/method_illustration.png -------------------------------------------------------------------------------- /plot/pytorch-logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/pytorch-logo-dark.png -------------------------------------------------------------------------------- /run_few_shot.sh: -------------------------------------------------------------------------------- 1 | # =============================================================================================================== 2 | # EXP 1: COVID SCIENTIFIC 3 | # =============================================================================================================== 4 | COVID_SCIENTIFIC_PATH=/data/covid_scientific.jsonl 5 | EXP_NAME=covid_scientific 6 | 7 | for K in 50 10 2 8 | do 9 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base 10 | do 11 | python few_shot_ppl.py \ 12 | --test_data_file $COVID_SCIENTIFIC_PATH \ 13 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \ 14 | --k $K \ 15 | --covid_data \ 16 | --exp_name $EXP_NAME 17 | done 18 | done 19 | 20 | # =============================================================================================================== 21 | # EXP 2: COVID SOCIAL 22 | # =============================================================================================================== 23 | COVID_POLITIFACT_PATH=/data/covid_social.jsonl 24 | EXP_NAME=covid_politifact_justification 25 | 26 | for K in 50 10 2 27 | do 28 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base 29 | do 30 | python few_shot_ppl.py \ 31 | --test_data_file $COVID_POLITIFACT_PATH \ 32 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \ 33 | --k $K \ 34 | --covid_data \ 35 | --exp_name $EXP_NAME 36 | done 37 | done 38 | 39 | # =============================================================================================================== 40 | # EXP 3: FEVER 41 | # =============================================================================================================== 42 | FEVER_TRAIN_PATH=/data/fever_train.jsonl 43 | TRAIN_EXP_NAME=fever_train_small 44 | 45 | FEVER_TEST_PATH=/data/fever_test.jsonl 46 | TEST_EXP_NAME=fever_test 47 | 48 | EXP_NAME_FOR_SAVE=fever 49 | 50 | for K in 50 10 2 51 | do 52 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base 53 | do 54 | python few_shot_ppl.py \ 55 | --train_data_file $FEVER_TRAIN_PATH \ 56 | --train_result_path /ppl_results/$LM_MODEL_TYPE.$TRAIN_EXP_NAME.npy \ 57 | --test_data_file $FEVER_TEST_PATH \ 58 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$TEST_EXP_NAME.npy \ 59 | --k $K \ 60 | --exp_name $EXP_NAME_FOR_SAVE 61 | done 62 | done 63 | 64 | -------------------------------------------------------------------------------- /transformers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/activations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/activations.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_albert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_albert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_auto.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_bart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_bert_generation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bert_generation.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_blenderbot.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_blenderbot.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_camembert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_camembert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_deberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_deberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_dpr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_dpr.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_electra.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_electra.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_encoder_decoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_encoder_decoder.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_flaubert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_flaubert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_fsmt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_fsmt.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_funnel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_funnel.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_layoutlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_layoutlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_longformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_longformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_lxmert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_lxmert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_marian.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_marian.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_mbart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mbart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_mmbt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mmbt.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_mobilebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mobilebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_openai.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_pegasus.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_pegasus.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_rag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_rag.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_reformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_reformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_retribert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_retribert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_squeezebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_squeezebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_t5.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_t5.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_xlm_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_xlm_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/convert_slow_tokenizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/convert_slow_tokenizer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/file_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/file_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/generation_beam_search.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_beam_search.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/generation_logits_process.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_logits_process.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/generation_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/hf_argparser.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/hf_argparser.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/integrations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/integrations.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modelcard.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modelcard.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_albert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_albert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_auto.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_bart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_bert_generation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bert_generation.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_blenderbot.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_blenderbot.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_camembert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_camembert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_deberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_deberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_dpr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_dpr.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_electra.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_electra.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_encoder_decoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_encoder_decoder.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_flaubert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_flaubert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_fsmt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_fsmt.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_funnel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_funnel.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_layoutlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_layoutlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_longformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_longformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_lxmert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_lxmert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_marian.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_marian.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_mbart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mbart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_mmbt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mmbt.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_mobilebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mobilebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_openai.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_outputs.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_outputs.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_pegasus.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_pegasus.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_rag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_rag.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_reformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_reformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_retribert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_retribert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_squeezebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_squeezebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_t5.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_t5.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_tf_pytorch_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_xlm_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_xlm_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/optimization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/optimization.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/retrieval_rag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/retrieval_rag.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_albert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_albert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_albert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_albert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_auto.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_auto.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bart_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bart_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bert_generation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_generation.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bert_japanese.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_japanese.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_bertweet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bertweet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_blenderbot.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_blenderbot.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_camembert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_camembert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_camembert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_camembert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_ctrl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_deberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_deberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_distilbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_distilbert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_distilbert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_dpr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_dpr.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_dpr_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_dpr_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_electra.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_electra.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_electra_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_electra_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_flaubert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_flaubert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_fsmt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_fsmt.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_funnel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_funnel.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_funnel_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_funnel_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_gpt2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_gpt2_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_gpt2_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_herbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_herbert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_herbert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_herbert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_layoutlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_layoutlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_layoutlm_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_layoutlm_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_longformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_longformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_longformer_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_longformer_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_lxmert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_lxmert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_lxmert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_lxmert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_marian.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_marian.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_mbart.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mbart.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_mbart_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mbart_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_mobilebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mobilebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_mobilebert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mobilebert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_openai.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_openai.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_openai_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_openai_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_pegasus.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_pegasus.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_pegasus_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_pegasus_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_phobert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_phobert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_rag.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_rag.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_reformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_reformer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_reformer_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_reformer_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_retribert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_retribert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_retribert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_retribert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_roberta_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_roberta_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_squeezebert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_squeezebert.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_squeezebert_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_squeezebert_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_t5.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_t5.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_t5_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_t5_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils_base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils_base.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlm_prophetnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_prophetnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlm_roberta.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_roberta.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlm_roberta_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_roberta_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlnet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_xlnet_fast.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlnet_fast.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/trainer_callback.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_callback.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/trainer_pt_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_pt_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/trainer_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/training_args.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/training_args.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/training_args_tf.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/training_args_tf.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/activations.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from packaging import version 6 | 7 | from .utils import logging 8 | 9 | 10 | logger = logging.get_logger(__name__) 11 | 12 | 13 | def _gelu_python(x): 14 | """ 15 | Original Implementation of the GELU activation function in Google BERT repo when initially created. For 16 | information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + 17 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in 18 | torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 19 | """ 20 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 21 | 22 | 23 | def gelu_new(x): 24 | """ 25 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see 26 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 27 | """ 28 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 29 | 30 | 31 | if version.parse(torch.__version__) < version.parse("1.4"): 32 | gelu = _gelu_python 33 | else: 34 | gelu = F.gelu 35 | 36 | 37 | def gelu_fast(x): 38 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 39 | 40 | 41 | def _silu_python(x): 42 | """ 43 | See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear 44 | Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function 45 | Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated 46 | Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with 47 | later. 48 | """ 49 | return x * torch.sigmoid(x) 50 | 51 | 52 | if version.parse(torch.__version__) < version.parse("1.7"): 53 | silu = _silu_python 54 | else: 55 | silu = F.silu 56 | 57 | 58 | def mish(x): 59 | return x * torch.tanh(torch.nn.functional.softplus(x)) 60 | 61 | 62 | def linear_act(x): 63 | return x 64 | 65 | 66 | ACT2FN = { 67 | "relu": F.relu, 68 | "silu": silu, 69 | "swish": silu, 70 | "gelu": gelu, 71 | "tanh": torch.tanh, 72 | "gelu_new": gelu_new, 73 | "gelu_fast": gelu_fast, 74 | "mish": mish, 75 | "linear": linear_act, 76 | "sigmoid": torch.sigmoid, 77 | } 78 | 79 | 80 | def get_activation(activation_string): 81 | if activation_string in ACT2FN: 82 | return ACT2FN[activation_string] 83 | else: 84 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 85 | -------------------------------------------------------------------------------- /transformers/activations_tf.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def gelu(x): 7 | """ 8 | Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when 9 | initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 10 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see 11 | https://arxiv.org/abs/1606.08415 12 | """ 13 | x = tf.convert_to_tensor(x) 14 | cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) 15 | 16 | return x * cdf 17 | 18 | 19 | def gelu_new(x): 20 | """ 21 | Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841 22 | 23 | Args: 24 | x: float Tensor to perform activation 25 | 26 | Returns: 27 | `x` with the GELU activation applied. 28 | """ 29 | x = tf.convert_to_tensor(x) 30 | pi = tf.cast(math.pi, x.dtype) 31 | coeff = tf.cast(0.044715, x.dtype) 32 | cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3)))) 33 | 34 | return x * cdf 35 | 36 | 37 | def mish(x): 38 | x = tf.convert_to_tensor(x) 39 | 40 | return x * tf.tanh(tf.math.softplus(x)) 41 | 42 | 43 | def gelu_fast(x): 44 | x = tf.convert_to_tensor(x) 45 | coeff1 = tf.cast(7978845608, x.dtype) 46 | coeff2 = tf.cast(0.044715, x.dtype) 47 | 48 | return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x))) 49 | 50 | 51 | ACT2FN = { 52 | "gelu": tf.keras.layers.Activation(gelu), 53 | "relu": tf.keras.activations.relu, 54 | "swish": tf.keras.activations.swish, 55 | "silu": tf.keras.activations.swish, 56 | "gelu_new": tf.keras.layers.Activation(gelu_new), 57 | "mish": tf.keras.layers.Activation(mish), 58 | "tanh": tf.keras.activations.tanh, 59 | "gelu_fast": tf.keras.layers.Activation(gelu_fast), 60 | } 61 | 62 | 63 | def get_tf_activation(activation_string): 64 | if activation_string in ACT2FN: 65 | return ACT2FN[activation_string] 66 | else: 67 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 68 | -------------------------------------------------------------------------------- /transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__init__.py -------------------------------------------------------------------------------- /transformers/benchmark/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/benchmark/__pycache__/benchmark.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/benchmark/__pycache__/benchmark_args.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_args.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/benchmark/__pycache__/benchmark_args_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_args_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/benchmark/__pycache__/benchmark_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /transformers/commands/transformers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.add_new_model import AddNewModelCommand 5 | from transformers.commands.convert import ConvertCommand 6 | from transformers.commands.download import DownloadCommand 7 | from transformers.commands.env import EnvironmentCommand 8 | from transformers.commands.run import RunCommand 9 | from transformers.commands.serving import ServeCommand 10 | from transformers.commands.user import UserCommands 11 | 12 | 13 | def main(): 14 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []") 15 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers") 16 | 17 | # Register commands 18 | ConvertCommand.register_subcommand(commands_parser) 19 | DownloadCommand.register_subcommand(commands_parser) 20 | EnvironmentCommand.register_subcommand(commands_parser) 21 | RunCommand.register_subcommand(commands_parser) 22 | ServeCommand.register_subcommand(commands_parser) 23 | UserCommands.register_subcommand(commands_parser) 24 | AddNewModelCommand.register_subcommand(commands_parser) 25 | 26 | # Let's go 27 | args = parser.parse_args() 28 | 29 | if not hasattr(args, "func"): 30 | parser.print_help() 31 | exit(1) 32 | 33 | # Run 34 | service = args.func(args) 35 | service.run() 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | from .configuration_roberta import RobertaConfig 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json", 26 | "umberto-commoncrawl-cased-v1": "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json", 27 | "umberto-wikipedia-uncased-v1": "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json", 28 | } 29 | 30 | 31 | class CamembertConfig(RobertaConfig): 32 | """ 33 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate 34 | documentation alongside usage examples. 35 | """ 36 | 37 | model_type = "camembert" 38 | -------------------------------------------------------------------------------- /transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | class MMBTConfig(object): 25 | """ 26 | This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to 27 | instantiate a MMBT model according to the specified arguments, defining the model architecture. 28 | 29 | Args: 30 | config (:class:`~transformers.PreTrainedConfig`): 31 | Config of the underlying Transformer models. Its values are copied over to use a single config. 32 | num_labels (:obj:`int`, `optional`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, `optional`, defaults to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from .configuration_bert import BertConfig 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json", 26 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json", 27 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json", 28 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json", 29 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json", 30 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | r""" 36 | This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a 37 | :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified 38 | arguments, defining the model architecture. 39 | 40 | 41 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model 42 | outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. 43 | 44 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the 45 | same defaults. Please check the parent class for more information. 46 | 47 | Examples:: 48 | 49 | >>> from transformers import RobertaConfig, RobertaModel 50 | 51 | >>> # Initializing a RoBERTa configuration 52 | >>> configuration = RobertaConfig() 53 | 54 | >>> # Initializing a model from the configuration 55 | >>> model = RobertaModel(configuration) 56 | 57 | >>> # Accessing the model configuration 58 | >>> configuration = model.config 59 | """ 60 | model_type = "roberta" 61 | 62 | def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): 63 | """Constructs RobertaConfig.""" 64 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 65 | -------------------------------------------------------------------------------- /transformers/configuration_xlm_prophetnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ XLM-ProphetNet model configuration """ 16 | 17 | 18 | from .configuration_prophetnet import ProphetNetConfig 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json", 26 | } 27 | 28 | 29 | class XLMProphetNetConfig(ProphetNetConfig): 30 | """ 31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate 32 | documentation alongside usage examples. 33 | """ 34 | 35 | model_type = "xlm-prophetnet" 36 | -------------------------------------------------------------------------------- /transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | from .configuration_roberta import RobertaConfig 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 25 | "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json", 26 | "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json", 27 | "xlm-roberta-large-finetuned-conll02-dutch": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json", 28 | "xlm-roberta-large-finetuned-conll02-spanish": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json", 29 | "xlm-roberta-large-finetuned-conll03-english": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json", 30 | "xlm-roberta-large-finetuned-conll03-german": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json", 31 | } 32 | 33 | 34 | class XLMRobertaConfig(RobertaConfig): 35 | """ 36 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate 37 | documentation alongside usage examples. 38 | """ 39 | 40 | model_type = "xlm-roberta" 41 | -------------------------------------------------------------------------------- /transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from transformers.file_utils import WEIGHTS_NAME 7 | 8 | 9 | DIALOGPT_MODELS = ["small", "medium", "large"] 10 | 11 | OLD_KEY = "lm_head.decoder.weight" 12 | NEW_KEY = "lm_head.weight" 13 | 14 | 15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 16 | d = torch.load(checkpoint_path) 17 | d[NEW_KEY] = d.pop(OLD_KEY) 18 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 19 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--dialogpt_path", default=".", type=str) 25 | args = parser.parse_args() 26 | for MODEL in DIALOGPT_MODELS: 27 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 28 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 29 | convert_dialogpt_checkpoint( 30 | checkpoint_path, 31 | pytorch_dump_folder_path, 32 | ) 33 | -------------------------------------------------------------------------------- /transformers/convert_electra_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ELECTRA checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): 30 | # Initialise PyTorch model 31 | config = ElectraConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | 34 | if discriminator_or_generator == "discriminator": 35 | model = ElectraForPreTraining(config) 36 | elif discriminator_or_generator == "generator": 37 | model = ElectraForMaskedLM(config) 38 | else: 39 | raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") 40 | 41 | # Load weights from tf checkpoint 42 | load_tf_weights_in_electra( 43 | model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator 44 | ) 45 | 46 | # Save pytorch-model 47 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 48 | torch.save(model.state_dict(), pytorch_dump_path) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | # Required parameters 54 | parser.add_argument( 55 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 56 | ) 57 | parser.add_argument( 58 | "--config_file", 59 | default=None, 60 | type=str, 61 | required=True, 62 | help="The config json file corresponding to the pre-trained model. \n" 63 | "This specifies the model architecture.", 64 | ) 65 | parser.add_argument( 66 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 67 | ) 68 | parser.add_argument( 69 | "--discriminator_or_generator", 70 | default=None, 71 | type=str, 72 | required=True, 73 | help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " 74 | "'generator'.", 75 | ) 76 | args = parser.parse_args() 77 | convert_tf_checkpoint_to_pytorch( 78 | args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator 79 | ) 80 | -------------------------------------------------------------------------------- /transformers/convert_funnel_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Funnel checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import FunnelConfig, FunnelForPreTraining, load_tf_weights_in_funnel 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = FunnelConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = FunnelForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_funnel(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if gpt2_config_file == "": 32 | config = GPT2Config() 33 | else: 34 | config = GPT2Config.from_json_file(gpt2_config_file) 35 | model = GPT2Model(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 55 | ) 56 | parser.add_argument( 57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 58 | ) 59 | parser.add_argument( 60 | "--gpt2_config_file", 61 | default="", 62 | type=str, 63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 64 | "This specifies the model architecture.", 65 | ) 66 | args = parser.parse_args() 67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) 68 | -------------------------------------------------------------------------------- /transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert RoBERTa checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import pytorch_lightning as pl 21 | import torch 22 | 23 | from transformers.modeling_longformer import LongformerForQuestionAnswering, LongformerModel 24 | 25 | 26 | class LightningModel(pl.LightningModule): 27 | def __init__(self, model): 28 | super().__init__() 29 | self.model = model 30 | self.num_labels = 2 31 | self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) 32 | 33 | # implement only because lightning requires to do so 34 | def forward(self): 35 | pass 36 | 37 | 38 | def convert_longformer_qa_checkpoint_to_pytorch( 39 | longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str 40 | ): 41 | 42 | # load longformer model from model identifier 43 | longformer = LongformerModel.from_pretrained(longformer_model) 44 | lightning_model = LightningModel(longformer) 45 | 46 | ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) 47 | lightning_model.load_state_dict(ckpt["state_dict"]) 48 | 49 | # init longformer question answering model 50 | longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model) 51 | 52 | # transfer weights 53 | longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict()) 54 | longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict()) 55 | longformer_for_qa.eval() 56 | 57 | # save model 58 | longformer_for_qa.save_pretrained(pytorch_dump_folder_path) 59 | 60 | print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path)) 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser() 65 | # Required parameters 66 | parser.add_argument( 67 | "--longformer_model", 68 | default=None, 69 | type=str, 70 | required=True, 71 | help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.", 72 | ) 73 | parser.add_argument( 74 | "--longformer_question_answering_ckpt_path", 75 | default=None, 76 | type=str, 77 | required=True, 78 | help="Path the official PyTorch Lightning Checkpoint.", 79 | ) 80 | parser.add_argument( 81 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 82 | ) 83 | args = parser.parse_args() 84 | convert_longformer_qa_checkpoint_to_pytorch( 85 | args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path 86 | ) 87 | -------------------------------------------------------------------------------- /transformers/convert_lxmert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert LXMERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = LxmertConfig.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = LxmertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_lxmert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /transformers/convert_mbart_original_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | from transformers import BartForConditionalGeneration, MBartConfig 6 | 7 | from .convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_ 8 | 9 | 10 | def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"): 11 | state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] 12 | remove_ignore_keys_(state_dict) 13 | vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] 14 | mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) 15 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] 16 | model = BartForConditionalGeneration(mbart_config) 17 | model.model.load_state_dict(state_dict) 18 | return model 19 | 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser() 23 | # Required parameters 24 | parser.add_argument( 25 | "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." 26 | ) 27 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") 28 | parser.add_argument( 29 | "--hf_config", 30 | default="facebook/mbart-large-cc25", 31 | type=str, 32 | help="Which huggingface architecture to use: bart-large-xsum", 33 | ) 34 | args = parser.parse_args() 35 | model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config) 36 | model.save_pretrained(args.pytorch_dump_folder_path) 37 | -------------------------------------------------------------------------------- /transformers/convert_mobilebert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert 6 | from transformers.utils import logging 7 | 8 | 9 | logging.set_verbosity_info() 10 | 11 | 12 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path): 13 | # Initialise PyTorch model 14 | config = MobileBertConfig.from_json_file(mobilebert_config_file) 15 | print("Building PyTorch model from configuration: {}".format(str(config))) 16 | model = MobileBertForPreTraining(config) 17 | # Load weights from tf checkpoint 18 | model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path) 19 | # Save pytorch-model 20 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 21 | torch.save(model.state_dict(), pytorch_dump_path) 22 | 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser() 26 | # Required parameters 27 | parser.add_argument( 28 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 29 | ) 30 | parser.add_argument( 31 | "--mobilebert_config_file", 32 | default=None, 33 | type=str, 34 | required=True, 35 | help="The config json file corresponding to the pre-trained MobileBERT model. \n" 36 | "This specifies the model architecture.", 37 | ) 38 | parser.add_argument( 39 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 40 | ) 41 | args = parser.parse_args() 42 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path) 43 | -------------------------------------------------------------------------------- /transformers/convert_openai_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | import torch 21 | 22 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt 23 | from transformers.utils import logging 24 | 25 | 26 | logging.set_verbosity_info() 27 | 28 | 29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 30 | # Construct model 31 | if openai_config_file == "": 32 | config = OpenAIGPTConfig() 33 | else: 34 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 35 | model = OpenAIGPTModel(config) 36 | 37 | # Load weights from numpy 38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 39 | 40 | # Save pytorch-model 41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 44 | torch.save(model.state_dict(), pytorch_weights_dump_path) 45 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 47 | f.write(config.to_json_string()) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | # Required parameters 53 | parser.add_argument( 54 | "--openai_checkpoint_folder_path", 55 | default=None, 56 | type=str, 57 | required=True, 58 | help="Path to the TensorFlow checkpoint path.", 59 | ) 60 | parser.add_argument( 61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 62 | ) 63 | parser.add_argument( 64 | "--openai_config_file", 65 | default="", 66 | type=str, 67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.", 69 | ) 70 | args = parser.parse_args() 71 | convert_openai_checkpoint_to_pytorch( 72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path 73 | ) 74 | -------------------------------------------------------------------------------- /transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 21 | from transformers.utils import logging 22 | 23 | 24 | logging.set_verbosity_info() 25 | 26 | 27 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 28 | # Initialise PyTorch model 29 | config = T5Config.from_json_file(config_file) 30 | print("Building PyTorch model from configuration: {}".format(str(config))) 31 | model = T5Model(config) 32 | 33 | # Load weights from tf checkpoint 34 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 35 | 36 | # Save pytorch-model 37 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 38 | model.save_pretrained(pytorch_dump_path) 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | # Required parameters 44 | parser.add_argument( 45 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 46 | ) 47 | parser.add_argument( 48 | "--config_file", 49 | default=None, 50 | type=str, 51 | required=True, 52 | help="The config json file corresponding to the pre-trained T5 model. \n" 53 | "This specifies the model architecture.", 54 | ) 55 | parser.add_argument( 56 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 57 | ) 58 | args = parser.parse_args() 59 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 60 | -------------------------------------------------------------------------------- /transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Seq2Seq TF Hub checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | from transformers import ( 21 | BertConfig, 22 | BertGenerationConfig, 23 | BertGenerationDecoder, 24 | BertGenerationEncoder, 25 | load_tf_weights_in_bert_generation, 26 | logging, 27 | ) 28 | 29 | 30 | logging.set_verbosity_info() 31 | 32 | 33 | def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder): 34 | # Initialise PyTorch model 35 | bert_config = BertConfig.from_pretrained( 36 | "bert-large-cased", 37 | vocab_size=vocab_size, 38 | max_position_embeddings=512, 39 | is_decoder=True, 40 | add_cross_attention=True, 41 | ) 42 | bert_config_dict = bert_config.to_dict() 43 | del bert_config_dict["type_vocab_size"] 44 | config = BertGenerationConfig(**bert_config_dict) 45 | if is_encoder: 46 | model = BertGenerationEncoder(config) 47 | else: 48 | model = BertGenerationDecoder(config) 49 | print("Building PyTorch model from configuration: {}".format(str(config))) 50 | 51 | # Load weights from tf checkpoint 52 | load_tf_weights_in_bert_generation( 53 | model, 54 | tf_hub_path, 55 | model_class="bert", 56 | is_encoder_named_decoder=is_encoder_named_decoder, 57 | is_encoder=is_encoder, 58 | ) 59 | 60 | # Save pytorch-model 61 | print("Save PyTorch model and config to {}".format(pytorch_dump_path)) 62 | model.save_pretrained(pytorch_dump_path) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | # Required parameters 68 | parser.add_argument( 69 | "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 70 | ) 71 | parser.add_argument( 72 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 73 | ) 74 | parser.add_argument( 75 | "--is_encoder_named_decoder", 76 | action="store_true", 77 | help="If decoder has to be renamed to encoder in PyTorch model.", 78 | ) 79 | parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.") 80 | parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model") 81 | args = parser.parse_args() 82 | convert_tf_checkpoint_to_pytorch( 83 | args.tf_hub_path, 84 | args.pytorch_dump_path, 85 | args.is_encoder_named_decoder, 86 | args.vocab_size, 87 | is_encoder=args.is_encoder, 88 | ) 89 | -------------------------------------------------------------------------------- /transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import json 20 | 21 | import numpy 22 | import torch 23 | 24 | from transformers import CONFIG_NAME, WEIGHTS_NAME 25 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES 26 | from transformers.utils import logging 27 | 28 | 29 | logging.set_verbosity_info() 30 | 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") 35 | 36 | state_dict = chkpt["model"] 37 | 38 | # We have the base model one level deeper than the original XLM repository 39 | two_levels_state_dict = {} 40 | for k, v in state_dict.items(): 41 | if "pred_layer" in k: 42 | two_levels_state_dict[k] = v 43 | else: 44 | two_levels_state_dict["transformer." + k] = v 45 | 46 | config = chkpt["params"] 47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 48 | 49 | vocab = chkpt["dico_word2id"] 50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) 51 | 52 | # Save pytorch-model 53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME 54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME 55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] 56 | 57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path) 59 | 60 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 62 | f.write(json.dumps(config, indent=2) + "\n") 63 | 64 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 66 | f.write(json.dumps(vocab, indent=2) + "\n") 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | # Required parameters 72 | parser.add_argument( 73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." 74 | ) 75 | parser.add_argument( 76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 77 | ) 78 | args = parser.parse_args() 79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 80 | -------------------------------------------------------------------------------- /transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import glue_compute_metrics, xnli_compute_metrics 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | factcheck_convert_examples_to_features, 24 | # factcheck_output_modes, 25 | factcheck_processors, 26 | ) 27 | 28 | from .metrics import glue_compute_metrics, xnli_compute_metrics, factcheck_compute_metrics 29 | -------------------------------------------------------------------------------- /transformers/data/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/__pycache__/data_collator.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/__pycache__/data_collator.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import GlueDataset, GlueDataTrainingArguments 6 | from .language_modeling import ( 7 | LineByLineTextDataset, 8 | LineByLineWithRefDataset, 9 | LineByLineWithSOPTextDataset, 10 | TextDataset, 11 | TextDatasetForNextSentencePrediction, 12 | ) 13 | from .squad import SquadDataset, SquadDataTrainingArguments 14 | -------------------------------------------------------------------------------- /transformers/data/datasets/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/datasets/__pycache__/glue.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/glue.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/datasets/__pycache__/language_modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/language_modeling.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/datasets/__pycache__/lm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/lm.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/datasets/__pycache__/squad.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/squad.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/metrics/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/metrics/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | from .factcheck import factcheck_convert_examples_to_features, factcheck_processors, DatasetForClassification 10 | -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/factcheck.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/factcheck.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/glue.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/glue.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/squad.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/squad.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/processors/__pycache__/xnli.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/xnli.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/data/test_generation_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import timeout_decorator 5 | 6 | from transformers import is_torch_available 7 | from transformers.file_utils import cached_property 8 | from transformers.testing_utils import require_torch 9 | 10 | 11 | if is_torch_available(): 12 | import torch 13 | 14 | from transformers import MarianConfig, MarianMTModel 15 | 16 | 17 | @require_torch 18 | class GenerationUtilsTest(unittest.TestCase): 19 | @cached_property 20 | def config(self): 21 | config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de") 22 | return config 23 | 24 | @cached_property 25 | def model(self): 26 | return MarianMTModel(self.config) 27 | 28 | def test_postprocess_next_token_scores(self): 29 | config = self.config 30 | model = self.model 31 | # Initialize an input id tensor with batch size 8 and sequence length 12 32 | input_ids = torch.arange(0, 96, 1).view((8, 12)) 33 | eos = config.eos_token_id 34 | bad_words_ids_test_cases = [[[299]], [[23, 24], [54]], [[config.eos_token_id]], []] 35 | masked_scores = [ 36 | [(0, 299), (1, 299), (2, 299), (3, 299), (4, 299), (5, 299), (6, 299), (7, 299)], 37 | [(1, 24), (0, 54), (1, 54), (2, 54), (3, 54), (4, 54), (5, 54), (6, 54), (7, 54)], 38 | [(0, eos), (1, eos), (2, eos), (3, eos), (4, eos), (5, eos), (6, eos), (7, eos)], 39 | [], 40 | ] 41 | 42 | for test_case_index, bad_words_ids in enumerate(bad_words_ids_test_cases): 43 | # Initialize a scores tensor with batch size 8 and vocabulary size 300 44 | scores = torch.rand((8, 300)) 45 | output = model.postprocess_next_token_scores( 46 | scores, 47 | input_ids, 48 | 0, 49 | bad_words_ids, 50 | 13, 51 | 15, 52 | config.max_length, 53 | config.eos_token_id, 54 | config.repetition_penalty, 55 | 32, 56 | 5, 57 | ) 58 | for masked_score in masked_scores[test_case_index]: 59 | self.assertTrue(output[masked_score[0], masked_score[1]] == -float("inf")) 60 | 61 | @timeout_decorator.timeout(10) 62 | def test_postprocess_next_token_scores_large_bad_words_list(self): 63 | 64 | config = self.config 65 | model = self.model 66 | # Initialize an input id tensor with batch size 8 and sequence length 12 67 | input_ids = torch.arange(0, 96, 1).view((8, 12)) 68 | 69 | bad_words_ids = [] 70 | for _ in range(100): 71 | length_bad_word = random.randint(1, 4) 72 | bad_words_ids.append(random.sample(range(1, 300), length_bad_word)) 73 | 74 | scores = torch.rand((8, 300)) 75 | _ = model.postprocess_next_token_scores( 76 | scores, 77 | input_ids, 78 | 0, 79 | bad_words_ids, 80 | 13, 81 | 15, 82 | config.max_length, 83 | config.eos_token_id, 84 | config.repetition_penalty, 85 | 32, 86 | 5, 87 | ) 88 | -------------------------------------------------------------------------------- /transformers/modeling_blenderbot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | # 5 | # This source code is licensed under the MIT license found in the; 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # LICENSE file in the root directory of this source tree. 17 | """"BlenderbotForConditionalGeneration which inherits from BART""" 18 | 19 | import torch 20 | 21 | from .configuration_blenderbot import BlenderbotConfig 22 | from .file_utils import add_start_docstrings 23 | from .modeling_bart import BartForConditionalGeneration 24 | 25 | 26 | BLENDER_START_DOCSTRING = r""" 27 | 28 | This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic 29 | methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, 30 | pruning heads etc.) 31 | 32 | This model is also a PyTorch `torch.nn.Module `__ 33 | subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to 34 | general usage and behavior. 35 | 36 | """ 37 | 38 | BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/blenderbot-90M"] 39 | 40 | 41 | @add_start_docstrings( 42 | "The BART Model with a language modeling head. Can be used for summarization.", BLENDER_START_DOCSTRING 43 | ) 44 | class BlenderbotForConditionalGeneration(BartForConditionalGeneration): 45 | """ 46 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the 47 | appropriate documentation alongside usage examples. 48 | """ 49 | 50 | config_class = BlenderbotConfig 51 | 52 | def adjust_logits_during_generation(self, logits, cur_len, max_length): 53 | logits[:, self.config.bos_token_id] = -torch.finfo(torch.float16).max # near infinity fp16 54 | if cur_len == max_length - 1 and self.config.eos_token_id is not None: 55 | self._force_token_id_to_be_generated(logits, self.config.eos_token_id) 56 | return logits 57 | -------------------------------------------------------------------------------- /transformers/modeling_marian.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 Marian Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch MarianMTModel model, ported from the Marian C++ repo.""" 16 | 17 | 18 | from .configuration_marian import MarianConfig 19 | from .modeling_bart import BartForConditionalGeneration 20 | 21 | 22 | # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP 23 | 24 | 25 | class MarianMTModel(BartForConditionalGeneration): 26 | r""" 27 | Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available 28 | models are listed `here `__. 29 | 30 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the 31 | appropriate documentation alongside usage examples. 32 | 33 | Examples:: 34 | 35 | >>> from transformers import MarianTokenizer, MarianMTModel 36 | >>> from typing import List 37 | >>> src = 'fr' # source language 38 | >>> trg = 'en' # target language 39 | >>> sample_text = "où est l'arrêt de bus ?" 40 | >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' 41 | 42 | >>> model = MarianMTModel.from_pretrained(mname) 43 | >>> tok = MarianTokenizer.from_pretrained(mname) 44 | >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text]) # don't need tgt_text for inference 45 | >>> gen = model.generate(**batch) # for forward pass: model(**batch) 46 | >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the bus stop ?" 47 | 48 | """ 49 | config_class = MarianConfig 50 | authorized_missing_keys = [ 51 | "model.encoder.embed_positions.weight", 52 | "model.decoder.embed_positions.weight", 53 | ] 54 | keys_to_never_save = [ 55 | "model.encoder.embed_positions.weight", 56 | "model.decoder.embed_positions.weight", 57 | ] 58 | 59 | def adjust_logits_during_generation(self, logits, cur_len, max_length): 60 | logits[:, self.config.pad_token_id] = float("-inf") # never predict pad token. 61 | if cur_len == max_length - 1 and self.config.eos_token_id is not None: 62 | self._force_token_id_to_be_generated(logits, self.config.eos_token_id) 63 | return logits 64 | -------------------------------------------------------------------------------- /transformers/modeling_mbart.py: -------------------------------------------------------------------------------- 1 | from .configuration_mbart import MBartConfig 2 | from .modeling_bart import BartForConditionalGeneration 3 | 4 | 5 | _CONFIG_FOR_DOC = "MBartConfig" 6 | _TOKENIZER_FOR_DOC = "MBartTokenizer" 7 | 8 | MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [ 9 | "facebook/mbart-large-cc25", 10 | "facebook/mbart-large-en-ro", 11 | # See all multilingual BART models at https://huggingface.co/models?filter=mbart 12 | ] 13 | 14 | 15 | class MBartForConditionalGeneration(BartForConditionalGeneration): 16 | r""" 17 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the 18 | appropriate documentation alongside usage examples. 19 | 20 | Examples:: 21 | >>> from transformers import MBartForConditionalGeneration, MBartTokenizer 22 | >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") 23 | >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro") 24 | >>> article = "UN Chief Says There Is No Military Solution in Syria" 25 | >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article]) 26 | >>> translated_tokens = model.generate(**batch) 27 | >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] 28 | >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria" 29 | """ 30 | model_type = "mbart" 31 | config_class = MBartConfig 32 | authorized_missing_keys = [ 33 | "model.encoder.embed_positions.weight", 34 | "model.decoder.embed_positions.weight", 35 | ] 36 | keys_to_never_save = [ 37 | "model.encoder.embed_positions.weight", 38 | "model.decoder.embed_positions.weight", 39 | ] 40 | -------------------------------------------------------------------------------- /transformers/modeling_pegasus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 Google and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch Pegasus model, ported from https://github.com/google-research/pegasus""" 16 | 17 | 18 | from .configuration_pegasus import PegasusConfig 19 | from .file_utils import add_start_docstrings 20 | from .modeling_bart import BART_START_DOCSTRING, BartForConditionalGeneration 21 | 22 | 23 | @add_start_docstrings("The Pegasus Model for summarization ", BART_START_DOCSTRING) 24 | class PegasusForConditionalGeneration(BartForConditionalGeneration): 25 | r""" 26 | Pytorch version of google's pegasus model for summarization. Available models are listed `here 27 | `__. 28 | 29 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the 30 | appropriate documentation alongside usage examples. 31 | 32 | Examples:: 33 | 34 | >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration 35 | >>> from typing import List 36 | >>> PGE_ARTICLE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." 37 | >>> mname = "google/pegasus-xsum" 38 | 39 | >>> model = PegasusForConditionalGeneration.from_pretrained(mname) 40 | >>> tok = PegasusTokenizer.from_pretrained(mname) 41 | >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE]) # don't need tgt_text for inference 42 | >>> gen = model.generate(**batch) # for forward pass: model(**batch) 43 | >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True) 44 | >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers." 45 | 46 | """ 47 | # All the code is in src/transformers/modeling_bart.py 48 | config_class = PegasusConfig 49 | authorized_missing_keys = [ 50 | r"final_logits_bias", 51 | r"encoder\.version", 52 | r"decoder\.version", 53 | "model.encoder.embed_positions", 54 | "model.decoder.embed_positions", 55 | ] 56 | keys_to_never_save = [ 57 | "model.encoder.embed_positions.weight", 58 | "model.decoder.embed_positions.weight", 59 | ] 60 | -------------------------------------------------------------------------------- /transformers/modeling_tf_blenderbot.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF BlenderBot model, ported from the fairseq repo.""" 16 | from .configuration_blenderbot import BlenderbotConfig 17 | from .file_utils import add_start_docstrings, is_tf_available 18 | from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration 19 | from .utils import logging 20 | 21 | 22 | if is_tf_available(): 23 | import tensorflow as tf 24 | 25 | 26 | _CONFIG_FOR_DOC = "BlenderbotConfig" 27 | 28 | START_DOCSTRING = BART_START_DOCSTRING.replace( 29 | "inherits from :class:`~transformers.TFPreTrainedModel`", 30 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 31 | ).replace("BartConfig", _CONFIG_FOR_DOC) 32 | 33 | 34 | logger = logging.get_logger(__name__) 35 | 36 | 37 | @add_start_docstrings("Blenderbot model for open domain dialogue", START_DOCSTRING) 38 | class TFBlenderbotForConditionalGeneration(TFBartForConditionalGeneration): 39 | config_class = BlenderbotConfig 40 | 41 | def adjust_logits_during_generation(self, logits, cur_len, max_length): 42 | """Never predict pad_token_id. Predict when max_length is reached.""" 43 | vocab_range = tf.constant(range(self.config.vocab_size)) 44 | logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits) 45 | if cur_len == max_length - 1: 46 | logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits) 47 | return logits 48 | -------------------------------------------------------------------------------- /transformers/modeling_tf_marian.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF Marian model, ported from the fairseq repo.""" 16 | 17 | from .configuration_marian import MarianConfig 18 | from .file_utils import add_start_docstrings, is_tf_available 19 | from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration 20 | from .utils import logging 21 | 22 | 23 | if is_tf_available(): 24 | import tensorflow as tf 25 | 26 | 27 | _CONFIG_FOR_DOC = "MarianConfig" 28 | 29 | START_DOCSTRING = BART_START_DOCSTRING.replace( 30 | "inherits from :class:`~transformers.TFPreTrainedModel`", 31 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 32 | ).replace("BartConfig", _CONFIG_FOR_DOC) 33 | 34 | 35 | logger = logging.get_logger(__name__) 36 | 37 | 38 | @add_start_docstrings("Marian model for machine translation", START_DOCSTRING) 39 | class TFMarianMTModel(TFBartForConditionalGeneration): 40 | authorized_missing_keys = [ 41 | r"model.encoder.embed_positions.weight", 42 | r"model.decoder.embed_positions.weight", 43 | ] 44 | config_class = MarianConfig 45 | 46 | def adjust_logits_during_generation(self, logits, cur_len, max_length): 47 | """Never predict pad_token_id. Predict when max_length is reached.""" 48 | vocab_range = tf.constant(range(self.config.vocab_size)) 49 | logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits) 50 | if cur_len == max_length - 1: 51 | logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits) 52 | return logits 53 | -------------------------------------------------------------------------------- /transformers/modeling_tf_mbart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF mBART model, originally from fairseq.""" 16 | from .configuration_mbart import MBartConfig 17 | from .file_utils import add_start_docstrings 18 | from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration 19 | from .utils import logging 20 | 21 | 22 | _CONFIG_FOR_DOC = "MBartConfig" 23 | 24 | START_DOCSTRING = BART_START_DOCSTRING.replace( 25 | "inherits from :class:`~transformers.TFPreTrainedModel`", 26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 27 | ).replace("BartConfig", _CONFIG_FOR_DOC) 28 | 29 | 30 | logger = logging.get_logger(__name__) 31 | 32 | 33 | @add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING) 34 | class TFMBartForConditionalGeneration(TFBartForConditionalGeneration): 35 | config_class = MBartConfig 36 | # All the code is in src/transformers/modeling_tf_bart.py 37 | -------------------------------------------------------------------------------- /transformers/modeling_tf_pegasus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TF Pegasus model, ported from the fairseq repo.""" 16 | from .configuration_pegasus import PegasusConfig 17 | from .file_utils import add_start_docstrings 18 | from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration 19 | from .utils import logging 20 | 21 | 22 | _CONFIG_FOR_DOC = "PegasusConfig" 23 | 24 | START_DOCSTRING = BART_START_DOCSTRING.replace( 25 | "inherits from :class:`~transformers.TFPreTrainedModel`", 26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`", 27 | ).replace("BartConfig", _CONFIG_FOR_DOC) 28 | 29 | 30 | logger = logging.get_logger(__name__) 31 | 32 | 33 | @add_start_docstrings("Pegasus model for summarization", START_DOCSTRING) 34 | class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration): 35 | authorized_missing_keys = [ 36 | r"final_logits_bias", 37 | r"model.encoder.embed_positions.weight", 38 | r"model.decoder.embed_positions.weight", 39 | ] 40 | config_class = PegasusConfig 41 | # All the code is in src/transformers/modeling_tf_bart.py 42 | -------------------------------------------------------------------------------- /transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from .tokenization_bert import BertTokenizer 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 28 | "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 29 | "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", 30 | "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", 31 | "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt", 32 | "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", 33 | } 34 | } 35 | 36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 37 | "distilbert-base-uncased": 512, 38 | "distilbert-base-uncased-distilled-squad": 512, 39 | "distilbert-base-cased": 512, 40 | "distilbert-base-cased-distilled-squad": 512, 41 | "distilbert-base-german-cased": 512, 42 | "distilbert-base-multilingual-cased": 512, 43 | } 44 | 45 | 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "distilbert-base-uncased": {"do_lower_case": True}, 48 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, 49 | "distilbert-base-cased": {"do_lower_case": False}, 50 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, 51 | "distilbert-base-german-cased": {"do_lower_case": False}, 52 | "distilbert-base-multilingual-cased": {"do_lower_case": False}, 53 | } 54 | 55 | 56 | class DistilBertTokenizer(BertTokenizer): 57 | r""" 58 | Construct a DistilBERT tokenizer. 59 | 60 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 61 | tokenization: punctuation splitting and wordpiece. 62 | 63 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 64 | parameters. 65 | """ 66 | 67 | vocab_files_names = VOCAB_FILES_NAMES 68 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 69 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 70 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 71 | model_input_names = ["attention_mask"] 72 | -------------------------------------------------------------------------------- /transformers/tokenization_electra.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_bert import BertTokenizer 17 | 18 | 19 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 20 | 21 | PRETRAINED_VOCAB_FILES_MAP = { 22 | "vocab_file": { 23 | "google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt", 24 | "google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt", 25 | "google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt", 26 | "google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt", 27 | "google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt", 28 | "google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt", 29 | } 30 | } 31 | 32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 33 | "google/electra-small-generator": 512, 34 | "google/electra-base-generator": 512, 35 | "google/electra-large-generator": 512, 36 | "google/electra-small-discriminator": 512, 37 | "google/electra-base-discriminator": 512, 38 | "google/electra-large-discriminator": 512, 39 | } 40 | 41 | 42 | PRETRAINED_INIT_CONFIGURATION = { 43 | "google/electra-small-generator": {"do_lower_case": True}, 44 | "google/electra-base-generator": {"do_lower_case": True}, 45 | "google/electra-large-generator": {"do_lower_case": True}, 46 | "google/electra-small-discriminator": {"do_lower_case": True}, 47 | "google/electra-base-discriminator": {"do_lower_case": True}, 48 | "google/electra-large-discriminator": {"do_lower_case": True}, 49 | } 50 | 51 | 52 | class ElectraTokenizer(BertTokenizer): 53 | r""" 54 | Construct an ELECTRA tokenizer. 55 | 56 | :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 57 | tokenization: punctuation splitting and wordpiece. 58 | 59 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 60 | parameters. 61 | """ 62 | 63 | vocab_files_names = VOCAB_FILES_NAMES 64 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 65 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 66 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 67 | -------------------------------------------------------------------------------- /transformers/tokenization_herbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_bert import BasicTokenizer 17 | from .tokenization_xlm import XLMTokenizer 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = { 24 | "vocab_file": "vocab.json", 25 | "merges_file": "merges.txt", 26 | } 27 | 28 | PRETRAINED_VOCAB_FILES_MAP = { 29 | "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"}, 30 | "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"}, 31 | } 32 | 33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514} 34 | PRETRAINED_INIT_CONFIGURATION = {} 35 | 36 | 37 | class HerbertTokenizer(XLMTokenizer): 38 | """ 39 | Construct a BPE tokenizer for HerBERT. 40 | 41 | Peculiarities: 42 | 43 | - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a 44 | punctuation character will be treated separately. 45 | 46 | - Such pretokenized input is BPE subtokenized 47 | 48 | This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should 49 | refer to the superclass for more information regarding methods. 50 | """ 51 | 52 | vocab_files_names = VOCAB_FILES_NAMES 53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 54 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 55 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 56 | 57 | def __init__(self, **kwargs): 58 | 59 | kwargs["cls_token"] = "" 60 | kwargs["unk_token"] = "" 61 | kwargs["pad_token"] = "" 62 | kwargs["mask_token"] = "" 63 | kwargs["sep_token"] = "" 64 | kwargs["do_lowercase_and_remove_accent"] = False 65 | kwargs["additional_special_tokens"] = [] 66 | 67 | super().__init__(**kwargs) 68 | self.bert_pre_tokenizer = BasicTokenizer( 69 | do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False 70 | ) 71 | 72 | def _tokenize(self, text): 73 | 74 | pre_tokens = self.bert_pre_tokenizer.tokenize(text) 75 | 76 | split_tokens = [] 77 | for token in pre_tokens: 78 | if token: 79 | split_tokens.extend([t for t in self.bpe(token).split(" ")]) 80 | 81 | return split_tokens 82 | -------------------------------------------------------------------------------- /transformers/tokenization_layoutlm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Tokenization class for model LayoutLM.""" 16 | 17 | 18 | from .tokenization_bert import BertTokenizer 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 29 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 30 | } 31 | } 32 | 33 | 34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 35 | "microsoft/layoutlm-base-uncased": 512, 36 | "microsoft/layoutlm-large-uncased": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True}, 42 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True}, 43 | } 44 | 45 | 46 | class LayoutLMTokenizer(BertTokenizer): 47 | r""" 48 | Constructs a LayoutLM tokenizer. 49 | 50 | :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 51 | tokenization: punctuation splitting + wordpiece. 52 | 53 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 54 | parameters. 55 | """ 56 | 57 | vocab_files_names = VOCAB_FILES_NAMES 58 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 61 | -------------------------------------------------------------------------------- /transformers/tokenization_layoutlm_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Tokenization class for model LayoutLM.""" 16 | 17 | 18 | from .tokenization_bert_fast import BertTokenizerFast 19 | from .tokenization_layoutlm import LayoutLMTokenizer 20 | from .utils import logging 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 26 | 27 | PRETRAINED_VOCAB_FILES_MAP = { 28 | "vocab_file": { 29 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 30 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", 31 | }, 32 | "tokenizer_file": { 33 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 34 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json", 35 | }, 36 | } 37 | 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | "microsoft/layoutlm-base-uncased": 512, 41 | "microsoft/layoutlm-large-uncased": 512, 42 | } 43 | 44 | 45 | PRETRAINED_INIT_CONFIGURATION = { 46 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True}, 47 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True}, 48 | } 49 | 50 | 51 | class LayoutLMTokenizerFast(BertTokenizerFast): 52 | r""" 53 | Constructs a "Fast" LayoutLMTokenizer. 54 | 55 | :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 56 | end-to-end tokenization: punctuation splitting + wordpiece. 57 | 58 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 59 | parameters. 60 | """ 61 | 62 | vocab_files_names = VOCAB_FILES_NAMES 63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 66 | slow_tokenizer_class = LayoutLMTokenizer 67 | -------------------------------------------------------------------------------- /transformers/tokenization_longformer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_roberta import RobertaTokenizer 17 | from .utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | # vocab and merges same as roberta 24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 26 | _all_longformer_models = [ 27 | "allenai/longformer-base-4096", 28 | "allenai/longformer-large-4096", 29 | "allenai/longformer-large-4096-finetuned-triviaqa", 30 | "allenai/longformer-base-4096-extra.pos.embd.only", 31 | "allenai/longformer-large-4096-extra.pos.embd.only", 32 | ] 33 | 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "allenai/longformer-base-4096": 4096, 37 | "allenai/longformer-large-4096": 4096, 38 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096, 39 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096, 40 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096, 41 | } 42 | 43 | 44 | class LongformerTokenizer(RobertaTokenizer): 45 | r""" 46 | Construct a Longformer tokenizer. 47 | 48 | :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the 49 | superclass for usage examples and documentation concerning parameters. 50 | """ 51 | # merges and vocab same as Roberta 52 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 53 | pretrained_vocab_files_map = { 54 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 55 | "merges_file": {m: merges_url for m in _all_longformer_models}, 56 | } 57 | -------------------------------------------------------------------------------- /transformers/tokenization_longformer_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_longformer import LongformerTokenizer 17 | from .tokenization_roberta_fast import RobertaTokenizerFast 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | # vocab and merges same as roberta 25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json" 26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt" 27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json" 28 | _all_longformer_models = [ 29 | "allenai/longformer-base-4096", 30 | "allenai/longformer-large-4096", 31 | "allenai/longformer-large-4096-finetuned-triviaqa", 32 | "allenai/longformer-base-4096-extra.pos.embd.only", 33 | "allenai/longformer-large-4096-extra.pos.embd.only", 34 | ] 35 | 36 | 37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 38 | "allenai/longformer-base-4096": 4096, 39 | "allenai/longformer-large-4096": 4096, 40 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096, 41 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096, 42 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096, 43 | } 44 | 45 | 46 | class LongformerTokenizerFast(RobertaTokenizerFast): 47 | r""" 48 | Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library). 49 | 50 | :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer 51 | to the superclass for usage examples and documentation concerning parameters. 52 | """ 53 | # merges and vocab same as Roberta 54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 55 | pretrained_vocab_files_map = { 56 | "vocab_file": {m: vocab_url for m in _all_longformer_models}, 57 | "merges_file": {m: merges_url for m in _all_longformer_models}, 58 | "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models}, 59 | } 60 | slow_tokenizer_class = LongformerTokenizer 61 | -------------------------------------------------------------------------------- /transformers/tokenization_lxmert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_bert import BertTokenizer 17 | 18 | 19 | #################################################### 20 | # Mapping from the keyword arguments names of Tokenizer `__init__` 21 | # to file names for serializing Tokenizer instances 22 | #################################################### 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | #################################################### 26 | # Mapping from the keyword arguments names of Tokenizer `__init__` 27 | # to pretrained vocabulary URL for all the model shortcut names. 28 | #################################################### 29 | PRETRAINED_VOCAB_FILES_MAP = { 30 | "vocab_file": { 31 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 32 | } 33 | } 34 | 35 | #################################################### 36 | # Mapping from model shortcut names to max length of inputs 37 | #################################################### 38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 39 | "unc-nlp/lxmert-base-uncased": 512, 40 | } 41 | #################################################### 42 | # Mapping from model shortcut names to a dictionary of additional 43 | # keyword arguments for Tokenizer `__init__`. 44 | # To be used for checkpoint specific configurations. 45 | #################################################### 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, 48 | } 49 | 50 | 51 | class LxmertTokenizer(BertTokenizer): 52 | r""" 53 | Construct an LXMERT tokenizer. 54 | 55 | :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 56 | tokenization: punctuation splitting and wordpiece. 57 | 58 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 59 | parameters. 60 | """ 61 | 62 | vocab_files_names = VOCAB_FILES_NAMES 63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 66 | -------------------------------------------------------------------------------- /transformers/tokenization_lxmert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .tokenization_bert_fast import BertTokenizerFast 17 | from .tokenization_lxmert import LxmertTokenizer 18 | 19 | 20 | #################################################### 21 | # Mapping from the keyword arguments names of Tokenizer `__init__` 22 | # to file names for serializing Tokenizer instances 23 | #################################################### 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | #################################################### 27 | # Mapping from the keyword arguments names of Tokenizer `__init__` 28 | # to pretrained vocabulary URL for all the model shortcut names. 29 | #################################################### 30 | PRETRAINED_VOCAB_FILES_MAP = { 31 | "vocab_file": { 32 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 33 | }, 34 | "tokenizer_file": { 35 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 36 | }, 37 | } 38 | 39 | #################################################### 40 | # Mapping from model shortcut names to max length of inputs 41 | #################################################### 42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 43 | "unc-nlp/lxmert-base-uncased": 512, 44 | } 45 | #################################################### 46 | # Mapping from model shortcut names to a dictionary of additional 47 | # keyword arguments for Tokenizer `__init__`. 48 | # To be used for checkpoint specific configurations. 49 | #################################################### 50 | PRETRAINED_INIT_CONFIGURATION = { 51 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, 52 | } 53 | 54 | 55 | class LxmertTokenizerFast(BertTokenizerFast): 56 | r""" 57 | Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library). 58 | 59 | :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 60 | end-to-end tokenization: punctuation splitting and wordpiece. 61 | 62 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 63 | parameters. 64 | """ 65 | vocab_files_names = VOCAB_FILES_NAMES 66 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 67 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 68 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 69 | slow_tokenizer_class = LxmertTokenizer 70 | -------------------------------------------------------------------------------- /transformers/tokenization_mobilebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tokenization classes for MobileBERT.""" 15 | 16 | from .tokenization_bert import BertTokenizer 17 | from .utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 23 | 24 | PRETRAINED_VOCAB_FILES_MAP = { 25 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"} 26 | } 27 | 28 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} 29 | 30 | 31 | PRETRAINED_INIT_CONFIGURATION = {} 32 | 33 | 34 | class MobileBertTokenizer(BertTokenizer): 35 | r""" 36 | Construct a MobileBERT tokenizer. 37 | 38 | :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 39 | tokenization: punctuation splitting and wordpiece. 40 | 41 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 42 | parameters. 43 | """ 44 | 45 | vocab_files_names = VOCAB_FILES_NAMES 46 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 47 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 48 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 49 | -------------------------------------------------------------------------------- /transformers/tokenization_mobilebert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tokenization classes for MobileBERT.""" 15 | 16 | from .tokenization_bert_fast import BertTokenizerFast 17 | from .tokenization_mobilebert import MobileBertTokenizer 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"}, 27 | "tokenizer_file": { 28 | "mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/tokenizer.json" 29 | }, 30 | } 31 | 32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} 33 | 34 | 35 | PRETRAINED_INIT_CONFIGURATION = {} 36 | 37 | 38 | class MobileBertTokenizerFast(BertTokenizerFast): 39 | r""" 40 | Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library). 41 | 42 | :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 43 | end-to-end tokenization: punctuation splitting and wordpiece. 44 | 45 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 46 | parameters. 47 | """ 48 | 49 | vocab_files_names = VOCAB_FILES_NAMES 50 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 51 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 52 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 53 | slow_tokenizer_class = MobileBertTokenizer 54 | -------------------------------------------------------------------------------- /transformers/tokenization_openai_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Fast Tokenization classes for OpenAI GPT.""" 16 | 17 | 18 | from typing import Optional, Tuple 19 | 20 | from .tokenization_openai import OpenAIGPTTokenizer 21 | from .tokenization_utils_fast import PreTrainedTokenizerFast 22 | from .utils import logging 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 28 | 29 | PRETRAINED_VOCAB_FILES_MAP = { 30 | "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"}, 31 | "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"}, 32 | "tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"}, 33 | } 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "openai-gpt": 512, 37 | } 38 | 39 | 40 | class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): 41 | """ 42 | Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with 43 | the following peculiarities: 44 | 45 | - lower case all inputs 46 | - uses BERT's BasicTokenizer for pre-BPE tokenization 47 | 48 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main 49 | methods. Users should refer to this superclass for more information regarding those methods. 50 | 51 | Args: 52 | vocab_file (:obj:`str`): 53 | Path to the vocabulary file. 54 | merges_file (:obj:`str`): 55 | Path to the merges file. 56 | unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): 57 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 58 | token instead. 59 | """ 60 | 61 | vocab_files_names = VOCAB_FILES_NAMES 62 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 63 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 64 | model_input_names = ["attention_mask"] 65 | slow_tokenizer_class = OpenAIGPTTokenizer 66 | 67 | def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="", **kwargs): 68 | super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs) 69 | 70 | @property 71 | def do_lower_case(self): 72 | return True 73 | 74 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 75 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 76 | return tuple(files) 77 | -------------------------------------------------------------------------------- /transformers/tokenization_retribert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RetriBERT.""" 16 | 17 | from .tokenization_bert import BertTokenizer 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 28 | } 29 | } 30 | 31 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 32 | "yjernite/retribert-base-uncased": 512, 33 | } 34 | 35 | 36 | PRETRAINED_INIT_CONFIGURATION = { 37 | "yjernite/retribert-base-uncased": {"do_lower_case": True}, 38 | } 39 | 40 | 41 | class RetriBertTokenizer(BertTokenizer): 42 | r""" 43 | Constructs a RetriBERT tokenizer. 44 | 45 | :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 46 | tokenization: punctuation splitting and wordpiece. 47 | 48 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 49 | parameters. 50 | """ 51 | 52 | vocab_files_names = VOCAB_FILES_NAMES 53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 55 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 56 | model_input_names = ["attention_mask"] 57 | -------------------------------------------------------------------------------- /transformers/tokenization_retribert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RetriBERT.""" 16 | 17 | from .tokenization_bert_fast import BertTokenizerFast 18 | from .tokenization_retribert import RetriBertTokenizer 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", 29 | }, 30 | "tokenizer_file": { 31 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json", 32 | }, 33 | } 34 | 35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 36 | "yjernite/retribert-base-uncased": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "yjernite/retribert-base-uncased": {"do_lower_case": True}, 42 | } 43 | 44 | 45 | class RetriBertTokenizerFast(BertTokenizerFast): 46 | r""" 47 | Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library). 48 | 49 | :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 50 | end-to-end tokenization: punctuation splitting and wordpiece. 51 | 52 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 53 | parameters. 54 | """ 55 | 56 | vocab_files_names = VOCAB_FILES_NAMES 57 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 58 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 60 | slow_tokenizer_class = RetriBertTokenizer 61 | model_input_names = ["attention_mask"] 62 | -------------------------------------------------------------------------------- /transformers/tokenization_squeezebert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for SqueezeBERT.""" 16 | 17 | from .tokenization_bert import BertTokenizer 18 | from .utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} 24 | 25 | PRETRAINED_VOCAB_FILES_MAP = { 26 | "vocab_file": { 27 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt", 28 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt", 29 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt", 30 | } 31 | } 32 | 33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 34 | "squeezebert/squeezebert-uncased": 512, 35 | "squeezebert/squeezebert-mnli": 512, 36 | "squeezebert/squeezebert-mnli-headless": 512, 37 | } 38 | 39 | 40 | PRETRAINED_INIT_CONFIGURATION = { 41 | "squeezebert/squeezebert-uncased": {"do_lower_case": True}, 42 | "squeezebert/squeezebert-mnli": {"do_lower_case": True}, 43 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True}, 44 | } 45 | 46 | 47 | class SqueezeBertTokenizer(BertTokenizer): 48 | r""" 49 | Constructs a SqueezeBert tokenizer. 50 | 51 | :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end 52 | tokenization: punctuation splitting + wordpiece. 53 | 54 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning 55 | parameters. 56 | """ 57 | 58 | vocab_files_names = VOCAB_FILES_NAMES 59 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 61 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 62 | -------------------------------------------------------------------------------- /transformers/tokenization_squeezebert_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for SqueezeBERT.""" 16 | 17 | from .tokenization_bert_fast import BertTokenizerFast 18 | from .tokenization_squeezebert import SqueezeBertTokenizer 19 | from .utils import logging 20 | 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} 25 | 26 | PRETRAINED_VOCAB_FILES_MAP = { 27 | "vocab_file": { 28 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt", 29 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt", 30 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt", 31 | }, 32 | "tokenizer_file": { 33 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json", 34 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json", 35 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json", 36 | }, 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | "squeezebert/squeezebert-uncased": 512, 41 | "squeezebert/squeezebert-mnli": 512, 42 | "squeezebert/squeezebert-mnli-headless": 512, 43 | } 44 | 45 | 46 | PRETRAINED_INIT_CONFIGURATION = { 47 | "squeezebert/squeezebert-uncased": {"do_lower_case": True}, 48 | "squeezebert/squeezebert-mnli": {"do_lower_case": True}, 49 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True}, 50 | } 51 | 52 | 53 | class SqueezeBertTokenizerFast(BertTokenizerFast): 54 | r""" 55 | Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library). 56 | 57 | :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs 58 | end-to-end tokenization: punctuation splitting + wordpiece. 59 | 60 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning 61 | parameters. 62 | """ 63 | 64 | vocab_files_names = VOCAB_FILES_NAMES 65 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 66 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 67 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION 68 | slow_tokenizer_class = SqueezeBertTokenizer 69 | -------------------------------------------------------------------------------- /transformers/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__init__.py -------------------------------------------------------------------------------- /transformers/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_flax_objects.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/dummy_flax_objects.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_tf_objects.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/dummy_tf_objects.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/logging.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/logging.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/sentencepiece_model_pb2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/sentencepiece_model_pb2.cpython-37.pyc -------------------------------------------------------------------------------- /transformers/utils/dummy_flax_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_flax 3 | 4 | 5 | class FlaxBertModel: 6 | def __init__(self, *args, **kwargs): 7 | requires_flax(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_flax(self) 12 | 13 | 14 | class FlaxRobertaModel: 15 | def __init__(self, *args, **kwargs): 16 | requires_flax(self) 17 | 18 | @classmethod 19 | def from_pretrained(self, *args, **kwargs): 20 | requires_flax(self) 21 | -------------------------------------------------------------------------------- /transformers/utils/dummy_sentencepiece_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..file_utils import requires_sentencepiece 3 | 4 | 5 | class AlbertTokenizer: 6 | def __init__(self, *args, **kwargs): 7 | requires_sentencepiece(self) 8 | 9 | @classmethod 10 | def from_pretrained(self, *args, **kwargs): 11 | requires_sentencepiece(self) 12 | 13 | 14 | class BertGenerationTokenizer: 15 | def __init__(self, *args, **kwargs): 16 | requires_sentencepiece(self) 17 | 18 | @classmethod 19 | def from_pretrained(self, *args, **kwargs): 20 | requires_sentencepiece(self) 21 | 22 | 23 | class CamembertTokenizer: 24 | def __init__(self, *args, **kwargs): 25 | requires_sentencepiece(self) 26 | 27 | @classmethod 28 | def from_pretrained(self, *args, **kwargs): 29 | requires_sentencepiece(self) 30 | 31 | 32 | class MarianTokenizer: 33 | def __init__(self, *args, **kwargs): 34 | requires_sentencepiece(self) 35 | 36 | @classmethod 37 | def from_pretrained(self, *args, **kwargs): 38 | requires_sentencepiece(self) 39 | 40 | 41 | class MBartTokenizer: 42 | def __init__(self, *args, **kwargs): 43 | requires_sentencepiece(self) 44 | 45 | @classmethod 46 | def from_pretrained(self, *args, **kwargs): 47 | requires_sentencepiece(self) 48 | 49 | 50 | class PegasusTokenizer: 51 | def __init__(self, *args, **kwargs): 52 | requires_sentencepiece(self) 53 | 54 | @classmethod 55 | def from_pretrained(self, *args, **kwargs): 56 | requires_sentencepiece(self) 57 | 58 | 59 | class ReformerTokenizer: 60 | def __init__(self, *args, **kwargs): 61 | requires_sentencepiece(self) 62 | 63 | @classmethod 64 | def from_pretrained(self, *args, **kwargs): 65 | requires_sentencepiece(self) 66 | 67 | 68 | class T5Tokenizer: 69 | def __init__(self, *args, **kwargs): 70 | requires_sentencepiece(self) 71 | 72 | @classmethod 73 | def from_pretrained(self, *args, **kwargs): 74 | requires_sentencepiece(self) 75 | 76 | 77 | class XLMProphetNetTokenizer: 78 | def __init__(self, *args, **kwargs): 79 | requires_sentencepiece(self) 80 | 81 | @classmethod 82 | def from_pretrained(self, *args, **kwargs): 83 | requires_sentencepiece(self) 84 | 85 | 86 | class XLMRobertaTokenizer: 87 | def __init__(self, *args, **kwargs): 88 | requires_sentencepiece(self) 89 | 90 | @classmethod 91 | def from_pretrained(self, *args, **kwargs): 92 | requires_sentencepiece(self) 93 | 94 | 95 | class XLNetTokenizer: 96 | def __init__(self, *args, **kwargs): 97 | requires_sentencepiece(self) 98 | 99 | @classmethod 100 | def from_pretrained(self, *args, **kwargs): 101 | requires_sentencepiece(self) 102 | --------------------------------------------------------------------------------