├── .gitignore
├── README.md
├── few_shot_ppl.py
├── mlm
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── main.py
├── mlm-scoring.png
├── obtain_ppl_for_mlm.sh
├── scripts
│ ├── librispeech-download-text.sh
│ └── librispeech-score.sh
├── setup.py
├── src
│ ├── main.py
│ ├── mlm.egg-info
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ ├── entry_points.txt
│ │ ├── not-zip-safe
│ │ ├── requires.txt
│ │ └── top_level.txt
│ └── mlm
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── batchify.cpython-36.pyc
│ │ ├── loaders.cpython-36.pyc
│ │ └── scorers.cpython-36.pyc
│ │ ├── batchify.py
│ │ ├── cmds.py
│ │ ├── loaders.py
│ │ ├── models
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── bert.cpython-36.pyc
│ │ │ └── gpt2.cpython-36.pyc
│ │ ├── bert.py
│ │ └── gpt2.py
│ │ └── scorers.py
└── tests
│ ├── test_cmds.py
│ ├── test_loaders.py
│ ├── test_models.py
│ └── test_scorers.py
├── obtain_ppl_for_clm.sh
├── plot
├── HKUST.jpg
├── method_illustration.png
└── pytorch-logo-dark.png
├── run_few_shot.sh
├── run_language_modelling_clm.py
└── transformers
├── __init__.py
├── __pycache__
├── __init__.cpython-37.pyc
├── activations.cpython-37.pyc
├── configuration_albert.cpython-37.pyc
├── configuration_auto.cpython-37.pyc
├── configuration_bart.cpython-37.pyc
├── configuration_bert.cpython-37.pyc
├── configuration_bert_generation.cpython-37.pyc
├── configuration_blenderbot.cpython-37.pyc
├── configuration_camembert.cpython-37.pyc
├── configuration_ctrl.cpython-37.pyc
├── configuration_deberta.cpython-37.pyc
├── configuration_distilbert.cpython-37.pyc
├── configuration_dpr.cpython-37.pyc
├── configuration_electra.cpython-37.pyc
├── configuration_encoder_decoder.cpython-37.pyc
├── configuration_flaubert.cpython-37.pyc
├── configuration_fsmt.cpython-37.pyc
├── configuration_funnel.cpython-37.pyc
├── configuration_gpt2.cpython-37.pyc
├── configuration_layoutlm.cpython-37.pyc
├── configuration_longformer.cpython-37.pyc
├── configuration_lxmert.cpython-37.pyc
├── configuration_marian.cpython-37.pyc
├── configuration_mbart.cpython-37.pyc
├── configuration_mmbt.cpython-37.pyc
├── configuration_mobilebert.cpython-37.pyc
├── configuration_openai.cpython-37.pyc
├── configuration_pegasus.cpython-37.pyc
├── configuration_prophetnet.cpython-37.pyc
├── configuration_rag.cpython-37.pyc
├── configuration_reformer.cpython-37.pyc
├── configuration_retribert.cpython-37.pyc
├── configuration_roberta.cpython-37.pyc
├── configuration_squeezebert.cpython-37.pyc
├── configuration_t5.cpython-37.pyc
├── configuration_transfo_xl.cpython-37.pyc
├── configuration_utils.cpython-37.pyc
├── configuration_xlm.cpython-37.pyc
├── configuration_xlm_prophetnet.cpython-37.pyc
├── configuration_xlm_roberta.cpython-37.pyc
├── configuration_xlnet.cpython-37.pyc
├── convert_slow_tokenizer.cpython-37.pyc
├── file_utils.cpython-37.pyc
├── generation_beam_search.cpython-37.pyc
├── generation_logits_process.cpython-37.pyc
├── generation_utils.cpython-37.pyc
├── hf_argparser.cpython-37.pyc
├── integrations.cpython-37.pyc
├── modelcard.cpython-37.pyc
├── modeling_albert.cpython-37.pyc
├── modeling_auto.cpython-37.pyc
├── modeling_bart.cpython-37.pyc
├── modeling_bert.cpython-37.pyc
├── modeling_bert_generation.cpython-37.pyc
├── modeling_blenderbot.cpython-37.pyc
├── modeling_camembert.cpython-37.pyc
├── modeling_ctrl.cpython-37.pyc
├── modeling_deberta.cpython-37.pyc
├── modeling_distilbert.cpython-37.pyc
├── modeling_dpr.cpython-37.pyc
├── modeling_electra.cpython-37.pyc
├── modeling_encoder_decoder.cpython-37.pyc
├── modeling_flaubert.cpython-37.pyc
├── modeling_fsmt.cpython-37.pyc
├── modeling_funnel.cpython-37.pyc
├── modeling_gpt2.cpython-37.pyc
├── modeling_layoutlm.cpython-37.pyc
├── modeling_longformer.cpython-37.pyc
├── modeling_lxmert.cpython-37.pyc
├── modeling_marian.cpython-37.pyc
├── modeling_mbart.cpython-37.pyc
├── modeling_mmbt.cpython-37.pyc
├── modeling_mobilebert.cpython-37.pyc
├── modeling_openai.cpython-37.pyc
├── modeling_outputs.cpython-37.pyc
├── modeling_pegasus.cpython-37.pyc
├── modeling_prophetnet.cpython-37.pyc
├── modeling_rag.cpython-37.pyc
├── modeling_reformer.cpython-37.pyc
├── modeling_retribert.cpython-37.pyc
├── modeling_roberta.cpython-37.pyc
├── modeling_squeezebert.cpython-37.pyc
├── modeling_t5.cpython-37.pyc
├── modeling_tf_pytorch_utils.cpython-37.pyc
├── modeling_transfo_xl.cpython-37.pyc
├── modeling_transfo_xl_utilities.cpython-37.pyc
├── modeling_utils.cpython-37.pyc
├── modeling_xlm.cpython-37.pyc
├── modeling_xlm_prophetnet.cpython-37.pyc
├── modeling_xlm_roberta.cpython-37.pyc
├── modeling_xlnet.cpython-37.pyc
├── optimization.cpython-37.pyc
├── pipelines.cpython-37.pyc
├── retrieval_rag.cpython-37.pyc
├── tokenization_albert.cpython-37.pyc
├── tokenization_albert_fast.cpython-37.pyc
├── tokenization_auto.cpython-37.pyc
├── tokenization_bart.cpython-37.pyc
├── tokenization_bart_fast.cpython-37.pyc
├── tokenization_bert.cpython-37.pyc
├── tokenization_bert_fast.cpython-37.pyc
├── tokenization_bert_generation.cpython-37.pyc
├── tokenization_bert_japanese.cpython-37.pyc
├── tokenization_bertweet.cpython-37.pyc
├── tokenization_blenderbot.cpython-37.pyc
├── tokenization_camembert.cpython-37.pyc
├── tokenization_camembert_fast.cpython-37.pyc
├── tokenization_ctrl.cpython-37.pyc
├── tokenization_deberta.cpython-37.pyc
├── tokenization_distilbert.cpython-37.pyc
├── tokenization_distilbert_fast.cpython-37.pyc
├── tokenization_dpr.cpython-37.pyc
├── tokenization_dpr_fast.cpython-37.pyc
├── tokenization_electra.cpython-37.pyc
├── tokenization_electra_fast.cpython-37.pyc
├── tokenization_flaubert.cpython-37.pyc
├── tokenization_fsmt.cpython-37.pyc
├── tokenization_funnel.cpython-37.pyc
├── tokenization_funnel_fast.cpython-37.pyc
├── tokenization_gpt2.cpython-37.pyc
├── tokenization_gpt2_fast.cpython-37.pyc
├── tokenization_herbert.cpython-37.pyc
├── tokenization_herbert_fast.cpython-37.pyc
├── tokenization_layoutlm.cpython-37.pyc
├── tokenization_layoutlm_fast.cpython-37.pyc
├── tokenization_longformer.cpython-37.pyc
├── tokenization_longformer_fast.cpython-37.pyc
├── tokenization_lxmert.cpython-37.pyc
├── tokenization_lxmert_fast.cpython-37.pyc
├── tokenization_marian.cpython-37.pyc
├── tokenization_mbart.cpython-37.pyc
├── tokenization_mbart_fast.cpython-37.pyc
├── tokenization_mobilebert.cpython-37.pyc
├── tokenization_mobilebert_fast.cpython-37.pyc
├── tokenization_openai.cpython-37.pyc
├── tokenization_openai_fast.cpython-37.pyc
├── tokenization_pegasus.cpython-37.pyc
├── tokenization_pegasus_fast.cpython-37.pyc
├── tokenization_phobert.cpython-37.pyc
├── tokenization_prophetnet.cpython-37.pyc
├── tokenization_rag.cpython-37.pyc
├── tokenization_reformer.cpython-37.pyc
├── tokenization_reformer_fast.cpython-37.pyc
├── tokenization_retribert.cpython-37.pyc
├── tokenization_retribert_fast.cpython-37.pyc
├── tokenization_roberta.cpython-37.pyc
├── tokenization_roberta_fast.cpython-37.pyc
├── tokenization_squeezebert.cpython-37.pyc
├── tokenization_squeezebert_fast.cpython-37.pyc
├── tokenization_t5.cpython-37.pyc
├── tokenization_t5_fast.cpython-37.pyc
├── tokenization_transfo_xl.cpython-37.pyc
├── tokenization_utils.cpython-37.pyc
├── tokenization_utils_base.cpython-37.pyc
├── tokenization_utils_fast.cpython-37.pyc
├── tokenization_xlm.cpython-37.pyc
├── tokenization_xlm_prophetnet.cpython-37.pyc
├── tokenization_xlm_roberta.cpython-37.pyc
├── tokenization_xlm_roberta_fast.cpython-37.pyc
├── tokenization_xlnet.cpython-37.pyc
├── tokenization_xlnet_fast.cpython-37.pyc
├── trainer.cpython-37.pyc
├── trainer_callback.cpython-37.pyc
├── trainer_pt_utils.cpython-37.pyc
├── trainer_utils.cpython-37.pyc
├── training_args.cpython-37.pyc
└── training_args_tf.cpython-37.pyc
├── activations.py
├── activations_tf.py
├── benchmark
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── benchmark.cpython-37.pyc
│ ├── benchmark_args.cpython-37.pyc
│ ├── benchmark_args_utils.cpython-37.pyc
│ └── benchmark_utils.cpython-37.pyc
├── benchmark.py
├── benchmark_args.py
├── benchmark_args_tf.py
├── benchmark_args_utils.py
├── benchmark_tf.py
└── benchmark_utils.py
├── commands
├── __init__.py
├── add_new_model.py
├── convert.py
├── download.py
├── env.py
├── run.py
├── serving.py
├── train.py
├── transformers_cli.py
└── user.py
├── configuration_albert.py
├── configuration_auto.py
├── configuration_bart.py
├── configuration_bert.py
├── configuration_bert_generation.py
├── configuration_blenderbot.py
├── configuration_camembert.py
├── configuration_ctrl.py
├── configuration_deberta.py
├── configuration_distilbert.py
├── configuration_dpr.py
├── configuration_electra.py
├── configuration_encoder_decoder.py
├── configuration_flaubert.py
├── configuration_fsmt.py
├── configuration_funnel.py
├── configuration_gpt2.py
├── configuration_layoutlm.py
├── configuration_longformer.py
├── configuration_lxmert.py
├── configuration_marian.py
├── configuration_mbart.py
├── configuration_mmbt.py
├── configuration_mobilebert.py
├── configuration_openai.py
├── configuration_pegasus.py
├── configuration_prophetnet.py
├── configuration_rag.py
├── configuration_reformer.py
├── configuration_retribert.py
├── configuration_roberta.py
├── configuration_squeezebert.py
├── configuration_t5.py
├── configuration_transfo_xl.py
├── configuration_utils.py
├── configuration_xlm.py
├── configuration_xlm_prophetnet.py
├── configuration_xlm_roberta.py
├── configuration_xlnet.py
├── convert_albert_original_tf_checkpoint_to_pytorch.py
├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
├── convert_bert_original_tf2_checkpoint_to_pytorch.py
├── convert_bert_original_tf_checkpoint_to_pytorch.py
├── convert_bert_pytorch_checkpoint_to_original_tf.py
├── convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
├── convert_dpr_original_checkpoint_to_pytorch.py
├── convert_electra_original_tf_checkpoint_to_pytorch.py
├── convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
├── convert_funnel_original_tf_checkpoint_to_pytorch.py
├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
├── convert_graph_to_onnx.py
├── convert_longformer_original_pytorch_lightning_to_pytorch.py
├── convert_lxmert_original_tf_checkpoint_to_pytorch.py
├── convert_marian_tatoeba_to_pytorch.py
├── convert_marian_to_pytorch.py
├── convert_mbart_original_checkpoint_to_pytorch.py
├── convert_mobilebert_original_tf_checkpoint_to_pytorch.py
├── convert_openai_original_tf_checkpoint_to_pytorch.py
├── convert_pegasus_tf_to_pytorch.py
├── convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
├── convert_pytorch_checkpoint_to_tf2.py
├── convert_reformer_trax_checkpoint_to_pytorch.py
├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
├── convert_slow_tokenizer.py
├── convert_slow_tokenizers_checkpoints_to_fast.py
├── convert_t5_original_tf_checkpoint_to_pytorch.py
├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py
├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
├── data
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ └── data_collator.cpython-37.pyc
├── data_collator.py
├── datasets
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── glue.cpython-37.pyc
│ │ ├── language_modeling.cpython-37.pyc
│ │ ├── lm.cpython-37.pyc
│ │ └── squad.cpython-37.pyc
│ ├── glue.py
│ ├── language_modeling.py
│ ├── lm.py
│ └── squad.py
├── metrics
│ ├── __init__.py
│ ├── __pycache__
│ │ └── __init__.cpython-37.pyc
│ └── squad_metrics.py
├── processors
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── factcheck.cpython-37.pyc
│ │ ├── glue.cpython-37.pyc
│ │ ├── squad.cpython-37.pyc
│ │ ├── utils.cpython-37.pyc
│ │ └── xnli.cpython-37.pyc
│ ├── factcheck.py
│ ├── factcheck_old.py
│ ├── glue.py
│ ├── squad.py
│ ├── utils.py
│ └── xnli.py
└── test_generation_utils.py
├── file_utils.py
├── generation_beam_search.py
├── generation_logits_process.py
├── generation_tf_utils.py
├── generation_utils.py
├── hf_api.py
├── hf_argparser.py
├── integrations.py
├── modelcard.py
├── modeling_albert.py
├── modeling_auto.py
├── modeling_bart.py
├── modeling_bert.py
├── modeling_bert_generation.py
├── modeling_blenderbot.py
├── modeling_camembert.py
├── modeling_ctrl.py
├── modeling_deberta.py
├── modeling_distilbert.py
├── modeling_dpr.py
├── modeling_electra.py
├── modeling_encoder_decoder.py
├── modeling_flaubert.py
├── modeling_flax_auto.py
├── modeling_flax_bert.py
├── modeling_flax_roberta.py
├── modeling_flax_utils.py
├── modeling_fsmt.py
├── modeling_funnel.py
├── modeling_gpt2.py
├── modeling_layoutlm.py
├── modeling_longformer.py
├── modeling_lxmert.py
├── modeling_marian.py
├── modeling_mbart.py
├── modeling_mmbt.py
├── modeling_mobilebert.py
├── modeling_openai.py
├── modeling_outputs.py
├── modeling_pegasus.py
├── modeling_prophetnet.py
├── modeling_rag.py
├── modeling_reformer.py
├── modeling_retribert.py
├── modeling_roberta.py
├── modeling_squeezebert.py
├── modeling_t5.py
├── modeling_tf_albert.py
├── modeling_tf_auto.py
├── modeling_tf_bart.py
├── modeling_tf_bert.py
├── modeling_tf_blenderbot.py
├── modeling_tf_camembert.py
├── modeling_tf_ctrl.py
├── modeling_tf_distilbert.py
├── modeling_tf_dpr.py
├── modeling_tf_electra.py
├── modeling_tf_flaubert.py
├── modeling_tf_funnel.py
├── modeling_tf_gpt2.py
├── modeling_tf_longformer.py
├── modeling_tf_lxmert.py
├── modeling_tf_marian.py
├── modeling_tf_mbart.py
├── modeling_tf_mobilebert.py
├── modeling_tf_openai.py
├── modeling_tf_outputs.py
├── modeling_tf_pegasus.py
├── modeling_tf_pytorch_utils.py
├── modeling_tf_roberta.py
├── modeling_tf_t5.py
├── modeling_tf_transfo_xl.py
├── modeling_tf_transfo_xl_utilities.py
├── modeling_tf_utils.py
├── modeling_tf_xlm.py
├── modeling_tf_xlm_roberta.py
├── modeling_tf_xlnet.py
├── modeling_transfo_xl.py
├── modeling_transfo_xl_utilities.py
├── modeling_utils.py
├── modeling_xlm.py
├── modeling_xlm_prophetnet.py
├── modeling_xlm_roberta.py
├── modeling_xlnet.py
├── optimization.py
├── optimization_tf.py
├── pipelines.py
├── retrieval_rag.py
├── testing_utils.py
├── tokenization_albert.py
├── tokenization_albert_fast.py
├── tokenization_auto.py
├── tokenization_bart.py
├── tokenization_bart_fast.py
├── tokenization_bert.py
├── tokenization_bert_fast.py
├── tokenization_bert_generation.py
├── tokenization_bert_japanese.py
├── tokenization_bertweet.py
├── tokenization_blenderbot.py
├── tokenization_camembert.py
├── tokenization_camembert_fast.py
├── tokenization_ctrl.py
├── tokenization_deberta.py
├── tokenization_distilbert.py
├── tokenization_distilbert_fast.py
├── tokenization_dpr.py
├── tokenization_dpr_fast.py
├── tokenization_electra.py
├── tokenization_electra_fast.py
├── tokenization_flaubert.py
├── tokenization_fsmt.py
├── tokenization_funnel.py
├── tokenization_funnel_fast.py
├── tokenization_gpt2.py
├── tokenization_gpt2_fast.py
├── tokenization_herbert.py
├── tokenization_herbert_fast.py
├── tokenization_layoutlm.py
├── tokenization_layoutlm_fast.py
├── tokenization_longformer.py
├── tokenization_longformer_fast.py
├── tokenization_lxmert.py
├── tokenization_lxmert_fast.py
├── tokenization_marian.py
├── tokenization_mbart.py
├── tokenization_mbart_fast.py
├── tokenization_mobilebert.py
├── tokenization_mobilebert_fast.py
├── tokenization_openai.py
├── tokenization_openai_fast.py
├── tokenization_pegasus.py
├── tokenization_pegasus_fast.py
├── tokenization_phobert.py
├── tokenization_prophetnet.py
├── tokenization_rag.py
├── tokenization_reformer.py
├── tokenization_reformer_fast.py
├── tokenization_retribert.py
├── tokenization_retribert_fast.py
├── tokenization_roberta.py
├── tokenization_roberta_fast.py
├── tokenization_squeezebert.py
├── tokenization_squeezebert_fast.py
├── tokenization_t5.py
├── tokenization_t5_fast.py
├── tokenization_transfo_xl.py
├── tokenization_utils.py
├── tokenization_utils_base.py
├── tokenization_utils_fast.py
├── tokenization_xlm.py
├── tokenization_xlm_prophetnet.py
├── tokenization_xlm_roberta.py
├── tokenization_xlm_roberta_fast.py
├── tokenization_xlnet.py
├── tokenization_xlnet_fast.py
├── trainer.py
├── trainer_callback.py
├── trainer_pt_utils.py
├── trainer_tf.py
├── trainer_utils.py
├── training_args.py
├── training_args_tf.py
└── utils
├── __init__.py
├── __pycache__
├── __init__.cpython-37.pyc
├── dummy_flax_objects.cpython-37.pyc
├── dummy_tf_objects.cpython-37.pyc
├── logging.cpython-37.pyc
└── sentencepiece_model_pb2.cpython-37.pyc
├── dummy_flax_objects.py
├── dummy_pt_objects.py
├── dummy_sentencepiece_objects.py
├── dummy_tf_objects.py
├── dummy_tokenizers_objects.py
├── hp_naming.py
├── logging.py
├── notebook.py
└── sentencepiece_model_pb2.py
/.gitignore:
--------------------------------------------------------------------------------
1 | ppl_results/
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Perplexity-FactChecking
2 |
3 |
[](https://opensource.org/licenses/MIT)
4 |
5 |
This respository contains the code for our paper:
6 | **Towards Few-Shot Fact-Checking via Perplexity**. *Nayeon Lee\*, Yejin Bang\*, [Andrea Madotto](https://andreamad8.github.io/)*, Madian Khabsa, Pascale Fung, *NAACL2021* [[PDF]](https://www.aclweb.org/anthology/2021.naacl-main.158.pdf)
7 |
8 |
9 |
10 | ## How to run
11 |
12 | #### 1. Dataset Preparation
13 |
14 | To download the testset with evidence used for experiments described in the paper, please fill in the request form - https://forms.gle/5key5cTqCu5ZLTnr7
15 | The details of test set can be found in the paper.
16 |
17 | After you download, please locate the data files under directory 'data/'
18 |
19 | #### 2. Obtain Evidence-conditioned Perplexity
20 | By running the below script, files with perplexity scores will be saved in "/ppl_results" directory.
21 |
22 | **a. Causal Language Model**
23 |
24 | ```
25 | bash obtain_evidence_conditioned_perplexity_clm.sh
26 | ```
27 |
28 | **b. Masked Language Model**
29 |
30 | ```
31 | bash mlm/obtain_evidence_conditioned_perplexity_mlm.sh
32 | ```
33 |
34 |
35 | #### 3. Hyper-parameter search (of the optimal threshold), and evaluate performance
36 |
37 | ```
38 | bash run_few_shot.sh
39 | ```
40 |
41 | ## Citation:
42 |
43 | If you find this paper and code useful, please cite our paper:
44 |
45 | ```
46 | @inproceedings{lee-etal-2021-towards,
47 | title = "Towards Few-shot Fact-Checking via Perplexity",
48 | author = "Lee, Nayeon and
49 | Bang, Yejin and
50 | Madotto, Andrea and
51 | Fung, Pascale",
52 | booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
53 | month = jun,
54 | year = "2021",
55 | address = "Online",
56 | publisher = "Association for Computational Linguistics",
57 | url = "https://www.aclweb.org/anthology/2021.naacl-main.158",
58 | pages = "1971--1981"
59 | }
60 | ```
61 |
62 | ## Acknowledgement
63 | This repository is implemented using [**Huggingface**](https://github.com/huggingface/transformers) codebase.
64 | For MLM, we utilize code from [**MLM-scoring Github**](https://github.com/awslabs/mlm-scoring)
65 |
66 |
--------------------------------------------------------------------------------
/mlm/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
--------------------------------------------------------------------------------
/mlm/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
--------------------------------------------------------------------------------
/mlm/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
--------------------------------------------------------------------------------
/mlm/main.py:
--------------------------------------------------------------------------------
1 | # import mlm
2 | # from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
3 | # from mlm.models import get_pretrained
4 | # import mxnet as mx
5 | # ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)]
6 |
7 | # model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased') # bert-large-en-uncased
8 | # scorer = MLMScorer(model, vocab, tokenizer, ctxs)
9 |
10 | # print(scorer.score_sentences(["Hello world!"]))
11 |
12 | # # for line in all_lines:
13 | # # print(scorer.score_sentences(["Hello world!"]))
14 | # # # >> [-12.410664200782776]
15 | # # # print(scorer.score_sentences(["Hello world!"], per_token=True))
16 | from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
17 | from mlm.models import get_pretrained
18 | import mxnet as mx
19 | import numpy as np
20 | import argparse
21 | import jsonlines
22 | from tqdm import tqdm
23 |
24 | def fever_data_cleaning(sent):
25 | sent = sent.replace('-LRB-', '(')
26 | sent = sent.replace('-RRB-', ')')
27 | sent = sent.replace('-LSB-', '[')
28 | sent = sent.replace('-RSB-', ']')
29 | return sent
30 |
31 | def prepare_data(file_path):
32 | test_set = []
33 | with jsonlines.open(file_path) as reader:
34 | objs = [obj for obj in reader]
35 |
36 | for obj in objs:
37 | claim = obj['claim'].lower().strip()
38 | evs_line = fever_data_cleaning(obj['evidences'][0][0]).lower().strip()
39 | test_sent = " ".join([evs_line, claim])
40 | test_set.append(test_sent)
41 | return test_set
42 |
43 | models_mapping = {
44 | 'bert-large': 'bert-large-en-uncased',
45 | 'bert-base':'bert-base-en-uncased'
46 | }
47 | if __name__ == "__main__":
48 | parser = argparse.ArgumentParser()
49 | parser.add_argument(
50 | "--train_data_file", default=None, type=str, required=False, help=""
51 | )
52 | parser.add_argument(
53 | "--output_eval_file", default=None, type=str, required=False, help=""
54 | )
55 | parser.add_argument(
56 | "--model_name", default=None, type=str, required=False, help=""
57 | )
58 | args = parser.parse_args()
59 |
60 | modelName = models_mapping[args.model_name]
61 | ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)]
62 | model, vocab, tokenizer = get_pretrained(ctxs, modelName) # bert-base-en-uncased bert-large-en-uncased
63 | scorer = MLMScorer(model, vocab, tokenizer, ctxs)
64 |
65 | ppl_results=[]
66 | file_path = args.train_data_file
67 |
68 | print("Evaluating ", file_path)
69 |
70 | with jsonlines.open(file_path) as reader:
71 | objs = [obj for obj in reader]
72 |
73 | for i in tqdm(range(len(objs))):
74 | obj = objs[i]
75 | claim = fever_data_cleaning(obj['claim'].lower().strip())
76 | evs_line = fever_data_cleaning(obj['evidences'][0][0]).lower().strip()
77 | test_sent = " ".join([evs_line, claim])
78 | ppl = {'perplexity': scorer.score_sentences([test_sent])[0]}
79 | ppl_results.append(ppl)
80 |
81 | with jsonlines.open(args.output_eval_file.replace(".npy", ".jsonl"), 'a') as writer:
82 | writer.write(ppl)
83 | # ppl_results = scorer.score_sentences(test_lines)
84 | np.save(args.output_eval_file, ppl_results)
85 |
--------------------------------------------------------------------------------
/mlm/mlm-scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/mlm-scoring.png
--------------------------------------------------------------------------------
/mlm/obtain_ppl_for_mlm.sh:
--------------------------------------------------------------------------------
1 | TEST_DATA_PATH=/path/to/test/data.jsonl # e.g. covid_scientific.jsonl
2 | EXP_NAME=output-name
3 |
4 | LM_MODEL_TYPE=bert-base # bert-large
5 | python main.py \
6 | --train_data_file=$TEST_DATA_PATH \
7 | --output_eval_file=/path-to-project/ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \
8 | --model_name=$LM_MODEL_TYPE
--------------------------------------------------------------------------------
/mlm/scripts/librispeech-download-text.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 | set -x
5 |
6 | target_dir=$1
7 |
8 | mkdir -p ${1}
9 | wget -O ${1}/librispeech-lm-norm.txt.gz http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
10 | gunzip ${1}/librispeech-lm-norm.txt.gz
11 | # To avoid tripping up cased models
12 | tr '[:upper:]' '[:lower:]' < ${1}/librispeech-lm-norm.txt > ${1}/librispeech-lm-norm.lower.txt
13 | # Split to a number that's divisible by 10, and 4/8/16 GPUs ;)
14 | split --numeric-suffixes --suffix-length 2 --number l/80 ${1}/librispeech-lm-norm.lower.txt ${1}/part.
15 | # Clean up
16 | rm ${1}/librispeech-lm-norm.txt ${1}/librispeech-lm-norm.lower.txt
17 | echo "There should be 80 parts in ${1}; I found $(ls -1q ${1}/part.* | wc -l)."
18 |
--------------------------------------------------------------------------------
/mlm/scripts/librispeech-score.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # set -e
4 | set -x
5 |
6 | source_dir=$1
7 | target_dir=$2
8 | start=$3
9 | end=$4
10 | gpus=$5
11 | split_size=$6
12 | model=$7
13 |
14 | if [ "$8" != "" ]; then
15 | model_weights_arg="--weights $8"
16 | else
17 | model_weights_arg=""
18 | fi
19 |
20 | ### TODO: Scale better so that split sizes are not absurdly low
21 |
22 | for x in `seq -w ${start} ${end}`
23 | do
24 | mkdir -p ${target_dir}
25 | mlm score ${model_weights_arg} \
26 | --mode ref \
27 | --model ${model} \
28 | --gpus ${gpus} \
29 | --split-size ${split_size} \
30 | ${source_dir}/part.${x} \
31 | > ${target_dir}/part.${x}.ref.scores \
32 | 2> >(tee ${target_dir}/part.${x}.ref.log >&2)
33 | done
34 |
--------------------------------------------------------------------------------
/mlm/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from setuptools import find_packages, setup
4 |
5 | setup(
6 | name='mlm',
7 | version='0.1',
8 | description="Masked Language Model Scoring",
9 | author='Julian Salazar',
10 | packages=find_packages('src'),
11 | package_dir={'': 'src'},
12 | entry_points = {
13 | 'console_scripts': ['mlm=mlm.cmds:main'],
14 | },
15 |
16 | install_requires=[
17 | 'gluonnlp~=0.8.3',
18 | 'regex',
19 | 'sacrebleu',
20 | 'mosestokenizer',
21 | 'transformers~=3.3.1'
22 | ],
23 |
24 | extras_require={
25 | 'dev': [
26 | 'pylint',
27 | 'pytest',
28 | 'pytest-cov',
29 | 'mypy'
30 | ]
31 | },
32 |
33 | # Needed for static type checking
34 | # https://mypy.readthedocs.io/en/latest/installed_packages.html
35 | zip_safe=False
36 | )
37 |
--------------------------------------------------------------------------------
/mlm/src/main.py:
--------------------------------------------------------------------------------
1 | from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
2 | from mlm.models import get_pretrained
3 | import mxnet as mx
4 | ctxs = [mx.cpu()] # or, e.g., [mx.gpu(0), mx.gpu(1)]
5 |
6 | # MXNet MLMs (use names from mlm.models.SUPPORTED_MLMS)
7 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased')
8 | scorer = MLMScorer(model, vocab, tokenizer, ctxs)
9 | print(scorer.score_sentences(["Hello world!"]))
10 | # >> [-12.410664200782776]
11 | print(scorer.score_sentences(["Hello world!"], per_token=True))
12 |
13 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: mlm
3 | Version: 0.1
4 | Summary: Masked Language Model Scoring
5 | Home-page: UNKNOWN
6 | Author: Julian Salazar
7 | License: UNKNOWN
8 | Description: UNKNOWN
9 | Platform: UNKNOWN
10 | Provides-Extra: dev
11 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | src/mlm/__init__.py
4 | src/mlm/batchify.py
5 | src/mlm/cmds.py
6 | src/mlm/loaders.py
7 | src/mlm/scorers.py
8 | src/mlm.egg-info/PKG-INFO
9 | src/mlm.egg-info/SOURCES.txt
10 | src/mlm.egg-info/dependency_links.txt
11 | src/mlm.egg-info/entry_points.txt
12 | src/mlm.egg-info/not-zip-safe
13 | src/mlm.egg-info/requires.txt
14 | src/mlm.egg-info/top_level.txt
15 | src/mlm/models/__init__.py
16 | src/mlm/models/bert.py
17 | src/mlm/models/gpt2.py
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | mlm = mlm.cmds:main
3 |
4 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | gluonnlp~=0.8.3
2 | regex
3 | sacrebleu
4 | mosestokenizer
5 | transformers~=3.3.1
6 |
7 | [dev]
8 | pylint
9 | pytest
10 | pytest-cov
11 | mypy
12 |
--------------------------------------------------------------------------------
/mlm/src/mlm.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | mlm
2 |
--------------------------------------------------------------------------------
/mlm/src/mlm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__init__.py
--------------------------------------------------------------------------------
/mlm/src/mlm/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/__pycache__/batchify.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/batchify.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/__pycache__/loaders.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/loaders.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/__pycache__/scorers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/__pycache__/scorers.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/models/__pycache__/bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/bert.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/src/mlm/models/__pycache__/gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/mlm/src/mlm/models/__pycache__/gpt2.cpython-36.pyc
--------------------------------------------------------------------------------
/mlm/tests/test_cmds.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import random
4 |
5 | import mxnet as mx
6 | import numpy as np
7 |
8 | from mlm.cmds import setup_ctxs
9 |
10 |
11 | def test_setup_ctxs():
12 |
13 | # CPU
14 | ctxs = setup_ctxs('-1')
15 | assert len(ctxs) == 1
16 | assert ctxs[0] == mx.cpu()
17 | # Test randomness
18 | assert random.randint(0, 1000000) == 885440
19 | assert np.random.randint(0, 1000000) == 985772
20 | assert mx.random.randint(0, 1000000, ctx=ctxs[0])[0] == 656751
21 |
22 | # GPU
23 | ctxs = setup_ctxs('0,2')
24 | assert len(ctxs) == 2
25 | assert ctxs[0] == mx.gpu(0)
26 | assert ctxs[1] == mx.gpu(2)
27 | # Test randomness
28 | for ctx in ctxs:
29 | assert mx.random.randint(0, 1000000, ctx=ctx)[0] == 248005
30 |
--------------------------------------------------------------------------------
/mlm/tests/test_scorers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import mxnet as mx
4 | from mxnet.gluon.data import Dataset
5 |
6 | from mlm.loaders import Corpus
7 | from mlm.models import get_pretrained
8 | from mlm.scorers import LMScorer, MLMScorer, MLMScorerPT
9 |
10 |
11 | # The ASR case, where we append . as an EOS
12 |
13 | def _get_scorer_and_corpus_eos():
14 | ctxs = [mx.cpu()]
15 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased')
16 | scorer_mx = MLMScorer(model, vocab, tokenizer, ctxs, eos=True, wwm=False)
17 | model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-uncased')
18 | scorer_pt = MLMScorerPT(model, vocab, tokenizer, ctxs, eos=True, wwm=False)
19 | corpus = Corpus.from_dict({'utt': {'ref': "I am Sam"}})
20 | return scorer_mx, scorer_pt, corpus
21 |
22 |
23 | def test_mlmscorer_corpus_to_dataset():
24 | scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos()
25 | dataset = scorer_mx.corpus_to_dataset(corpus)
26 | assert isinstance(dataset, Dataset)
27 | # Our three tokens, plus the EOS
28 | assert len(dataset) == 4
29 |
30 |
31 | def test_mlmscorer_score_eos():
32 | scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos()
33 | scores, _ = scorer_mx.score(corpus)
34 | assert len(scores) == 1
35 | assert pytest.approx(scores[0], abs=0.0001) == -13.3065947
36 | scores, _ = scorer_pt.score(corpus)
37 | assert len(scores) == 1
38 | assert pytest.approx(scores[0], abs=0.0001) == -13.3065947
39 |
40 |
41 | # The general case
42 |
43 | def test_mlmscorer_score_sentences():
44 |
45 | TEST_CASES = (
46 | # README examples
47 | ('bert-base-en-cased', MLMScorer, (None, -6.126666069030762, -5.50140380859375, -0.7823182344436646, None)),
48 | ('bert-base-cased', MLMScorerPT, (None, -6.126738548278809, -5.501765727996826, -0.782496988773346, None)),
49 | ('gpt2-117m-en-cased', LMScorer, (-8.293947219848633, -6.387561798095703, -1.3138668537139893)),
50 | # etc.
51 | ('albert-base-v2', MLMScorerPT, (None, -16.480087280273438, -12.897505760192871, -4.277405738830566, None)),
52 | ('distilbert-base-cased', MLMScorerPT, (None, -5.1874895095825195, -6.390861511230469, -3.8225560188293457, None)),
53 | )
54 |
55 | for name, scorer_cls, expected_scores in TEST_CASES:
56 | model, vocab, tokenizer = get_pretrained([mx.cpu()], name)
57 | scorer = scorer_cls(model, vocab, tokenizer, [mx.cpu()])
58 | scores = scorer.score_sentences(["Hello world!"], per_token=True)[0]
59 | expected_total = 0
60 | for score, expected_score in zip(scores, expected_scores):
61 | if score is None and expected_score is None:
62 | continue
63 | assert pytest.approx(score, abs=0.0001) == expected_score
64 | expected_total += expected_score
65 | score_total = scorer.score_sentences(["Hello world!"], per_token=False)[0]
66 | assert pytest.approx(score_total, abs=0.0001) == expected_total
67 |
--------------------------------------------------------------------------------
/obtain_ppl_for_clm.sh:
--------------------------------------------------------------------------------
1 | # covid myth
2 | COVID_MYTH_PATH='data/covid_scientific.jsonl'
3 | COVID_MYTH_EXP_NAME=covid_scientific
4 |
5 | # covid politifact
6 | COVID_POLITIFACT_W_JUSTIFICATION_PATH='data/covid_social.jsonl'
7 | COVID_POLITIFACT_EXP_NAME=covid_politifact_justification
8 |
9 | # fever
10 | FEVER_TEST_PATH='data/fever_test.jsonl'
11 | FEVER_TEXT_EXP_NAME=fever_test
12 |
13 | PATHS=( $COVID_MYTH_PATH $COVID_POLITIFACT_W_JUSTIFICATION_PATH $FEVER_TEST_PATH )
14 | NAMES=( $COVID_MYTH_EXP_NAME $COVID_POLITIFACT_EXP_NAME $FEVER_TEXT_EXP_NAME )
15 |
16 | LM_MODEL_TYPE=gpt2 # Some options: gpt2 gpt2-xl gpt2 gpt2-medium gpt2-large gpt2-xl
17 |
18 | mkdir -p ppl_results
19 |
20 | for i in 0 #1 2
21 | do
22 | INPUT_FILE_NAME=${PATHS[$i]}
23 | EXP_NAME=${NAMES[$i]}
24 | CUDA_VISIBLE_DEVICES=0 python run_language_modelling_clm.py \
25 | --model_name_or_path $LM_MODEL_TYPE \
26 | --data_file_path $INPUT_FILE_NAME \
27 | --do_eval \
28 | --per_gpu_train_batch_size 1 \
29 | --per_device_train_batch_size 1 \
30 | --block_size 128 \
31 | --per_gpu_eval_batch_size 1 \
32 | --per_device_eval_batch_size 1 \
33 | --result_path ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \
34 | --overwrite_output_dir
35 | done
--------------------------------------------------------------------------------
/plot/HKUST.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/HKUST.jpg
--------------------------------------------------------------------------------
/plot/method_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/method_illustration.png
--------------------------------------------------------------------------------
/plot/pytorch-logo-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/plot/pytorch-logo-dark.png
--------------------------------------------------------------------------------
/run_few_shot.sh:
--------------------------------------------------------------------------------
1 | # ===============================================================================================================
2 | # EXP 1: COVID SCIENTIFIC
3 | # ===============================================================================================================
4 | COVID_SCIENTIFIC_PATH=/data/covid_scientific.jsonl
5 | EXP_NAME=covid_scientific
6 |
7 | for K in 50 10 2
8 | do
9 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base
10 | do
11 | python few_shot_ppl.py \
12 | --test_data_file $COVID_SCIENTIFIC_PATH \
13 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \
14 | --k $K \
15 | --covid_data \
16 | --exp_name $EXP_NAME
17 | done
18 | done
19 |
20 | # ===============================================================================================================
21 | # EXP 2: COVID SOCIAL
22 | # ===============================================================================================================
23 | COVID_POLITIFACT_PATH=/data/covid_social.jsonl
24 | EXP_NAME=covid_politifact_justification
25 |
26 | for K in 50 10 2
27 | do
28 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base
29 | do
30 | python few_shot_ppl.py \
31 | --test_data_file $COVID_POLITIFACT_PATH \
32 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$EXP_NAME.npy \
33 | --k $K \
34 | --covid_data \
35 | --exp_name $EXP_NAME
36 | done
37 | done
38 |
39 | # ===============================================================================================================
40 | # EXP 3: FEVER
41 | # ===============================================================================================================
42 | FEVER_TRAIN_PATH=/data/fever_train.jsonl
43 | TRAIN_EXP_NAME=fever_train_small
44 |
45 | FEVER_TEST_PATH=/data/fever_test.jsonl
46 | TEST_EXP_NAME=fever_test
47 |
48 | EXP_NAME_FOR_SAVE=fever
49 |
50 | for K in 50 10 2
51 | do
52 | for LM_MODEL_TYPE in gpt2 #gpt2-medium gpt2-large gpt2-xl bert-base
53 | do
54 | python few_shot_ppl.py \
55 | --train_data_file $FEVER_TRAIN_PATH \
56 | --train_result_path /ppl_results/$LM_MODEL_TYPE.$TRAIN_EXP_NAME.npy \
57 | --test_data_file $FEVER_TEST_PATH \
58 | --test_result_path /ppl_results/$LM_MODEL_TYPE.$TEST_EXP_NAME.npy \
59 | --k $K \
60 | --exp_name $EXP_NAME_FOR_SAVE
61 | done
62 | done
63 |
64 |
--------------------------------------------------------------------------------
/transformers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/activations.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/activations.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_albert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_albert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_auto.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_bart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_bert_generation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_bert_generation.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_blenderbot.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_blenderbot.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_camembert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_camembert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_ctrl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_deberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_deberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_distilbert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_dpr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_dpr.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_electra.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_electra.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_encoder_decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_encoder_decoder.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_flaubert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_flaubert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_fsmt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_fsmt.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_funnel.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_funnel.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_gpt2.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_layoutlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_layoutlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_longformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_longformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_lxmert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_lxmert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_marian.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_marian.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_mbart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mbart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_mmbt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mmbt.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_mobilebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_mobilebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_openai.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_pegasus.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_pegasus.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_rag.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_rag.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_reformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_reformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_retribert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_retribert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_squeezebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_squeezebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_t5.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_t5.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_transfo_xl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_xlm_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_xlm_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlm_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/configuration_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/configuration_xlnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/convert_slow_tokenizer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/convert_slow_tokenizer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/file_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/file_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/generation_beam_search.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_beam_search.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/generation_logits_process.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_logits_process.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/generation_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/generation_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/hf_argparser.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/hf_argparser.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/integrations.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/integrations.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modelcard.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modelcard.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_albert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_albert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_auto.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_bart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_bert_generation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_bert_generation.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_blenderbot.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_blenderbot.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_camembert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_camembert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_ctrl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_deberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_deberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_distilbert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_dpr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_dpr.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_electra.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_electra.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_encoder_decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_encoder_decoder.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_flaubert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_flaubert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_fsmt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_fsmt.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_funnel.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_funnel.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_gpt2.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_layoutlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_layoutlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_longformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_longformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_lxmert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_lxmert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_marian.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_marian.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_mbart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mbart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_mmbt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mmbt.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_mobilebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_mobilebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_openai.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_outputs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_outputs.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_pegasus.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_pegasus.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_rag.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_rag.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_reformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_reformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_retribert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_retribert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_squeezebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_squeezebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_t5.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_t5.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_transfo_xl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_xlm_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_xlm_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlm_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/modeling_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/modeling_xlnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/optimization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/optimization.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/pipelines.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/retrieval_rag.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/retrieval_rag.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_albert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_albert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_albert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_albert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_auto.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_auto.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bart_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bart_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bert_generation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_generation.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bert_japanese.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bert_japanese.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_bertweet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_bertweet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_blenderbot.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_blenderbot.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_camembert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_camembert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_camembert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_camembert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_ctrl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_deberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_deberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_distilbert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_distilbert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_distilbert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_dpr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_dpr.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_dpr_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_dpr_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_electra.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_electra.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_electra_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_electra_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_flaubert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_flaubert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_fsmt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_fsmt.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_funnel.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_funnel.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_funnel_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_funnel_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_gpt2.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_gpt2_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_gpt2_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_herbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_herbert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_herbert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_herbert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_layoutlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_layoutlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_layoutlm_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_layoutlm_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_longformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_longformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_longformer_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_longformer_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_lxmert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_lxmert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_lxmert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_lxmert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_marian.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_marian.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_mbart.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mbart.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_mbart_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mbart_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_mobilebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mobilebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_mobilebert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_mobilebert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_openai.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_openai.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_openai_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_openai_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_pegasus.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_pegasus.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_pegasus_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_pegasus_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_phobert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_phobert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_rag.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_rag.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_reformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_reformer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_reformer_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_reformer_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_retribert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_retribert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_retribert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_retribert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_roberta_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_roberta_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_squeezebert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_squeezebert.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_squeezebert_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_squeezebert_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_t5.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_t5.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_t5_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_t5_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_transfo_xl.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_utils_base.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils_base.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_utils_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_utils_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlm_prophetnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_prophetnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlm_roberta.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_roberta.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlm_roberta_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlm_roberta_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlnet.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/tokenization_xlnet_fast.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/tokenization_xlnet_fast.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/trainer_callback.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_callback.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/trainer_pt_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_pt_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/trainer_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/trainer_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/training_args.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/training_args.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/__pycache__/training_args_tf.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/__pycache__/training_args_tf.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/activations.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from packaging import version
6 |
7 | from .utils import logging
8 |
9 |
10 | logger = logging.get_logger(__name__)
11 |
12 |
13 | def _gelu_python(x):
14 | """
15 | Original Implementation of the GELU activation function in Google BERT repo when initially created. For
16 | information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
17 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
18 | torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
19 | """
20 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
21 |
22 |
23 | def gelu_new(x):
24 | """
25 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
26 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
27 | """
28 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
29 |
30 |
31 | if version.parse(torch.__version__) < version.parse("1.4"):
32 | gelu = _gelu_python
33 | else:
34 | gelu = F.gelu
35 |
36 |
37 | def gelu_fast(x):
38 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
39 |
40 |
41 | def _silu_python(x):
42 | """
43 | See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
44 | Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
45 | Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
46 | Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
47 | later.
48 | """
49 | return x * torch.sigmoid(x)
50 |
51 |
52 | if version.parse(torch.__version__) < version.parse("1.7"):
53 | silu = _silu_python
54 | else:
55 | silu = F.silu
56 |
57 |
58 | def mish(x):
59 | return x * torch.tanh(torch.nn.functional.softplus(x))
60 |
61 |
62 | def linear_act(x):
63 | return x
64 |
65 |
66 | ACT2FN = {
67 | "relu": F.relu,
68 | "silu": silu,
69 | "swish": silu,
70 | "gelu": gelu,
71 | "tanh": torch.tanh,
72 | "gelu_new": gelu_new,
73 | "gelu_fast": gelu_fast,
74 | "mish": mish,
75 | "linear": linear_act,
76 | "sigmoid": torch.sigmoid,
77 | }
78 |
79 |
80 | def get_activation(activation_string):
81 | if activation_string in ACT2FN:
82 | return ACT2FN[activation_string]
83 | else:
84 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
85 |
--------------------------------------------------------------------------------
/transformers/activations_tf.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import tensorflow as tf
4 |
5 |
6 | def gelu(x):
7 | """
8 | Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
9 | initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
10 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
11 | https://arxiv.org/abs/1606.08415
12 | """
13 | x = tf.convert_to_tensor(x)
14 | cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
15 |
16 | return x * cdf
17 |
18 |
19 | def gelu_new(x):
20 | """
21 | Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841
22 |
23 | Args:
24 | x: float Tensor to perform activation
25 |
26 | Returns:
27 | `x` with the GELU activation applied.
28 | """
29 | x = tf.convert_to_tensor(x)
30 | pi = tf.cast(math.pi, x.dtype)
31 | coeff = tf.cast(0.044715, x.dtype)
32 | cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
33 |
34 | return x * cdf
35 |
36 |
37 | def mish(x):
38 | x = tf.convert_to_tensor(x)
39 |
40 | return x * tf.tanh(tf.math.softplus(x))
41 |
42 |
43 | def gelu_fast(x):
44 | x = tf.convert_to_tensor(x)
45 | coeff1 = tf.cast(7978845608, x.dtype)
46 | coeff2 = tf.cast(0.044715, x.dtype)
47 |
48 | return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
49 |
50 |
51 | ACT2FN = {
52 | "gelu": tf.keras.layers.Activation(gelu),
53 | "relu": tf.keras.activations.relu,
54 | "swish": tf.keras.activations.swish,
55 | "silu": tf.keras.activations.swish,
56 | "gelu_new": tf.keras.layers.Activation(gelu_new),
57 | "mish": tf.keras.layers.Activation(mish),
58 | "tanh": tf.keras.activations.tanh,
59 | "gelu_fast": tf.keras.layers.Activation(gelu_fast),
60 | }
61 |
62 |
63 | def get_tf_activation(activation_string):
64 | if activation_string in ACT2FN:
65 | return ACT2FN[activation_string]
66 | else:
67 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
68 |
--------------------------------------------------------------------------------
/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__init__.py
--------------------------------------------------------------------------------
/transformers/benchmark/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/benchmark/__pycache__/benchmark.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/benchmark/__pycache__/benchmark_args.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_args.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/benchmark/__pycache__/benchmark_args_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_args_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/benchmark/__pycache__/benchmark_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/benchmark/__pycache__/benchmark_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from argparse import ArgumentParser
3 |
4 |
5 | class BaseTransformersCLICommand(ABC):
6 | @staticmethod
7 | @abstractmethod
8 | def register_subcommand(parser: ArgumentParser):
9 | raise NotImplementedError()
10 |
11 | @abstractmethod
12 | def run(self):
13 | raise NotImplementedError()
14 |
--------------------------------------------------------------------------------
/transformers/commands/download.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from transformers.commands import BaseTransformersCLICommand
4 |
5 |
6 | def download_command_factory(args):
7 | return DownloadCommand(args.model, args.cache_dir, args.force)
8 |
9 |
10 | class DownloadCommand(BaseTransformersCLICommand):
11 | @staticmethod
12 | def register_subcommand(parser: ArgumentParser):
13 | download_parser = parser.add_parser("download")
14 | download_parser.add_argument(
15 | "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 | )
17 | download_parser.add_argument(
18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 | )
20 | download_parser.add_argument("model", type=str, help="Name of the model to download")
21 | download_parser.set_defaults(func=download_command_factory)
22 |
23 | def __init__(self, model: str, cache: str, force: bool):
24 | self._model = model
25 | self._cache = cache
26 | self._force = force
27 |
28 | def run(self):
29 | from transformers import AutoModel, AutoTokenizer
30 |
31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 |
--------------------------------------------------------------------------------
/transformers/commands/env.py:
--------------------------------------------------------------------------------
1 | import platform
2 | from argparse import ArgumentParser
3 |
4 | from transformers import __version__ as version
5 | from transformers import is_tf_available, is_torch_available
6 | from transformers.commands import BaseTransformersCLICommand
7 |
8 |
9 | def info_command_factory(_):
10 | return EnvironmentCommand()
11 |
12 |
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 | @staticmethod
15 | def register_subcommand(parser: ArgumentParser):
16 | download_parser = parser.add_parser("env")
17 | download_parser.set_defaults(func=info_command_factory)
18 |
19 | def run(self):
20 | pt_version = "not installed"
21 | pt_cuda_available = "NA"
22 | if is_torch_available():
23 | import torch
24 |
25 | pt_version = torch.__version__
26 | pt_cuda_available = torch.cuda.is_available()
27 |
28 | tf_version = "not installed"
29 | tf_cuda_available = "NA"
30 | if is_tf_available():
31 | import tensorflow as tf
32 |
33 | tf_version = tf.__version__
34 | try:
35 | # deprecated in v2.1
36 | tf_cuda_available = tf.test.is_gpu_available()
37 | except AttributeError:
38 | # returns list of devices, convert to bool
39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 |
41 | info = {
42 | "`transformers` version": version,
43 | "Platform": platform.platform(),
44 | "Python version": platform.python_version(),
45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 | "Using GPU in script?": "",
48 | "Using distributed or parallel set-up in script?": "",
49 | }
50 |
51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 | print(self.format_dict(info))
53 |
54 | return info
55 |
56 | @staticmethod
57 | def format_dict(d):
58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 |
--------------------------------------------------------------------------------
/transformers/commands/transformers_cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from argparse import ArgumentParser
3 |
4 | from transformers.commands.add_new_model import AddNewModelCommand
5 | from transformers.commands.convert import ConvertCommand
6 | from transformers.commands.download import DownloadCommand
7 | from transformers.commands.env import EnvironmentCommand
8 | from transformers.commands.run import RunCommand
9 | from transformers.commands.serving import ServeCommand
10 | from transformers.commands.user import UserCommands
11 |
12 |
13 | def main():
14 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []")
15 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers")
16 |
17 | # Register commands
18 | ConvertCommand.register_subcommand(commands_parser)
19 | DownloadCommand.register_subcommand(commands_parser)
20 | EnvironmentCommand.register_subcommand(commands_parser)
21 | RunCommand.register_subcommand(commands_parser)
22 | ServeCommand.register_subcommand(commands_parser)
23 | UserCommands.register_subcommand(commands_parser)
24 | AddNewModelCommand.register_subcommand(commands_parser)
25 |
26 | # Let's go
27 | args = parser.parse_args()
28 |
29 | if not hasattr(args, "func"):
30 | parser.print_help()
31 | exit(1)
32 |
33 | # Run
34 | service = args.func(args)
35 | service.run()
36 |
37 |
38 | if __name__ == "__main__":
39 | main()
40 |
--------------------------------------------------------------------------------
/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 |
18 | from .configuration_roberta import RobertaConfig
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "camembert-base": "https://huggingface.co/camembert-base/resolve/main/config.json",
26 | "umberto-commoncrawl-cased-v1": "https://huggingface.co/Musixmatch/umberto-commoncrawl-cased-v1/resolve/main/config.json",
27 | "umberto-wikipedia-uncased-v1": "https://huggingface.co/Musixmatch/umberto-wikipedia-uncased-v1/resolve/main/config.json",
28 | }
29 |
30 |
31 | class CamembertConfig(RobertaConfig):
32 | """
33 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
34 | documentation alongside usage examples.
35 | """
36 |
37 | model_type = "camembert"
38 |
--------------------------------------------------------------------------------
/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 | # Copyright (c) HuggingFace Inc. team.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 |
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 |
24 | class MMBTConfig(object):
25 | """
26 | This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to
27 | instantiate a MMBT model according to the specified arguments, defining the model architecture.
28 |
29 | Args:
30 | config (:class:`~transformers.PreTrainedConfig`):
31 | Config of the underlying Transformer models. Its values are copied over to use a single config.
32 | num_labels (:obj:`int`, `optional`):
33 | Size of final Linear layer for classification.
34 | modal_hidden_size (:obj:`int`, `optional`, defaults to 2048):
35 | Embedding dimension of the non-text modality encoder.
36 | """
37 |
38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 | self.__dict__ = config.__dict__
40 | self.modal_hidden_size = modal_hidden_size
41 | if num_labels:
42 | self.num_labels = num_labels
43 |
--------------------------------------------------------------------------------
/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 |
18 | from .configuration_bert import BertConfig
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
26 | "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
27 | "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
28 | "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
29 | "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
30 | "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
31 | }
32 |
33 |
34 | class RobertaConfig(BertConfig):
35 | r"""
36 | This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
37 | :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
38 | arguments, defining the model architecture.
39 |
40 |
41 | Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
42 | outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
43 |
44 | The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
45 | same defaults. Please check the parent class for more information.
46 |
47 | Examples::
48 |
49 | >>> from transformers import RobertaConfig, RobertaModel
50 |
51 | >>> # Initializing a RoBERTa configuration
52 | >>> configuration = RobertaConfig()
53 |
54 | >>> # Initializing a model from the configuration
55 | >>> model = RobertaModel(configuration)
56 |
57 | >>> # Accessing the model configuration
58 | >>> configuration = model.config
59 | """
60 | model_type = "roberta"
61 |
62 | def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
63 | """Constructs RobertaConfig."""
64 | super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
65 |
--------------------------------------------------------------------------------
/transformers/configuration_xlm_prophetnet.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ XLM-ProphetNet model configuration """
16 |
17 |
18 | from .configuration_prophetnet import ProphetNetConfig
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "microsoft/xprophetnet-large-wiki100-cased": "https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased/resolve/main/config.json",
26 | }
27 |
28 |
29 | class XLMProphetNetConfig(ProphetNetConfig):
30 | """
31 | This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
32 | documentation alongside usage examples.
33 | """
34 |
35 | model_type = "xlm-prophetnet"
36 |
--------------------------------------------------------------------------------
/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 |
18 | from .configuration_roberta import RobertaConfig
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
25 | "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base/resolve/main/config.json",
26 | "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large/resolve/main/config.json",
27 | "xlm-roberta-large-finetuned-conll02-dutch": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/config.json",
28 | "xlm-roberta-large-finetuned-conll02-spanish": "https://huggingface.co/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/config.json",
29 | "xlm-roberta-large-finetuned-conll03-english": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-english/resolve/main/config.json",
30 | "xlm-roberta-large-finetuned-conll03-german": "https://huggingface.co/xlm-roberta-large-finetuned-conll03-german/resolve/main/config.json",
31 | }
32 |
33 |
34 | class XLMRobertaConfig(RobertaConfig):
35 | """
36 | This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
37 | documentation alongside usage examples.
38 | """
39 |
40 | model_type = "xlm-roberta"
41 |
--------------------------------------------------------------------------------
/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import torch
21 |
22 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
23 | from transformers.utils import logging
24 |
25 |
26 | logging.set_verbosity_info()
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = AlbertConfig.from_json_file(albert_config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = AlbertForPreTraining(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--albert_config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import torch
21 |
22 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
23 | from transformers.utils import logging
24 |
25 |
26 | logging.set_verbosity_info()
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = BertConfig.from_json_file(bert_config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = BertForPreTraining(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--bert_config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained BERT model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import torch
5 |
6 | from transformers.file_utils import WEIGHTS_NAME
7 |
8 |
9 | DIALOGPT_MODELS = ["small", "medium", "large"]
10 |
11 | OLD_KEY = "lm_head.decoder.weight"
12 | NEW_KEY = "lm_head.weight"
13 |
14 |
15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
16 | d = torch.load(checkpoint_path)
17 | d[NEW_KEY] = d.pop(OLD_KEY)
18 | os.makedirs(pytorch_dump_folder_path, exist_ok=True)
19 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
20 |
21 |
22 | if __name__ == "__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--dialogpt_path", default=".", type=str)
25 | args = parser.parse_args()
26 | for MODEL in DIALOGPT_MODELS:
27 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
28 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
29 | convert_dialogpt_checkpoint(
30 | checkpoint_path,
31 | pytorch_dump_folder_path,
32 | )
33 |
--------------------------------------------------------------------------------
/transformers/convert_electra_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ELECTRA checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import torch
21 |
22 | from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
23 | from transformers.utils import logging
24 |
25 |
26 | logging.set_verbosity_info()
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
30 | # Initialise PyTorch model
31 | config = ElectraConfig.from_json_file(config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 |
34 | if discriminator_or_generator == "discriminator":
35 | model = ElectraForPreTraining(config)
36 | elif discriminator_or_generator == "generator":
37 | model = ElectraForMaskedLM(config)
38 | else:
39 | raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
40 |
41 | # Load weights from tf checkpoint
42 | load_tf_weights_in_electra(
43 | model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
44 | )
45 |
46 | # Save pytorch-model
47 | print("Save PyTorch model to {}".format(pytorch_dump_path))
48 | torch.save(model.state_dict(), pytorch_dump_path)
49 |
50 |
51 | if __name__ == "__main__":
52 | parser = argparse.ArgumentParser()
53 | # Required parameters
54 | parser.add_argument(
55 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
56 | )
57 | parser.add_argument(
58 | "--config_file",
59 | default=None,
60 | type=str,
61 | required=True,
62 | help="The config json file corresponding to the pre-trained model. \n"
63 | "This specifies the model architecture.",
64 | )
65 | parser.add_argument(
66 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
67 | )
68 | parser.add_argument(
69 | "--discriminator_or_generator",
70 | default=None,
71 | type=str,
72 | required=True,
73 | help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
74 | "'generator'.",
75 | )
76 | args = parser.parse_args()
77 | convert_tf_checkpoint_to_pytorch(
78 | args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
79 | )
80 |
--------------------------------------------------------------------------------
/transformers/convert_funnel_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert Funnel checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import FunnelConfig, FunnelForPreTraining, load_tf_weights_in_funnel
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = FunnelConfig.from_json_file(config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = FunnelForPreTraining(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import torch
21 |
22 | from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2
23 | from transformers.utils import logging
24 |
25 |
26 | logging.set_verbosity_info()
27 |
28 |
29 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
30 | # Construct model
31 | if gpt2_config_file == "":
32 | config = GPT2Config()
33 | else:
34 | config = GPT2Config.from_json_file(gpt2_config_file)
35 | model = GPT2Model(config)
36 |
37 | # Load weights from numpy
38 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
39 |
40 | # Save pytorch-model
41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 | torch.save(model.state_dict(), pytorch_weights_dump_path)
45 | print("Save configuration file to {}".format(pytorch_config_dump_path))
46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 | f.write(config.to_json_string())
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | # Required parameters
53 | parser.add_argument(
54 | "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
55 | )
56 | parser.add_argument(
57 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
58 | )
59 | parser.add_argument(
60 | "--gpt2_config_file",
61 | default="",
62 | type=str,
63 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
64 | "This specifies the model architecture.",
65 | )
66 | args = parser.parse_args()
67 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
68 |
--------------------------------------------------------------------------------
/transformers/convert_longformer_original_pytorch_lightning_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert RoBERTa checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import pytorch_lightning as pl
21 | import torch
22 |
23 | from transformers.modeling_longformer import LongformerForQuestionAnswering, LongformerModel
24 |
25 |
26 | class LightningModel(pl.LightningModule):
27 | def __init__(self, model):
28 | super().__init__()
29 | self.model = model
30 | self.num_labels = 2
31 | self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
32 |
33 | # implement only because lightning requires to do so
34 | def forward(self):
35 | pass
36 |
37 |
38 | def convert_longformer_qa_checkpoint_to_pytorch(
39 | longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
40 | ):
41 |
42 | # load longformer model from model identifier
43 | longformer = LongformerModel.from_pretrained(longformer_model)
44 | lightning_model = LightningModel(longformer)
45 |
46 | ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"))
47 | lightning_model.load_state_dict(ckpt["state_dict"])
48 |
49 | # init longformer question answering model
50 | longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model)
51 |
52 | # transfer weights
53 | longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict())
54 | longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict())
55 | longformer_for_qa.eval()
56 |
57 | # save model
58 | longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
59 |
60 | print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path))
61 |
62 |
63 | if __name__ == "__main__":
64 | parser = argparse.ArgumentParser()
65 | # Required parameters
66 | parser.add_argument(
67 | "--longformer_model",
68 | default=None,
69 | type=str,
70 | required=True,
71 | help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.",
72 | )
73 | parser.add_argument(
74 | "--longformer_question_answering_ckpt_path",
75 | default=None,
76 | type=str,
77 | required=True,
78 | help="Path the official PyTorch Lightning Checkpoint.",
79 | )
80 | parser.add_argument(
81 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
82 | )
83 | args = parser.parse_args()
84 | convert_longformer_qa_checkpoint_to_pytorch(
85 | args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path
86 | )
87 |
--------------------------------------------------------------------------------
/transformers/convert_lxmert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert LXMERT checkpoint."""
16 |
17 |
18 | import argparse
19 | import logging
20 |
21 | import torch
22 |
23 | from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
24 |
25 |
26 | logging.basicConfig(level=logging.INFO)
27 |
28 |
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 | # Initialise PyTorch model
31 | config = LxmertConfig.from_json_file(config_file)
32 | print("Building PyTorch model from configuration: {}".format(str(config)))
33 | model = LxmertForPreTraining(config)
34 |
35 | # Load weights from tf checkpoint
36 | load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
37 |
38 | # Save pytorch-model
39 | print("Save PyTorch model to {}".format(pytorch_dump_path))
40 | torch.save(model.state_dict(), pytorch_dump_path)
41 |
42 |
43 | if __name__ == "__main__":
44 | parser = argparse.ArgumentParser()
45 | # Required parameters
46 | parser.add_argument(
47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 | )
49 | parser.add_argument(
50 | "--config_file",
51 | default=None,
52 | type=str,
53 | required=True,
54 | help="The config json file corresponding to the pre-trained model. \n"
55 | "This specifies the model architecture.",
56 | )
57 | parser.add_argument(
58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 | )
60 | args = parser.parse_args()
61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 |
--------------------------------------------------------------------------------
/transformers/convert_mbart_original_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 |
5 | from transformers import BartForConditionalGeneration, MBartConfig
6 |
7 | from .convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_
8 |
9 |
10 | def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"):
11 | state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
12 | remove_ignore_keys_(state_dict)
13 | vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
14 | mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
15 | state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
16 | model = BartForConditionalGeneration(mbart_config)
17 | model.model.load_state_dict(state_dict)
18 | return model
19 |
20 |
21 | if __name__ == "__main__":
22 | parser = argparse.ArgumentParser()
23 | # Required parameters
24 | parser.add_argument(
25 | "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
26 | )
27 | parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
28 | parser.add_argument(
29 | "--hf_config",
30 | default="facebook/mbart-large-cc25",
31 | type=str,
32 | help="Which huggingface architecture to use: bart-large-xsum",
33 | )
34 | args = parser.parse_args()
35 | model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config)
36 | model.save_pretrained(args.pytorch_dump_folder_path)
37 |
--------------------------------------------------------------------------------
/transformers/convert_mobilebert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 |
5 | from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert
6 | from transformers.utils import logging
7 |
8 |
9 | logging.set_verbosity_info()
10 |
11 |
12 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path):
13 | # Initialise PyTorch model
14 | config = MobileBertConfig.from_json_file(mobilebert_config_file)
15 | print("Building PyTorch model from configuration: {}".format(str(config)))
16 | model = MobileBertForPreTraining(config)
17 | # Load weights from tf checkpoint
18 | model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path)
19 | # Save pytorch-model
20 | print("Save PyTorch model to {}".format(pytorch_dump_path))
21 | torch.save(model.state_dict(), pytorch_dump_path)
22 |
23 |
24 | if __name__ == "__main__":
25 | parser = argparse.ArgumentParser()
26 | # Required parameters
27 | parser.add_argument(
28 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
29 | )
30 | parser.add_argument(
31 | "--mobilebert_config_file",
32 | default=None,
33 | type=str,
34 | required=True,
35 | help="The config json file corresponding to the pre-trained MobileBERT model. \n"
36 | "This specifies the model architecture.",
37 | )
38 | parser.add_argument(
39 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
40 | )
41 | args = parser.parse_args()
42 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path)
43 |
--------------------------------------------------------------------------------
/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | import torch
21 |
22 | from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
23 | from transformers.utils import logging
24 |
25 |
26 | logging.set_verbosity_info()
27 |
28 |
29 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
30 | # Construct model
31 | if openai_config_file == "":
32 | config = OpenAIGPTConfig()
33 | else:
34 | config = OpenAIGPTConfig.from_json_file(openai_config_file)
35 | model = OpenAIGPTModel(config)
36 |
37 | # Load weights from numpy
38 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
39 |
40 | # Save pytorch-model
41 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
42 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
43 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
44 | torch.save(model.state_dict(), pytorch_weights_dump_path)
45 | print("Save configuration file to {}".format(pytorch_config_dump_path))
46 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
47 | f.write(config.to_json_string())
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | # Required parameters
53 | parser.add_argument(
54 | "--openai_checkpoint_folder_path",
55 | default=None,
56 | type=str,
57 | required=True,
58 | help="Path to the TensorFlow checkpoint path.",
59 | )
60 | parser.add_argument(
61 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
62 | )
63 | parser.add_argument(
64 | "--openai_config_file",
65 | default="",
66 | type=str,
67 | help="An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 | "This specifies the model architecture.",
69 | )
70 | args = parser.parse_args()
71 | convert_openai_checkpoint_to_pytorch(
72 | args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
73 | )
74 |
--------------------------------------------------------------------------------
/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
21 | from transformers.utils import logging
22 |
23 |
24 | logging.set_verbosity_info()
25 |
26 |
27 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
28 | # Initialise PyTorch model
29 | config = T5Config.from_json_file(config_file)
30 | print("Building PyTorch model from configuration: {}".format(str(config)))
31 | model = T5Model(config)
32 |
33 | # Load weights from tf checkpoint
34 | load_tf_weights_in_t5(model, config, tf_checkpoint_path)
35 |
36 | # Save pytorch-model
37 | print("Save PyTorch model to {}".format(pytorch_dump_path))
38 | model.save_pretrained(pytorch_dump_path)
39 |
40 |
41 | if __name__ == "__main__":
42 | parser = argparse.ArgumentParser()
43 | # Required parameters
44 | parser.add_argument(
45 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
46 | )
47 | parser.add_argument(
48 | "--config_file",
49 | default=None,
50 | type=str,
51 | required=True,
52 | help="The config json file corresponding to the pre-trained T5 model. \n"
53 | "This specifies the model architecture.",
54 | )
55 | parser.add_argument(
56 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
57 | )
58 | args = parser.parse_args()
59 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
60 |
--------------------------------------------------------------------------------
/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert Seq2Seq TF Hub checkpoint."""
16 |
17 |
18 | import argparse
19 |
20 | from transformers import (
21 | BertConfig,
22 | BertGenerationConfig,
23 | BertGenerationDecoder,
24 | BertGenerationEncoder,
25 | load_tf_weights_in_bert_generation,
26 | logging,
27 | )
28 |
29 |
30 | logging.set_verbosity_info()
31 |
32 |
33 | def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
34 | # Initialise PyTorch model
35 | bert_config = BertConfig.from_pretrained(
36 | "bert-large-cased",
37 | vocab_size=vocab_size,
38 | max_position_embeddings=512,
39 | is_decoder=True,
40 | add_cross_attention=True,
41 | )
42 | bert_config_dict = bert_config.to_dict()
43 | del bert_config_dict["type_vocab_size"]
44 | config = BertGenerationConfig(**bert_config_dict)
45 | if is_encoder:
46 | model = BertGenerationEncoder(config)
47 | else:
48 | model = BertGenerationDecoder(config)
49 | print("Building PyTorch model from configuration: {}".format(str(config)))
50 |
51 | # Load weights from tf checkpoint
52 | load_tf_weights_in_bert_generation(
53 | model,
54 | tf_hub_path,
55 | model_class="bert",
56 | is_encoder_named_decoder=is_encoder_named_decoder,
57 | is_encoder=is_encoder,
58 | )
59 |
60 | # Save pytorch-model
61 | print("Save PyTorch model and config to {}".format(pytorch_dump_path))
62 | model.save_pretrained(pytorch_dump_path)
63 |
64 |
65 | if __name__ == "__main__":
66 | parser = argparse.ArgumentParser()
67 | # Required parameters
68 | parser.add_argument(
69 | "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
70 | )
71 | parser.add_argument(
72 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
73 | )
74 | parser.add_argument(
75 | "--is_encoder_named_decoder",
76 | action="store_true",
77 | help="If decoder has to be renamed to encoder in PyTorch model.",
78 | )
79 | parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")
80 | parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")
81 | args = parser.parse_args()
82 | convert_tf_checkpoint_to_pytorch(
83 | args.tf_hub_path,
84 | args.pytorch_dump_path,
85 | args.is_encoder_named_decoder,
86 | args.vocab_size,
87 | is_encoder=args.is_encoder,
88 | )
89 |
--------------------------------------------------------------------------------
/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 |
17 |
18 | import argparse
19 | import json
20 |
21 | import numpy
22 | import torch
23 |
24 | from transformers import CONFIG_NAME, WEIGHTS_NAME
25 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
26 | from transformers.utils import logging
27 |
28 |
29 | logging.set_verbosity_info()
30 |
31 |
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 | # Load checkpoint
34 | chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
35 |
36 | state_dict = chkpt["model"]
37 |
38 | # We have the base model one level deeper than the original XLM repository
39 | two_levels_state_dict = {}
40 | for k, v in state_dict.items():
41 | if "pred_layer" in k:
42 | two_levels_state_dict[k] = v
43 | else:
44 | two_levels_state_dict["transformer." + k] = v
45 |
46 | config = chkpt["params"]
47 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 |
49 | vocab = chkpt["dico_word2id"]
50 | vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items())
51 |
52 | # Save pytorch-model
53 | pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
54 | pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
55 | pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
56 |
57 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 | torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 |
60 | print("Save configuration file to {}".format(pytorch_config_dump_path))
61 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 | f.write(json.dumps(config, indent=2) + "\n")
63 |
64 | print("Save vocab file to {}".format(pytorch_config_dump_path))
65 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 | f.write(json.dumps(vocab, indent=2) + "\n")
67 |
68 |
69 | if __name__ == "__main__":
70 | parser = argparse.ArgumentParser()
71 | # Required parameters
72 | parser.add_argument(
73 | "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
74 | )
75 | parser.add_argument(
76 | "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
77 | )
78 | args = parser.parse_args()
79 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
80 |
--------------------------------------------------------------------------------
/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | from .metrics import glue_compute_metrics, xnli_compute_metrics
6 | from .processors import (
7 | DataProcessor,
8 | InputExample,
9 | InputFeatures,
10 | SingleSentenceClassificationProcessor,
11 | SquadExample,
12 | SquadFeatures,
13 | SquadV1Processor,
14 | SquadV2Processor,
15 | glue_convert_examples_to_features,
16 | glue_output_modes,
17 | glue_processors,
18 | glue_tasks_num_labels,
19 | squad_convert_examples_to_features,
20 | xnli_output_modes,
21 | xnli_processors,
22 | xnli_tasks_num_labels,
23 | factcheck_convert_examples_to_features,
24 | # factcheck_output_modes,
25 | factcheck_processors,
26 | )
27 |
28 | from .metrics import glue_compute_metrics, xnli_compute_metrics, factcheck_compute_metrics
29 |
--------------------------------------------------------------------------------
/transformers/data/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/__pycache__/data_collator.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/__pycache__/data_collator.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | from .glue import GlueDataset, GlueDataTrainingArguments
6 | from .language_modeling import (
7 | LineByLineTextDataset,
8 | LineByLineWithRefDataset,
9 | LineByLineWithSOPTextDataset,
10 | TextDataset,
11 | TextDatasetForNextSentencePrediction,
12 | )
13 | from .squad import SquadDataset, SquadDataTrainingArguments
14 |
--------------------------------------------------------------------------------
/transformers/data/datasets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/datasets/__pycache__/glue.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/glue.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/datasets/__pycache__/language_modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/language_modeling.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/datasets/__pycache__/lm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/lm.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/datasets/__pycache__/squad.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/datasets/__pycache__/squad.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/metrics/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/metrics/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 |
5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
9 | from .factcheck import factcheck_convert_examples_to_features, factcheck_processors, DatasetForClassification
10 |
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/factcheck.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/factcheck.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/glue.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/glue.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/squad.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/squad.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/processors/__pycache__/xnli.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/data/processors/__pycache__/xnli.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/data/test_generation_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 | import unittest
3 |
4 | import timeout_decorator
5 |
6 | from transformers import is_torch_available
7 | from transformers.file_utils import cached_property
8 | from transformers.testing_utils import require_torch
9 |
10 |
11 | if is_torch_available():
12 | import torch
13 |
14 | from transformers import MarianConfig, MarianMTModel
15 |
16 |
17 | @require_torch
18 | class GenerationUtilsTest(unittest.TestCase):
19 | @cached_property
20 | def config(self):
21 | config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de")
22 | return config
23 |
24 | @cached_property
25 | def model(self):
26 | return MarianMTModel(self.config)
27 |
28 | def test_postprocess_next_token_scores(self):
29 | config = self.config
30 | model = self.model
31 | # Initialize an input id tensor with batch size 8 and sequence length 12
32 | input_ids = torch.arange(0, 96, 1).view((8, 12))
33 | eos = config.eos_token_id
34 | bad_words_ids_test_cases = [[[299]], [[23, 24], [54]], [[config.eos_token_id]], []]
35 | masked_scores = [
36 | [(0, 299), (1, 299), (2, 299), (3, 299), (4, 299), (5, 299), (6, 299), (7, 299)],
37 | [(1, 24), (0, 54), (1, 54), (2, 54), (3, 54), (4, 54), (5, 54), (6, 54), (7, 54)],
38 | [(0, eos), (1, eos), (2, eos), (3, eos), (4, eos), (5, eos), (6, eos), (7, eos)],
39 | [],
40 | ]
41 |
42 | for test_case_index, bad_words_ids in enumerate(bad_words_ids_test_cases):
43 | # Initialize a scores tensor with batch size 8 and vocabulary size 300
44 | scores = torch.rand((8, 300))
45 | output = model.postprocess_next_token_scores(
46 | scores,
47 | input_ids,
48 | 0,
49 | bad_words_ids,
50 | 13,
51 | 15,
52 | config.max_length,
53 | config.eos_token_id,
54 | config.repetition_penalty,
55 | 32,
56 | 5,
57 | )
58 | for masked_score in masked_scores[test_case_index]:
59 | self.assertTrue(output[masked_score[0], masked_score[1]] == -float("inf"))
60 |
61 | @timeout_decorator.timeout(10)
62 | def test_postprocess_next_token_scores_large_bad_words_list(self):
63 |
64 | config = self.config
65 | model = self.model
66 | # Initialize an input id tensor with batch size 8 and sequence length 12
67 | input_ids = torch.arange(0, 96, 1).view((8, 12))
68 |
69 | bad_words_ids = []
70 | for _ in range(100):
71 | length_bad_word = random.randint(1, 4)
72 | bad_words_ids.append(random.sample(range(1, 300), length_bad_word))
73 |
74 | scores = torch.rand((8, 300))
75 | _ = model.postprocess_next_token_scores(
76 | scores,
77 | input_ids,
78 | 0,
79 | bad_words_ids,
80 | 13,
81 | 15,
82 | config.max_length,
83 | config.eos_token_id,
84 | config.repetition_penalty,
85 | 32,
86 | 5,
87 | )
88 |
--------------------------------------------------------------------------------
/transformers/modeling_blenderbot.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | #
5 | # This source code is licensed under the MIT license found in the;
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # LICENSE file in the root directory of this source tree.
17 | """"BlenderbotForConditionalGeneration which inherits from BART"""
18 |
19 | import torch
20 |
21 | from .configuration_blenderbot import BlenderbotConfig
22 | from .file_utils import add_start_docstrings
23 | from .modeling_bart import BartForConditionalGeneration
24 |
25 |
26 | BLENDER_START_DOCSTRING = r"""
27 |
28 | This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
29 | methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
30 | pruning heads etc.)
31 |
32 | This model is also a PyTorch `torch.nn.Module `__
33 | subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
34 | general usage and behavior.
35 |
36 | """
37 |
38 | BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/blenderbot-90M"]
39 |
40 |
41 | @add_start_docstrings(
42 | "The BART Model with a language modeling head. Can be used for summarization.", BLENDER_START_DOCSTRING
43 | )
44 | class BlenderbotForConditionalGeneration(BartForConditionalGeneration):
45 | """
46 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
47 | appropriate documentation alongside usage examples.
48 | """
49 |
50 | config_class = BlenderbotConfig
51 |
52 | def adjust_logits_during_generation(self, logits, cur_len, max_length):
53 | logits[:, self.config.bos_token_id] = -torch.finfo(torch.float16).max # near infinity fp16
54 | if cur_len == max_length - 1 and self.config.eos_token_id is not None:
55 | self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
56 | return logits
57 |
--------------------------------------------------------------------------------
/transformers/modeling_marian.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 Marian Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """PyTorch MarianMTModel model, ported from the Marian C++ repo."""
16 |
17 |
18 | from .configuration_marian import MarianConfig
19 | from .modeling_bart import BartForConditionalGeneration
20 |
21 |
22 | # See all Marian models at https://huggingface.co/models?search=Helsinki-NLP
23 |
24 |
25 | class MarianMTModel(BartForConditionalGeneration):
26 | r"""
27 | Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
28 | models are listed `here `__.
29 |
30 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
31 | appropriate documentation alongside usage examples.
32 |
33 | Examples::
34 |
35 | >>> from transformers import MarianTokenizer, MarianMTModel
36 | >>> from typing import List
37 | >>> src = 'fr' # source language
38 | >>> trg = 'en' # target language
39 | >>> sample_text = "où est l'arrêt de bus ?"
40 | >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
41 |
42 | >>> model = MarianMTModel.from_pretrained(mname)
43 | >>> tok = MarianTokenizer.from_pretrained(mname)
44 | >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text]) # don't need tgt_text for inference
45 | >>> gen = model.generate(**batch) # for forward pass: model(**batch)
46 | >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the bus stop ?"
47 |
48 | """
49 | config_class = MarianConfig
50 | authorized_missing_keys = [
51 | "model.encoder.embed_positions.weight",
52 | "model.decoder.embed_positions.weight",
53 | ]
54 | keys_to_never_save = [
55 | "model.encoder.embed_positions.weight",
56 | "model.decoder.embed_positions.weight",
57 | ]
58 |
59 | def adjust_logits_during_generation(self, logits, cur_len, max_length):
60 | logits[:, self.config.pad_token_id] = float("-inf") # never predict pad token.
61 | if cur_len == max_length - 1 and self.config.eos_token_id is not None:
62 | self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
63 | return logits
64 |
--------------------------------------------------------------------------------
/transformers/modeling_mbart.py:
--------------------------------------------------------------------------------
1 | from .configuration_mbart import MBartConfig
2 | from .modeling_bart import BartForConditionalGeneration
3 |
4 |
5 | _CONFIG_FOR_DOC = "MBartConfig"
6 | _TOKENIZER_FOR_DOC = "MBartTokenizer"
7 |
8 | MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
9 | "facebook/mbart-large-cc25",
10 | "facebook/mbart-large-en-ro",
11 | # See all multilingual BART models at https://huggingface.co/models?filter=mbart
12 | ]
13 |
14 |
15 | class MBartForConditionalGeneration(BartForConditionalGeneration):
16 | r"""
17 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
18 | appropriate documentation alongside usage examples.
19 |
20 | Examples::
21 | >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
22 | >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
23 | >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
24 | >>> article = "UN Chief Says There Is No Military Solution in Syria"
25 | >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article])
26 | >>> translated_tokens = model.generate(**batch)
27 | >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
28 | >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
29 | """
30 | model_type = "mbart"
31 | config_class = MBartConfig
32 | authorized_missing_keys = [
33 | "model.encoder.embed_positions.weight",
34 | "model.decoder.embed_positions.weight",
35 | ]
36 | keys_to_never_save = [
37 | "model.encoder.embed_positions.weight",
38 | "model.decoder.embed_positions.weight",
39 | ]
40 |
--------------------------------------------------------------------------------
/transformers/modeling_pegasus.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 Google and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """PyTorch Pegasus model, ported from https://github.com/google-research/pegasus"""
16 |
17 |
18 | from .configuration_pegasus import PegasusConfig
19 | from .file_utils import add_start_docstrings
20 | from .modeling_bart import BART_START_DOCSTRING, BartForConditionalGeneration
21 |
22 |
23 | @add_start_docstrings("The Pegasus Model for summarization ", BART_START_DOCSTRING)
24 | class PegasusForConditionalGeneration(BartForConditionalGeneration):
25 | r"""
26 | Pytorch version of google's pegasus model for summarization. Available models are listed `here
27 | `__.
28 |
29 | This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
30 | appropriate documentation alongside usage examples.
31 |
32 | Examples::
33 |
34 | >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
35 | >>> from typing import List
36 | >>> PGE_ARTICLE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
37 | >>> mname = "google/pegasus-xsum"
38 |
39 | >>> model = PegasusForConditionalGeneration.from_pretrained(mname)
40 | >>> tok = PegasusTokenizer.from_pretrained(mname)
41 | >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE]) # don't need tgt_text for inference
42 | >>> gen = model.generate(**batch) # for forward pass: model(**batch)
43 | >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True)
44 | >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers."
45 |
46 | """
47 | # All the code is in src/transformers/modeling_bart.py
48 | config_class = PegasusConfig
49 | authorized_missing_keys = [
50 | r"final_logits_bias",
51 | r"encoder\.version",
52 | r"decoder\.version",
53 | "model.encoder.embed_positions",
54 | "model.decoder.embed_positions",
55 | ]
56 | keys_to_never_save = [
57 | "model.encoder.embed_positions.weight",
58 | "model.decoder.embed_positions.weight",
59 | ]
60 |
--------------------------------------------------------------------------------
/transformers/modeling_tf_blenderbot.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """TF BlenderBot model, ported from the fairseq repo."""
16 | from .configuration_blenderbot import BlenderbotConfig
17 | from .file_utils import add_start_docstrings, is_tf_available
18 | from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
19 | from .utils import logging
20 |
21 |
22 | if is_tf_available():
23 | import tensorflow as tf
24 |
25 |
26 | _CONFIG_FOR_DOC = "BlenderbotConfig"
27 |
28 | START_DOCSTRING = BART_START_DOCSTRING.replace(
29 | "inherits from :class:`~transformers.TFPreTrainedModel`",
30 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
31 | ).replace("BartConfig", _CONFIG_FOR_DOC)
32 |
33 |
34 | logger = logging.get_logger(__name__)
35 |
36 |
37 | @add_start_docstrings("Blenderbot model for open domain dialogue", START_DOCSTRING)
38 | class TFBlenderbotForConditionalGeneration(TFBartForConditionalGeneration):
39 | config_class = BlenderbotConfig
40 |
41 | def adjust_logits_during_generation(self, logits, cur_len, max_length):
42 | """Never predict pad_token_id. Predict when max_length is reached."""
43 | vocab_range = tf.constant(range(self.config.vocab_size))
44 | logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
45 | if cur_len == max_length - 1:
46 | logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
47 | return logits
48 |
--------------------------------------------------------------------------------
/transformers/modeling_tf_marian.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """TF Marian model, ported from the fairseq repo."""
16 |
17 | from .configuration_marian import MarianConfig
18 | from .file_utils import add_start_docstrings, is_tf_available
19 | from .modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
20 | from .utils import logging
21 |
22 |
23 | if is_tf_available():
24 | import tensorflow as tf
25 |
26 |
27 | _CONFIG_FOR_DOC = "MarianConfig"
28 |
29 | START_DOCSTRING = BART_START_DOCSTRING.replace(
30 | "inherits from :class:`~transformers.TFPreTrainedModel`",
31 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
32 | ).replace("BartConfig", _CONFIG_FOR_DOC)
33 |
34 |
35 | logger = logging.get_logger(__name__)
36 |
37 |
38 | @add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
39 | class TFMarianMTModel(TFBartForConditionalGeneration):
40 | authorized_missing_keys = [
41 | r"model.encoder.embed_positions.weight",
42 | r"model.decoder.embed_positions.weight",
43 | ]
44 | config_class = MarianConfig
45 |
46 | def adjust_logits_during_generation(self, logits, cur_len, max_length):
47 | """Never predict pad_token_id. Predict when max_length is reached."""
48 | vocab_range = tf.constant(range(self.config.vocab_size))
49 | logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
50 | if cur_len == max_length - 1:
51 | logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
52 | return logits
53 |
--------------------------------------------------------------------------------
/transformers/modeling_tf_mbart.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """TF mBART model, originally from fairseq."""
16 | from .configuration_mbart import MBartConfig
17 | from .file_utils import add_start_docstrings
18 | from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
19 | from .utils import logging
20 |
21 |
22 | _CONFIG_FOR_DOC = "MBartConfig"
23 |
24 | START_DOCSTRING = BART_START_DOCSTRING.replace(
25 | "inherits from :class:`~transformers.TFPreTrainedModel`",
26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
27 | ).replace("BartConfig", _CONFIG_FOR_DOC)
28 |
29 |
30 | logger = logging.get_logger(__name__)
31 |
32 |
33 | @add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING)
34 | class TFMBartForConditionalGeneration(TFBartForConditionalGeneration):
35 | config_class = MBartConfig
36 | # All the code is in src/transformers/modeling_tf_bart.py
37 |
--------------------------------------------------------------------------------
/transformers/modeling_tf_pegasus.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """TF Pegasus model, ported from the fairseq repo."""
16 | from .configuration_pegasus import PegasusConfig
17 | from .file_utils import add_start_docstrings
18 | from .modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
19 | from .utils import logging
20 |
21 |
22 | _CONFIG_FOR_DOC = "PegasusConfig"
23 |
24 | START_DOCSTRING = BART_START_DOCSTRING.replace(
25 | "inherits from :class:`~transformers.TFPreTrainedModel`",
26 | "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
27 | ).replace("BartConfig", _CONFIG_FOR_DOC)
28 |
29 |
30 | logger = logging.get_logger(__name__)
31 |
32 |
33 | @add_start_docstrings("Pegasus model for summarization", START_DOCSTRING)
34 | class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration):
35 | authorized_missing_keys = [
36 | r"final_logits_bias",
37 | r"model.encoder.embed_positions.weight",
38 | r"model.decoder.embed_positions.weight",
39 | ]
40 | config_class = PegasusConfig
41 | # All the code is in src/transformers/modeling_tf_bart.py
42 |
--------------------------------------------------------------------------------
/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 |
17 | from .tokenization_bert import BertTokenizer
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 |
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 | "vocab_file": {
27 | "distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
28 | "distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
29 | "distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
30 | "distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
31 | "distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
32 | "distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
33 | }
34 | }
35 |
36 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
37 | "distilbert-base-uncased": 512,
38 | "distilbert-base-uncased-distilled-squad": 512,
39 | "distilbert-base-cased": 512,
40 | "distilbert-base-cased-distilled-squad": 512,
41 | "distilbert-base-german-cased": 512,
42 | "distilbert-base-multilingual-cased": 512,
43 | }
44 |
45 |
46 | PRETRAINED_INIT_CONFIGURATION = {
47 | "distilbert-base-uncased": {"do_lower_case": True},
48 | "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
49 | "distilbert-base-cased": {"do_lower_case": False},
50 | "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
51 | "distilbert-base-german-cased": {"do_lower_case": False},
52 | "distilbert-base-multilingual-cased": {"do_lower_case": False},
53 | }
54 |
55 |
56 | class DistilBertTokenizer(BertTokenizer):
57 | r"""
58 | Construct a DistilBERT tokenizer.
59 |
60 | :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
61 | tokenization: punctuation splitting and wordpiece.
62 |
63 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
64 | parameters.
65 | """
66 |
67 | vocab_files_names = VOCAB_FILES_NAMES
68 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
69 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
71 | model_input_names = ["attention_mask"]
72 |
--------------------------------------------------------------------------------
/transformers/tokenization_electra.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_bert import BertTokenizer
17 |
18 |
19 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
20 |
21 | PRETRAINED_VOCAB_FILES_MAP = {
22 | "vocab_file": {
23 | "google/electra-small-generator": "https://huggingface.co/google/electra-small-generator/resolve/main/vocab.txt",
24 | "google/electra-base-generator": "https://huggingface.co/google/electra-base-generator/resolve/main/vocab.txt",
25 | "google/electra-large-generator": "https://huggingface.co/google/electra-large-generator/resolve/main/vocab.txt",
26 | "google/electra-small-discriminator": "https://huggingface.co/google/electra-small-discriminator/resolve/main/vocab.txt",
27 | "google/electra-base-discriminator": "https://huggingface.co/google/electra-base-discriminator/resolve/main/vocab.txt",
28 | "google/electra-large-discriminator": "https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt",
29 | }
30 | }
31 |
32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
33 | "google/electra-small-generator": 512,
34 | "google/electra-base-generator": 512,
35 | "google/electra-large-generator": 512,
36 | "google/electra-small-discriminator": 512,
37 | "google/electra-base-discriminator": 512,
38 | "google/electra-large-discriminator": 512,
39 | }
40 |
41 |
42 | PRETRAINED_INIT_CONFIGURATION = {
43 | "google/electra-small-generator": {"do_lower_case": True},
44 | "google/electra-base-generator": {"do_lower_case": True},
45 | "google/electra-large-generator": {"do_lower_case": True},
46 | "google/electra-small-discriminator": {"do_lower_case": True},
47 | "google/electra-base-discriminator": {"do_lower_case": True},
48 | "google/electra-large-discriminator": {"do_lower_case": True},
49 | }
50 |
51 |
52 | class ElectraTokenizer(BertTokenizer):
53 | r"""
54 | Construct an ELECTRA tokenizer.
55 |
56 | :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
57 | tokenization: punctuation splitting and wordpiece.
58 |
59 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
60 | parameters.
61 | """
62 |
63 | vocab_files_names = VOCAB_FILES_NAMES
64 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
65 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
66 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
67 |
--------------------------------------------------------------------------------
/transformers/tokenization_herbert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_bert import BasicTokenizer
17 | from .tokenization_xlm import XLMTokenizer
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | VOCAB_FILES_NAMES = {
24 | "vocab_file": "vocab.json",
25 | "merges_file": "merges.txt",
26 | }
27 |
28 | PRETRAINED_VOCAB_FILES_MAP = {
29 | "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
30 | "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
31 | }
32 |
33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
34 | PRETRAINED_INIT_CONFIGURATION = {}
35 |
36 |
37 | class HerbertTokenizer(XLMTokenizer):
38 | """
39 | Construct a BPE tokenizer for HerBERT.
40 |
41 | Peculiarities:
42 |
43 | - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
44 | punctuation character will be treated separately.
45 |
46 | - Such pretokenized input is BPE subtokenized
47 |
48 | This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
49 | refer to the superclass for more information regarding methods.
50 | """
51 |
52 | vocab_files_names = VOCAB_FILES_NAMES
53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
54 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
55 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56 |
57 | def __init__(self, **kwargs):
58 |
59 | kwargs["cls_token"] = ""
60 | kwargs["unk_token"] = ""
61 | kwargs["pad_token"] = ""
62 | kwargs["mask_token"] = ""
63 | kwargs["sep_token"] = ""
64 | kwargs["do_lowercase_and_remove_accent"] = False
65 | kwargs["additional_special_tokens"] = []
66 |
67 | super().__init__(**kwargs)
68 | self.bert_pre_tokenizer = BasicTokenizer(
69 | do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False
70 | )
71 |
72 | def _tokenize(self, text):
73 |
74 | pre_tokens = self.bert_pre_tokenizer.tokenize(text)
75 |
76 | split_tokens = []
77 | for token in pre_tokens:
78 | if token:
79 | split_tokens.extend([t for t in self.bpe(token).split(" ")])
80 |
81 | return split_tokens
82 |
--------------------------------------------------------------------------------
/transformers/tokenization_layoutlm.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Tokenization class for model LayoutLM."""
16 |
17 |
18 | from .tokenization_bert import BertTokenizer
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
25 |
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 | "vocab_file": {
28 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
29 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
30 | }
31 | }
32 |
33 |
34 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
35 | "microsoft/layoutlm-base-uncased": 512,
36 | "microsoft/layoutlm-large-uncased": 512,
37 | }
38 |
39 |
40 | PRETRAINED_INIT_CONFIGURATION = {
41 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
42 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
43 | }
44 |
45 |
46 | class LayoutLMTokenizer(BertTokenizer):
47 | r"""
48 | Constructs a LayoutLM tokenizer.
49 |
50 | :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
51 | tokenization: punctuation splitting + wordpiece.
52 |
53 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
54 | parameters.
55 | """
56 |
57 | vocab_files_names = VOCAB_FILES_NAMES
58 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
61 |
--------------------------------------------------------------------------------
/transformers/tokenization_layoutlm_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Tokenization class for model LayoutLM."""
16 |
17 |
18 | from .tokenization_bert_fast import BertTokenizerFast
19 | from .tokenization_layoutlm import LayoutLMTokenizer
20 | from .utils import logging
21 |
22 |
23 | logger = logging.get_logger(__name__)
24 |
25 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
26 |
27 | PRETRAINED_VOCAB_FILES_MAP = {
28 | "vocab_file": {
29 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
30 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
31 | },
32 | "tokenizer_file": {
33 | "microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
34 | "microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
35 | },
36 | }
37 |
38 |
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 | "microsoft/layoutlm-base-uncased": 512,
41 | "microsoft/layoutlm-large-uncased": 512,
42 | }
43 |
44 |
45 | PRETRAINED_INIT_CONFIGURATION = {
46 | "microsoft/layoutlm-base-uncased": {"do_lower_case": True},
47 | "microsoft/layoutlm-large-uncased": {"do_lower_case": True},
48 | }
49 |
50 |
51 | class LayoutLMTokenizerFast(BertTokenizerFast):
52 | r"""
53 | Constructs a "Fast" LayoutLMTokenizer.
54 |
55 | :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
56 | end-to-end tokenization: punctuation splitting + wordpiece.
57 |
58 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
59 | parameters.
60 | """
61 |
62 | vocab_files_names = VOCAB_FILES_NAMES
63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
66 | slow_tokenizer_class = LayoutLMTokenizer
67 |
--------------------------------------------------------------------------------
/transformers/tokenization_longformer.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_roberta import RobertaTokenizer
17 | from .utils import logging
18 |
19 |
20 | logger = logging.get_logger(__name__)
21 |
22 |
23 | # vocab and merges same as roberta
24 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
25 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
26 | _all_longformer_models = [
27 | "allenai/longformer-base-4096",
28 | "allenai/longformer-large-4096",
29 | "allenai/longformer-large-4096-finetuned-triviaqa",
30 | "allenai/longformer-base-4096-extra.pos.embd.only",
31 | "allenai/longformer-large-4096-extra.pos.embd.only",
32 | ]
33 |
34 |
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 | "allenai/longformer-base-4096": 4096,
37 | "allenai/longformer-large-4096": 4096,
38 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
39 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
40 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
41 | }
42 |
43 |
44 | class LongformerTokenizer(RobertaTokenizer):
45 | r"""
46 | Construct a Longformer tokenizer.
47 |
48 | :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
49 | superclass for usage examples and documentation concerning parameters.
50 | """
51 | # merges and vocab same as Roberta
52 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
53 | pretrained_vocab_files_map = {
54 | "vocab_file": {m: vocab_url for m in _all_longformer_models},
55 | "merges_file": {m: merges_url for m in _all_longformer_models},
56 | }
57 |
--------------------------------------------------------------------------------
/transformers/tokenization_longformer_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_longformer import LongformerTokenizer
17 | from .tokenization_roberta_fast import RobertaTokenizerFast
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 |
24 | # vocab and merges same as roberta
25 | vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
26 | merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
27 | tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
28 | _all_longformer_models = [
29 | "allenai/longformer-base-4096",
30 | "allenai/longformer-large-4096",
31 | "allenai/longformer-large-4096-finetuned-triviaqa",
32 | "allenai/longformer-base-4096-extra.pos.embd.only",
33 | "allenai/longformer-large-4096-extra.pos.embd.only",
34 | ]
35 |
36 |
37 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
38 | "allenai/longformer-base-4096": 4096,
39 | "allenai/longformer-large-4096": 4096,
40 | "allenai/longformer-large-4096-finetuned-triviaqa": 4096,
41 | "allenai/longformer-base-4096-extra.pos.embd.only": 4096,
42 | "allenai/longformer-large-4096-extra.pos.embd.only": 4096,
43 | }
44 |
45 |
46 | class LongformerTokenizerFast(RobertaTokenizerFast):
47 | r"""
48 | Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
49 |
50 | :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
51 | to the superclass for usage examples and documentation concerning parameters.
52 | """
53 | # merges and vocab same as Roberta
54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
55 | pretrained_vocab_files_map = {
56 | "vocab_file": {m: vocab_url for m in _all_longformer_models},
57 | "merges_file": {m: merges_url for m in _all_longformer_models},
58 | "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
59 | }
60 | slow_tokenizer_class = LongformerTokenizer
61 |
--------------------------------------------------------------------------------
/transformers/tokenization_lxmert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_bert import BertTokenizer
17 |
18 |
19 | ####################################################
20 | # Mapping from the keyword arguments names of Tokenizer `__init__`
21 | # to file names for serializing Tokenizer instances
22 | ####################################################
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 |
25 | ####################################################
26 | # Mapping from the keyword arguments names of Tokenizer `__init__`
27 | # to pretrained vocabulary URL for all the model shortcut names.
28 | ####################################################
29 | PRETRAINED_VOCAB_FILES_MAP = {
30 | "vocab_file": {
31 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
32 | }
33 | }
34 |
35 | ####################################################
36 | # Mapping from model shortcut names to max length of inputs
37 | ####################################################
38 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39 | "unc-nlp/lxmert-base-uncased": 512,
40 | }
41 | ####################################################
42 | # Mapping from model shortcut names to a dictionary of additional
43 | # keyword arguments for Tokenizer `__init__`.
44 | # To be used for checkpoint specific configurations.
45 | ####################################################
46 | PRETRAINED_INIT_CONFIGURATION = {
47 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
48 | }
49 |
50 |
51 | class LxmertTokenizer(BertTokenizer):
52 | r"""
53 | Construct an LXMERT tokenizer.
54 |
55 | :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
56 | tokenization: punctuation splitting and wordpiece.
57 |
58 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
59 | parameters.
60 | """
61 |
62 | vocab_files_names = VOCAB_FILES_NAMES
63 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
64 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
65 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
66 |
--------------------------------------------------------------------------------
/transformers/tokenization_lxmert_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from .tokenization_bert_fast import BertTokenizerFast
17 | from .tokenization_lxmert import LxmertTokenizer
18 |
19 |
20 | ####################################################
21 | # Mapping from the keyword arguments names of Tokenizer `__init__`
22 | # to file names for serializing Tokenizer instances
23 | ####################################################
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 |
26 | ####################################################
27 | # Mapping from the keyword arguments names of Tokenizer `__init__`
28 | # to pretrained vocabulary URL for all the model shortcut names.
29 | ####################################################
30 | PRETRAINED_VOCAB_FILES_MAP = {
31 | "vocab_file": {
32 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
33 | },
34 | "tokenizer_file": {
35 | "unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
36 | },
37 | }
38 |
39 | ####################################################
40 | # Mapping from model shortcut names to max length of inputs
41 | ####################################################
42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
43 | "unc-nlp/lxmert-base-uncased": 512,
44 | }
45 | ####################################################
46 | # Mapping from model shortcut names to a dictionary of additional
47 | # keyword arguments for Tokenizer `__init__`.
48 | # To be used for checkpoint specific configurations.
49 | ####################################################
50 | PRETRAINED_INIT_CONFIGURATION = {
51 | "unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
52 | }
53 |
54 |
55 | class LxmertTokenizerFast(BertTokenizerFast):
56 | r"""
57 | Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
58 |
59 | :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
60 | end-to-end tokenization: punctuation splitting and wordpiece.
61 |
62 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
63 | parameters.
64 | """
65 | vocab_files_names = VOCAB_FILES_NAMES
66 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
67 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
68 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
69 | slow_tokenizer_class = LxmertTokenizer
70 |
--------------------------------------------------------------------------------
/transformers/tokenization_mobilebert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tokenization classes for MobileBERT."""
15 |
16 | from .tokenization_bert import BertTokenizer
17 | from .utils import logging
18 |
19 |
20 | logger = logging.get_logger(__name__)
21 |
22 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
23 |
24 | PRETRAINED_VOCAB_FILES_MAP = {
25 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"}
26 | }
27 |
28 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
29 |
30 |
31 | PRETRAINED_INIT_CONFIGURATION = {}
32 |
33 |
34 | class MobileBertTokenizer(BertTokenizer):
35 | r"""
36 | Construct a MobileBERT tokenizer.
37 |
38 | :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
39 | tokenization: punctuation splitting and wordpiece.
40 |
41 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
42 | parameters.
43 | """
44 |
45 | vocab_files_names = VOCAB_FILES_NAMES
46 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
47 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
48 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
49 |
--------------------------------------------------------------------------------
/transformers/tokenization_mobilebert_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tokenization classes for MobileBERT."""
15 |
16 | from .tokenization_bert_fast import BertTokenizerFast
17 | from .tokenization_mobilebert import MobileBertTokenizer
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
24 |
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 | "vocab_file": {"mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/vocab.txt"},
27 | "tokenizer_file": {
28 | "mobilebert-uncased": "https://huggingface.co/google/mobilebert-uncased/resolve/main/tokenizer.json"
29 | },
30 | }
31 |
32 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
33 |
34 |
35 | PRETRAINED_INIT_CONFIGURATION = {}
36 |
37 |
38 | class MobileBertTokenizerFast(BertTokenizerFast):
39 | r"""
40 | Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
41 |
42 | :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
43 | end-to-end tokenization: punctuation splitting and wordpiece.
44 |
45 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
46 | parameters.
47 | """
48 |
49 | vocab_files_names = VOCAB_FILES_NAMES
50 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
51 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
52 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
53 | slow_tokenizer_class = MobileBertTokenizer
54 |
--------------------------------------------------------------------------------
/transformers/tokenization_openai_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Fast Tokenization classes for OpenAI GPT."""
16 |
17 |
18 | from typing import Optional, Tuple
19 |
20 | from .tokenization_openai import OpenAIGPTTokenizer
21 | from .tokenization_utils_fast import PreTrainedTokenizerFast
22 | from .utils import logging
23 |
24 |
25 | logger = logging.get_logger(__name__)
26 |
27 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
28 |
29 | PRETRAINED_VOCAB_FILES_MAP = {
30 | "vocab_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/vocab.json"},
31 | "merges_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/merges.txt"},
32 | "tokenizer_file": {"openai-gpt": "https://huggingface.co/openai-gpt/resolve/main/tokenizer.json"},
33 | }
34 |
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 | "openai-gpt": 512,
37 | }
38 |
39 |
40 | class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
41 | """
42 | Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
43 | the following peculiarities:
44 |
45 | - lower case all inputs
46 | - uses BERT's BasicTokenizer for pre-BPE tokenization
47 |
48 | This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
49 | methods. Users should refer to this superclass for more information regarding those methods.
50 |
51 | Args:
52 | vocab_file (:obj:`str`):
53 | Path to the vocabulary file.
54 | merges_file (:obj:`str`):
55 | Path to the merges file.
56 | unk_token (:obj:`str`, `optional`, defaults to :obj:`""`):
57 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
58 | token instead.
59 | """
60 |
61 | vocab_files_names = VOCAB_FILES_NAMES
62 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
63 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
64 | model_input_names = ["attention_mask"]
65 | slow_tokenizer_class = OpenAIGPTTokenizer
66 |
67 | def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="", **kwargs):
68 | super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
69 |
70 | @property
71 | def do_lower_case(self):
72 | return True
73 |
74 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
75 | files = self._tokenizer.model.save(save_directory, name=filename_prefix)
76 | return tuple(files)
77 |
--------------------------------------------------------------------------------
/transformers/tokenization_retribert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RetriBERT."""
16 |
17 | from .tokenization_bert import BertTokenizer
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 |
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 | "vocab_file": {
27 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
28 | }
29 | }
30 |
31 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
32 | "yjernite/retribert-base-uncased": 512,
33 | }
34 |
35 |
36 | PRETRAINED_INIT_CONFIGURATION = {
37 | "yjernite/retribert-base-uncased": {"do_lower_case": True},
38 | }
39 |
40 |
41 | class RetriBertTokenizer(BertTokenizer):
42 | r"""
43 | Constructs a RetriBERT tokenizer.
44 |
45 | :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
46 | tokenization: punctuation splitting and wordpiece.
47 |
48 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
49 | parameters.
50 | """
51 |
52 | vocab_files_names = VOCAB_FILES_NAMES
53 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
54 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
55 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
56 | model_input_names = ["attention_mask"]
57 |
--------------------------------------------------------------------------------
/transformers/tokenization_retribert_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RetriBERT."""
16 |
17 | from .tokenization_bert_fast import BertTokenizerFast
18 | from .tokenization_retribert import RetriBertTokenizer
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 |
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 | "vocab_file": {
28 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
29 | },
30 | "tokenizer_file": {
31 | "yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
32 | },
33 | }
34 |
35 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
36 | "yjernite/retribert-base-uncased": 512,
37 | }
38 |
39 |
40 | PRETRAINED_INIT_CONFIGURATION = {
41 | "yjernite/retribert-base-uncased": {"do_lower_case": True},
42 | }
43 |
44 |
45 | class RetriBertTokenizerFast(BertTokenizerFast):
46 | r"""
47 | Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library).
48 |
49 | :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
50 | end-to-end tokenization: punctuation splitting and wordpiece.
51 |
52 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
53 | parameters.
54 | """
55 |
56 | vocab_files_names = VOCAB_FILES_NAMES
57 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
58 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
59 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
60 | slow_tokenizer_class = RetriBertTokenizer
61 | model_input_names = ["attention_mask"]
62 |
--------------------------------------------------------------------------------
/transformers/tokenization_squeezebert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for SqueezeBERT."""
16 |
17 | from .tokenization_bert import BertTokenizer
18 | from .utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
24 |
25 | PRETRAINED_VOCAB_FILES_MAP = {
26 | "vocab_file": {
27 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt",
28 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
29 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt",
30 | }
31 | }
32 |
33 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
34 | "squeezebert/squeezebert-uncased": 512,
35 | "squeezebert/squeezebert-mnli": 512,
36 | "squeezebert/squeezebert-mnli-headless": 512,
37 | }
38 |
39 |
40 | PRETRAINED_INIT_CONFIGURATION = {
41 | "squeezebert/squeezebert-uncased": {"do_lower_case": True},
42 | "squeezebert/squeezebert-mnli": {"do_lower_case": True},
43 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
44 | }
45 |
46 |
47 | class SqueezeBertTokenizer(BertTokenizer):
48 | r"""
49 | Constructs a SqueezeBert tokenizer.
50 |
51 | :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
52 | tokenization: punctuation splitting + wordpiece.
53 |
54 | Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
55 | parameters.
56 | """
57 |
58 | vocab_files_names = VOCAB_FILES_NAMES
59 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
60 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
61 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
62 |
--------------------------------------------------------------------------------
/transformers/tokenization_squeezebert_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for SqueezeBERT."""
16 |
17 | from .tokenization_bert_fast import BertTokenizerFast
18 | from .tokenization_squeezebert import SqueezeBertTokenizer
19 | from .utils import logging
20 |
21 |
22 | logger = logging.get_logger(__name__)
23 |
24 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
25 |
26 | PRETRAINED_VOCAB_FILES_MAP = {
27 | "vocab_file": {
28 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/vocab.txt",
29 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/vocab.txt",
30 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/vocab.txt",
31 | },
32 | "tokenizer_file": {
33 | "squeezebert/squeezebert-uncased": "https://huggingface.co/squeezebert/squeezebert-uncased/resolve/main/tokenizer.json",
34 | "squeezebert/squeezebert-mnli": "https://huggingface.co/squeezebert/squeezebert-mnli/resolve/main/tokenizer.json",
35 | "squeezebert/squeezebert-mnli-headless": "https://huggingface.co/squeezebert/squeezebert-mnli-headless/resolve/main/tokenizer.json",
36 | },
37 | }
38 |
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 | "squeezebert/squeezebert-uncased": 512,
41 | "squeezebert/squeezebert-mnli": 512,
42 | "squeezebert/squeezebert-mnli-headless": 512,
43 | }
44 |
45 |
46 | PRETRAINED_INIT_CONFIGURATION = {
47 | "squeezebert/squeezebert-uncased": {"do_lower_case": True},
48 | "squeezebert/squeezebert-mnli": {"do_lower_case": True},
49 | "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True},
50 | }
51 |
52 |
53 | class SqueezeBertTokenizerFast(BertTokenizerFast):
54 | r"""
55 | Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library).
56 |
57 | :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
58 | end-to-end tokenization: punctuation splitting + wordpiece.
59 |
60 | Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
61 | parameters.
62 | """
63 |
64 | vocab_files_names = VOCAB_FILES_NAMES
65 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
66 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
67 | pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
68 | slow_tokenizer_class = SqueezeBertTokenizer
69 |
--------------------------------------------------------------------------------
/transformers/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__init__.py
--------------------------------------------------------------------------------
/transformers/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/utils/__pycache__/dummy_flax_objects.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/dummy_flax_objects.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/utils/__pycache__/dummy_tf_objects.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/dummy_tf_objects.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/utils/__pycache__/logging.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/logging.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/utils/__pycache__/sentencepiece_model_pb2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HLTCHKUST/Perplexity-FactChecking/aec341410d7c66273ecdb52daa7d39abd786edc3/transformers/utils/__pycache__/sentencepiece_model_pb2.cpython-37.pyc
--------------------------------------------------------------------------------
/transformers/utils/dummy_flax_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..file_utils import requires_flax
3 |
4 |
5 | class FlaxBertModel:
6 | def __init__(self, *args, **kwargs):
7 | requires_flax(self)
8 |
9 | @classmethod
10 | def from_pretrained(self, *args, **kwargs):
11 | requires_flax(self)
12 |
13 |
14 | class FlaxRobertaModel:
15 | def __init__(self, *args, **kwargs):
16 | requires_flax(self)
17 |
18 | @classmethod
19 | def from_pretrained(self, *args, **kwargs):
20 | requires_flax(self)
21 |
--------------------------------------------------------------------------------
/transformers/utils/dummy_sentencepiece_objects.py:
--------------------------------------------------------------------------------
1 | # This file is autogenerated by the command `make fix-copies`, do not edit.
2 | from ..file_utils import requires_sentencepiece
3 |
4 |
5 | class AlbertTokenizer:
6 | def __init__(self, *args, **kwargs):
7 | requires_sentencepiece(self)
8 |
9 | @classmethod
10 | def from_pretrained(self, *args, **kwargs):
11 | requires_sentencepiece(self)
12 |
13 |
14 | class BertGenerationTokenizer:
15 | def __init__(self, *args, **kwargs):
16 | requires_sentencepiece(self)
17 |
18 | @classmethod
19 | def from_pretrained(self, *args, **kwargs):
20 | requires_sentencepiece(self)
21 |
22 |
23 | class CamembertTokenizer:
24 | def __init__(self, *args, **kwargs):
25 | requires_sentencepiece(self)
26 |
27 | @classmethod
28 | def from_pretrained(self, *args, **kwargs):
29 | requires_sentencepiece(self)
30 |
31 |
32 | class MarianTokenizer:
33 | def __init__(self, *args, **kwargs):
34 | requires_sentencepiece(self)
35 |
36 | @classmethod
37 | def from_pretrained(self, *args, **kwargs):
38 | requires_sentencepiece(self)
39 |
40 |
41 | class MBartTokenizer:
42 | def __init__(self, *args, **kwargs):
43 | requires_sentencepiece(self)
44 |
45 | @classmethod
46 | def from_pretrained(self, *args, **kwargs):
47 | requires_sentencepiece(self)
48 |
49 |
50 | class PegasusTokenizer:
51 | def __init__(self, *args, **kwargs):
52 | requires_sentencepiece(self)
53 |
54 | @classmethod
55 | def from_pretrained(self, *args, **kwargs):
56 | requires_sentencepiece(self)
57 |
58 |
59 | class ReformerTokenizer:
60 | def __init__(self, *args, **kwargs):
61 | requires_sentencepiece(self)
62 |
63 | @classmethod
64 | def from_pretrained(self, *args, **kwargs):
65 | requires_sentencepiece(self)
66 |
67 |
68 | class T5Tokenizer:
69 | def __init__(self, *args, **kwargs):
70 | requires_sentencepiece(self)
71 |
72 | @classmethod
73 | def from_pretrained(self, *args, **kwargs):
74 | requires_sentencepiece(self)
75 |
76 |
77 | class XLMProphetNetTokenizer:
78 | def __init__(self, *args, **kwargs):
79 | requires_sentencepiece(self)
80 |
81 | @classmethod
82 | def from_pretrained(self, *args, **kwargs):
83 | requires_sentencepiece(self)
84 |
85 |
86 | class XLMRobertaTokenizer:
87 | def __init__(self, *args, **kwargs):
88 | requires_sentencepiece(self)
89 |
90 | @classmethod
91 | def from_pretrained(self, *args, **kwargs):
92 | requires_sentencepiece(self)
93 |
94 |
95 | class XLNetTokenizer:
96 | def __init__(self, *args, **kwargs):
97 | requires_sentencepiece(self)
98 |
99 | @classmethod
100 | def from_pretrained(self, *args, **kwargs):
101 | requires_sentencepiece(self)
102 |
--------------------------------------------------------------------------------