├── .gitignore
├── README.md
├── bin
    ├── acc_loss_figure.py
    ├── aidrtokenize.py
    ├── bert_multiclass.sh
    ├── callbacks.py
    ├── cnn_filter.py
    ├── cnn_filter.pyc
    ├── compute_results.py
    ├── data_process.py
    ├── performance.py
    ├── stop_words_english.txt
    ├── text_cnn_classifier.py
    ├── text_cnn_pipeline_unimodal.py
    └── transformers
    │   ├── CONTRIBUTING.md
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── README.md
    │   ├── docker
    │       └── Dockerfile
    │   ├── docs
    │       ├── Makefile
    │       ├── README.md
    │       ├── requirements.txt
    │       └── source
    │       │   ├── _static
    │       │       ├── css
    │       │       │   ├── Calibre-Light.ttf
    │       │       │   ├── Calibre-Medium.otf
    │       │       │   ├── Calibre-Regular.otf
    │       │       │   ├── Calibre-Thin.otf
    │       │       │   ├── code-snippets.css
    │       │       │   └── huggingface.css
    │       │       └── js
    │       │       │   ├── custom.js
    │       │       │   └── huggingface_logo.svg
    │       │   ├── benchmarks.md
    │       │   ├── bertology.rst
    │       │   ├── conf.py
    │       │   ├── converting_tensorflow_models.rst
    │       │   ├── imgs
    │       │       ├── transformers_logo_name.png
    │       │       ├── warmup_constant_schedule.png
    │       │       ├── warmup_cosine_hard_restarts_schedule.png
    │       │       ├── warmup_cosine_schedule.png
    │       │       ├── warmup_cosine_warm_restarts_schedule.png
    │       │       └── warmup_linear_schedule.png
    │       │   ├── index.rst
    │       │   ├── installation.md
    │       │   ├── main_classes
    │       │       ├── configuration.rst
    │       │       ├── model.rst
    │       │       ├── optimizer_schedules.rst
    │       │       ├── processors.rst
    │       │       └── tokenizer.rst
    │       │   ├── migration.md
    │       │   ├── model_doc
    │       │       ├── auto.rst
    │       │       ├── bert.rst
    │       │       ├── ctrl.rst
    │       │       ├── distilbert.rst
    │       │       ├── gpt.rst
    │       │       ├── gpt2.rst
    │       │       ├── roberta.rst
    │       │       ├── transformerxl.rst
    │       │       ├── xlm.rst
    │       │       └── xlnet.rst
    │       │   ├── multilingual.rst
    │       │   ├── notebooks.rst
    │       │   ├── pretrained_models.rst
    │       │   ├── quickstart.md
    │       │   ├── serialization.rst
    │       │   └── torchscript.rst
    │   ├── examples
    │       ├── README.md
    │       ├── benchmarks.py
    │       ├── contrib
    │       │   ├── README.md
    │       │   ├── run_camembert.py
    │       │   ├── run_openai_gpt.py
    │       │   ├── run_swag.py
    │       │   └── run_transfo_xl.py
    │       ├── distillation
    │       │   ├── README.md
    │       │   ├── distiller.py
    │       │   ├── grouped_batch_sampler.py
    │       │   ├── lm_seqs_dataset.py
    │       │   ├── requirements.txt
    │       │   ├── run_squad_w_distillation.py
    │       │   ├── scripts
    │       │   │   ├── binarized_data.py
    │       │   │   ├── extract.py
    │       │   │   ├── extract_distilbert.py
    │       │   │   └── token_counts.py
    │       │   ├── train.py
    │       │   ├── training_configs
    │       │   │   ├── distilbert-base-uncased.json
    │       │   │   └── distilgpt2.json
    │       │   └── utils.py
    │       ├── etc
    │       │   └── stop_words_english.txt
    │       ├── requirements.txt
    │       ├── run_bertology.py
    │       ├── run_generation.py
    │       ├── run_glue.py
    │       ├── run_glue_multiclass.py
    │       ├── run_glue_multitask.py
    │       ├── run_lm_finetuning.py
    │       ├── run_multiple_choice.py
    │       ├── run_ner.py
    │       ├── run_squad.py
    │       ├── run_summarization_finetuning.py
    │       ├── run_tf_glue.py
    │       ├── test_examples.py
    │       ├── tests_samples
    │       │   ├── .gitignore
    │       │   ├── MRPC
    │       │   │   ├── dev.tsv
    │       │   │   └── train.tsv
    │       │   └── SQUAD
    │       │   │   └── dev-v2.0-small.json
    │       ├── transformers
    │       │   ├── __init__.py
    │       │   ├── __main__.py
    │       │   ├── __pycache__
    │       │   │   ├── __init__.cpython-36.pyc
    │       │   │   ├── configuration_auto.cpython-36.pyc
    │       │   │   ├── configuration_bert.cpython-36.pyc
    │       │   │   ├── configuration_camembert.cpython-36.pyc
    │       │   │   ├── configuration_ctrl.cpython-36.pyc
    │       │   │   ├── configuration_distilbert.cpython-36.pyc
    │       │   │   ├── configuration_gpt2.cpython-36.pyc
    │       │   │   ├── configuration_openai.cpython-36.pyc
    │       │   │   ├── configuration_roberta.cpython-36.pyc
    │       │   │   ├── configuration_transfo_xl.cpython-36.pyc
    │       │   │   ├── configuration_utils.cpython-36.pyc
    │       │   │   ├── configuration_xlm.cpython-36.pyc
    │       │   │   ├── configuration_xlnet.cpython-36.pyc
    │       │   │   ├── file_utils.cpython-36.pyc
    │       │   │   ├── modeling_auto.cpython-36.pyc
    │       │   │   ├── modeling_bert.cpython-36.pyc
    │       │   │   ├── modeling_camembert.cpython-36.pyc
    │       │   │   ├── modeling_ctrl.cpython-36.pyc
    │       │   │   ├── modeling_distilbert.cpython-36.pyc
    │       │   │   ├── modeling_encoder_decoder.cpython-36.pyc
    │       │   │   ├── modeling_gpt2.cpython-36.pyc
    │       │   │   ├── modeling_openai.cpython-36.pyc
    │       │   │   ├── modeling_roberta.cpython-36.pyc
    │       │   │   ├── modeling_tf_pytorch_utils.cpython-36.pyc
    │       │   │   ├── modeling_transfo_xl.cpython-36.pyc
    │       │   │   ├── modeling_transfo_xl_utilities.cpython-36.pyc
    │       │   │   ├── modeling_utils.cpython-36.pyc
    │       │   │   ├── modeling_xlm.cpython-36.pyc
    │       │   │   ├── modeling_xlnet.cpython-36.pyc
    │       │   │   ├── optimization.cpython-36.pyc
    │       │   │   ├── tokenization_auto.cpython-36.pyc
    │       │   │   ├── tokenization_bert.cpython-36.pyc
    │       │   │   ├── tokenization_camembert.cpython-36.pyc
    │       │   │   ├── tokenization_ctrl.cpython-36.pyc
    │       │   │   ├── tokenization_distilbert.cpython-36.pyc
    │       │   │   ├── tokenization_gpt2.cpython-36.pyc
    │       │   │   ├── tokenization_openai.cpython-36.pyc
    │       │   │   ├── tokenization_roberta.cpython-36.pyc
    │       │   │   ├── tokenization_transfo_xl.cpython-36.pyc
    │       │   │   ├── tokenization_utils.cpython-36.pyc
    │       │   │   ├── tokenization_xlm.cpython-36.pyc
    │       │   │   └── tokenization_xlnet.cpython-36.pyc
    │       │   ├── configuration_auto.py
    │       │   ├── configuration_bert.py
    │       │   ├── configuration_camembert.py
    │       │   ├── configuration_ctrl.py
    │       │   ├── configuration_distilbert.py
    │       │   ├── configuration_gpt2.py
    │       │   ├── configuration_openai.py
    │       │   ├── configuration_roberta.py
    │       │   ├── configuration_transfo_xl.py
    │       │   ├── configuration_utils.py
    │       │   ├── configuration_xlm.py
    │       │   ├── configuration_xlnet.py
    │       │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │       │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    │       │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │       │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │       │   ├── convert_pytorch_checkpoint_to_tf2.py
    │       │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    │       │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    │       │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │       │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    │       │   ├── data
    │       │   │   ├── __init__.py
    │       │   │   ├── __pycache__
    │       │   │   │   └── __init__.cpython-36.pyc
    │       │   │   ├── metrics
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── __pycache__
    │       │   │   │   │   └── __init__.cpython-36.pyc
    │       │   │   └── processors
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── __pycache__
    │       │   │   │       ├── __init__.cpython-36.pyc
    │       │   │   │       ├── aidrtokenize.cpython-36.pyc
    │       │   │   │       ├── glue.cpython-36.pyc
    │       │   │   │       └── utils.cpython-36.pyc
    │       │   │   │   ├── aidrtokenize.py
    │       │   │   │   ├── glue.py
    │       │   │   │   └── utils.py
    │       │   ├── file_utils.py
    │       │   ├── modeling_auto.py
    │       │   ├── modeling_beam_search.py
    │       │   ├── modeling_bert.py
    │       │   ├── modeling_camembert.py
    │       │   ├── modeling_ctrl.py
    │       │   ├── modeling_distilbert.py
    │       │   ├── modeling_encoder_decoder.py
    │       │   ├── modeling_gpt2.py
    │       │   ├── modeling_openai.py
    │       │   ├── modeling_roberta.py
    │       │   ├── modeling_tf_auto.py
    │       │   ├── modeling_tf_bert.py
    │       │   ├── modeling_tf_ctrl.py
    │       │   ├── modeling_tf_distilbert.py
    │       │   ├── modeling_tf_gpt2.py
    │       │   ├── modeling_tf_openai.py
    │       │   ├── modeling_tf_pytorch_utils.py
    │       │   ├── modeling_tf_roberta.py
    │       │   ├── modeling_tf_transfo_xl.py
    │       │   ├── modeling_tf_transfo_xl_utilities.py
    │       │   ├── modeling_tf_utils.py
    │       │   ├── modeling_tf_xlm.py
    │       │   ├── modeling_tf_xlnet.py
    │       │   ├── modeling_transfo_xl.py
    │       │   ├── modeling_transfo_xl_utilities.py
    │       │   ├── modeling_utils.py
    │       │   ├── modeling_xlm.py
    │       │   ├── modeling_xlnet.py
    │       │   ├── optimization.py
    │       │   ├── tests
    │       │   │   ├── __init__.py
    │       │   │   ├── configuration_common_test.py
    │       │   │   ├── conftest.py
    │       │   │   ├── fixtures
    │       │   │   │   ├── input.txt
    │       │   │   │   ├── sample_text.txt
    │       │   │   │   └── test_sentencepiece.model
    │       │   │   ├── modeling_auto_test.py
    │       │   │   ├── modeling_bert_test.py
    │       │   │   ├── modeling_common_test.py
    │       │   │   ├── modeling_ctrl_test.py
    │       │   │   ├── modeling_distilbert_test.py
    │       │   │   ├── modeling_encoder_decoder_test.py
    │       │   │   ├── modeling_gpt2_test.py
    │       │   │   ├── modeling_openai_test.py
    │       │   │   ├── modeling_roberta_test.py
    │       │   │   ├── modeling_tf_auto_test.py
    │       │   │   ├── modeling_tf_bert_test.py
    │       │   │   ├── modeling_tf_common_test.py
    │       │   │   ├── modeling_tf_ctrl_test.py
    │       │   │   ├── modeling_tf_distilbert_test.py
    │       │   │   ├── modeling_tf_gpt2_test.py
    │       │   │   ├── modeling_tf_openai_gpt_test.py
    │       │   │   ├── modeling_tf_roberta_test.py
    │       │   │   ├── modeling_tf_transfo_xl_test.py
    │       │   │   ├── modeling_tf_xlm_test.py
    │       │   │   ├── modeling_tf_xlnet_test.py
    │       │   │   ├── modeling_transfo_xl_test.py
    │       │   │   ├── modeling_xlm_test.py
    │       │   │   ├── modeling_xlnet_test.py
    │       │   │   ├── optimization_test.py
    │       │   │   ├── tokenization_auto_test.py
    │       │   │   ├── tokenization_bert_test.py
    │       │   │   ├── tokenization_ctrl_test.py
    │       │   │   ├── tokenization_distilbert_test.py
    │       │   │   ├── tokenization_gpt2_test.py
    │       │   │   ├── tokenization_openai_test.py
    │       │   │   ├── tokenization_roberta_test.py
    │       │   │   ├── tokenization_tests_commons.py
    │       │   │   ├── tokenization_transfo_xl_test.py
    │       │   │   ├── tokenization_utils_test.py
    │       │   │   ├── tokenization_xlm_test.py
    │       │   │   └── tokenization_xlnet_test.py
    │       │   ├── tokenization_auto.py
    │       │   ├── tokenization_bert.py
    │       │   ├── tokenization_camembert.py
    │       │   ├── tokenization_ctrl.py
    │       │   ├── tokenization_distilbert.py
    │       │   ├── tokenization_gpt2.py
    │       │   ├── tokenization_openai.py
    │       │   ├── tokenization_roberta.py
    │       │   ├── tokenization_transfo_xl.py
    │       │   ├── tokenization_utils.py
    │       │   ├── tokenization_xlm.py
    │       │   └── tokenization_xlnet.py
    │       ├── utils_multiple_choice.py
    │       ├── utils_ner.py
    │       ├── utils_squad.py
    │       ├── utils_squad_evaluate.py
    │       ├── utils_summarization.py
    │       └── utils_summarization_test.py
    │   ├── hubconf.py
    │   ├── notebooks
    │       ├── Comparing-PT-and-TF-models.ipynb
    │       ├── Comparing-TF-and-PT-models-MLM-NSP.ipynb
    │       ├── Comparing-TF-and-PT-models-SQuAD.ipynb
    │       └── Comparing-TF-and-PT-models.ipynb
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   ├── setup.py
    │   └── templates
    │       ├── adding_a_new_example_script
    │           ├── README.md
    │           ├── run_xxx.py
    │           └── utils_xxx.py
    │       └── adding_a_new_model
    │           ├── README.md
    │           ├── configuration_xxx.py
    │           ├── convert_xxx_original_tf_checkpoint_to_pytorch.py
    │           ├── modeling_tf_xxx.py
    │           ├── modeling_xxx.py
    │           ├── tests
    │               ├── modeling_tf_xxx_test.py
    │               ├── modeling_xxx_test.py
    │               └── tokenization_xxx_test.py
    │           └── tokenization_xxx.py
├── environment_crisis_bert_env.yml
├── etc
    └── stop_words_english.txt
├── license_by_sa_4.0_legalcode.txt
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | results.xlsx
 3 | .idea
 4 | crisis_datasets_web
 5 | crisis_datasets_benchmarks_v1.0.tar.gz
 6 | repository-open-graph-template.png
 7 | repository-open-graph-template.psd
 8 | Untitled-1.ai
 9 | bin/.ipynb_checkpoints
10 | .idea
11 | 


--------------------------------------------------------------------------------
/bin/acc_loss_figure.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import math
 4 | import warnings
 5 | import datetime
 6 | import optparse
 7 | import os, errno
 8 | import matplotlib.pyplot as plt
 9 | import pandas as pd
10 | 
11 | def plot_fig_image(history,epochs, outfile):
12 |     fig = plt.figure()
13 |     plt.plot(range(1,epochs+1),history['image_output_acc'],label='training-acc')
14 |     plt.plot(range(1,epochs+1),history['image_output_loss'],label='training-loss')
15 |     plt.plot(range(1,epochs+1),history['val_image_output_acc'],label='validation-acc')
16 |     plt.plot(range(1,epochs+1),history['val_image_output_loss'],label='validation-loss')
17 | 
18 |     plt.legend(loc=0)
19 |     plt.xlabel('epochs')
20 |     plt.ylabel('accuracy-loss')
21 |     plt.xlim([1,epochs])
22 | #     plt.ylim([0,1])
23 |     plt.grid(True)
24 |     plt.title("Model Performance")
25 |     # plt.show()
26 |     fig.savefig(outfile)
27 |     plt.close(fig)
28 | 
29 | def plot_fig_text(history,epochs, outfile):
30 |     fig = plt.figure()
31 |     plt.plot(range(1,epochs+1),history['acc'],label='training-acc')
32 |     plt.plot(range(1,epochs+1),history['loss'],label='training-loss')
33 |     plt.plot(range(1,epochs+1),history['val_acc'],label='validation-acc')
34 |     plt.plot(range(1,epochs+1),history['val_loss'],label='validation-loss')
35 | 
36 |     plt.legend(loc=0)
37 |     plt.xlabel('epochs')
38 |     plt.ylabel('accuracy-loss')
39 |     plt.xlim([1,epochs])
40 | #     plt.ylim([0,1])
41 |     plt.grid(True)
42 |     plt.title("Model Performance")
43 |     # plt.show()
44 |     fig.savefig(outfile)
45 |     plt.close(fig)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     warnings.filterwarnings("ignore")
50 |     parser = optparse.OptionParser()
51 |     parser.add_option('-i', action="store", dest="data_file")
52 |     parser.add_option('-o', action="store", dest="out_file")
53 | 
54 |     options, args = parser.parse_args()
55 |     a = datetime.datetime.now().replace(microsecond=0)
56 | 
57 |     data_file = options.data_file
58 |     out_file = options.out_file
59 |     out_dir = os.path.dirname(out_file)
60 |     out_base =os.path.basename(out_file)
61 |     out_base = os.path.splitext(out_base)[0]
62 |     # out_file_image = out_dir + "/" + out_base +"_image.jpg"
63 |     out_file_text = out_dir + "/" + out_base + "_text.jpg"
64 | 
65 |     # train_img_file = options.data_file  # "exp/iraq_earthquake_task_text_train.csv"
66 | 
67 |     df = pd.read_csv(data_file, sep="\t")
68 |     epochs=df.shape[0]
69 |     # print(df)
70 | 
71 |     # plot_fig_image(df,epochs, out_file_image)
72 |     plot_fig_text(df, epochs, out_file_text)
73 | 


--------------------------------------------------------------------------------
/bin/bert_multiclass.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | cd bin/transformers/examples
 4 | source your_path/bin/activate your_path/envs/transformers
 5 | 
 6 | event=$1
 7 | train_file=$2
 8 | dev_file=$3
 9 | test_file=$4
10 | results_file=$5
11 | 
12 | 
13 | num_epoch=3.0
14 | export HOME_DIR="$PWD/"
15 | export TASK_NAME=multiclass
16 | model=bert-base-uncased
17 | outputDir=$HOME_DIR/output_multi_class_${model}_$event
18 | cache_dir=$HOME_DIR/exp_cache
19 | export data_dir=$HOME_DIR"/data_bert_model_$event/"
20 | mkdir -p $data_dir
21 | mkdir -p $outputDir
22 | 
23 | 
24 | python run_glue_multiclass.py --model_type bert --model_name_or_path bert-base-uncased --task_name $TASK_NAME --do_train --do_eval --do_lower_case \
25 |     --data_dir $data_dir --max_seq_length 128 --per_gpu_eval_batch_size=8  --per_gpu_train_batch_size=8 --learning_rate 2e-5 --num_train_epochs $num_epoch \
26 |     --train_file $HOME_DIR/$train_file --dev_file $HOME_DIR/$dev_file \
27 |     --output_dir $outputDir --overwrite_output_dir 
28 | 
29 | rm $data_dir/cached_test_bert-base-uncased_128_multiclass
30 | 
31 | python run_glue_multiclass.py --model_type bert --model_name_or_path bert-base-uncased --task_name $TASK_NAME --do_test --do_lower_case --data_dir $data_dir \
32 | --max_seq_length 128 --per_gpu_eval_batch_size=8  --per_gpu_train_batch_size=8   --learning_rate 2e-5 --num_train_epochs $num_epoch \
33 | --test_file $HOME_DIR/$dev_file  --output_dir $outputDir --results_file $results_file"_dev.txt"
34 | 
35 | rm $data_dir/cached_test_bert-base-uncased_128_multiclass
36 | 
37 | python run_glue_multiclass.py --model_type bert --model_name_or_path bert-base-uncased --task_name $TASK_NAME --do_test --do_lower_case --data_dir $data_dir \
38 | --max_seq_length 128 --per_gpu_eval_batch_size=8  --per_gpu_train_batch_size=8   --learning_rate 2e-5 --num_train_epochs $num_epoch \
39 | --test_file $HOME_DIR/$test_file  --output_dir $outputDir --results_file $results_file"_test.txt"
40 | 
41 | 


--------------------------------------------------------------------------------
/bin/cnn_filter.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/cnn_filter.pyc


--------------------------------------------------------------------------------
/bin/compute_results.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Oct 17 10:24:54 2018
 5 | 
 6 | @author: Firoj Alam
 7 | 
 8 | """
 9 | 
10 | import sys
11 | import os
12 | import optparse
13 | import datetime
14 | import numpy as np
15 | from sklearn import metrics
16 | import sklearn.metrics as metrics
17 | from sklearn import preprocessing
18 | import pandas as pd
19 | import re
20 | 
21 | def classifaction_report(report):
22 |     report_data = []
23 |     lines = report.split('\n')
24 |     #    print lines
25 | 
26 |     for line in lines[2:-3]:
27 |         # print line
28 |         line = line.strip()
29 |         row = {}
30 |         row_data = re.split('\s+', line)
31 |         #        print row_data
32 |         row['class'] = row_data[0]
33 |         row['precision'] = float(row_data[1])
34 |         row['recall'] = float(row_data[2])
35 |         row['f1_score'] = float(row_data[3])
36 |         row['support'] = float(row_data[4])
37 |         report_data.append(row)
38 |     (P, R, F1, sumClassCnt) = (0, 0, 0, 0)
39 | 
40 |     for row in report_data:
41 |         tmp = row['precision']
42 |         P = P + (tmp * row['support'])
43 |         tmp = row['recall']
44 |         R = R + (tmp * row['support'])
45 |         tmp = row['f1_score']
46 |         F1 = F1 + (tmp * row['support'])
47 |         sumClassCnt = sumClassCnt + row['support']
48 |     precision = P / sumClassCnt;
49 |     recall = R / sumClassCnt;
50 |     f1_score = F1 / sumClassCnt;
51 |     print(str(precision) + "\t" + str(recall) + "\t" + str(f1_score) + "\n")
52 |     return precision, recall, f1_score
53 | 
54 | if __name__ == '__main__':
55 |     a = datetime.datetime.now().replace(microsecond=0)
56 |     parser = optparse.OptionParser()
57 |     parser.add_option('-i', action="store", dest="infile")
58 |     options, args = parser.parse_args()
59 | 
60 |     file_name = options.infile
61 |     actual=[]
62 |     predicted=[]
63 |     with open(file_name, 'rU') as f:
64 |         for line in f:
65 |             line = line.strip()
66 |             if("inst#" in line):
67 |                 break
68 |         for line in f:
69 |             line = line.strip()
70 |             # print line
71 |             if (line==""):
72 |                 continue
73 |             arr=line.split()
74 |             actual_lab = arr[1].strip().split(":")[1]
75 |             actual.append(actual_lab)
76 |             predicted_lab = arr[2].strip().split(":")[1]
77 |             predicted.append(predicted_lab)
78 | 
79 |     actual=np.array(actual)
80 |     predicted = np.array(predicted)
81 |     acc = precision = recall = f1_score = 0.0
82 |     report = ""
83 |     try:
84 |         acc = metrics.accuracy_score(actual, predicted)*100
85 |         # report = metrics.classification_report(actual, predicted)
86 |         # precision, recall, f1_score = classifaction_report(report)
87 |         precision = metrics.precision_score(actual, predicted, average="weighted")*100
88 |         recall = metrics.recall_score(actual, predicted, average="weighted")*100
89 |         f1_score = metrics.f1_score(actual, predicted, average="weighted")*100
90 |     except Exception as e:
91 |         print (e)
92 |     pass
93 |     result = str("{0:.2f}".format(acc)) + "\t" + str(
94 |         "{0:.2f}".format(precision)) + "\t" + str("{0:.2f}".format(recall)) + "\t" + str(
95 |         "{0:.2f}".format(f1_score))
96 |     print(result)
97 | 
98 |     b = datetime.datetime.now().replace(microsecond=0)
99 |     #print("Time taken: " + str((b - a)))


--------------------------------------------------------------------------------
/bin/stop_words_english.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | are
 13 | aren't
 14 | as
 15 | at
 16 | be
 17 | because
 18 | been
 19 | before
 20 | being
 21 | below
 22 | between
 23 | both
 24 | but
 25 | by
 26 | can't
 27 | cannot
 28 | could
 29 | couldn't
 30 | did
 31 | didn't
 32 | do
 33 | does
 34 | doesn't
 35 | doing
 36 | don't
 37 | down
 38 | during
 39 | each
 40 | few
 41 | for
 42 | from
 43 | further
 44 | had
 45 | hadn't
 46 | has
 47 | hasn't
 48 | have
 49 | haven't
 50 | having
 51 | he
 52 | he'd
 53 | he'll
 54 | he's
 55 | her
 56 | here
 57 | here's
 58 | hers
 59 | herself
 60 | him
 61 | himself
 62 | his
 63 | how
 64 | how's
 65 | i
 66 | i'd
 67 | i'll
 68 | i'm
 69 | i've
 70 | if
 71 | in
 72 | into
 73 | is
 74 | isn't
 75 | it
 76 | it's
 77 | its
 78 | itself
 79 | let's
 80 | me
 81 | more
 82 | most
 83 | mustn't
 84 | my
 85 | myself
 86 | no
 87 | nor
 88 | not
 89 | of
 90 | off
 91 | on
 92 | once
 93 | only
 94 | or
 95 | other
 96 | ought
 97 | our
 98 | ours	ourselves
 99 | out
100 | over
101 | own
102 | same
103 | shan't
104 | she
105 | she'd
106 | she'll
107 | she's
108 | should
109 | shouldn't
110 | so
111 | some
112 | such
113 | than
114 | that
115 | that's
116 | the
117 | their
118 | theirs
119 | them
120 | themselves
121 | then
122 | there
123 | there's
124 | these
125 | they
126 | they'd
127 | they'll
128 | they're
129 | they've
130 | this
131 | those
132 | through
133 | to
134 | too
135 | under
136 | until
137 | up
138 | very
139 | was
140 | wasn't
141 | we
142 | we'd
143 | we'll
144 | we're
145 | we've
146 | were
147 | weren't
148 | what
149 | what's
150 | when
151 | when's
152 | where
153 | where's
154 | which
155 | while
156 | who
157 | who's
158 | whom
159 | why
160 | why's
161 | with
162 | won't
163 | would
164 | wouldn't
165 | you
166 | you'd
167 | you'll
168 | you're
169 | you've
170 | your
171 | yours
172 | yourself
173 | yourselves
174 | 


--------------------------------------------------------------------------------
/bin/transformers/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/bin/transformers/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:latest
2 | 
3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
4 | 
5 | RUN pip install transformers
6 | 
7 | WORKDIR /workspace


--------------------------------------------------------------------------------
/bin/transformers/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/bin/transformers/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Generating the documentation
 2 | 
 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
 4 | you can install them using:
 5 | 
 6 | ```bash
 7 | pip install -r requirements.txt
 8 | ```
 9 |  
10 | ## Packages installed
11 | 
12 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from 
13 | `requirements.txt`, you do not need to run the following commands.
14 | 
15 | Building it requires the package `sphinx` that you can 
16 | install using:
17 | 
18 | ```bash
19 | pip install -U sphinx
20 | ```
21 | 
22 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 
23 | [Read The Docs](https://readthedocs.org/). You can install it using the following command:
24 | 
25 | ```bash
26 | pip install sphinx_rtd_theme
27 | ```
28 | 
29 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
30 | 
31 | ```bash
32 | pip install recommonmark
33 | ```
34 | 
35 | ## Building the documentation
36 | 
37 | Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 
38 | command to generate it:
39 | 
40 | ```bash
41 | ln -s ../../examples/README.md examples.md
42 | ```
43 | 
44 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
45 | 
46 | ```bash
47 | make html
48 | ```
49 | 
50 | ---
51 | **NOTE**
52 | 
53 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
54 | directory before rebuilding. Run the following command to clean and build:
55 | 
56 | ```bash
57 | make clean && make html
58 | ```
59 | 
60 | ---
61 | 
62 | It should build the static app that will be available under `/docs/_build/html`
63 | 
64 | ## Adding a new element to the tree (toc-tree)
65 | 
66 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
67 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
68 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.12
 2 | Babel==2.7.0
 3 | certifi==2019.6.16
 4 | chardet==3.0.4
 5 | commonmark==0.9.0
 6 | docutils==0.14
 7 | future==0.17.1
 8 | idna==2.8
 9 | imagesize==1.1.0
10 | Jinja2==2.10.1
11 | MarkupSafe==1.1.1
12 | packaging==19.0
13 | Pygments==2.4.2
14 | pyparsing==2.4.0
15 | pytz==2019.1
16 | recommonmark==0.5.0
17 | requests==2.22.0
18 | six==1.12.0
19 | snowballstemmer==1.9.0
20 | Sphinx==2.1.2
21 | sphinx-rtd-theme==0.4.3
22 | sphinxcontrib-applehelp==1.0.1
23 | sphinxcontrib-devhelp==1.0.1
24 | sphinxcontrib-htmlhelp==1.0.2
25 | sphinxcontrib-jsmath==1.0.1
26 | sphinxcontrib-qthelp==1.0.2
27 | sphinxcontrib-serializinghtml==1.1.3
28 | urllib3==1.25.8
29 | sphinx-markdown-tables==0.0.9
30 | numpy==1.17.2
31 | tensorflow==2.0.0rc2
32 | torch==1.2.0


--------------------------------------------------------------------------------
/bin/transformers/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/_static/css/Calibre-Light.ttf


--------------------------------------------------------------------------------
/bin/transformers/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/_static/css/Calibre-Medium.otf


--------------------------------------------------------------------------------
/bin/transformers/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/_static/css/Calibre-Regular.otf


--------------------------------------------------------------------------------
/bin/transformers/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/_static/css/Calibre-Thin.otf


--------------------------------------------------------------------------------
/bin/transformers/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .highlight .c1, .highlight .sd{
 3 |     color: #999
 4 | }
 5 | 
 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
 7 |     color: #FB8D68;
 8 | }
 9 | 
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
11 |     color: #6670FF;
12 | }


--------------------------------------------------------------------------------
/bin/transformers/docs/source/benchmarks.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
 4 | benchmark will help keep track of the preformance improvements that are brought to our models across versions.
 5 | 
 6 | ## Benchmarking all models for inference
 7 | 
 8 | As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
 9 | and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
10 | TensorFlow XLA) and GPUs.
11 | 
12 | The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
13 | 
14 | The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
15 | 
16 | ## TF2 with mixed precision, XLA, Distribution (@tlkh)
17 | 
18 | This work was done by [Timothy Liu](https://github.com/tlkh).
19 | 
20 | There are very positive results to be gained from the various TensorFlow 2.0 features:
21 | 
22 | - Automatic Mixed Precision (AMP)
23 | - XLA compiler
24 | - Distribution strategies (multi-GPU)
25 | 
26 | The benefits are listed here (tested on CoLA, MRPC, SST-2):
27 | 
28 | - AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
29 | - AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
30 | - Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
31 | - Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
32 | 
33 | The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
34 | on a single GPU gives the following results:
35 | 
36 | - CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
37 | - MRPC: AMP results in lower acc (0.823 vs 0.835)
38 | - SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
39 | 
40 | However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
41 | 
42 | CoLA: AMP results in higher acc (0.828 vs 0.812)
43 | MRPC: AMP results in lower acc (0.817 vs 0.827)
44 | SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
45 | 
46 | The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
47 | 
48 | Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
49 | as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
50 | can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
51 | 
52 | The benefits as seen on SST-2 (larger dataset) is much clear.
53 | 
54 | All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
55 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/bertology.rst:
--------------------------------------------------------------------------------
 1 | BERTology
 2 | ---------
 3 | 
 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 5 | 
 6 | 
 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
10 | 
11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
12 | 
13 | 
14 | * accessing all the hidden-states of BERT/GPT/GPT-2,
15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2,
16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
17 | 
18 | To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
19 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/converting_tensorflow_models.rst:
--------------------------------------------------------------------------------
  1 | Converting Tensorflow Checkpoints
  2 | ================================================
  3 | 
  4 | A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
  5 | 
  6 | BERT
  7 | ^^^^
  8 | 
  9 | You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 10 | 
 11 | This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
 12 | 
 13 | You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
 14 | 
 15 | To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
 16 | 
 17 | Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
 18 | 
 19 | .. code-block:: shell
 20 | 
 21 |    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 22 | 
 23 |    transformers bert \
 24 |      $BERT_BASE_DIR/bert_model.ckpt \
 25 |      $BERT_BASE_DIR/bert_config.json \
 26 |      $BERT_BASE_DIR/pytorch_model.bin
 27 | 
 28 | You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
 29 | 
 30 | OpenAI GPT
 31 | ^^^^^^^^^^
 32 | 
 33 | Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
 34 | 
 35 | .. code-block:: shell
 36 | 
 37 |    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 38 | 
 39 |    transformers gpt \
 40 |      $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
 41 |      $PYTORCH_DUMP_OUTPUT \
 42 |      [OPENAI_GPT_CONFIG]
 43 | 
 44 | OpenAI GPT-2
 45 | ^^^^^^^^^^^^
 46 | 
 47 | Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
 48 | 
 49 | .. code-block:: shell
 50 | 
 51 |    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
 52 | 
 53 |    transformers gpt2 \
 54 |      $OPENAI_GPT2_CHECKPOINT_PATH \
 55 |      $PYTORCH_DUMP_OUTPUT \
 56 |      [OPENAI_GPT2_CONFIG]
 57 | 
 58 | Transformer-XL
 59 | ^^^^^^^^^^^^^^
 60 | 
 61 | Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
 62 | 
 63 | .. code-block:: shell
 64 | 
 65 |    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 66 | 
 67 |    transformers transfo_xl \
 68 |      $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
 69 |      $PYTORCH_DUMP_OUTPUT \
 70 |      [TRANSFO_XL_CONFIG]
 71 | 
 72 | 
 73 | XLNet
 74 | ^^^^^
 75 | 
 76 | Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
 77 | 
 78 | .. code-block:: shell
 79 | 
 80 |    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 81 |    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 82 | 
 83 |    transformers xlnet \
 84 |      $TRANSFO_XL_CHECKPOINT_PATH \
 85 |      $TRANSFO_XL_CONFIG_PATH \
 86 |      $PYTORCH_DUMP_OUTPUT \
 87 |      STS-B \
 88 | 
 89 | 
 90 | XLM
 91 | ^^^
 92 | 
 93 | Here is an example of the conversion process for a pre-trained XLM model:
 94 | 
 95 | .. code-block:: shell
 96 | 
 97 |    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
 98 | 
 99 |    transformers xlm \
100 |      $XLM_CHECKPOINT_PATH \
101 |      $PYTORCH_DUMP_OUTPUT \
102 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/transformers_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/transformers_logo_name.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/warmup_constant_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/warmup_constant_schedule.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/warmup_cosine_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/warmup_cosine_schedule.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/imgs/warmup_linear_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/docs/source/imgs/warmup_linear_schedule.png


--------------------------------------------------------------------------------
/bin/transformers/docs/source/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 4 | 
 5 | ## With pip
 6 | 
 7 | PyTorch Transformers can be installed using pip as follows:
 8 | 
 9 | ``` bash
10 | pip install transformers
11 | ```
12 | 
13 | ## From source
14 | 
15 | To install from source, clone the repository and install with:
16 | 
17 | ``` bash
18 | git clone https://github.com/huggingface/transformers.git
19 | cd transformers
20 | pip install [--editable] .
21 | ```
22 | 
23 | ## Tests
24 | 
25 | An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
26 | 
27 | Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
28 | 
29 | Run all the tests from the root of the cloned repository with the commands:
30 | 
31 | ``` bash
32 | python -m pytest -sv ./transformers/tests/
33 | python -m pytest -sv ./examples/
34 | ```
35 | 
36 | ## OpenAI GPT original tokenization workflow
37 | 
38 | If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
39 | 
40 | ``` bash
41 | pip install spacy ftfy==4.4.3
42 | python -m spacy download en
43 | ```
44 | 
45 | If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
46 | 
47 | ## Note on model downloads (Continuous Integration or large-scale deployments)
48 | 
49 | If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
50 | 
51 | ## Do you want to run a Transformer model on a mobile device?
52 | 
53 | You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
54 | 
55 | It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
56 | 
57 | At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
58 | or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
59 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PretrainedConfig``
 7 | ~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | .. autoclass:: transformers.PretrainedConfig
10 |     :members:
11 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/main_classes/model.rst:
--------------------------------------------------------------------------------
 1 | Models
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to:
 7 | 
 8 | - resize the input token embeddings when new tokens are added to the vocabulary
 9 | - prune the attention heads of the model.
10 | 
11 | ``PreTrainedModel``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.PreTrainedModel
15 |     :members:
16 | 
17 | ``TFPreTrainedModel``
18 | ~~~~~~~~~~~~~~~~~~~~~
19 | 
20 | .. autoclass:: transformers.TFPreTrainedModel
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/main_classes/optimizer_schedules.rst:
--------------------------------------------------------------------------------
 1 | Optimizer
 2 | ----------------------------------------------------
 3 | 
 4 | The ``.optimization`` module provides:
 5 | 
 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
 8 | 
 9 | ``AdamW``
10 | ~~~~~~~~~~~~~~~~
11 | 
12 | .. autoclass:: transformers.AdamW
13 |     :members:
14 | 
15 | Schedules
16 | ----------------------------------------------------
17 | 
18 | Learning Rate Schedules
19 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20 | 
21 | .. autofunction:: transformers.get_constant_schedule
22 | 
23 | 
24 | .. autofunction:: transformers.get_constant_schedule_with_warmup
25 | 
26 | .. image:: /imgs/warmup_constant_schedule.png
27 |     :target: /imgs/warmup_constant_schedule.png
28 |     :alt:
29 | 
30 | 
31 | .. autofunction:: transformers.get_cosine_schedule_with_warmup
32 |     :members:
33 | 
34 | .. image:: /imgs/warmup_cosine_schedule.png
35 |     :target: /imgs/warmup_cosine_schedule.png
36 |     :alt:
37 | 
38 | 
39 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
40 | 
41 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
42 |     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
43 |     :alt:
44 | 
45 | 
46 | 
47 | .. autofunction:: transformers.get_linear_schedule_with_warmup
48 | 
49 | .. image:: /imgs/warmup_linear_schedule.png
50 |     :target: /imgs/warmup_linear_schedule.png
51 |     :alt:
52 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/main_classes/processors.rst:
--------------------------------------------------------------------------------
 1 | Processors
 2 | ----------------------------------------------------
 3 | 
 4 | This library includes processors for several traditional tasks. These processors can be used to process a dataset into
 5 | examples that can be fed to a model.
 6 | 
 7 | Processors
 8 | ~~~~~~~~~~~~~~~~~~~~~
 9 | 
10 | All processors follow the same architecture which is that of the
11 | :class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
12 | of :class:`~transformers.data.processors.utils.InputExample`. These
13 | :class:`~transformers.data.processors.utils.InputExample` can be converted to
14 | :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
15 | 
16 | .. autoclass:: transformers.data.processors.utils.DataProcessor
17 |     :members:
18 | 
19 | 
20 | .. autoclass:: transformers.data.processors.utils.InputExample
21 |     :members:
22 | 
23 | 
24 | .. autoclass:: transformers.data.processors.utils.InputFeatures
25 |     :members:
26 | 
27 | 
28 | GLUE
29 | ~~~~~~~~~~~~~~~~~~~~~
30 | 
31 | `General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
32 | the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
33 | `GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
34 | 
35 | This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
36 | CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
37 | 
38 | Those processors are:
39 |     - :class:`~transformers.data.processors.utils.MrpcProcessor`
40 |     - :class:`~transformers.data.processors.utils.MnliProcessor`
41 |     - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
42 |     - :class:`~transformers.data.processors.utils.Sst2Processor`
43 |     - :class:`~transformers.data.processors.utils.StsbProcessor`
44 |     - :class:`~transformers.data.processors.utils.QqpProcessor`
45 |     - :class:`~transformers.data.processors.utils.QnliProcessor`
46 |     - :class:`~transformers.data.processors.utils.RteProcessor`
47 |     - :class:`~transformers.data.processors.utils.WnliProcessor`
48 | 
49 | Additionally, the following method  can be used to load values from a data file and convert them to a list of
50 | :class:`~transformers.data.processors.utils.InputExample`.
51 | 
52 | .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
53 | 
54 | Example usage
55 | ^^^^^^^^^^^^^^^^^^^^^^^^^
56 | 
57 | An example using these processors is given in the
58 | `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.


--------------------------------------------------------------------------------
/bin/transformers/docs/source/main_classes/tokenizer.rst:
--------------------------------------------------------------------------------
 1 | Tokenizer
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
 7 | 
 8 | - tokenizing, converting tokens to ids and back and encoding/decoding,
 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
11 | 
12 | ``PreTrainedTokenizer``
13 | ~~~~~~~~~~~~~~~~~~~~~~~~
14 | 
15 | .. autoclass:: transformers.PreTrainedTokenizer
16 |     :members:
17 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/auto.rst:
--------------------------------------------------------------------------------
 1 | AutoModels
 2 | -----------
 3 | 
 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
 5 | 
 6 | AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary:
 7 | 
 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
 9 | 
10 | 
11 | ``AutoConfig``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.AutoConfig
15 |     :members:
16 | 
17 | 
18 | ``AutoModel``
19 | ~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.AutoModel
22 |     :members:
23 | 
24 | 
25 | ``AutoTokenizer``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.AutoTokenizer
29 |     :members:
30 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/bert.rst:
--------------------------------------------------------------------------------
  1 | BERT
  2 | ----------------------------------------------------
  3 | 
  4 | ``BertConfig``
  5 | ~~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 | .. autoclass:: transformers.BertConfig
  8 |     :members:
  9 | 
 10 | 
 11 | ``BertTokenizer``
 12 | ~~~~~~~~~~~~~~~~~~~~~
 13 | 
 14 | .. autoclass:: transformers.BertTokenizer
 15 |     :members:
 16 | 
 17 | 
 18 | ``BertModel``
 19 | ~~~~~~~~~~~~~~~~~~~~
 20 | 
 21 | .. autoclass:: transformers.BertModel
 22 |     :members:
 23 | 
 24 | 
 25 | ``BertForPreTraining``
 26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 27 | 
 28 | .. autoclass:: transformers.BertForPreTraining
 29 |     :members:
 30 | 
 31 | 
 32 | ``BertForMaskedLM``
 33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 34 | 
 35 | .. autoclass:: transformers.BertForMaskedLM
 36 |     :members:
 37 | 
 38 | 
 39 | ``BertForNextSentencePrediction``
 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | .. autoclass:: transformers.BertForNextSentencePrediction
 43 |     :members:
 44 | 
 45 | 
 46 | ``BertForSequenceClassification``
 47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 48 | 
 49 | .. autoclass:: transformers.BertForSequenceClassification
 50 |     :members:
 51 | 
 52 | 
 53 | ``BertForMultipleChoice``
 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 55 | 
 56 | .. autoclass:: transformers.BertForMultipleChoice
 57 |     :members:
 58 | 
 59 | 
 60 | ``BertForTokenClassification``
 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 62 | 
 63 | .. autoclass:: transformers.BertForTokenClassification
 64 |     :members:
 65 | 
 66 | 
 67 | ``BertForQuestionAnswering``
 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 69 | 
 70 | .. autoclass:: transformers.BertForQuestionAnswering
 71 |     :members:
 72 | 
 73 | 
 74 | ``TFBertModel``
 75 | ~~~~~~~~~~~~~~~~~~~~
 76 | 
 77 | .. autoclass:: transformers.TFBertModel
 78 |     :members:
 79 | 
 80 | 
 81 | ``TFBertForPreTraining``
 82 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 83 | 
 84 | .. autoclass:: transformers.TFBertForPreTraining
 85 |     :members:
 86 | 
 87 | 
 88 | ``TFBertForMaskedLM``
 89 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 90 | 
 91 | .. autoclass:: transformers.TFBertForMaskedLM
 92 |     :members:
 93 | 
 94 | 
 95 | ``TFBertForNextSentencePrediction``
 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 97 | 
 98 | .. autoclass:: transformers.TFBertForNextSentencePrediction
 99 |     :members:
100 | 
101 | 
102 | ``TFBertForSequenceClassification``
103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104 | 
105 | .. autoclass:: transformers.TFBertForSequenceClassification
106 |     :members:
107 | 
108 | 
109 | ``TFBertForMultipleChoice``
110 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111 | 
112 | .. autoclass:: transformers.TFBertForMultipleChoice
113 |     :members:
114 | 
115 | 
116 | ``TFBertForTokenClassification``
117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
118 | 
119 | .. autoclass:: transformers.TFBertForTokenClassification
120 |     :members:
121 | 
122 | 
123 | ``TFBertForQuestionAnswering``
124 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
125 | 
126 | .. autoclass:: transformers.TFBertForQuestionAnswering
127 |     :members:
128 | 
129 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/ctrl.rst:
--------------------------------------------------------------------------------
 1 | CTRL
 2 | ----------------------------------------------------
 3 | 
 4 | Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
 5 | you'll be able to convert from TF to our HuggingFace/Transformers format using the 
 6 | ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).
 7 | 
 8 | 
 9 | ``CTRLConfig``
10 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 | 
12 | .. autoclass:: transformers.CTRLConfig
13 |     :members:
14 | 
15 | 
16 | ``CTRLTokenizer``
17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | 
19 | .. autoclass:: transformers.CTRLTokenizer
20 |     :members:
21 | 
22 | 
23 | ``CTRLModel``
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | 
26 | .. autoclass:: transformers.CTRLModel
27 |     :members:
28 | 
29 | 
30 | ``CTRLLMHeadModel``
31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
32 | 
33 | .. autoclass:: transformers.CTRLLMHeadModel
34 |     :members:
35 | 
36 | 
37 | ``TFCTRLModel``
38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
39 | 
40 | .. autoclass:: transformers.TFCTRLModel
41 |     :members:
42 | 
43 | 
44 | ``TFCTRLLMHeadModel``
45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46 | 
47 | .. autoclass:: transformers.TFCTRLLMHeadModel
48 |     :members:
49 | 
50 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/distilbert.rst:
--------------------------------------------------------------------------------
 1 | DistilBERT
 2 | ----------------------------------------------------
 3 | 
 4 | ``DistilBertConfig``
 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.DistilBertConfig
 8 |     :members:
 9 | 
10 | 
11 | ``DistilBertTokenizer``
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.DistilBertTokenizer
15 |     :members:
16 | 
17 | 
18 | ``DistilBertModel``
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.DistilBertModel
22 |     :members:
23 | 
24 | 
25 | ``DistilBertForMaskedLM``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.DistilBertForMaskedLM
29 |     :members:
30 | 
31 | 
32 | ``DistilBertForSequenceClassification``
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.DistilBertForSequenceClassification
36 |     :members:
37 | 
38 | 
39 | ``DistilBertForQuestionAnswering``
40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.DistilBertForQuestionAnswering
43 |     :members:
44 | 
45 | ``TFDistilBertModel``
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | .. autoclass:: transformers.TFDistilBertModel
49 |     :members:
50 | 
51 | 
52 | ``TFDistilBertForMaskedLM``
53 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
54 | 
55 | .. autoclass:: transformers.TFDistilBertForMaskedLM
56 |     :members:
57 | 
58 | 
59 | ``TFDistilBertForSequenceClassification``
60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
61 | 
62 | .. autoclass:: transformers.TFDistilBertForSequenceClassification
63 |     :members:
64 | 
65 | 
66 | ``TFDistilBertForQuestionAnswering``
67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
68 | 
69 | .. autoclass:: transformers.TFDistilBertForQuestionAnswering
70 |     :members:
71 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/gpt.rst:
--------------------------------------------------------------------------------
 1 | OpenAI GPT
 2 | ----------------------------------------------------
 3 | 
 4 | ``OpenAIGPTConfig``
 5 | ~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.OpenAIGPTConfig
 8 |     :members:
 9 | 
10 | 
11 | ``OpenAIGPTTokenizer``
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.OpenAIGPTTokenizer
15 |     :members:
16 | 
17 | 
18 | ``OpenAIGPTModel``
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.OpenAIGPTModel
22 |     :members:
23 | 
24 | 
25 | ``OpenAIGPTLMHeadModel``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.OpenAIGPTLMHeadModel
29 |     :members:
30 | 
31 | 
32 | ``OpenAIGPTDoubleHeadsModel``
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
36 |     :members:
37 | 
38 | 
39 | ``TFOpenAIGPTModel``
40 | ~~~~~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.TFOpenAIGPTModel
43 |     :members:
44 | 
45 | 
46 | ``TFOpenAIGPTLMHeadModel``
47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.TFOpenAIGPTLMHeadModel
50 |     :members:
51 | 
52 | 
53 | ``TFOpenAIGPTDoubleHeadsModel``
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
57 |     :members:
58 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/gpt2.rst:
--------------------------------------------------------------------------------
 1 | OpenAI GPT2
 2 | ----------------------------------------------------
 3 | 
 4 | ``GPT2Config``
 5 | ~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.GPT2Config
 8 |     :members:
 9 | 
10 | 
11 | ``GPT2Tokenizer``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.GPT2Tokenizer
15 |     :members:
16 | 
17 | 
18 | ``GPT2Model``
19 | ~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.GPT2Model
22 |     :members:
23 | 
24 | 
25 | ``GPT2LMHeadModel``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.GPT2LMHeadModel
29 |     :members:
30 | 
31 | 
32 | ``GPT2DoubleHeadsModel``
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.GPT2DoubleHeadsModel
36 |     :members:
37 | 
38 | 
39 | ``TFGPT2Model``
40 | ~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.TFGPT2Model
43 |     :members:
44 | 
45 | 
46 | ``TFGPT2LMHeadModel``
47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.TFGPT2LMHeadModel
50 |     :members:
51 | 
52 | 
53 | ``TFGPT2DoubleHeadsModel``
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.TFGPT2DoubleHeadsModel
57 |     :members:
58 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/roberta.rst:
--------------------------------------------------------------------------------
 1 | RoBERTa
 2 | ----------------------------------------------------
 3 | 
 4 | ``RobertaConfig``
 5 | ~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.RobertaConfig
 8 |     :members:
 9 | 
10 | 
11 | ``RobertaTokenizer``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.RobertaTokenizer
15 |     :members:
16 | 
17 | 
18 | ``RobertaModel``
19 | ~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.RobertaModel
22 |     :members:
23 | 
24 | 
25 | ``RobertaForMaskedLM``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.RobertaForMaskedLM
29 |     :members:
30 | 
31 | 
32 | ``RobertaForSequenceClassification``
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.RobertaForSequenceClassification
36 |     :members:
37 | 
38 | 
39 | ``TFRobertaModel``
40 | ~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.TFRobertaModel
43 |     :members:
44 | 
45 | 
46 | ``TFRobertaForMaskedLM``
47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.TFRobertaForMaskedLM
50 |     :members:
51 | 
52 | 
53 | ``TFRobertaForSequenceClassification``
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.TFRobertaForSequenceClassification
57 |     :members:
58 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/transformerxl.rst:
--------------------------------------------------------------------------------
 1 | Transformer XL
 2 | ----------------------------------------------------
 3 | 
 4 | 
 5 | ``TransfoXLConfig``
 6 | ~~~~~~~~~~~~~~~~~~~~~
 7 | 
 8 | .. autoclass:: transformers.TransfoXLConfig
 9 |     :members:
10 | 
11 | 
12 | ``TransfoXLTokenizer``
13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
14 | 
15 | .. autoclass:: transformers.TransfoXLTokenizer
16 |     :members:
17 | 
18 | 
19 | ``TransfoXLModel``
20 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
21 | 
22 | .. autoclass:: transformers.TransfoXLModel
23 |     :members:
24 | 
25 | 
26 | ``TransfoXLLMHeadModel``
27 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
28 | 
29 | .. autoclass:: transformers.TransfoXLLMHeadModel
30 |     :members:
31 | 
32 | 
33 | ``TFTransfoXLModel``
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | 
36 | .. autoclass:: transformers.TFTransfoXLModel
37 |     :members:
38 | 
39 | 
40 | ``TFTransfoXLLMHeadModel``
41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42 | 
43 | .. autoclass:: transformers.TFTransfoXLLMHeadModel
44 |     :members:
45 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/xlm.rst:
--------------------------------------------------------------------------------
 1 | XLM
 2 | ----------------------------------------------------
 3 | 
 4 | ``XLMConfig``
 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.XLMConfig
 8 |     :members:
 9 | 
10 | ``XLMTokenizer``
11 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12 | 
13 | .. autoclass:: transformers.XLMTokenizer
14 |     :members:
15 | 
16 | ``XLMModel``
17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | 
19 | .. autoclass:: transformers.XLMModel
20 |     :members:
21 | 
22 | 
23 | ``XLMWithLMHeadModel``
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 | 
26 | .. autoclass:: transformers.XLMWithLMHeadModel
27 |     :members:
28 | 
29 | 
30 | ``XLMForSequenceClassification``
31 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
32 | 
33 | .. autoclass:: transformers.XLMForSequenceClassification
34 |     :members:
35 | 
36 | 
37 | ``XLMForQuestionAnswering``
38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
39 | 
40 | .. autoclass:: transformers.XLMForQuestionAnswering
41 |     :members:
42 | 
43 | 
44 | ``TFXLMModel``
45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46 | 
47 | .. autoclass:: transformers.TFXLMModel
48 |     :members:
49 | 
50 | 
51 | ``TFXLMWithLMHeadModel``
52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
53 | 
54 | .. autoclass:: transformers.TFXLMWithLMHeadModel
55 |     :members:
56 | 
57 | 
58 | ``TFXLMForSequenceClassification``
59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60 | 
61 | .. autoclass:: transformers.TFXLMForSequenceClassification
62 |     :members:
63 | 
64 | 
65 | ``TFXLMForQuestionAnsweringSimple``
66 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
67 | 
68 | .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
69 |     :members:
70 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/model_doc/xlnet.rst:
--------------------------------------------------------------------------------
 1 | XLNet
 2 | ----------------------------------------------------
 3 | 
 4 | ``XLNetConfig``
 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | .. autoclass:: transformers.XLNetConfig
 8 |     :members:
 9 | 
10 | 
11 | ``XLNetTokenizer``
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.XLNetTokenizer
15 |     :members:
16 | 
17 | 
18 | ``XLNetModel``
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.XLNetModel
22 |     :members:
23 | 
24 | 
25 | ``XLNetLMHeadModel``
26 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.XLNetLMHeadModel
29 |     :members:
30 | 
31 | 
32 | ``XLNetForSequenceClassification``
33 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.XLNetForSequenceClassification
36 |     :members:
37 | 
38 | 
39 | ``XLNetForQuestionAnswering``
40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.XLNetForQuestionAnswering
43 |     :members:
44 | 
45 | 
46 | ``TFXLNetModel``
47 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.TFXLNetModel
50 |     :members:
51 | 
52 | 
53 | ``TFXLNetLMHeadModel``
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.TFXLNetLMHeadModel
57 |     :members:
58 | 
59 | 
60 | ``TFXLNetForSequenceClassification``
61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
62 | 
63 | .. autoclass:: transformers.TFXLNetForSequenceClassification
64 |     :members:
65 | 
66 | 
67 | ``TFXLNetForQuestionAnsweringSimple``
68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
69 | 
70 | .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
71 |     :members:
72 | 


--------------------------------------------------------------------------------
/bin/transformers/docs/source/multilingual.rst:
--------------------------------------------------------------------------------
  1 | Multi-lingual models
  2 | ================================================
  3 | 
  4 | Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
  5 | multi-lingual models are available and have a different mechanisms than mono-lingual models.
  6 | This page details the usage of these models.
  7 | 
  8 | The two models that currently support multiple languages are BERT and XLM.
  9 | 
 10 | XLM
 11 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 12 | 
 13 | XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can
 14 | be split in two categories: the checkpoints that make use of language embeddings, and those that don't
 15 | 
 16 | XLM & Language Embeddings
 17 | ------------------------------------------------
 18 | 
 19 | This section concerns the following checkpoints:
 20 | 
 21 | - ``xlm-mlm-ende-1024`` (Masked language modeling, English-German)
 22 | - ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French)
 23 | - ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian)
 24 | - ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages)
 25 | - ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages)
 26 | - ``xlm-clm-enfr-1024`` (Causal language modeling, English-French)
 27 | - ``xlm-clm-ende-1024`` (Causal language modeling, English-German)
 28 | 
 29 | These checkpoints require language embeddings that will specify the language used at inference time. These language
 30 | embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
 31 | these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
 32 | from the tokenizer.
 33 | 
 34 | Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):
 35 | 
 36 | 
 37 | .. code-block::
 38 | 
 39 |     import torch
 40 |     from transformers import XLMTokenizer, XLMWithLMHeadModel
 41 | 
 42 |     tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
 43 | 
 44 | 
 45 | The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
 46 | ``lang2id`` attribute:
 47 | 
 48 | .. code-block::
 49 | 
 50 |     print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
 51 | 
 52 | 
 53 | These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
 54 | 
 55 | .. code-block::
 56 | 
 57 |     input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
 58 | 
 59 | 
 60 | We should now define the language embedding by using the previously defined language id. We want to create a tensor
 61 | filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
 62 | 
 63 | .. code-block::
 64 | 
 65 |     language_id = tokenizer.lang2id['en']  # 0
 66 |     langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
 67 | 
 68 |     # We reshape it to be of size (batch_size, sequence_length)
 69 |     langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
 70 | 
 71 | 
 72 | You can then feed it all as input to your model:
 73 | 
 74 | .. code-block::
 75 | 
 76 |     outputs = model(input_ids, langs=langs)
 77 | 
 78 | 
 79 | The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
 80 | can generate text using the CLM checkpoints from XLM, using the language embeddings.
 81 | 
 82 | XLM without Language Embeddings
 83 | ------------------------------------------------
 84 | 
 85 | This section concerns the following checkpoints:
 86 | 
 87 | - ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
 88 | - ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)
 89 | 
 90 | These checkpoints do not require language embeddings at inference time. These models are used to have generic
 91 | sentence representations, differently from previously-mentioned XLM checkpoints.
 92 | 
 93 | 
 94 | BERT
 95 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 96 | 
 97 | BERT has two checkpoints that can be used for multi-lingual tasks:
 98 | 
 99 | - ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
100 | - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)
101 | 
102 | These checkpoints do not require language embeddings at inference time. They should identify the language
103 | used in the context and infer accordingly.


--------------------------------------------------------------------------------
/bin/transformers/docs/source/notebooks.rst:
--------------------------------------------------------------------------------
 1 | Notebooks
 2 | ================================================
 3 | 
 4 | We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 5 | 
 6 | 
 7 | *
 8 |   The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 9 | 
10 | *
11 |   The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
12 | 
13 | *
14 |   The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
15 | 
16 | Please follow the instructions given in the notebooks to run and modify them.
17 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/contrib/README.md:
--------------------------------------------------------------------------------
1 | # Community contributed examples
2 | 
3 | This folder contains examples which are not actively maintained (mostly contributed by the community).
4 | 
5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
6 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/contrib/run_camembert.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import tarfile
 3 | import urllib.request
 4 | 
 5 | import torch
 6 | 
 7 | from transformers.tokenization_camembert import CamembertTokenizer
 8 | from transformers.modeling_camembert import CamembertForMaskedLM
 9 | 
10 | 
11 | def fill_mask(masked_input, model, tokenizer, topk=5):
12 |     # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
13 |     assert masked_input.count('<mask>') == 1
14 |     input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
15 |     logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
16 |     masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
17 |     logits = logits[0, masked_index, :]
18 |     prob = logits.softmax(dim=0)
19 |     values, indices = prob.topk(k=topk, dim=0)
20 |     topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item())
21 |                                          for i in range(len(indices))])
22 |     masked_token = tokenizer.mask_token
23 |     topk_filled_outputs = []
24 |     for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')):
25 |         predicted_token = predicted_token_bpe.replace('\u2581', ' ')
26 |         if " {0}".format(masked_token) in masked_input:
27 |             topk_filled_outputs.append((
28 |                 masked_input.replace(
29 |                     ' {0}'.format(masked_token), predicted_token
30 |                 ),
31 |                 values[index].item(),
32 |                 predicted_token,
33 |             ))
34 |         else:
35 |             topk_filled_outputs.append((
36 |                 masked_input.replace(masked_token, predicted_token),
37 |                 values[index].item(),
38 |                 predicted_token,
39 |             ))
40 |     return topk_filled_outputs
41 | 
42 | 
43 | tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
44 | model = CamembertForMaskedLM.from_pretrained('camembert-base')
45 | model.eval()
46 | 
47 | masked_input = "Le camembert est <mask> :)"
48 | print(fill_mask(masked_input, model, tokenizer, topk=3))
49 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/grouped_batch_sampler.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
 16 | """
 17 | import bisect
 18 | import copy
 19 | from collections import defaultdict
 20 | import numpy as np
 21 | 
 22 | from torch.utils.data.sampler import BatchSampler, Sampler
 23 | 
 24 | from utils import logger
 25 | 
 26 | def _quantize(x, bins):
 27 |     bins = copy.deepcopy(bins)
 28 |     bins = sorted(bins)
 29 |     quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
 30 |     return quantized
 31 | 
 32 | def create_lengths_groups(lengths, k=0):
 33 |     bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
 34 |     groups = _quantize(lengths, bins)
 35 |     # count number of elements per group
 36 |     counts = np.unique(groups, return_counts=True)[1]
 37 |     fbins = [0] + bins + [np.inf]
 38 |     logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
 39 |     logger.info("Count of instances per bin: {}".format(counts))
 40 |     return groups
 41 | 
 42 | class GroupedBatchSampler(BatchSampler):
 43 |     """
 44 |     Wraps another sampler to yield a mini-batch of indices.
 45 |     It enforces that the batch only contain elements from the same group.
 46 |     It also tries to provide mini-batches which follows an ordering which is
 47 |     as close as possible to the ordering from the original sampler.
 48 |     Arguments:
 49 |         sampler (Sampler): Base sampler.
 50 |         group_ids (list[int]): If the sampler produces indices in range [0, N),
 51 |             `group_ids` must be a list of `N` ints which contains the group id of each sample.
 52 |             The group ids must be a continuous set of integers starting from
 53 |             0, i.e. they must be in the range [0, num_groups).
 54 |         batch_size (int): Size of mini-batch.
 55 |     """
 56 |     def __init__(self, sampler, group_ids, batch_size):
 57 |         if not isinstance(sampler, Sampler):
 58 |             raise ValueError(
 59 |                 "sampler should be an instance of "
 60 |                 "torch.utils.data.Sampler, but got sampler={}".format(sampler)
 61 |             )
 62 |         self.sampler = sampler
 63 |         self.group_ids = group_ids
 64 |         self.batch_size = batch_size
 65 | 
 66 |     def __iter__(self):
 67 |         buffer_per_group = defaultdict(list)
 68 |         samples_per_group = defaultdict(list)
 69 | 
 70 |         num_batches = 0
 71 |         for idx in self.sampler:
 72 |             group_id = self.group_ids[idx]
 73 |             buffer_per_group[group_id].append(idx)
 74 |             samples_per_group[group_id].append(idx)
 75 |             if len(buffer_per_group[group_id]) == self.batch_size:
 76 |                 yield buffer_per_group[group_id] #TODO
 77 |                 num_batches += 1
 78 |                 del buffer_per_group[group_id]
 79 |             assert len(buffer_per_group[group_id]) < self.batch_size
 80 | 
 81 |         # now we have run out of elements that satisfy
 82 |         # the group criteria, let's return the remaining
 83 |         # elements so that the size of the sampler is
 84 |         # deterministic
 85 |         expected_num_batches = len(self)
 86 |         num_remaining = expected_num_batches - num_batches
 87 |         if num_remaining > 0:
 88 |             # for the remaining batches, group the batches by similar lengths
 89 |             batch_idx = []
 90 |             for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
 91 |                 batch_idx.extend(idxs)
 92 |                 if len(batch_idx) >= self.batch_size:
 93 |                     yield batch_idx[:self.batch_size]
 94 |                     batch_idx = batch_idx[self.batch_size:]
 95 |                     num_remaining -= 1
 96 |             if len(batch_idx) > 0:
 97 |                 yield batch_idx
 98 |                 num_remaining -= 1
 99 |         assert num_remaining == 0
100 | 
101 |     def __len__(self):
102 |         """
103 |         Return the number of mini-batches rather than the number of samples.
104 |         """
105 |         return (len(self.sampler) + self.batch_size - 1) // self.batch_size
106 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | gitpython==3.0.2
2 | tensorboard>=1.14.0
3 | tensorboardX==1.8
4 | psutil==5.6.3
5 | scipy==1.3.1
6 | transformers==2.0.0
7 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/scripts/binarized_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before distillation.
17 | """
18 | import argparse
19 | import pickle
20 | import random
21 | import time
22 | import numpy as np
23 | from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
24 | import logging
25 | 
26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
28 |                     level = logging.INFO)
29 | logger = logging.getLogger(__name__)
30 | 
31 | def main():
32 |     parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
33 |     parser.add_argument('--file_path', type=str, default='data/dump.txt',
34 |                         help='The path to the data.')
35 |     parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
36 |     parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
37 |                         help="The tokenizer to use.")
38 |     parser.add_argument('--dump_file', type=str, default='data/dump',
39 |                         help='The dump file prefix.')
40 |     args = parser.parse_args()
41 | 
42 | 
43 |     logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
44 |     if args.tokenizer_type == 'bert':
45 |         tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
46 |         bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
47 |         sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
48 |     elif args.tokenizer_type == 'roberta':
49 |         tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
50 |         bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
51 |         sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
52 |     elif args.tokenizer_type == 'gpt2':
53 |         tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
54 |         bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
55 |         sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
56 | 
57 |     logger.info(f'Loading text from {args.file_path}')
58 |     with open(args.file_path, 'r', encoding='utf8') as fp:
59 |         data = fp.readlines()
60 | 
61 | 
62 |     logger.info(f'Start encoding')
63 |     logger.info(f'{len(data)} examples to process.')
64 | 
65 |     rslt = []
66 |     iter = 0
67 |     interval = 10000
68 |     start = time.time()
69 |     for text in data:
70 |         text = f'{bos} {text.strip()} {sep}'
71 |         token_ids = tokenizer.encode(text, add_special_tokens=False)
72 |         rslt.append(token_ids)
73 | 
74 |         iter += 1
75 |         if iter % interval == 0:
76 |             end = time.time()
77 |             logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
78 |             start = time.time()
79 |     logger.info('Finished binarization')
80 |     logger.info(f'{len(data)} examples processed.')
81 | 
82 | 
83 |     dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
84 |     rslt_ = [np.uint16(d) for d in rslt]
85 |     random.shuffle(rslt_)
86 |     logger.info(f'Dump to {dp_file}')
87 |     with open(dp_file, 'wb') as handle:
88 |         pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main()
93 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/scripts/extract.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training the distilled model.
17 | Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
18 | """
19 | from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
20 | import torch
21 | import argparse
22 | 
23 | if __name__ == '__main__':
24 |     parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
25 |     parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
26 |     parser.add_argument("--model_name", default='roberta-large', type=str)
27 |     parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
28 |     parser.add_argument("--vocab_transform", action='store_true')
29 |     args = parser.parse_args()
30 | 
31 | 
32 |     if args.model_type == 'roberta':
33 |         model = RobertaForMaskedLM.from_pretrained(args.model_name)
34 |         prefix = 'roberta'
35 |     elif args.model_type == 'gpt2':
36 |         model = GPT2LMHeadModel.from_pretrained(args.model_name)
37 |         prefix = 'transformer'
38 | 
39 |     state_dict = model.state_dict()
40 |     compressed_sd = {}
41 | 
42 |     ### Embeddings ###
43 |     if args.model_type == 'gpt2':
44 |         for param_name in ['wte.weight', 'wpe.weight']:
45 |             compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
46 |     else:
47 |         for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
48 |             param_name = f'{prefix}.embeddings.{w}.weight'
49 |             compressed_sd[param_name] = state_dict[param_name]
50 |         for w in ['weight', 'bias']:
51 |             param_name = f'{prefix}.embeddings.LayerNorm.{w}'
52 |             compressed_sd[param_name] = state_dict[param_name]
53 | 
54 |     ### Transformer Blocks ###
55 |     std_idx = 0
56 |     for teacher_idx in [0, 2, 4, 7, 9, 11]:
57 |         if args.model_type == 'gpt2':
58 |             for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
59 |                 for w in ['weight', 'bias']:
60 |                     compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
61 |                         state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
62 |             compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
63 |         else:
64 |             for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
65 |                         'attention.output.dense', 'attention.output.LayerNorm',
66 |                         'intermediate.dense', 'output.dense', 'output.LayerNorm']:
67 |                 for w in ['weight', 'bias']:
68 |                     compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
69 |                         state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
70 |         std_idx += 1
71 | 
72 |     ### Language Modeling Head ###s
73 |     if args.model_type == 'roberta':
74 |         for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
75 |             compressed_sd[f'{layer}'] = state_dict[f'{layer}']
76 |         if args.vocab_transform:
77 |             for w in ['weight', 'bias']:
78 |                 compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
79 |                 compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
80 |     elif args.model_type == 'gpt2':
81 |         for w in ['weight', 'bias']:
82 |             compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
83 |         compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
84 | 
85 |     print(f'N layers selected for distillation: {std_idx}')
86 |     print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
87 | 
88 |     print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
89 |     torch.save(compressed_sd, args.dump_checkpoint)
90 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/scripts/extract_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training DistilBERT.
17 | Specific to BERT -> DistilBERT.
18 | """
19 | from transformers import BertForMaskedLM, RobertaForMaskedLM
20 | import torch
21 | import argparse
22 | 
23 | if __name__ == '__main__':
24 |     parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
25 |     parser.add_argument("--model_type", default="bert", choices=["bert"])
26 |     parser.add_argument("--model_name", default='bert-base-uncased', type=str)
27 |     parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
28 |     parser.add_argument("--vocab_transform", action='store_true')
29 |     args = parser.parse_args()
30 | 
31 | 
32 |     if args.model_type == 'bert':
33 |         model = BertForMaskedLM.from_pretrained(args.model_name)
34 |         prefix = 'bert'
35 |     else:
36 |         raise ValueError(f'args.model_type should be "bert".')
37 | 
38 |     state_dict = model.state_dict()
39 |     compressed_sd = {}
40 | 
41 |     for w in ['word_embeddings', 'position_embeddings']:
42 |         compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
43 |             state_dict[f'{prefix}.embeddings.{w}.weight']
44 |     for w in ['weight', 'bias']:
45 |         compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
46 |             state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
47 | 
48 |     std_idx = 0
49 |     for teacher_idx in [0, 2, 4, 7, 9, 11]:
50 |         for w in ['weight', 'bias']:
51 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
52 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
53 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
54 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
55 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
56 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
57 | 
58 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
59 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
60 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
61 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
62 | 
63 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
64 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
65 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
66 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
67 |             compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
68 |                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
69 |         std_idx += 1
70 | 
71 |     compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
72 |     compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
73 |     if args.vocab_transform:
74 |         for w in ['weight', 'bias']:
75 |             compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
76 |             compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
77 | 
78 |     print(f'N layers selected for distillation: {std_idx}')
79 |     print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
80 | 
81 |     print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
82 |     torch.save(compressed_sd, args.dump_checkpoint)
83 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/scripts/token_counts.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """
16 | Preprocessing script before training the distilled model.
17 | """
18 | from collections import Counter
19 | import argparse
20 | import pickle
21 | import logging
22 | 
23 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
24 |                     datefmt = '%m/%d/%Y %H:%M:%S',
25 |                     level = logging.INFO)
26 | logger = logging.getLogger(__name__)
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
30 |     parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
31 |                         help="The binarized dataset.")
32 |     parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
33 |                         help="The dump file.")
34 |     parser.add_argument("--vocab_size", default=30522, type=int)
35 |     args = parser.parse_args()
36 | 
37 |     logger.info(f'Loading data from {args.data_file}')
38 |     with open(args.data_file, 'rb') as fp:
39 |         data = pickle.load(fp)
40 | 
41 |     logger.info('Counting occurences for MLM.')
42 |     counter = Counter()
43 |     for tk_ids in data:
44 |         counter.update(tk_ids)
45 |     counts = [0]*args.vocab_size
46 |     for k, v in counter.items():
47 |         counts[k] = v
48 | 
49 |     logger.info(f'Dump to {args.token_counts_dump}')
50 |     with open(args.token_counts_dump, 'wb') as handle:
51 |         pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
52 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 30522
14 |   }
15 |   


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"initializer_range": 0.02,
 3 | 	"layer_norm_epsilon": 0.00001,
 4 | 	"n_ctx": 1024,
 5 | 	"n_embd": 768,
 6 | 	"n_head": 12,
 7 | 	"n_layer": 6,
 8 | 	"n_positions": 1024,
 9 | 	"vocab_size": 50257
10 | }


--------------------------------------------------------------------------------
/bin/transformers/examples/distillation/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Utils to train DistilBERT
 16 |     adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 17 | """
 18 | import git
 19 | import json
 20 | import os
 21 | import socket
 22 | import torch
 23 | import numpy as np
 24 | 
 25 | import logging
 26 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
 27 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 28 |                     level = logging.INFO)
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | def git_log(folder_path: str):
 33 |     """
 34 |     Log commit info.
 35 |     """
 36 |     repo = git.Repo(search_parent_directories=True)
 37 |     repo_infos = {
 38 |         'repo_id': str(repo),
 39 |         'repo_sha': str(repo.head.object.hexsha),
 40 |         'repo_branch': str(repo.active_branch)
 41 |     }
 42 | 
 43 |     with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
 44 |         json.dump(repo_infos, f, indent=4)
 45 | 
 46 | 
 47 | def init_gpu_params(params):
 48 |     """
 49 |     Handle single and multi-GPU / multi-node.
 50 |     """
 51 |     if params.n_gpu <= 0:
 52 |         params.local_rank = 0
 53 |         params.master_port = -1
 54 |         params.is_master = True
 55 |         params.multi_gpu = False
 56 |         return
 57 | 
 58 |     assert torch.cuda.is_available()
 59 | 
 60 |     logger.info('Initializing GPUs')
 61 |     if params.n_gpu > 1:
 62 |         assert params.local_rank != -1
 63 | 
 64 |         params.world_size = int(os.environ['WORLD_SIZE'])
 65 |         params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
 66 |         params.global_rank = int(os.environ['RANK'])
 67 | 
 68 |         # number of nodes / node ID
 69 |         params.n_nodes = params.world_size // params.n_gpu_per_node
 70 |         params.node_id = params.global_rank // params.n_gpu_per_node
 71 |         params.multi_gpu = True
 72 | 
 73 |         assert params.n_nodes == int(os.environ['N_NODES'])
 74 |         assert params.node_id == int(os.environ['NODE_RANK'])
 75 | 
 76 |     # local job (single GPU)
 77 |     else:
 78 |         assert params.local_rank == -1
 79 | 
 80 |         params.n_nodes = 1
 81 |         params.node_id = 0
 82 |         params.local_rank = 0
 83 |         params.global_rank = 0
 84 |         params.world_size = 1
 85 |         params.n_gpu_per_node = 1
 86 |         params.multi_gpu = False
 87 | 
 88 |     # sanity checks
 89 |     assert params.n_nodes >= 1
 90 |     assert 0 <= params.node_id < params.n_nodes
 91 |     assert 0 <= params.local_rank <= params.global_rank < params.world_size
 92 |     assert params.world_size == params.n_nodes * params.n_gpu_per_node
 93 | 
 94 |     # define whether this is the master process / if we are in multi-node distributed mode
 95 |     params.is_master = params.node_id == 0 and params.local_rank == 0
 96 |     params.multi_node = params.n_nodes > 1
 97 | 
 98 |     # summary
 99 |     PREFIX = f"--- Global rank: {params.global_rank} - "
100 |     logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
101 |     logger.info(PREFIX + "Node ID        : %i" % params.node_id)
102 |     logger.info(PREFIX + "Local rank     : %i" % params.local_rank)
103 |     logger.info(PREFIX + "World size     : %i" % params.world_size)
104 |     logger.info(PREFIX + "GPUs per node  : %i" % params.n_gpu_per_node)
105 |     logger.info(PREFIX + "Master         : %s" % str(params.is_master))
106 |     logger.info(PREFIX + "Multi-node     : %s" % str(params.multi_node))
107 |     logger.info(PREFIX + "Multi-GPU      : %s" % str(params.multi_gpu))
108 |     logger.info(PREFIX + "Hostname       : %s" % socket.gethostname())
109 | 
110 |     # set GPU device
111 |     torch.cuda.set_device(params.local_rank)
112 | 
113 |     # initialize multi-GPU
114 |     if params.multi_gpu:
115 |         logger.info("Initializing PyTorch distributed")
116 |         torch.distributed.init_process_group(
117 |             init_method='env://',
118 |             backend='nccl',
119 |         )
120 | 
121 | 
122 | def set_seed(args):
123 |     """
124 |     Set the random seed.
125 |     """
126 |     np.random.seed(args.seed)
127 |     torch.manual_seed(args.seed)
128 |     if args.n_gpu > 0:
129 |         torch.cuda.manual_seed_all(args.seed)
130 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/etc/stop_words_english.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | are
 13 | aren't
 14 | as
 15 | at
 16 | be
 17 | because
 18 | been
 19 | before
 20 | being
 21 | below
 22 | between
 23 | both
 24 | but
 25 | by
 26 | can't
 27 | cannot
 28 | could
 29 | couldn't
 30 | did
 31 | didn't
 32 | do
 33 | does
 34 | doesn't
 35 | doing
 36 | don't
 37 | down
 38 | during
 39 | each
 40 | few
 41 | for
 42 | from
 43 | further
 44 | had
 45 | hadn't
 46 | has
 47 | hasn't
 48 | have
 49 | haven't
 50 | having
 51 | he
 52 | he'd
 53 | he'll
 54 | he's
 55 | her
 56 | here
 57 | here's
 58 | hers
 59 | herself
 60 | him
 61 | himself
 62 | his
 63 | how
 64 | how's
 65 | i
 66 | i'd
 67 | i'll
 68 | i'm
 69 | i've
 70 | if
 71 | in
 72 | into
 73 | is
 74 | isn't
 75 | it
 76 | it's
 77 | its
 78 | itself
 79 | let's
 80 | me
 81 | more
 82 | most
 83 | mustn't
 84 | my
 85 | myself
 86 | no
 87 | nor
 88 | not
 89 | of
 90 | off
 91 | on
 92 | once
 93 | only
 94 | or
 95 | other
 96 | ought
 97 | our
 98 | ours	ourselves
 99 | out
100 | over
101 | own
102 | same
103 | shan't
104 | she
105 | she'd
106 | she'll
107 | she's
108 | should
109 | shouldn't
110 | so
111 | some
112 | such
113 | than
114 | that
115 | that's
116 | the
117 | their
118 | theirs
119 | them
120 | themselves
121 | then
122 | there
123 | there's
124 | these
125 | they
126 | they'd
127 | they'll
128 | they're
129 | they've
130 | this
131 | those
132 | through
133 | to
134 | too
135 | under
136 | until
137 | up
138 | very
139 | was
140 | wasn't
141 | we
142 | we'd
143 | we'll
144 | we're
145 | we've
146 | were
147 | weren't
148 | what
149 | what's
150 | when
151 | when's
152 | where
153 | where's
154 | which
155 | while
156 | who
157 | who's
158 | whom
159 | why
160 | why's
161 | with
162 | won't
163 | would
164 | wouldn't
165 | you
166 | you'd
167 | you'll
168 | you're
169 | you've
170 | your
171 | yours
172 | yourself
173 | yourselves
174 | 
175 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | tensorboard
3 | scikit-learn
4 | seqeval
5 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/run_tf_glue.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | import tensorflow_datasets
 4 | from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
 5 | 
 6 | # script parameters
 7 | BATCH_SIZE = 32
 8 | EVAL_BATCH_SIZE = BATCH_SIZE * 2
 9 | USE_XLA = False
10 | USE_AMP = False
11 | EPOCHS = 3
12 | 
13 | TASK = "mrpc"
14 | 
15 | if TASK == "sst-2":
16 |     TFDS_TASK = "sst2"
17 | elif TASK == "sts-b":
18 |     TFDS_TASK = "stsb"
19 | else: 
20 |     TFDS_TASK = TASK
21 | 
22 | num_labels = len(glue_processors[TASK]().get_labels())
23 | print(num_labels)
24 | 
25 | tf.config.optimizer.set_jit(USE_XLA)
26 | tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
27 | 
28 | # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
29 | config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
30 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
31 | model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
32 | 
33 | # Load dataset via TensorFlow Datasets
34 | data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
35 | train_examples = info.splits['train'].num_examples
36 | 
37 | # MNLI expects either validation_matched or validation_mismatched
38 | valid_examples = info.splits['validation'].num_examples
39 | 
40 | # Prepare dataset for GLUE as a tf.data.Dataset instance
41 | train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
42 | 
43 | # MNLI expects either validation_matched or validation_mismatched
44 | valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
45 | train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
46 | valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
47 | 
48 | # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
49 | opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
50 | if USE_AMP:
51 |     # loss scaling is currently required when using mixed precision
52 |     opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
53 | 
54 | 
55 | if num_labels == 1:
56 |     loss = tf.keras.losses.MeanSquaredError()
57 | else:
58 |     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
59 | 
60 | metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
61 | model.compile(optimizer=opt, loss=loss, metrics=[metric])
62 | 
63 | # Train and evaluate using tf.keras.Model.fit()
64 | train_steps = train_examples//BATCH_SIZE
65 | valid_steps = valid_examples//EVAL_BATCH_SIZE
66 | 
67 | history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
68 |                     validation_data=valid_dataset, validation_steps=valid_steps)
69 | 
70 | # Save TF2 model
71 | os.makedirs('./save/', exist_ok=True)
72 | model.save_pretrained('./save/')
73 | 
74 | if TASK == "mrpc":
75 |     # Load the TensorFlow model in PyTorch for inspection
76 |     # This is to demo the interoperability between the two frameworks, you don't have to 
77 |     # do this in real life (you can run the inference on the TF model).
78 |     pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
79 | 
80 |     # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
81 |     sentence_0 = 'This research was consistent with his findings.'
82 |     sentence_1 = 'His findings were compatible with this research.'
83 |     sentence_2 = 'His findings were not compatible with this research.'
84 |     inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
85 |     inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
86 | 
87 |     del inputs_1["special_tokens_mask"]
88 |     del inputs_2["special_tokens_mask"]
89 | 
90 |     pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
91 |     pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
92 |     print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
93 |     print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
94 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/test_examples.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 HuggingFace Inc..
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import sys
 20 | import unittest
 21 | import argparse
 22 | import logging
 23 | 
 24 | try:
 25 |     # python 3.4+ can use builtin unittest.mock instead of mock package
 26 |     from unittest.mock import patch
 27 | except ImportError:
 28 |     from mock import patch
 29 | 
 30 | import run_glue_multitask
 31 | import run_squad
 32 | import run_generation
 33 | 
 34 | logging.basicConfig(level=logging.DEBUG)
 35 | 
 36 | logger = logging.getLogger()
 37 | 
 38 | def get_setup_file():
 39 |     parser = argparse.ArgumentParser()
 40 |     parser.add_argument('-f')
 41 |     args = parser.parse_args()
 42 |     return args.f
 43 | 
 44 | class ExamplesTests(unittest.TestCase):
 45 | 
 46 |     def test_run_glue(self):
 47 |         stream_handler = logging.StreamHandler(sys.stdout)
 48 |         logger.addHandler(stream_handler)
 49 | 
 50 |         testargs = ["run_glue_multitask.py",
 51 |                     "--data_dir=./examples/tests_samples/MRPC/",
 52 |                     "--task_name=mrpc",
 53 |                     "--do_train",
 54 |                     "--do_eval",
 55 |                     "--output_dir=./examples/tests_samples/temp_dir",
 56 |                     "--per_gpu_train_batch_size=2",
 57 |                     "--per_gpu_eval_batch_size=1",
 58 |                     "--learning_rate=1e-4",
 59 |                     "--max_steps=10",
 60 |                     "--warmup_steps=2",
 61 |                     "--overwrite_output_dir",
 62 |                     "--seed=42"]
 63 |         model_type, model_name = ("--model_type=bert",
 64 |                                   "--model_name_or_path=bert-base-uncased")
 65 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
 66 |             result = run_glue_multitask.main()
 67 |             for value in result.values():
 68 |                 self.assertGreaterEqual(value, 0.75)
 69 | 
 70 |     def test_run_squad(self):
 71 |         stream_handler = logging.StreamHandler(sys.stdout)
 72 |         logger.addHandler(stream_handler)
 73 | 
 74 |         testargs = ["run_squad.py",
 75 |                     "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
 76 |                     "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
 77 |                     "--model_name=bert-base-uncased",
 78 |                     "--output_dir=./examples/tests_samples/temp_dir",
 79 |                     "--max_steps=10",
 80 |                     "--warmup_steps=2",
 81 |                     "--do_train",
 82 |                     "--do_eval",
 83 |                     "--version_2_with_negative",
 84 |                     "--learning_rate=2e-4",
 85 |                     "--per_gpu_train_batch_size=2",
 86 |                     "--per_gpu_eval_batch_size=1",
 87 |                     "--overwrite_output_dir",
 88 |                     "--seed=42"]
 89 |         model_type, model_name = ("--model_type=bert",
 90 |                                   "--model_name_or_path=bert-base-uncased")
 91 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
 92 |             result = run_squad.main()
 93 |             self.assertGreaterEqual(result['f1'], 30)
 94 |             self.assertGreaterEqual(result['exact'], 30)
 95 | 
 96 |     def test_generation(self):
 97 |         stream_handler = logging.StreamHandler(sys.stdout)
 98 |         logger.addHandler(stream_handler)
 99 | 
100 |         testargs = ["run_generation.py",
101 |                     "--prompt=Hello",
102 |                     "--length=10",
103 |                     "--seed=42"]
104 |         model_type, model_name = ("--model_type=openai-gpt",
105 |                                   "--model_name_or_path=openai-gpt")
106 |         with patch.object(sys, 'argv', testargs + [model_type, model_name]):
107 |             result = run_generation.main()
108 |             self.assertGreaterEqual(len(result), 10)
109 | 
110 | if __name__ == "__main__":
111 |     unittest.main()
112 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/tests_samples/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | cache*
3 | temp*
4 | !*.tsv
5 | !*.json
6 | !.gitignore


--------------------------------------------------------------------------------
/bin/transformers/examples/tests_samples/MRPC/dev.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/tests_samples/MRPC/train.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_camembert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_camembert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/configuration_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/configuration_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/file_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/file_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_camembert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_camembert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_encoder_decoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_encoder_decoder.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_tf_pytorch_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_transfo_xl_utilities.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/modeling_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/modeling_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/optimization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/optimization.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_auto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_auto.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_bert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_bert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_camembert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_camembert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_ctrl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_distilbert.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_gpt2.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_openai.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_openai.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_roberta.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_roberta.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_transfo_xl.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_xlm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_xlm.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/__pycache__/tokenization_xlnet.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_roberta import RobertaConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
29 | }
30 | 
31 | 
32 | class CamembertConfig(RobertaConfig):
33 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
34 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
31 | }
32 | 
33 | 
34 | class DistilBertConfig(PretrainedConfig):
35 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 
37 |     def __init__(self,
38 |                  vocab_size_or_config_json_file=30522,
39 |                  max_position_embeddings=512,
40 |                  sinusoidal_pos_embds=False,
41 |                  n_layers=6,
42 |                  n_heads=12,
43 |                  dim=768,
44 |                  hidden_dim=4*768,
45 |                  dropout=0.1,
46 |                  attention_dropout=0.1,
47 |                  activation='gelu',
48 |                  initializer_range=0.02,
49 |                  tie_weights_=True,
50 |                  qa_dropout=0.1,
51 |                  seq_classif_dropout=0.2,
52 |                  **kwargs):
53 |         super(DistilBertConfig, self).__init__(**kwargs)
54 | 
55 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
56 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
57 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
58 |                 json_config = json.loads(reader.read())
59 |             for key, value in json_config.items():
60 |                 self.__dict__[key] = value
61 |         elif isinstance(vocab_size_or_config_json_file, int):
62 |             self.vocab_size = vocab_size_or_config_json_file
63 |             self.max_position_embeddings = max_position_embeddings
64 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
65 |             self.n_layers = n_layers
66 |             self.n_heads = n_heads
67 |             self.dim = dim
68 |             self.hidden_dim = hidden_dim
69 |             self.dropout = dropout
70 |             self.attention_dropout = attention_dropout
71 |             self.activation = activation
72 |             self.initializer_range = initializer_range
73 |             self.tie_weights_ = tie_weights_
74 |             self.qa_dropout = qa_dropout
75 |             self.seq_classif_dropout = seq_classif_dropout
76 |         else:
77 |             raise ValueError("First argument must be either a vocabulary size (int)"
78 |                              " or the path to a pretrained model config file (str)")
79 |     @property
80 |     def hidden_size(self):
81 |         return self.dim
82 | 
83 |     @property
84 |     def num_attention_heads(self):
85 |         return self.n_heads
86 | 
87 |     @property
88 |     def num_hidden_layers(self):
89 |         return self.n_layers
90 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 |     'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
32 |     'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json",
33 |     'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json",
34 | }
35 | 
36 | 
37 | class RobertaConfig(BertConfig):
38 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
39 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--bert_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.bert_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import os
 19 | import argparse
 20 | import torch
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from transformers import BertModel
 24 | 
 25 | 
 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 27 | 
 28 |     """
 29 |     :param model:BertModel Pytorch model instance to be converted
 30 |     :param ckpt_dir: Tensorflow model directory
 31 |     :param model_name: model name
 32 |     :return:
 33 | 
 34 |     Currently supported HF models:
 35 |         Y BertModel
 36 |         N BertForMaskedLM
 37 |         N BertForPreTraining
 38 |         N BertForMultipleChoice
 39 |         N BertForNextSentencePrediction
 40 |         N BertForSequenceClassification
 41 |         N BertForQuestionAnswering
 42 |     """
 43 | 
 44 |     tensors_to_transpose = (
 45 |         "dense.weight",
 46 |         "attention.self.query",
 47 |         "attention.self.key",
 48 |         "attention.self.value"
 49 |     )
 50 | 
 51 |     var_map = (
 52 |         ('layer.', 'layer_'),
 53 |         ('word_embeddings.weight', 'word_embeddings'),
 54 |         ('position_embeddings.weight', 'position_embeddings'),
 55 |         ('token_type_embeddings.weight', 'token_type_embeddings'),
 56 |         ('.', '/'),
 57 |         ('LayerNorm/weight', 'LayerNorm/gamma'),
 58 |         ('LayerNorm/bias', 'LayerNorm/beta'),
 59 |         ('weight', 'kernel')
 60 |     )
 61 | 
 62 |     if not os.path.isdir(ckpt_dir):
 63 |         os.makedirs(ckpt_dir)
 64 | 
 65 |     state_dict = model.state_dict()
 66 | 
 67 |     def to_tf_var_name(name:str):
 68 |         for patt, repl in iter(var_map):
 69 |             name = name.replace(patt, repl)
 70 |         return 'bert/{}'.format(name)
 71 | 
 72 |     def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
 73 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 74 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 75 |         session.run(tf.variables_initializer([tf_var]))
 76 |         session.run(tf_var)
 77 |         return tf_var
 78 | 
 79 |     tf.reset_default_graph()
 80 |     with tf.Session() as session:
 81 |         for var_name in state_dict:
 82 |             tf_name = to_tf_var_name(var_name)
 83 |             torch_tensor = state_dict[var_name].numpy()
 84 |             if any([x in var_name for x in tensors_to_transpose]):
 85 |                 torch_tensor = torch_tensor.T
 86 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 87 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 88 |             tf_weight = session.run(tf_var)
 89 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 90 | 
 91 |         saver = tf.train.Saver(tf.trainable_variables())
 92 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 93 | 
 94 | 
 95 | def main(raw_args=None):
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_name",
 98 |                         type=str,
 99 |                         required=True,
100 |                         help="model name e.g. bert-base-uncased")
101 |     parser.add_argument("--cache_dir",
102 |                         type=str,
103 |                         default=None,
104 |                         required=False,
105 |                         help="Directory containing pytorch model")
106 |     parser.add_argument("--pytorch_model_path",
107 |                         type=str,
108 |                         required=True,
109 |                         help="/path/to/<pytorch-model-name>.bin")
110 |     parser.add_argument("--tf_cache_dir",
111 |                         type=str,
112 |                         required=True,
113 |                         help="Directory in which to save tensorflow model")
114 |     args = parser.parse_args(raw_args)
115 |     
116 |     model = BertModel.from_pretrained(
117 |         pretrained_model_name_or_path=args.model_name,
118 |         state_dict=torch.load(args.pytorch_model_path),
119 |         cache_dir=args.cache_dir
120 |     )
121 |     
122 |     convert_pytorch_checkpoint_to_tf(
123 |         model=model,
124 |         ckpt_dir=args.tf_cache_dir,
125 |         model_name=args.model_name
126 |     )
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if gpt2_config_file == "":
36 |         config = GPT2Config()
37 |     else:
38 |         config = GPT2Config.from_json_file(gpt2_config_file)
39 |     model = GPT2Model(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--gpt2_checkpoint_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--gpt2_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
74 |                                          args.gpt2_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if openai_config_file == "":
36 |         config = OpenAIGPTConfig()
37 |     else:
38 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
39 |     model = OpenAIGPTModel(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--openai_checkpoint_folder_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--openai_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
74 |                                          args.openai_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | import json
21 | from io import open
22 | 
23 | import torch
24 | import numpy
25 | 
26 | from transformers import CONFIG_NAME, WEIGHTS_NAME
27 | from transformers.tokenization_xlm import VOCAB_FILES_NAMES
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
35 | 
36 |     state_dict = chkpt['model']
37 | 
38 |     # We have the base model one level deeper than the original XLM repository
39 |     two_levels_state_dict = {}
40 |     for k, v in state_dict.items():
41 |         if 'pred_layer' in k:
42 |             two_levels_state_dict[k] = v
43 |         else:
44 |             two_levels_state_dict['transformer.' + k] = v
45 | 
46 |     config = chkpt['params']
47 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
48 | 
49 |     vocab = chkpt['dico_word2id']
50 |     vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
51 | 
52 |     # Save pytorch-model
53 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
54 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
55 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
56 | 
57 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
58 |     torch.save(two_levels_state_dict, pytorch_weights_dump_path)
59 | 
60 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
61 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
62 |         f.write(json.dumps(config, indent=2) + "\n")
63 | 
64 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
65 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
66 |         f.write(json.dumps(vocab, indent=2) + "\n")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 |     ## Required parameters
72 |     parser.add_argument("--xlm_checkpoint_path",
73 |                         default = None,
74 |                         type = str,
75 |                         required = True,
76 |                         help = "Path the official PyTorch dump.")
77 |     parser.add_argument("--pytorch_dump_folder_path",
78 |                         default = None,
79 |                         type = str,
80 |                         required = True,
81 |                         help = "Path to the output PyTorch model.")
82 |     args = parser.parse_args()
83 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
84 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import argparse
 23 | import torch
 24 | 
 25 | from transformers import (CONFIG_NAME, WEIGHTS_NAME,
 26 |                                                     XLNetConfig,
 27 |                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
 28 |                                                     XLNetForSequenceClassification,
 29 |                                                     load_tf_weights_in_xlnet)
 30 | 
 31 | GLUE_TASKS_NUM_LABELS = {
 32 |     "cola": 2,
 33 |     "mnli": 3,
 34 |     "mrpc": 2,
 35 |     "sst-2": 2,
 36 |     "sts-b": 1,
 37 |     "qqp": 2,
 38 |     "qnli": 2,
 39 |     "rte": 2,
 40 |     "wnli": 2,
 41 | }
 42 | 
 43 | import logging
 44 | logging.basicConfig(level=logging.INFO)
 45 | 
 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
 47 |     # Initialise PyTorch model
 48 |     config = XLNetConfig.from_json_file(bert_config_file)
 49 | 
 50 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 51 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 52 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 53 |         config.finetuning_task = finetuning_task
 54 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 55 |         model = XLNetForSequenceClassification(config)
 56 |     elif 'squad' in finetuning_task:
 57 |         config.finetuning_task = finetuning_task
 58 |         model = XLNetForQuestionAnswering(config)
 59 |     else:
 60 |         model = XLNetLMHeadModel(config)
 61 | 
 62 |     # Load weights from tf checkpoint
 63 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 64 | 
 65 |     # Save pytorch-model
 66 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 67 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 68 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 69 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 70 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 71 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 72 |         f.write(config.to_json_string())
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser()
 77 |     ## Required parameters
 78 |     parser.add_argument("--tf_checkpoint_path",
 79 |                         default = None,
 80 |                         type = str,
 81 |                         required = True,
 82 |                         help = "Path to the TensorFlow checkpoint path.")
 83 |     parser.add_argument("--xlnet_config_file",
 84 |                         default = None,
 85 |                         type = str,
 86 |                         required = True,
 87 |                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
 88 |                                "This specifies the model architecture.")
 89 |     parser.add_argument("--pytorch_dump_folder_path",
 90 |                         default = None,
 91 |                         type = str,
 92 |                         required = True,
 93 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 94 |     parser.add_argument("--finetuning_task",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
 98 |     args = parser.parse_args()
 99 |     print(args)
100 | 
101 |     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
102 |                                         args.xlnet_config_file,
103 |                                         args.pytorch_dump_folder_path,
104 |                                         args.finetuning_task)
105 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .processors import InputExample, InputFeatures, DataProcessor
2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features, glue_convert_examples_to_features_multiclass
3 | from .processors import tokenize
4 | 
5 | from .metrics import is_sklearn_available
6 | if is_sklearn_available():
7 |     from .metrics import glue_compute_metrics
8 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/metrics/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/metrics/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import InputExample, InputFeatures, DataProcessor
2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features, glue_convert_examples_to_features_multiclass
3 | #from aidrtokenize import aidrtokenize
4 | from .aidrtokenize import tokenize
5 | 
6 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/processors/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/processors/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/processors/__pycache__/aidrtokenize.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/processors/__pycache__/aidrtokenize.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/processors/__pycache__/glue.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/processors/__pycache__/glue.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/data/processors/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/data/processors/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/tests/__init__.py


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/configuration_common_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 HuggingFace Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import copy
20 | import os
21 | import shutil
22 | import json
23 | import random
24 | import uuid
25 | 
26 | import unittest
27 | import logging
28 | 
29 | 
30 | class ConfigTester(object):
31 |     def __init__(self, parent, config_class=None, **kwargs):
32 |         self.parent = parent
33 |         self.config_class = config_class
34 |         self.inputs_dict = kwargs
35 | 
36 |     def create_and_test_config_common_properties(self):
37 |         config = self.config_class(**self.inputs_dict)
38 |         self.parent.assertTrue(hasattr(config, 'vocab_size'))
39 |         self.parent.assertTrue(hasattr(config, 'hidden_size'))
40 |         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
41 |         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
42 | 
43 |     def create_and_test_config_to_json_string(self):
44 |         config = self.config_class(**self.inputs_dict)
45 |         obj = json.loads(config.to_json_string())
46 |         for key, value in self.inputs_dict.items():
47 |             self.parent.assertEqual(obj[key], value)
48 | 
49 |     def create_and_test_config_to_json_file(self):
50 |         config_first = self.config_class(**self.inputs_dict)
51 |         json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
52 |         config_first.to_json_file(json_file_path)
53 |         config_second = self.config_class.from_json_file(json_file_path)
54 |         os.remove(json_file_path)
55 |         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
56 | 
57 |     def run_common_tests(self):
58 |         self.create_and_test_config_common_properties()
59 |         self.create_and_test_config_to_json_string()
60 |         self.create_and_test_config_to_json_file()
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption(
 8 |         "--runslow", action="store_true", default=False, help="run slow tests"
 9 |     )
10 |     parser.addoption(
11 |         "--use_cuda", action="store_true", default=False, help="run tests on gpu"
12 |     )
13 | 
14 | 
15 | def pytest_configure(config):
16 |     config.addinivalue_line("markers", "slow: mark test as slow to run")
17 | 
18 | 
19 | def pytest_collection_modifyitems(config, items):
20 |     if config.getoption("--runslow"):
21 |         # --runslow given in cli: do not skip slow tests
22 |         return
23 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
24 |     for item in items:
25 |         if "slow" in item.keywords:
26 |             item.add_marker(skip_slow)
27 | 
28 | @pytest.fixture
29 | def use_cuda(request):
30 |     """ Run test on gpu """
31 |     return request.config.getoption("--use_cuda")
32 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/fixtures/input.txt:
--------------------------------------------------------------------------------
1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer
2 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/fixtures/sample_text.txt:
--------------------------------------------------------------------------------
 1 | This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
 2 | Text should be one-sentence-per-line, with empty lines between documents.
 3 | This sample text is public domain and was randomly selected from Project Guttenberg.
 4 | 
 5 | The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
 6 | Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
 7 | Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
 8 | "Cass" Beard had risen early that morning, but not with a view to discovery.
 9 | A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
10 | The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
11 | This was nearly opposite.
12 | Mr. Cassius crossed the highway, and stopped suddenly.
13 | Something glittered in the nearest red pool before him.
14 | Gold, surely!
15 | But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
16 | Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
17 | Like most of his fellow gold-seekers, Cass was superstitious.
18 | 
19 | The fountain of classic wisdom, Hypatia herself.
20 | As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
21 | From my youth I felt in me a soul above the matter-entangled herd.
22 | She revealed to me the glorious fact, that I am a spark of Divinity itself.
23 | A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
24 | There is a philosophic pleasure in opening one's treasures to the modest young.
25 | Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
26 | Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
27 | but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
28 | Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
29 | His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
30 | while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
31 | At last they reached the quay at the opposite end of the street;
32 | and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
33 | He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
34 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/fixtures/test_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firojalam/crisis_datasets_benchmarks/2dc70b4010f6bab844a24c8c48ff3d12bc27b09c/bin/transformers/examples/transformers/tests/fixtures/test_sentencepiece.model


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/modeling_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from transformers import is_torch_available
25 | 
26 | if is_torch_available():
27 |     from transformers import (AutoConfig, BertConfig,
28 |                                     AutoModel, BertModel,
29 |                                     AutoModelWithLMHead, BertForMaskedLM,
30 |                                     AutoModelForSequenceClassification, BertForSequenceClassification,
31 |                                     AutoModelForQuestionAnswering, BertForQuestionAnswering)
32 |     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
33 | 
34 |     from .modeling_common_test import (CommonTestCases, ids_tensor)
35 |     from .configuration_common_test import ConfigTester
36 | else:
37 |     pytestmark = pytest.mark.skip("Require Torch")
38 | 
39 | 
40 | class AutoModelTest(unittest.TestCase):
41 |     @pytest.mark.slow
42 |     def test_model_from_pretrained(self):
43 |         logging.basicConfig(level=logging.INFO)
44 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
45 |             config = AutoConfig.from_pretrained(model_name)
46 |             self.assertIsNotNone(config)
47 |             self.assertIsInstance(config, BertConfig)
48 | 
49 |             model = AutoModel.from_pretrained(model_name)
50 |             model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
51 |             self.assertIsNotNone(model)
52 |             self.assertIsInstance(model, BertModel)
53 |             for value in loading_info.values():
54 |                 self.assertEqual(len(value), 0)
55 | 
56 |     @pytest.mark.slow
57 |     def test_lmhead_model_from_pretrained(self):
58 |         logging.basicConfig(level=logging.INFO)
59 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
60 |             config = AutoConfig.from_pretrained(model_name)
61 |             self.assertIsNotNone(config)
62 |             self.assertIsInstance(config, BertConfig)
63 | 
64 |             model = AutoModelWithLMHead.from_pretrained(model_name)
65 |             model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
66 |             self.assertIsNotNone(model)
67 |             self.assertIsInstance(model, BertForMaskedLM)
68 | 
69 |     @pytest.mark.slow
70 |     def test_sequence_classification_model_from_pretrained(self):
71 |         logging.basicConfig(level=logging.INFO)
72 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
73 |             config = AutoConfig.from_pretrained(model_name)
74 |             self.assertIsNotNone(config)
75 |             self.assertIsInstance(config, BertConfig)
76 | 
77 |             model = AutoModelForSequenceClassification.from_pretrained(model_name)
78 |             model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
79 |             self.assertIsNotNone(model)
80 |             self.assertIsInstance(model, BertForSequenceClassification)
81 | 
82 |     @pytest.mark.slow
83 |     def test_question_answering_model_from_pretrained(self):
84 |         logging.basicConfig(level=logging.INFO)
85 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
86 |             config = AutoConfig.from_pretrained(model_name)
87 |             self.assertIsNotNone(config)
88 |             self.assertIsInstance(config, BertConfig)
89 | 
90 |             model = AutoModelForQuestionAnswering.from_pretrained(model_name)
91 |             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
92 |             self.assertIsNotNone(model)
93 |             self.assertIsInstance(model, BertForQuestionAnswering)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     unittest.main()
98 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/modeling_encoder_decoder_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Hugging Face Inc. Team
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | import unittest
18 | import pytest
19 | 
20 | from transformers import is_torch_available
21 | 
22 | if is_torch_available():
23 |     from transformers import BertModel, BertForMaskedLM, Model2Model
24 |     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
25 | else:
26 |     pytestmark = pytest.mark.skip("Require Torch")
27 | 
28 | 
29 | class EncoderDecoderModelTest(unittest.TestCase):
30 |     @pytest.mark.slow
31 |     def test_model2model_from_pretrained(self):
32 |         logging.basicConfig(level=logging.INFO)
33 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
34 |             model = Model2Model.from_pretrained(model_name)
35 |             self.assertIsInstance(model.encoder, BertModel)
36 |             self.assertIsInstance(model.decoder, BertForMaskedLM)
37 |             self.assertEqual(model.decoder.config.is_decoder, True)
38 |             self.assertEqual(model.encoder.config.is_decoder, False)
39 | 
40 |     def test_model2model_from_pretrained_not_bert(self):
41 |         logging.basicConfig(level=logging.INFO)
42 |         with self.assertRaises(ValueError):
43 |             _ = Model2Model.from_pretrained('roberta')
44 | 
45 |         with self.assertRaises(ValueError):
46 |             _ = Model2Model.from_pretrained('distilbert')
47 | 
48 |         with self.assertRaises(ValueError):
49 |             _ = Model2Model.from_pretrained('does-not-exist')
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/modeling_tf_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from transformers import is_tf_available
25 | 
26 | if is_tf_available():
27 |     from transformers import (AutoConfig, BertConfig,
28 |                                       TFAutoModel, TFBertModel,
29 |                                       TFAutoModelWithLMHead, TFBertForMaskedLM,
30 |                                       TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
31 |                                       TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
32 |     from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
33 | 
34 |     from .modeling_common_test import (CommonTestCases, ids_tensor)
35 |     from .configuration_common_test import ConfigTester
36 | else:
37 |     pytestmark = pytest.mark.skip("Require TensorFlow")
38 | 
39 | 
40 | class TFAutoModelTest(unittest.TestCase):
41 |     def test_model_from_pretrained(self):
42 |         import h5py
43 |         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
44 | 
45 |         logging.basicConfig(level=logging.INFO)
46 |         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
47 |         for model_name in ['bert-base-uncased']:
48 |             config = AutoConfig.from_pretrained(model_name, force_download=True)
49 |             self.assertIsNotNone(config)
50 |             self.assertIsInstance(config, BertConfig)
51 | 
52 |             model = TFAutoModel.from_pretrained(model_name, force_download=True)
53 |             self.assertIsNotNone(model)
54 |             self.assertIsInstance(model, TFBertModel)
55 | 
56 |     def test_lmhead_model_from_pretrained(self):
57 |         logging.basicConfig(level=logging.INFO)
58 |         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
59 |         for model_name in ['bert-base-uncased']:
60 |             config = AutoConfig.from_pretrained(model_name, force_download=True)
61 |             self.assertIsNotNone(config)
62 |             self.assertIsInstance(config, BertConfig)
63 | 
64 |             model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
65 |             self.assertIsNotNone(model)
66 |             self.assertIsInstance(model, TFBertForMaskedLM)
67 | 
68 |     def test_sequence_classification_model_from_pretrained(self):
69 |         logging.basicConfig(level=logging.INFO)
70 |         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
71 |         for model_name in ['bert-base-uncased']:
72 |             config = AutoConfig.from_pretrained(model_name, force_download=True)
73 |             self.assertIsNotNone(config)
74 |             self.assertIsInstance(config, BertConfig)
75 | 
76 |             model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
77 |             self.assertIsNotNone(model)
78 |             self.assertIsInstance(model, TFBertForSequenceClassification)
79 | 
80 |     def test_question_answering_model_from_pretrained(self):
81 |         logging.basicConfig(level=logging.INFO)
82 |         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
83 |         for model_name in ['bert-base-uncased']:
84 |             config = AutoConfig.from_pretrained(model_name, force_download=True)
85 |             self.assertIsNotNone(config)
86 |             self.assertIsInstance(config, BertConfig)
87 | 
88 |             model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
89 |             self.assertIsNotNone(model)
90 |             self.assertIsInstance(model, TFBertForQuestionAnswering)
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     unittest.main()
95 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
25 | from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
26 | 
27 | 
28 | class AutoTokenizerTest(unittest.TestCase):
29 |     @pytest.mark.slow
30 |     def test_tokenizer_from_pretrained(self):
31 |         logging.basicConfig(level=logging.INFO)
32 |         for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
33 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
34 |             self.assertIsNotNone(tokenizer)
35 |             self.assertIsInstance(tokenizer, BertTokenizer)
36 |             self.assertGreater(len(tokenizer), 0)
37 | 
38 |         for model_name in list(GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
39 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
40 |             self.assertIsNotNone(tokenizer)
41 |             self.assertIsInstance(tokenizer, GPT2Tokenizer)
42 |             self.assertGreater(len(tokenizer), 0)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_ctrl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 Salesforce and HuggingFace Inc. team.
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from __future__ import absolute_import, division, print_function, unicode_literals
15 | 
16 | import os
17 | import unittest
18 | import json
19 | from io import open
20 | 
21 | from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = CTRLTokenizer
28 | 
29 |     def setUp(self):
30 |         super(CTRLTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', '<unk>']
34 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
35 |         merges = ["#version: 0.2", 'a p', 'ap t</w>', 'r e', 'a d', 'ad apt</w>', '']
36 |         self.special_tokens_map = {"unk_token": "<unk>"}
37 | 
38 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
39 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
40 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
41 |             fp.write(json.dumps(vocab_tokens) + "\n")
42 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
43 |             fp.write("\n".join(merges))
44 | 
45 |     def get_tokenizer(self, **kwargs):
46 |         kwargs.update(self.special_tokens_map)
47 |         return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
48 | 
49 |     def get_input_output_texts(self):
50 |         input_text = u"adapt react readapt apt"
51 |         output_text = u"adapt react readapt apt"
52 |         return input_text, output_text
53 | 
54 |     def test_full_tokenizer(self):
55 |         tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
56 |         text = "adapt react readapt apt"
57 |         bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
58 |         tokens = tokenizer.tokenize(text)
59 |         self.assertListEqual(tokens, bpe_tokens)
60 | 
61 |         input_tokens = tokens + [tokenizer.unk_token]
62 | 
63 |         input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6]
64 |         self.assertListEqual(
65 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_distilbert_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import pytest
20 | from io import open
21 | 
22 | from transformers.tokenization_distilbert import (DistilBertTokenizer)
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | from .tokenization_bert_test import BertTokenizationTest
26 | 
27 | class DistilBertTokenizationTest(BertTokenizationTest):
28 | 
29 |     tokenizer_class = DistilBertTokenizer
30 | 
31 |     def get_tokenizer(self, **kwargs):
32 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
33 | 
34 |     @pytest.mark.slow
35 |     def test_sequence_builders(self):
36 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
37 | 
38 |         text = tokenizer.encode("sequence builders", add_special_tokens=False)
39 |         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
40 | 
41 |         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
42 |         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
43 | 
44 |         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
45 |         assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
46 |                text_2 + [tokenizer.sep_token_id]
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_gpt2_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | from io import open
21 | 
22 | from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | 
26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = GPT2Tokenizer
29 | 
30 |     def setUp(self):
31 |         super(GPT2TokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "\u0120", "\u0120l", "\u0120n",
36 |                  "\u0120lo", "\u0120low", "er",
37 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
40 |         self.special_tokens_map = {"unk_token": "<unk>"}
41 | 
42 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
43 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
44 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
45 |             fp.write(json.dumps(vocab_tokens) + "\n")
46 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
47 |             fp.write("\n".join(merges))
48 | 
49 |     def get_tokenizer(self, **kwargs):
50 |         kwargs.update(self.special_tokens_map)
51 |         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
52 | 
53 |     def get_input_output_texts(self):
54 |         input_text = u"lower newer"
55 |         output_text = u"lower newer"
56 |         return input_text, output_text
57 | 
58 |     def test_full_tokenizer(self):
59 |         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
60 |         text = "lower newer"
61 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
62 |         tokens = tokenizer.tokenize(text, add_prefix_space=True)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + [tokenizer.unk_token]
66 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_openai_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = OpenAIGPTTokenizer
29 | 
30 |     def setUp(self):
31 |         super(OpenAIGPTTokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "w</w>", "r</w>", "t</w>",
36 |                  "lo", "low", "er</w>",
37 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w") as fp:
44 |             fp.write(json.dumps(vocab_tokens))
45 |         with open(self.merges_file, "w") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"lower newer"
53 |         output_text = u"lower newer"
54 |         return input_text, output_text
55 | 
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
59 | 
60 |         text = "lower"
61 |         bpe_tokens = ["low", "er</w>"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + ["<unk>"]
66 |         input_bpe_tokens = [14, 15, 20]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_roberta_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import json
 19 | import unittest
 20 | import pytest
 21 | from io import open
 22 | 
 23 | from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 24 | from .tokenization_tests_commons import CommonTestCases
 25 | 
 26 | 
 27 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
 28 |     tokenizer_class = RobertaTokenizer
 29 | 
 30 |     def setUp(self):
 31 |         super(RobertaTokenizationTest, self).setUp()
 32 | 
 33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
 34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
 35 |                  "\u0120", "\u0120l", "\u0120n",
 36 |                  "\u0120lo", "\u0120low", "er",
 37 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
 38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 39 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
 40 |         self.special_tokens_map = {"unk_token": "<unk>"}
 41 | 
 42 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
 43 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
 44 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
 45 |             fp.write(json.dumps(vocab_tokens) + "\n")
 46 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
 47 |             fp.write("\n".join(merges))
 48 | 
 49 |     def get_tokenizer(self, **kwargs):
 50 |         kwargs.update(self.special_tokens_map)
 51 |         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 52 | 
 53 |     def get_input_output_texts(self):
 54 |         input_text = u"lower newer"
 55 |         output_text = u"lower newer"
 56 |         return input_text, output_text
 57 | 
 58 |     def test_full_tokenizer(self):
 59 |         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
 60 |         text = "lower newer"
 61 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
 62 |         tokens = tokenizer.tokenize(text, add_prefix_space=True)
 63 |         self.assertListEqual(tokens, bpe_tokens)
 64 | 
 65 |         input_tokens = tokens + [tokenizer.unk_token]
 66 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
 67 |         self.assertListEqual(
 68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 69 | 
 70 |     def roberta_dict_integration_testing(self):
 71 |         tokenizer = self.get_tokenizer()
 72 | 
 73 |         self.assertListEqual(
 74 |             tokenizer.encode('Hello world!', add_special_tokens=False),
 75 |             [0, 31414, 232, 328, 2]
 76 |         )
 77 |         self.assertListEqual(
 78 |             tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
 79 |             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
 80 |         )
 81 | 
 82 |     @pytest.mark.slow
 83 |     def test_sequence_builders(self):
 84 |         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
 85 | 
 86 |         text = tokenizer.encode("sequence builders", add_special_tokens=False)
 87 |         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 88 | 
 89 |         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
 90 |         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
 91 | 
 92 |         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
 93 |         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
 94 | 
 95 |         assert encoded_sentence == encoded_text_from_decode
 96 |         assert encoded_pair == encoded_pair_from_decode
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     unittest.main()
101 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_transfo_xl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import pytest
20 | from io import open
21 | 
22 | from transformers import is_torch_available
23 | 
24 | if is_torch_available():
25 |     import torch
26 |     from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
27 | else:
28 |     pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
29 | 
30 | from .tokenization_tests_commons import CommonTestCases
31 | 
32 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
33 | 
34 |     tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
35 | 
36 |     def setUp(self):
37 |         super(TransfoXLTokenizationTest, self).setUp()
38 | 
39 |         vocab_tokens = [
40 |             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
41 |             "running", ",", "low", "l",
42 |         ]
43 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
44 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
45 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
46 | 
47 |     def get_tokenizer(self, **kwargs):
48 |         kwargs['lower_case'] = True
49 |         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"<unk> UNwanted , running"
53 |         output_text = u"<unk> unwanted, running"
54 |         return input_text, output_text
55 | 
56 |     def test_full_tokenizer(self):
57 |         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
58 | 
59 |         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
60 |         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
61 | 
62 |         self.assertListEqual(
63 |             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
64 | 
65 |     def test_full_tokenizer_lower(self):
66 |         tokenizer = TransfoXLTokenizer(lower_case=True)
67 | 
68 |         self.assertListEqual(
69 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
70 |             ["hello", "!", "how", "are", "you", "?"])
71 | 
72 |     def test_full_tokenizer_no_lower(self):
73 |         tokenizer = TransfoXLTokenizer(lower_case=False)
74 | 
75 |         self.assertListEqual(
76 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
77 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import six
21 | import pytest
22 | 
23 | from transformers import PreTrainedTokenizer
24 | from transformers.tokenization_gpt2 import GPT2Tokenizer
25 | 
26 | class TokenizerUtilsTest(unittest.TestCase):
27 |     @pytest.mark.slow
28 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
29 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
30 |         for model_name in s3_models[:1]:
31 |             tokenizer = tokenizer_class.from_pretrained(model_name)
32 |             self.assertIsNotNone(tokenizer)
33 |             self.assertIsInstance(tokenizer, tokenizer_class)
34 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
35 | 
36 |             for special_tok in tokenizer.all_special_tokens:
37 |                 if six.PY2:
38 |                     self.assertIsInstance(special_tok, unicode)
39 |                 else:
40 |                     self.assertIsInstance(special_tok, str)
41 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
42 |                 self.assertIsInstance(special_tok_id, int)
43 | 
44 |     def test_pretrained_tokenizers(self):
45 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
46 | 
47 | if __name__ == "__main__":
48 |     unittest.main()
49 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tests/tokenization_xlm_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | import pytest
21 | 
22 | from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | 
26 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = XLMTokenizer
29 | 
30 |     def setUp(self):
31 |         super(XLMTokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "w</w>", "r</w>", "t</w>",
36 |                  "lo", "low", "er</w>",
37 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w") as fp:
44 |             fp.write(json.dumps(vocab_tokens))
45 |         with open(self.merges_file, "w") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"lower newer"
53 |         output_text = u"lower newer"
54 |         return input_text, output_text
55 | 
56 |     def test_full_tokenizer(self):
57 |         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
58 |         tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
59 | 
60 |         text = "lower"
61 |         bpe_tokens = ["low", "er</w>"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + ["<unk>"]
66 |         input_bpe_tokens = [14, 15, 20]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 |     @pytest.mark.slow
71 |     def test_sequence_builders(self):
72 |         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
73 | 
74 |         text = tokenizer.encode("sequence builders", add_special_tokens=False)
75 |         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
76 | 
77 |         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
78 |         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
79 | 
80 |         assert encoded_sentence == [1] + text + [1]
81 |         assert encoded_pair == [1] + text + [1] + text_2 + [1]
82 | 
83 | if __name__ == '__main__':
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------
/bin/transformers/examples/transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | 
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 | 
25 | from .tokenization_bert import BertTokenizer
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 | 
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 |     'vocab_file':
33 |     {
34 |         'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 |         'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 |     }
37 | }
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     'distilbert-base-uncased': 512,
41 |     'distilbert-base-uncased-distilled-squad': 512,
42 | }
43 | 
44 | 
45 | class DistilBertTokenizer(BertTokenizer):
46 |     r"""
47 |     Constructs a DistilBertTokenizer.
48 |     :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
49 | 
50 |     Args:
51 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
52 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
53 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
54 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
55 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
56 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
57 |             do_wordpiece_only=False
58 |     """
59 | 
60 |     vocab_files_names = VOCAB_FILES_NAMES
61 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
62 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
63 | 


--------------------------------------------------------------------------------
/bin/transformers/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.8.0
 2 | astor==0.8.0
 3 | atomicwrites==1.3.0
 4 | attrs==19.2.0
 5 | boto3==1.9.243
 6 | botocore==1.12.243
 7 | certifi==2019.9.11
 8 | chardet==3.0.4
 9 | Click==7.0
10 | docutils==0.15.2
11 | gast==0.2.2
12 | google-pasta==0.1.7
13 | grpcio==1.24.1
14 | h5py==2.10.0
15 | idna==2.8
16 | importlib-metadata==0.23
17 | jmespath==0.9.4
18 | joblib==0.14.0
19 | Keras-Applications==1.0.8
20 | Keras-Preprocessing==1.1.0
21 | Markdown==3.1.1
22 | more-itertools==7.2.0
23 | numpy==1.17.2
24 | opt-einsum==3.1.0
25 | packaging==19.2
26 | pluggy==0.13.0
27 | protobuf==3.10.0
28 | py==1.8.0
29 | pyparsing==2.4.2
30 | pytest==5.2.1
31 | python-dateutil==2.8.0
32 | regex==2019.8.19
33 | requests==2.22.0
34 | s3transfer==0.2.1
35 | sacremoses==0.0.35
36 | sentencepiece==0.1.83
37 | six==1.12.0
38 | tensorboard==2.0.0
39 | tensorflow==2.0.0
40 | tensorflow-estimator==2.0.0
41 | termcolor==1.1.0
42 | torch==1.2.0
43 | tqdm==4.36.1
44 | urllib3==1.25.6
45 | wcwidth==0.1.7
46 | Werkzeug==0.16.0
47 | wrapt==1.11.2
48 | zipp==0.6.0
49 | 


--------------------------------------------------------------------------------
/bin/transformers/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Accessing files from S3 directly.
 4 | boto3
 5 | # Used for downloading models over HTTP
 6 | requests
 7 | # For OpenAI GPT
 8 | regex
 9 | # For XLNet
10 | sentencepiece
11 | # For XLM
12 | sacremoses


--------------------------------------------------------------------------------
/bin/transformers/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
 3 | 
 4 | To create the package for pypi.
 5 | 
 6 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
 7 | 
 8 | 2. Commit these changes with the message: "Release: VERSION"
 9 | 
10 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
11 |    Push the tag to git: git push --tags origin master
12 | 
13 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
14 |    creating the wheel and the source distribution (obviously).
15 | 
16 |    For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
17 |    (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
18 | 
19 |    For the sources, run: "python setup.py sdist"
20 |    You should now have a /dist directory with both .whl and .tar.gz source versions.
21 | 
22 | 5. Check that everything looks correct by uploading the package to the pypi test server:
23 | 
24 |    twine upload dist/* -r pypitest
25 |    (pypi suggest using twine as other methods upload files via plaintext.)
26 | 
27 |    Check that you can install it in a virtualenv by running:
28 |    pip install -i https://testpypi.python.org/pypi transformers
29 | 
30 | 6. Upload the final version to actual pypi:
31 |    twine upload dist/* -r pypi
32 | 
33 | 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
34 | 
35 | """
36 | from io import open
37 | from setuptools import find_packages, setup
38 | 
39 | setup(
40 |     name="transformers",
41 |     version="2.1.1",
42 |     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
43 |     author_email="thomas@huggingface.co",
44 |     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
45 |     long_description=open("README.md", "r", encoding='utf-8').read(),
46 |     long_description_content_type="text/markdown",
47 |     keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
48 |     license='Apache',
49 |     url="https://github.com/huggingface/transformers",
50 |     packages=find_packages(exclude=["*.tests", "*.tests.*",
51 |                                     "tests.*", "tests"]),
52 |     install_requires=['numpy',
53 |                       'boto3',
54 |                       'requests',
55 |                       'tqdm',
56 |                       'regex',
57 |                       'sentencepiece',
58 |                       'sacremoses'],
59 |     entry_points={
60 |       'console_scripts': [
61 |         "transformers=transformers.__main__:main",
62 |       ]
63 |     },
64 |     # python_requires='>=3.5.0',
65 |     tests_require=['pytest'],
66 |     classifiers=[
67 |           'Intended Audience :: Science/Research',
68 |           'License :: OSI Approved :: Apache Software License',
69 |           'Programming Language :: Python :: 3',
70 |           'Topic :: Scientific/Engineering :: Artificial Intelligence',
71 |     ],
72 | )
73 | 


--------------------------------------------------------------------------------
/bin/transformers/templates/adding_a_new_example_script/README.md:
--------------------------------------------------------------------------------
1 | # How to add a new example script in 🤗Transformers
2 | 
3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
4 | 
5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
6 | 


--------------------------------------------------------------------------------
/bin/transformers/templates/adding_a_new_model/README.md:
--------------------------------------------------------------------------------
 1 | # How to add a new model in 🤗Transformers
 2 | 
 3 | This folder describes the process to add a new model in 🤗Transformers and provide templates for the required files.
 4 | 
 5 | The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository.
 6 | 
 7 | One important point though is that the library has the following goals impacting the way models are incorporated:
 8 | 
 9 | - one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
10 | - the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
11 | 
12 | For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
13 | 
14 | # Typical workflow for including a model
15 | 
16 | Here an overview of the general workflow: 
17 | 
18 | - [ ] add model/configuration/tokenization classes
19 | - [ ] add conversion scripts
20 | - [ ] add tests
21 | - [ ] finalize
22 | 
23 | Let's detail what should be done at each step
24 | 
25 | ## Adding model/configuration/tokenization classes
26 | 
27 | Here is the workflow for adding model/configuration/tokenization classes:
28 | 
29 | - [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
30 | - [ ] edit the files to replace `XXX` (with various casing) with your model name
31 | - [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file
32 | - [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
33 | - [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file
34 | 
35 | # Adding conversion scripts
36 | 
37 | Here is the workflow for the conversion scripts:
38 | 
39 | - [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
40 | - [ ] edit this script to convert your original checkpoint weights to the current pytorch ones.
41 | 
42 | # Adding tests:
43 | 
44 | Here is the workflow for the adding tests:
45 | 
46 | - [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name,
47 | - [ ] edit the tests files to replace `XXX` (with various casing) with your model name
48 | - [ ] edit the tests code as needed
49 | 
50 | # Final steps
51 | 
52 | You can then finish the addition step by adding imports for your classes in the common files:
53 | 
54 | - [ ] add import for all the relevant classes in `__init__.py`
55 | - [ ] add your configuration in `configuration_auto.py`
56 | - [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`
57 | - [ ] add your tokenizer in `tokenization_auto.py`
58 | - [ ] add your models and tokenizer to `pipeline.py`
59 | - [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
60 | - [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
61 | - [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
62 | - [ ] upload the pretrained weigths, configurations and vocabulary files.
63 | 


--------------------------------------------------------------------------------
/bin/transformers/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert XXX checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = XxxConfig.from_json_file(xxx_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = XxxForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_xxx(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--xxx_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained XXX model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.xxx_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/bin/transformers/templates/adding_a_new_model/tests/tokenization_xxx_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 XXX Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES)
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = XxxTokenizer
28 | 
29 |     def setUp(self):
30 |         super(XxxTokenizationTest, self).setUp()
31 | 
32 |         vocab_tokens = [
33 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
34 |             "##ing", ",", "low", "lowest",
35 |         ]
36 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
37 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
38 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
39 | 
40 |     def get_tokenizer(self, **kwargs):
41 |         return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
42 | 
43 |     def get_input_output_texts(self):
44 |         input_text = u"UNwant\u00E9d,running"
45 |         output_text = u"unwanted, running"
46 |         return input_text, output_text
47 | 
48 |     def test_full_tokenizer(self):
49 |         tokenizer = self.tokenizer_class(self.vocab_file)
50 | 
51 |         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
52 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
53 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/environment_crisis_bert_env.yml:
--------------------------------------------------------------------------------
 1 | name: transformers
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - _libgcc_mutex=0.1=main
 6 |   - ca-certificates=2019.10.16=0
 7 |   - certifi=2019.9.11=py36_0
 8 |   - libedit=3.1.20181209=hc058e9b_0
 9 |   - libffi=3.2.1=hd88cf55_4
10 |   - libgcc-ng=9.1.0=hdf63c60_0
11 |   - libstdcxx-ng=9.1.0=hdf63c60_0
12 |   - ncurses=6.1=he6710b0_1
13 |   - openssl=1.1.1d=h7b6447c_3
14 |   - pip=19.3.1=py36_0
15 |   - python=3.6.9=h265db76_0
16 |   - readline=7.0=h7b6447c_5
17 |   - setuptools=41.6.0=py36_0
18 |   - sqlite=3.30.1=h7b6447c_0
19 |   - tk=8.6.8=hbc83047_0
20 |   - wheel=0.33.6=py36_0
21 |   - xz=5.2.4=h14c3975_4
22 |   - zlib=1.2.11=h7b6447c_3
23 |   - pip:
24 |     - absl-py==0.8.1
25 |     - astor==0.8.1
26 |     - boto3==1.10.19
27 |     - botocore==1.13.19
28 |     - cachetools==3.1.1
29 |     - chardet==3.0.4
30 |     - click==7.0
31 |     - docutils==0.15.2
32 |     - emoji==0.5.4
33 |     - gast==0.3.3
34 |     - google-auth==1.7.1
35 |     - google-auth-oauthlib==0.4.1
36 |     - google-pasta==0.1.8
37 |     - grpcio==1.25.0
38 |     - h5py==2.10.0
39 |     - idna==2.8
40 |     - jmespath==0.9.4
41 |     - joblib==0.14.0
42 |     - keras==2.3.1
43 |     - keras-applications==1.0.8
44 |     - keras-preprocessing==1.1.0
45 |     - markdown==3.1.1
46 |     - nltk==3.4.5
47 |     - numpy==1.17.4
48 |     - oauthlib==3.1.0
49 |     - pandas==0.25.3
50 |     - pillow==6.2.1
51 |     - protobuf==3.10.0
52 |     - pyasn1==0.4.8
53 |     - pyasn1-modules==0.2.7
54 |     - python-dateutil==2.8.0
55 |     - pytz==2019.3
56 |     - pyyaml==5.1.2
57 |     - regex==2019.11.1
58 |     - requests==2.22.0
59 |     - requests-oauthlib==1.3.0
60 |     - rsa==4.0
61 |     - s3transfer==0.2.1
62 |     - sacremoses==0.0.35
63 |     - scikit-learn==0.21.3
64 |     - scipy==1.3.2
65 |     - sentencepiece==0.1.83
66 |     - seqeval==0.0.12
67 |     - six==1.13.0
68 |     - tensorboard==1.13.1
69 |     - tensorboardx==1.9
70 |     - tensorflow==1.14.0rc0
71 |     - termcolor==1.1.0
72 |     - tf-estimator-nightly==1.14.0.dev2019042301
73 |     - torch==1.3.1
74 |     - torchvision==0.4.2
75 |     - tqdm==4.38.0
76 |     - transformers==2.1.1
77 |     - urllib3==1.25.7
78 |     - werkzeug==0.16.0
79 |     - wordsegment==1.3.1
80 |     - wrapt==1.11.2
81 | prefix: /home/local/QCRI/fialam/anaconda3home/envs/transformers
82 | 
83 | 


--------------------------------------------------------------------------------
/etc/stop_words_english.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | are
 13 | aren't
 14 | as
 15 | at
 16 | be
 17 | because
 18 | been
 19 | before
 20 | being
 21 | below
 22 | between
 23 | both
 24 | but
 25 | by
 26 | can't
 27 | cannot
 28 | could
 29 | couldn't
 30 | did
 31 | didn't
 32 | do
 33 | does
 34 | doesn't
 35 | doing
 36 | don't
 37 | down
 38 | during
 39 | each
 40 | few
 41 | for
 42 | from
 43 | further
 44 | had
 45 | hadn't
 46 | has
 47 | hasn't
 48 | have
 49 | haven't
 50 | having
 51 | he
 52 | he'd
 53 | he'll
 54 | he's
 55 | her
 56 | here
 57 | here's
 58 | hers
 59 | herself
 60 | him
 61 | himself
 62 | his
 63 | how
 64 | how's
 65 | i
 66 | i'd
 67 | i'll
 68 | i'm
 69 | i've
 70 | if
 71 | in
 72 | into
 73 | is
 74 | isn't
 75 | it
 76 | it's
 77 | its
 78 | itself
 79 | let's
 80 | me
 81 | more
 82 | most
 83 | mustn't
 84 | my
 85 | myself
 86 | no
 87 | nor
 88 | not
 89 | of
 90 | off
 91 | on
 92 | once
 93 | only
 94 | or
 95 | other
 96 | ought
 97 | our
 98 | ours	ourselves
 99 | out
100 | over
101 | own
102 | same
103 | shan't
104 | she
105 | she'd
106 | she'll
107 | she's
108 | should
109 | shouldn't
110 | so
111 | some
112 | such
113 | than
114 | that
115 | that's
116 | the
117 | their
118 | theirs
119 | them
120 | themselves
121 | then
122 | there
123 | there's
124 | these
125 | they
126 | they'd
127 | they'll
128 | they're
129 | they've
130 | this
131 | those
132 | through
133 | to
134 | too
135 | under
136 | until
137 | up
138 | very
139 | was
140 | wasn't
141 | we
142 | we'd
143 | we'll
144 | we're
145 | we've
146 | were
147 | weren't
148 | what
149 | what's
150 | when
151 | when's
152 | where
153 | where's
154 | which
155 | while
156 | who
157 | who's
158 | whom
159 | why
160 | why's
161 | with
162 | won't
163 | would
164 | wouldn't
165 | you
166 | you'd
167 | you'll
168 | you're
169 | you've
170 | your
171 | yours
172 | yourself
173 | yourselves
174 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Keras==2.2.0
 2 | matplotlib==2.2.2
 3 | nltk==3.3
 4 | requests==2.20.1
 5 | tensorflow==1.9.0
 6 | gensim==3.4.0
 7 | pandas==0.23.0
 8 | numpy==1.15.1
 9 | six==1.11.0
10 | python_dateutil==2.8.1
11 | scikit_learn==0.22.2.post1
12 | 


--------------------------------------------------------------------------------