├── AutoTinyBERT
    ├── AutoTinyBERT_overview.PNG
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── generate_data.py
    ├── inference_time_evaluation.py
    ├── latency_predictor.py
    ├── pre_training.py
    ├── searcher.py
    ├── submodel_extractor.py
    ├── superbert_run_en_classifier.py
    ├── transformer
    │   ├── __init__.py
    │   ├── file_utils.py
    │   ├── modeling_base.py
    │   ├── modeling_extractor.py
    │   ├── modeling_super_kd.py
    │   ├── optimization.py
    │   └── tokenization.py
    └── utils.py
├── BBPE
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── bbpe
    │   ├── byteVocab.txt
    │   ├── charNumVocab.txt
    │   ├── charVocab.txt
    │   ├── cn_wiki_sample.txt
    │   ├── fastBPE-master
    │   │   ├── bpe_postprocessing.py
    │   │   ├── fastBPE
    │   │   │   ├── fastBPE.hpp
    │   │   │   ├── fastBPE.pyx
    │   │   │   └── main.cc
    │   │   └── vocab_byteTo16base.py
    │   ├── genByteVocab.py
    │   ├── genNum.py
    │   ├── map_freq.py
    │   ├── mergeVocab.py
    │   ├── protectList.txt
    │   ├── text2utf-8-mt-byte.sh
    │   ├── text2utf-8-mt-char.sh
    │   ├── tokenization.py
    │   ├── utf-8-mt-byte.py
    │   └── utf-8-mt-char.py
    └── example.png
├── BinaryBERT
    ├── LICENSE
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── __init__.py
    ├── assets
    │   └── model.png
    ├── helper.py
    ├── kd_learner_glue.py
    ├── kd_learner_squad.py
    ├── quant_task_distill_glue.py
    ├── quant_task_distill_squad.py
    ├── readme.md
    ├── requirements.txt
    ├── scripts
    │   ├── terarny_glue.sh
    │   ├── terarny_squad.sh
    │   ├── tws_glue.sh
    │   └── tws_squad.sh
    ├── transformer
    │   ├── __init__.py
    │   ├── binary_model_init.py
    │   ├── configuration_bert.py
    │   ├── configuration_utils.py
    │   ├── file_utils.py
    │   ├── modeling.py
    │   ├── modeling_dynabert.py
    │   ├── modeling_dynabert_binary.py
    │   ├── modeling_dynabert_quant.py
    │   ├── modeling_utils.py
    │   ├── optimization.py
    │   ├── tokenization.py
    │   └── utils_quant.py
    ├── utils_glue.py
    └── utils_squad.py
├── CAME
    ├── .DS_Store
    ├── Dockerfile
    ├── LICENSE
    ├── NOTICE
    ├── README.md
    ├── adafactor.py
    ├── bert-large-uncased-vocab.txt
    ├── bert_config.json
    ├── bert_large_config.json
    ├── bert_pretrain.png
    ├── bind.sh
    ├── bind_pyt.py
    ├── came.py
    ├── came_pcode.png
    ├── configurations.yml
    ├── create_data.sh
    ├── create_pretraining_data.py
    ├── data
    │   ├── BooksDownloader.py
    │   ├── BookscorpusTextFormatting.py
    │   ├── Downloader.py
    │   ├── GLUEDownloader.py
    │   ├── GooglePretrainedWeightDownloader.py
    │   ├── NVIDIAPretrainedWeightDownloader.py
    │   ├── SquadDownloader.py
    │   ├── TextSharding.py
    │   ├── WikiDownloader.py
    │   ├── WikicorpusTextFormatting.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   └── TextSharding.cpython-36.pyc
    │   ├── bertPrep.py
    │   ├── create_datasets_from_start.sh
    │   ├── shard.py
    │   ├── squad
    │   │   └── squad_download.sh
    │   └── wikiextractor
    │   │   ├── .github
    │   │       └── workflows
    │   │       │   └── python-publish.yml
    │   │   ├── .gitignore
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── extract.sh
    │   │   ├── setup.py
    │   │   └── wikiextractor
    │   │       ├── WikiExtractor.py
    │   │       ├── __init__.py
    │   │       ├── cirrus-extract.py
    │   │       ├── clean.py
    │   │       ├── extract.py
    │   │       └── extractPage.py
    ├── extract_features.py
    ├── file_utils.py
    ├── inference.py
    ├── memory.png
    ├── modeling.py
    ├── optimization.py
    ├── processors
    │   ├── __init__.py
    │   └── glue.py
    ├── requirements.txt
    ├── run.sub
    ├── run_came_pretraining.sh
    ├── run_glue.py
    ├── run_pretraining.py
    ├── run_squad.py
    ├── run_swag.py
    ├── run_validation.sh
    ├── schedulers.py
    ├── scripts
    │   ├── configs
    │   │   ├── glue_config.sh
    │   │   ├── pretrain_config.sh
    │   │   └── squad_config.sh
    │   ├── data_download.sh
    │   ├── docker
    │   │   ├── build.sh
    │   │   └── launch.sh
    │   ├── run_glue.sh
    │   ├── run_pretraining.sh
    │   ├── run_squad.sh
    │   └── run_swag.sh
    ├── start_data.py
    ├── startup_came.py
    ├── tokenization.py
    ├── triton
    │   ├── LICENSE
    │   ├── README.md
    │   ├── client.py
    │   ├── deployer.py
    │   ├── deployer_lib.py
    │   ├── evaluate.sh
    │   ├── export_model.sh
    │   ├── generate_figures.sh
    │   ├── launch_triton_server.sh
    │   ├── profiling_data_int64
    │   │   ├── input__0
    │   │   ├── input__1
    │   │   └── input__2
    │   ├── run_perf_client.sh
    │   ├── run_squad_client.py
    │   └── wait_for_triton_server.sh
    ├── utils.py
    ├── v1.1
    │   ├── dev-v1.1.json
    │   ├── evaluate-v1.1.py
    │   └── train-v1.1.json
    └── vocab
    │   └── vocab
├── CeMAT
    ├── CeMAT_maskPredict
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── cemat_nat_options.py
    │   ├── checkpoint_utils.py
    │   ├── criterions
    │   │   ├── __init__.py
    │   │   └── label_smoothed_length_cross_entropy.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── language_pair_self_dataset_mask.py
    │   ├── generate_cmlm.py
    │   ├── meters.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── bert_seq2seq.py
    │   │   └── cemat_model.py
    │   ├── pybleu.py
    │   ├── strategies
    │   │   ├── __init__.py
    │   │   ├── decoding_strategy.py
    │   │   ├── easy_first.py
    │   │   ├── left_to_right.py
    │   │   ├── mask_predict.py
    │   │   └── strategy_utils.py
    │   ├── task_NAT_cemat.sh
    │   ├── task_infer_nat.sh
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   └── translation_self_from_cemat.py
    │   └── train.py
    ├── CeMAT_plugins
    │   ├── __init__.py
    │   ├── checkpoint_utils.py
    │   ├── criterions
    │   │   ├── __init__.py
    │   │   └── label_smoothed_cross_entropy_with_maskdecode.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── cemat_dataset.py
    │   │   ├── concat_pair_dataset.py
    │   │   ├── ddenoising_pair_dataset_dyna_replace.py
    │   │   └── language_pair_dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── cemat_model.py
    │   │   ├── fairseq_encoder.py
    │   │   └── transformer.py
    │   ├── task_NMT_cemat.sh
    │   ├── task_infer_nmt.sh
    │   ├── task_pt_cemat.sh
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── cemat_pretraining.py
    │   │   └── translation_from_pretrained_cemat.py
    ├── License
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md
    └── cemat_scripts
    │   ├── create_trans
    │       ├── example_extract_alignedpairs.sh
    │       └── extract_aligned_pairs.py
    │   └── process
    │       ├── preprocess_Mono.sh
    │       ├── preprocess_NMT.sh
    │       └── preprocess_Para.sh
├── DynaBERT
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── dynabert_overview.png
    ├── eval_glue.py
    ├── requirements.txt
    ├── run_glue.py
    └── transformers
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── configuration_bert.py
    │   ├── configuration_roberta.py
    │   ├── configuration_utils.py
    │   ├── data
    │       ├── __init__.py
    │       ├── metrics
    │       │   └── __init__.py
    │       └── processors
    │       │   ├── __init__.py
    │       │   ├── glue.py
    │       │   └── utils.py
    │   ├── file_utils.py
    │   ├── modeling_bert.py
    │   ├── modeling_roberta.py
    │   ├── modeling_utils.py
    │   ├── optimization.py
    │   ├── tokenization_bert.py
    │   ├── tokenization_gpt2.py
    │   ├── tokenization_roberta.py
    │   └── tokenization_utils.py
├── HyperText
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── hyperbolic
    │   ├── __init__.py
    │   ├── euclidean.py
    │   ├── math_utils.py
    │   ├── mobius_linear.py
    │   └── poincare.py
    ├── hypertext_model_architecture.png
    ├── main.py
    ├── models
    │   ├── Config.py
    │   ├── HyperText.py
    │   └── __init__.py
    ├── radam_optimizer.py
    ├── requirements.txt
    ├── train.py
    └── utils.py
├── JABER-PyTorch
    ├── LICENSE
    ├── NEZHA_PyTorch
    │   ├── LICENSE
    │   ├── README.md
    │   ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── file_utils.py
    │   ├── modeling_nezha.py
    │   ├── optimization.py
    │   └── tools
    │   │   ├── file_utils.py
    │   │   ├── official_tokenization.py
    │   │   ├── pytorch_optimization.py
    │   │   └── utils.py
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── alue_predictions
    │   └── README.md
    ├── alue_test_submission
    │   └── README.md
    ├── compute_metrics.py
    ├── generate_data.py
    ├── pretrained_models
    │   └── README.md
    ├── processors.py
    ├── raw_datasets
    │   └── toy.mq2q.dev.tsv
    ├── requirements.txt
    ├── run_alue.py
    ├── run_alue.sh
    └── tokenizationBBPE.py
├── NEZHA-Gen-TensorFlow
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── interactive_conditional_generation.py
    ├── poetry.py
    └── tokenization.py
├── NEZHA-PyTorch
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── convert_tf_checkpoint_to_pytorch.py
    ├── data
    │   ├── chnsenti
    │   │   ├── dev.tsv
    │   │   └── train.tsv
    │   └── mrpc
    │   │   ├── dev.tsv
    │   │   └── train.tsv
    ├── file_utils.py
    ├── modeling_nezha.py
    ├── optimization.py
    ├── pretrained_models
    │   ├── nezha-cn-base
    │   │   ├── bert_config.json
    │   │   └── vocab.txt
    │   └── nezha-en-base
    │   │   ├── bert_config.json
    │   │   └── vocab.txt
    ├── run_classifier.sh
    ├── run_sequence_classifier.py
    └── tools
    │   ├── file_utils.py
    │   ├── official_tokenization.py
    │   ├── pytorch_optimization.py
    │   └── utils.py
├── NEZHA-TensorFlow
    ├── CONTRIBUTING.md
    ├── Dockerfile
    ├── LICENSE
    ├── NOTICE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── __init__.py
    ├── data
    │   └── pretrain-toy
    │   │   ├── tf_examples_00.tfrecord
    │   │   └── tf_examples_01.tfrecord
    ├── extract_features.py
    ├── fp16_utils.py
    ├── fused_layer_norm.py
    ├── gpu_environment.py
    ├── modeling.py
    ├── modeling_ori.py
    ├── modeling_test.py
    ├── multilingual.md
    ├── nezha
    │   ├── bert_base_rel_config_vocab_100503.json
    │   ├── bert_config.json
    │   └── vocab.txt
    ├── optimization.py
    ├── optimization_test.py
    ├── read_tf_events.py
    ├── run_classifier.py
    ├── run_classifier_ner.py
    ├── run_classifier_with_tfhub.py
    ├── run_pretraining.py
    ├── run_squad.py
    ├── run_squad_trtis_client.py
    ├── sample_text.txt
    ├── scripts
    │   ├── run_clf.sh
    │   ├── run_clf_predict.sh
    │   ├── run_ner_predict.sh
    │   ├── run_pretraining.sh
    │   ├── run_reading.sh
    │   ├── run_seq_labelling.sh
    │   └── run_seq_labelling_predict.sh
    ├── tf_metrics.py
    ├── tokenization.py
    ├── tokenization_test.py
    └── utils
    │   ├── create_glue_data.py
    │   ├── create_pretraining_data.py
    │   ├── create_squad_data.py
    │   └── utils.py
├── Noah_WuKong
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt
    ├── configs
    │   ├── wukong_swin
    │   │   ├── wukong_swin.py
    │   │   ├── wukong_swin_f.py
    │   │   └── wukong_swin_g.py
    │   ├── wukong_vit_b
    │   │   ├── wukong_vit_b.py
    │   │   ├── wukong_vit_b_f.py
    │   │   └── wukong_vit_b_g.py
    │   └── wukong_vit_l
    │   │   ├── wukong_vit_l.py
    │   │   ├── wukong_vit_l_f.py
    │   │   └── wukong_vit_l_g.py
    ├── data
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── res
    │   │   ├── classnames.json
    │   │   └── prompts.txt
    │   └── tokenizer
    │   │   ├── __init__.py
    │   │   ├── res
    │   │       └── vocab.txt
    │   │   └── simple_tokenizer.py
    ├── main.py
    ├── model
    │   ├── __init__.py
    │   ├── builder.py
    │   ├── language
    │   │   ├── __init__.py
    │   │   └── transformer.py
    │   ├── modules.py
    │   ├── utils.py
    │   ├── vision
    │   │   ├── __init__.py
    │   │   ├── swin_transformer.py
    │   │   └── vision_transformer.py
    │   └── wukong.py
    └── requirements.txt
├── Noah_Wukong-MindSpore
    ├── README.md
    ├── README_CN.md
    ├── eval.py
    └── src
    │   ├── __init__.py
    │   ├── config
    │       ├── wukong_vit_b_32.yaml
    │       ├── wukong_vit_b_32_clip.yaml
    │       ├── wukong_vit_b_32_filip.yaml
    │       ├── wukong_vit_l_14.yaml
    │       ├── wukong_vit_l_14_clip.yaml
    │       └── wukong_vit_l_14_filip.yaml
    │   ├── dataset
    │       ├── __init__.py
    │       ├── dataset.py
    │       ├── generate_dataset.py
    │       └── wukong_download.py
    │   ├── model
    │       ├── __init__.py
    │       ├── matrics.py
    │       ├── text_encoder.py
    │       ├── token_learner.py
    │       └── visual_encoder.py
    │   └── tools
    │       ├── __init__.py
    │       ├── model_utils.py
    │       ├── simple_tokenizer.py
    │       ├── template_generate.py
    │       └── utils.py
├── PMLM
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── create_pretraining_data.py
    ├── en_vocab.txt
    ├── interactive_conditional_samples_sincos_acrostic.py
    ├── modeling.py
    └── tokenization.py
├── PanGu-Bot
    └── Readme.md
├── PanGu-α
    ├── .idea
    │   ├── .gitignore
    │   ├── PanGu-Alpha.iml
    │   ├── inspectionProfiles
    │   │   ├── Project_Default.xml
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── vcs.xml
    ├── LICENSE
    ├── PANGU-α.pdf
    ├── README.md
    ├── dataset.py
    ├── docs
    │   ├── 13B.png
    │   ├── 2.6B.png
    │   ├── Pipline.png
    │   ├── dataset.png
    │   ├── logos.png
    │   ├── model.png
    │   ├── task.png
    │   ├── 微信交流群2.png
    │   └── 鹏程.盘古微信交流群.png
    ├── generate.py
    ├── pangu_alpha.py
    ├── pangu_alpha_config.py
    ├── pangu_alpha_predict.py
    ├── pangu_alpha_train.py
    ├── pangu_alpha_wrapcell.py
    ├── run_pangu_alpha_predict.py
    ├── run_pangu_alpha_train.py
    ├── scripts
    │   ├── run_distribute_predict.sh
    │   └── run_distribute_train.sh
    ├── serving_demo
    │   └── PanGu-Alpha-serving-demo.avi
    ├── strategy_load_ckpt
    │   ├── pangu_alpha_13B_cktp_strategy.ckpt
    │   └── pangu_alpha_2.6B_ckpt_strategy.ckpt
    ├── tokenization_jieba.py
    ├── tokenizer
    │   ├── vocab.model
    │   └── vocab.vocab
    └── utils.py
├── README.md
├── TernaryBERT-MindSpore
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── eval.py
    ├── mindspore_hub_conf.py
    ├── scripts
    │   ├── run_eval.sh
    │   └── run_train.sh
    ├── src
    │   ├── __init__.py
    │   ├── assessment_method.py
    │   ├── cell_wrapper.py
    │   ├── config.py
    │   ├── dataset.py
    │   ├── quant.py
    │   ├── tinybert_model.py
    │   └── utils.py
    └── train.py
├── TernaryBERT
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── main.png
    ├── quant_task_glue.py
    ├── quant_task_squad.py
    ├── requirements.txt
    ├── transformer
    │   ├── __init__.py
    │   ├── configuration.py
    │   ├── file_utils.py
    │   ├── modeling.py
    │   ├── modeling_quant.py
    │   ├── optimization.py
    │   ├── tokenization.py
    │   └── utils_quant.py
    ├── utils_glue.py
    └── utils_squad.py
├── TinyBERT-MindSpore
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── mindspore_hub_conf.py
    ├── run_general_distill.py
    ├── run_task_distill.py
    ├── scripts
    │   ├── run_distributed_gd_ascend.sh
    │   ├── run_distributed_gd_gpu.sh
    │   ├── run_standalone_gd.sh
    │   └── run_standalone_td.sh
    └── src
    │   ├── __init__.py
    │   ├── assessment_method.py
    │   ├── dataset.py
    │   ├── gd_config.py
    │   ├── td_config.py
    │   ├── tinybert_for_gd_td.py
    │   ├── tinybert_model.py
    │   └── utils.py
└── TinyBERT
    ├── LICENSE
    ├── README.md
    ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE
    ├── data_augmentation.py
    ├── general_distill.py
    ├── pregenerate_training_data.py
    ├── requirements.txt
    ├── task_distill.py
    ├── tinybert_overview.png
    └── transformer
        ├── __init__.py
        ├── file_utils.py
        ├── modeling.py
        ├── optimization.py
        └── tokenization.py


/AutoTinyBERT/AutoTinyBERT_overview.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/AutoTinyBERT/AutoTinyBERT_overview.PNG


--------------------------------------------------------------------------------
/AutoTinyBERT/submodel_extractor.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Huawei Technologies Co., Ltd.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import json
18 | import torch
19 | import argparse
20 | 
21 | from transformer.modeling_extractor import SuperBertModel
22 | 
23 | 
24 | def main():
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--model",
27 |                         default=None,
28 |                         type=str,
29 |                         required=True)
30 |     parser.add_argument('--arch',
31 |                         type=str,
32 |                         required=True)
33 |     parser.add_argument('--output',
34 |                         type=str,
35 |                         required=True)
36 |     parser.add_argument('--kd', action='store_true')
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     model = SuperBertModel.from_pretrained(args.model)
41 |     size = 0
42 |     for n, p in model.named_parameters():
43 |         size += p.nelement()
44 |         print('n: {}#@#p: {}'.format(n, p.nelement()))
45 | 
46 |     print('the model size is : {}'.format(size))
47 | 
48 |     arch = json.loads(json.dumps(eval(args.arch)))
49 | 
50 |     print('kd: {}'.format(args.kd))
51 | 
52 |     kd = True if args.kd else False
53 |     model.module.set_sample_config(arch, kd) if hasattr(model, 'module') \
54 |         else model.set_sample_config(arch, kd)
55 | 
56 |     size = 0
57 |     for n, p in model.named_parameters():
58 |         size += p.nelement()
59 |         print('n: {}#@#p: {}'.format(n, p.nelement()))
60 | 
61 |     print('the extracted model size is : {}'.format(size))
62 | 
63 |     model_to_save = model.module if hasattr(model, 'module') else model
64 | 
65 |     model_output = os.path.join(args.output, 'pytorch_model.bin')
66 |     torch.save(model_to_save.state_dict(), model_output)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 
72 | 


--------------------------------------------------------------------------------
/AutoTinyBERT/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019, Facebook, Inc. and its affiliates. All Rights Reserved
 2 | 
 3 | __version__ = "0.6.1"
 4 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 5 | 
 6 | from .optimization import BertAdam
 7 | from .optimization import AdamW, get_linear_schedule_with_warmup
 8 | 
 9 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
10 | 


--------------------------------------------------------------------------------
/AutoTinyBERT/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Huawei Technologies Co., Ltd.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import random
17 | 
18 | 
19 | def sample_arch_4_kd(layer_numbers, hidden_sizes, ffn_sizes, qkv_sizes,
20 |                      reset_rand_seed=False, rand_seed=0):
21 | 
22 |     if reset_rand_seed:
23 |         random.seed(rand_seed)
24 | 
25 |     config = dict()
26 | 
27 |     layer_num = random.choice(layer_numbers)
28 | 
29 |     config['sample_layer_num'] = layer_num
30 |     config['sample_hidden_size'] = random.choice(hidden_sizes)
31 |     config['sample_intermediate_sizes'] = [random.choice(ffn_sizes)] * layer_num
32 |     config['sample_num_attention_heads'] = [12] * layer_num
33 |     config['sample_qkv_sizes'] = [random.choice(qkv_sizes)] * layer_num
34 |     return config
35 | 
36 | 
37 | def sample_arch_4_mlm(layer_numbers, hidden_sizes, ffn_sizes,
38 |                       head_numbers, reset_rand_seed=False, rand_seed=0):
39 | 
40 |     if reset_rand_seed:
41 |         random.seed(rand_seed)
42 | 
43 |     config = dict()
44 | 
45 |     layer_num = random.choice(layer_numbers)
46 |     head_num = random.choice(head_numbers)
47 | 
48 |     config['sample_layer_num'] = layer_num
49 |     config['sample_hidden_size'] = random.choice(hidden_sizes)
50 |     config['sample_intermediate_sizes'] = [random.choice(ffn_sizes)] * layer_num
51 |     config['sample_num_attention_heads'] = [head_num] * layer_num
52 |     config['sample_qkv_sizes'] = [head_num * 64] * layer_num
53 |     return config
54 | 
55 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/fastBPE-master/bpe_postprocessing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import re
 3 | import sys
 4 | import six
 5 | import unicodedata
 6 | import collections
 7 | import base64
 8 | 
 9 | count = 0
10 | output = open(sys.argv[1], "w", encoding = "utf-8")
11 | b16 = {}
12 | byteVocab = {}
13 | 
14 | def getChinese(context):
15 | #    context = context.decode("utf-8") # convert context from str to unicode
16 |     filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range
17 |     context = filtrate.sub(r'', context) # remove all non-Chinese characters
18 | #    context = context.encode("utf-8") # convert unicode back to str
19 |     return context
20 | 
21 | for i in range(10):
22 |     b16[i] = str(i) 
23 | 
24 | b16[10] = 'A'
25 | b16[11] = 'B'
26 | b16[12] = 'C'
27 | b16[13] = 'D'
28 | b16[14] = 'E'
29 | b16[15] = 'F'
30 | 
31 | b256tob16 = {}
32 | def base16decode(s):
33 |     result = 0
34 |     for c in s:
35 |         result = result * 16 + b16[c]
36 |     return result
37 | 
38 | def base16encode(n):
39 |     result = ''
40 |     n = int(n)
41 |     while n > 0:
42 |         n = int(n)
43 |         result = b16[n%16] + result
44 |         n /= 16
45 |         n = int(n)
46 |     return result
47 | 
48 | def base256encode(n):
49 |     return chr(n)
50 |     result = ''
51 |     while n > 0:
52 |         n = int(n)
53 |         result = chr(n%256) + result
54 |         n /= 256
55 |     return result
56 | for i in range(256):
57 |     b256tob16[str(base256encode(i))] = i
58 | for line in sys.stdin:
59 |     print(line)
60 |     line = line.split('\t') #bytes(line.strip(), encoding="utf-8")
61 | #    output.writelines("{}\t".format(line[0]))
62 |     print(line)
63 |     pair = line[0].split(' ')
64 |     output.writelines("{}\n".format(pair[0]+pair[1]))
65 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/fastBPE-master/fastBPE/fastBPE.pyx:
--------------------------------------------------------------------------------
 1 | # distutils: language = c++
 2 | 
 3 | from libcpp.vector cimport vector
 4 | from libcpp.string cimport string
 5 | 
 6 | cdef extern from "fastBPE.hpp" namespace "fastBPE":
 7 |     cdef cppclass BPEApplyer:
 8 |         BPEApplyer(const string& codes_path, const string& vocab_path)
 9 |         vector[string] apply(vector[string]& sentences)
10 | 
11 | cdef class fastBPE:
12 |     cdef BPEApplyer* c_obj
13 | 
14 |     def __dealloc__(self):
15 |         del self.c_obj
16 | 
17 |     def __init__(self, codes_path, vocab_path=""):
18 |         self.c_obj = new BPEApplyer(codes_path.encode(), vocab_path.encode())
19 | 
20 |     def apply(self, sentences):
21 |         cdef vector[string] s = [x.encode() for x in sentences]
22 |         cdef vector[string] res = self.c_obj.apply(s)
23 |         return [x.decode() for x in res]
24 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/fastBPE-master/fastBPE/main.cc:
--------------------------------------------------------------------------------
 1 | #include "fastBPE.hpp"
 2 | 
 3 | using namespace std;
 4 | using namespace fastBPE;
 5 | 
 6 | void printUsage() {
 7 |   cerr
 8 |       << "usage: fastbpe <command> <args>\n\n"
 9 |       << "The commands supported by fastBPE are:\n\n"
10 |       << "getvocab input1 [input2]             extract the vocabulary from one "
11 |          "or two text files\n"
12 |       << "learnbpe nCodes input1 [input2]      learn BPE codes from one or two "
13 |          "text files\n"
14 |       << "applybpe output input codes [vocab]  apply BPE codes to a text file\n"
15 |       << "applybpe_stream codes [vocab]        apply BPE codes to stdin and output to stdout\n"
16 |       << endl;
17 | }
18 | 
19 | 
20 | int main(int argc, char **argv) {
21 |   if (argc < 2) {
22 |     printUsage();
23 |     exit(EXIT_FAILURE);
24 |   }
25 |   string command = argv[1];
26 |   if (command == "getvocab") {
27 |     assert(argc == 3 || argc == 4);
28 |     getvocab(argv[2], argc == 4 ? argv[3] : "");
29 |   } else if (command == "learnbpe") {
30 |     assert(argc == 4 || argc == 5);
31 |     learnbpe(stoi(argv[2]), argv[3], argc == 5 ? argv[4] : "");
32 |   } else if (command == "applybpe") {
33 |     assert(argc == 5 || argc == 6);
34 |     applybpe(argv[2], argv[3], argv[4], argc == 6 ? argv[5] : "");
35 |   } else if (command == "applybpe_stream") {
36 |     assert(argc == 3 || argc == 4);
37 |     applybpe_stream(argv[2], argc == 4 ? argv[3] : "");
38 |   } else {
39 |     printUsage();
40 |     exit(EXIT_FAILURE);
41 |   }
42 |   return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/genByteVocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import re
 3 | import sys
 4 | import unicodedata
 5 | import collections
 6 | import base64
 7 | 
 8 | output = open("byteVocab.txt", "w")
 9 | 
10 | corpus = open("cn_wiki_sample.txt", "r")
11 | 
12 | vocab = {}
13 | 
14 | def getChinese(context):
15 | #    context = context.decode("utf-8") # convert context from str to unicode
16 |     filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range
17 |     context = filtrate.sub(r'', context) # remove all non-Chinese characters
18 | #    context = context.encode("utf-8") # convert unicode back to str
19 |     return context
20 | 
21 | i = 0
22 | 
23 | for line in corpus:
24 |     line = line.strip()
25 | #    print(line)
26 |     tokens = line #.split()
27 |     print(tokens)
28 | 
29 |     for token in tokens: #range(len(tokens)):
30 |        # token = tokens[i]    
31 |         print(token)
32 |         if len(getChinese(token)) > 0 and token not in vocab:
33 |             vocab[token] = i #int(tokens[1])
34 |             i += 1
35 |         if i>= 512: break
36 |     if i >= 512: break
37 | 
38 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=False)
39 | 
40 | for item in mergedVocab:
41 |     output.writelines("{}\t{}\n".format(item[1], item[0]))
42 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/genNum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import re
 3 | import sys
 4 | import unicodedata
 5 | import collections
 6 | import base64
 7 | 
 8 | def base256encode(n):
 9 |     return chr(n)
10 |     result = ''
11 |     while n > 0:
12 |         n = int(n)
13 |         result = chr(n%256) + result
14 |         n /= 256
15 |     return result
16 | 
17 | charvocab = open("charVocab.txt", "r")
18 | 
19 | vocab = {}
20 | for line in charvocab:
21 |     line = line.strip()
22 | #    print(line)
23 |     tokens = line.split('\t')
24 | #    print("tokens: " + tokens[0] + " " + tokens[1] + "\n")
25 |     tk = tokens[0]#(str(base16encode((b256tob16[tokens[0]]))))
26 |     #vocab[tk+'\t'+tk] = int(tokens[1])
27 |     vocab[tk] = int(tokens[1])
28 | 
29 | for i in range(1000):
30 | #    print(str(i).encode("utf-8"))
31 |     token = (str(base64.b16encode(str(i).encode("utf-8")))[2:-1])
32 |     if token not in vocab: vocab[token] = 1
33 | 
34 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=True)
35 | 
36 | output = open('charNumVocab.txt', 'w')
37 | for item in mergedVocab:
38 |     output.writelines("{}\t{}\n".format(item[0], item[1]))
39 |     output.writelines("##{}\t{}\n".format(item[0], item[1]))
40 | 
41 | for i in range(10):
42 |     token = (str(base64.b16encode(('00'+str(i)).encode("utf-8")))[2:-1])
43 |     output.writelines("##{}\t{}\n".format(token, 1))
44 | 
45 | for i in range(100):
46 |     if i < 10: continue
47 |     token = (str(base64.b16encode(('0'+str(i)).encode("utf-8")))[2:-1])
48 |     output.writelines("##{}\t{}\n".format(token, 1))
49 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/mergeVocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import re
 3 | import sys
 4 | import unicodedata
 5 | import collections
 6 | import base64
 7 | b16 = {}
 8 | 
 9 | for i in range(10):
10 |     b16[i] = str(i) 
11 | 
12 | b16[10] = 'A'
13 | b16[11] = 'B'
14 | b16[12] = 'C'
15 | b16[13] = 'D'
16 | b16[14] = 'E'
17 | b16[15] = 'F'
18 | 
19 | b256tob16 = {}
20 | def base16decode(s):
21 |     result = 0
22 |     for c in s:
23 |         result = result * 16 + b16[c]
24 |     return result
25 | 
26 | def base16encode(n):
27 |     result = ''
28 |     while n > 0:
29 |         n = int(n)
30 |         result = b16[n%16] + result
31 |         n /= 16
32 |         n = int(n)
33 |     return result
34 | 
35 | def base256encode(n):
36 |     return chr(n)
37 | 
38 | for i in range(256):
39 |     b256tob16[str(base256encode(i))] = i
40 | vocab = {}
41 | 
42 | byteVocab = open(sys.argv[1], 'r')
43 | Vocab = open(sys.argv[2], 'r')
44 | 
45 | for line in byteVocab:
46 |     line = line.strip()
47 | #    print(line)
48 |     tokens = line.split('\t')
49 | #    print("tokens: " + tokens[0] + " " + tokens[1] + "\n")
50 |     tk = tokens[0]#(str(base16encode((b256tob16[tokens[0]]))))
51 |     #vocab[tk+'\t'+tk] = int(tokens[1])
52 |     vocab[tk] = int(tokens[1])
53 | 
54 | numVocab = {}
55 | 
56 | for i in range(10):
57 | #    print(str(i).encode("utf-8"))
58 |     token = (str(base64.b16encode(str(i).encode("utf-8")))[2:-1])
59 |     if token not in numVocab: numVocab[token] = 1
60 | print(numVocab)
61 | for line in Vocab:
62 |     tokens = line.strip().split('\t')
63 |     if tokens[0] in byteVocab: continue
64 |     isNum = False
65 | #    print(tokens[0])
66 | #    print(int(len((tokens[0]))/2))
67 |     
68 |     for i in range(int(len((tokens[0]))/2)):
69 | #      print(tokens[0][2*i:2*i+2])
70 |       if i == 0 and tokens[0][0:2] == '##':
71 | #        print('##')
72 |         continue 
73 |       if tokens[0][2*i:2*i+2] not in numVocab: break
74 |       if i == int(len(tokens[0])/2)-1: 
75 |         isNum = True
76 |     if isNum:
77 |       print(tokens[0])
78 |       continue
79 |  #   print(line)
80 | #    if tokens[0] not in vocab:
81 |         #vocab[tokens[0]+'\t'+tokens[1]] = int(tokens[2])
82 |     vocab[tokens[0]] = int(tokens[1])
83 | 
84 | 
85 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=True)
86 | 
87 | output = open(sys.argv[3], 'w')
88 | for item in mergedVocab:
89 |     output.writelines("{}\t{}\n".format(item[0], item[1]))
90 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/protectList.txt:
--------------------------------------------------------------------------------
 1 | <unk>	
 2 | <s>	
 3 | </s>	
 4 | <TERM>	
 5 | </TERM>	
 6 | <MID>	
 7 | =SYMBOL=	
 8 | =NUMBER=	
 9 | =QUANTIFIER=	
10 | =DATE=	
11 | =TIME=	
12 | =RANGE=	
13 | <PUNC>	
14 | <BT>
15 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/text2utf-8-mt-byte.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | TEXT_DIR=./MTData/ #./wiki_others-hebin-yafu/wiki_others-hebin-yafu/
 4 | NUM=0
 5 | for TEXT_FILE in ${TEXT_DIR}/*; do
 6 | NUM=$((NUM+1))
 7 | echo $NUM
 8 | cat $TEXT_FILE | python3 utf-8-mt-byte.py MTData_byte/$(basename "$TEXT_FILE") &
 9 | if (($NUM>60))
10 | then
11 | wait
12 | NUM=0
13 | fi
14 | done
15 | 
16 | 


--------------------------------------------------------------------------------
/BBPE/bbpe/text2utf-8-mt-char.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | TEXT_DIR=./MTData/ #./wiki_others-hebin-yafu/wiki_others-hebin-yafu/
 4 | NUM=0
 5 | for TEXT_FILE in ${TEXT_DIR}/*; do
 6 | NUM=$((NUM+1))
 7 | echo $NUM
 8 | cat $TEXT_FILE | python3 utf-8-mt-char.py MTData_utf8/$(basename "$TEXT_FILE") &
 9 | if (($NUM>50))
10 | then
11 | wait
12 | NUM=0
13 | fi
14 | done
15 | 
16 | 


--------------------------------------------------------------------------------
/BBPE/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BBPE/example.png


--------------------------------------------------------------------------------
/BinaryBERT/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BinaryBERT/__init__.py


--------------------------------------------------------------------------------
/BinaryBERT/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BinaryBERT/assets/model.png


--------------------------------------------------------------------------------
/BinaryBERT/helper.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Huawei Technologies Co., Ltd.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import logging
16 | import os
17 | import string
18 | import random
19 | import torch
20 | 
21 | def generate_job_id():
22 |   return ''.join(random.sample(string.ascii_letters+string.digits, 5))
23 | 
24 | def init_logging(log_path):
25 | 
26 |   if not os.path.isdir(os.path.dirname(log_path)):
27 |     print("Log path does not exist. Create a new one.")
28 |     os.makedirs(os.path.dirname(log_path))
29 |   if os.path.exists(log_path):
30 |     print("%s already exists. replace it with current experiment." % log_path)
31 |     os.system('rm %s' % log_path)
32 | 
33 |   logger = logging.getLogger()
34 |   logger.setLevel(logging.INFO)
35 |   logFormatter = logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s')
36 | 
37 |   fileHandler = logging.FileHandler(log_path)
38 |   fileHandler.setFormatter(logFormatter)
39 |   logger.addHandler(fileHandler)
40 | 
41 |   consoleHandler = logging.StreamHandler()
42 |   consoleHandler.setFormatter(logFormatter)
43 |   logger.addHandler(consoleHandler)
44 | 
45 | def print_args(args):
46 |     for k, v in zip(args.keys(), args.values()):
47 |         logging.info("{0}: {1}".format(k, v))
48 | 
49 | def soft_cross_entropy(predicts, targets):
50 |     student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1)
51 |     targets_prob = torch.nn.functional.softmax(targets, dim=-1)
52 |     return (- targets_prob * student_likelihood).mean()
53 | 
54 | 
55 | def visualize_clip(clip_dict):
56 |     # assert len(clip_dict) > 0, 'empty clip_dict, possibly not learnable_scalling.'
57 |     logging.info("Visualizing learnable clipping vals...")
58 |     for n, p in clip_dict.items():
59 |         if p.nelement() == 2:
60 |             # PACT clip val has two elements
61 |             logging.info("PACT clip_val: %s: (%.4f, %.4f)" % (n, p[0].item(), p[1].item()))
62 |         elif p.nelement() == 1:
63 |             # LSQ step size has only one element
64 |             logging.info("LSQ step_size: %s: %.4f" % (n, p.item()))
65 | 
66 | 
67 | def result_to_file(result, file_name):
68 |     with open(file_name, "a") as writer:
69 |         logging.info("***** Eval results *****")
70 |         for key in sorted(result.keys()):
71 |             if result[key]>0.0:
72 |                 logging.info("  %s = %s", key, str(result[key]))
73 |                 writer.write("%s = %s\n" % (key, str(result[key])))
74 | 


--------------------------------------------------------------------------------
/BinaryBERT/readme.md:
--------------------------------------------------------------------------------
 1 | # BinaryBERT: Pushing the Limit of BERT Quantization
 2 | This repository contains the implementation of our paper 
 3 | "BinaryBERT: Pushing the Limit of BERT Quantization" 
 4 | in ACL 2021. 
 5 | The overall workflow of training BinaryBERT is shown below.
 6 | We first train a half-sized ternary BERT model, and then apply **ternary weight splitting** 
 7 | to initalize the full-sized BinaryBERT. We then fine-tune BinaryBERT for further refinement.
 8 | ![BinaryBERT](./assets/model.png)
 9 | 
10 | ## Dependencies
11 | ```bash
12 | pip install -r requirements.txt
13 | ```
14 | 
15 | ## Datasets
16 | 
17 | We train and test BinaryBERT on GLUE and SQuAD benchmarks. Both dataset are available online:
18 | - **GLUE**: https://github.com/nyu-mll/GLUE-baselines
19 | - **SQuAD**: https://rajpurkar.github.io/SQuAD-explorer/
20 | 
21 | For data augmentation on GLUE, please follow the instruction in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT).
22 | 
23 | ## Execution 
24 | Our experiments are based on the fine-tuned full-precision DynaBERT, 
25 | which can be found [here](https://drive.google.com/file/d/1pYApaDcse5QIB6lZagWO0uElAavFazpA/view?usp=sharing).
26 | Complete running scripts and more detailed tips are provided in `./scripts`.
27 | There are two steps for execution, and we illustrate them
28 | with training BinaryBERT with 4-bit activations on MRPC.
29 | 
30 | ### Step one: Train a half-sized ternary BERT
31 | This correponds to `scripts/ternary_glue.sh`.  For example
32 | ```bash
33 | sh scripts/terarny_glue.sh mrpc data/mrpc/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ 2 4
34 | ```
35 | 
36 | ### Step two: Apply TWS and finetune BinaryBERT
37 | This correponds to `scripts/tws_glue.sh`. Based on the model checkpoint of ternary BERT, execute:
38 | ```bash
39 | sh scripts/tws_glue.sh mrpc data/mrpc/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ output/Ternary_W2A8/mrpc/kd_stage2/ 1 4
40 | ```
41 | Go through each script for more detail.
42 | 
43 | ## Citation
44 | If you find this repo helpful for your research, please: 
45 | ```
46 | @inproceedings{bai2021binarybert,
47 | 	title={BinaryBERT: Pushing the Limit of BERT Quantization},
48 | 	author={Bai, H. and Zhang, W. and Hou, L. and Shang, L. and Jin, J. and Jiang, X. and Liu, Q. and Lyu, M. and King, I.},
49 | 	booktitle={Annual Meeting of the Association for Computational Linguistics},
50 | 	year={2021}
51 | }
52 | ```


--------------------------------------------------------------------------------
/BinaryBERT/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Used for downloading models over HTTP
 4 | requests
 5 | 
 6 | torch==1.0.0
 7 | python==3.6
 8 | seaborn
 9 | pickle
10 | collections


--------------------------------------------------------------------------------
/BinaryBERT/scripts/terarny_glue.sh:
--------------------------------------------------------------------------------
 1 | # Step 1: First train a half-sized ternary BERT model from the dynabert model checkpoint
 2 | # Tips:
 3 | # 1. If trained with data augmentation, please add --aug_train
 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu;
 5 | #    use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization
 6 | 
 7 | TASK_NAME=$1
 8 | GLUE_DIR=$2
 9 | TEACHER_MODEL_DIR=$3
10 | STUDENT_MODEL_DIR=$4
11 | wbits=$5
12 | abits=$6
13 | JOB_ID=Ternary_W${wbits}A${abits}
14 | echo $TASK_NAME
15 | echo $GLUE_DIR
16 | echo $TEACHER_MODEL_DIR
17 | echo $STUDENT_MODEL_DIR
18 | echo $wbits
19 | echo $abits
20 | echo $JOB_ID
21 | 
22 | if [ $abits == 4 ]
23 | then
24 | act_quan_method=lsq
25 | ACT2FN=relu
26 | else
27 | act_quan_method=uniform
28 | ACT2FN=gelu
29 | fi
30 | 
31 | export CUDA_VISIBLE_DEVICES=5
32 | python quant_task_distill_glue.py \
33 |     --data_dir ${GLUE_DIR} \
34 |     --job_id ${JOB_ID} \
35 |     --batch_size 16 \
36 |     --learning_rate 1e-5 \
37 |     --eval_step 1000 \
38 |     --num_train_epochs 2 \
39 |     --ACT2FN ${ACT2FN} \
40 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
41 |     --kd_type two_stage \
42 |     --task_name $TASK_NAME \
43 |     --teacher_model ${TEACHER_MODEL_DIR} \
44 |     --student_model ${STUDENT_MODEL_DIR} \
45 |     --weight_bits ${wbits} \
46 |     --weight_quant_method twn \
47 |     --input_bits ${abits} \
48 |     --input_quant_method ${act_quan_method} \
49 |     --clip_lr 1e-4 \
50 |     --learnable_scaling
51 | 


--------------------------------------------------------------------------------
/BinaryBERT/scripts/terarny_squad.sh:
--------------------------------------------------------------------------------
 1 | # Step 1: First train a half-sized ternary BERT model from the dynabert model checkpoint
 2 | # Tips:
 3 | # 1. If trained with data augmentation, please add --aug_train
 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu;
 5 | #    use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization
 6 | 
 7 | TASK=$1
 8 | DATA_DIR=$2
 9 | TEACHER_MODEL_DIR=$3
10 | STUDENT_MODEL_DIR=$4
11 | wbits=$5
12 | abits=$6
13 | JOB_ID=Ternary_W${wbits}A${abits}
14 | echo $TASK
15 | echo $DATA_DIR
16 | echo $TEACHER_MODEL_DIR
17 | echo $STUDENT_MODEL_DIR
18 | echo $wbits
19 | echo $abits
20 | echo $JOB_ID
21 | 
22 | if [ $abits == 4 ]
23 | then
24 | act_quan_method=lsq
25 | ACT2FN=relu
26 | else
27 | act_quan_method=uniform
28 | ACT2FN=gelu
29 | fi
30 | 
31 | export CUDA_VISIBLE_DEVICES=7
32 | if [ $TASK == 1 ]
33 | then
34 | TASK_NAME=SQuADv1.1
35 | python quant_task_distill_squad.py \
36 |     --data_dir ${DATA_DIR} \
37 |     --job_id ${JOB_ID} \
38 |     --batch_size 4 \
39 |     --learning_rate 2e-5 \
40 |     --eval_step 1000 \
41 |     --num_train_epochs 1 \
42 |     --ACT2FN ${ACT2FN} \
43 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
44 |     --kd_type two_stage \
45 |     --teacher_model ${TEACHER_MODEL_DIR} \
46 |     --student_model ${STUDENT_MODEL_DIR} \
47 |     --weight_bits ${wbits} \
48 |     --weight_quant_method twn \
49 |     --input_bits ${abits} \
50 |     --input_quant_method ${act_quan_method} \
51 |     --clip_lr 1e-3 \
52 |     --learnable_scaling
53 | else
54 | TASK_NAME=SQuADv2.0
55 | python quant_task_distill_squad.py \
56 |     --data_dir ${DATA_DIR} \
57 |     --job_id ${JOB_ID} \
58 |     --batch_size 4 \
59 |     --learning_rate 2e-5 \
60 |     --eval_step 1000 \
61 |     --num_train_epochs 1 \
62 |     --ACT2FN ${ACT2FN} \
63 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
64 |     --kd_type two_stage \
65 |     --teacher_model ${TEACHER_MODEL_DIR} \
66 |     --student_model ${STUDENT_MODEL_DIR} \
67 |     --weight_bits ${wbits} \
68 |     --weight_quant_method twn \
69 |     --input_bits ${abits} \
70 |     --input_quant_method ${act_quan_method} \
71 |     --clip_lr 1e-3 \
72 |     --learnable_scaling \
73 |     --version_2_with_negative
74 | fi
75 | 
76 | 


--------------------------------------------------------------------------------
/BinaryBERT/scripts/tws_glue.sh:
--------------------------------------------------------------------------------
 1 | # Step 2: Apply ternary weight splitting and finetune BinaryBERT.
 2 | # Tips:
 3 | # 1. If trained with data augmentation, please add --aug_train
 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu;
 5 | #    use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization
 6 | 
 7 | TASK_NAME=$1
 8 | GLUE_DIR=$2
 9 | TEACHER_MODEL_DIR=$3
10 | STUDENT_MODEL_DIR=$4
11 | wbits=$5
12 | abits=$6
13 | JOB_ID=Ternary_W${wbits}A${abits}
14 | echo $TASK_NAME
15 | echo $GLUE_DIR
16 | echo $TEACHER_MODEL_DIR
17 | echo $STUDENT_MODEL_DIR
18 | echo $wbits
19 | echo $abits
20 | echo $JOB_ID
21 | 
22 | if [ $abits == 4 ]
23 | then
24 | act_quan_method=lsq
25 | ACT2FN=relu
26 | else
27 | act_quan_method=uniform
28 | ACT2FN=gelu
29 | fi
30 | 
31 | export CUDA_VISIBLE_DEVICES=5
32 | python quant_task_distill_glue.py \
33 |     --data_dir ${GLUE_DIR} \
34 |     --job_id ${JOB_ID} \
35 |     --batch_size 16 \
36 |     --learning_rate 5e-5 \
37 |     --eval_step 100 \
38 |     --num_train_epochs 2 \
39 |     --ACT2FN ${ACT2FN} \
40 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
41 |     --kd_type two_stage \
42 |     --task_name $TASK_NAME \
43 |     --teacher_model ${TEACHER_MODEL_DIR} \
44 |     --student_model ${STUDENT_MODEL_DIR} \
45 |     --weight_bits ${wbits} \
46 |     --weight_quant_method bwn \
47 |     --input_bits ${abits} \
48 |     --input_quant_method ${act_quan_method} \
49 |     --clip_lr 1e-4 \
50 |     --learnable_scaling \
51 |     --is_binarybert \
52 |     --split


--------------------------------------------------------------------------------
/BinaryBERT/scripts/tws_squad.sh:
--------------------------------------------------------------------------------
 1 | # Step 2: Apply ternary weight splitting and finetune BinaryBERT.
 2 | # Tips:
 3 | # 1. If trained with data augmentation, please add --aug_train
 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu;
 5 | #    use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization
 6 | 
 7 | TASK=$1
 8 | DATA_DIR=$2
 9 | TEACHER_MODEL_DIR=$3
10 | STUDENT_MODEL_DIR=$4
11 | wbits=$5
12 | abits=$6
13 | JOB_ID=Ternary_W${wbits}A${abits}
14 | echo $TASK
15 | echo $DATA_DIR
16 | echo $TEACHER_MODEL_DIR
17 | echo $STUDENT_MODEL_DIR
18 | echo $wbits
19 | echo $abits
20 | echo $JOB_ID
21 | 
22 | if [ $abits == 4 ]
23 | then
24 | act_quan_method=lsq
25 | ACT2FN=relu
26 | else
27 | act_quan_method=uniform
28 | ACT2FN=gelu
29 | fi
30 | 
31 | export CUDA_VISIBLE_DEVICES=7
32 | if [ $TASK == 1 ]
33 | then
34 | TASK_NAME=SQuADv1.1
35 | python quant_task_distill_squad.py \
36 |     --data_dir ${DATA_DIR} \
37 |     --job_id ${JOB_ID} \
38 |     --batch_size 4 \
39 |     --learning_rate 2e-5 \
40 |     --eval_step 1000 \
41 |     --num_train_epochs 1 \
42 |     --ACT2FN ${ACT2FN} \
43 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
44 |     --kd_type two_stage \
45 |     --teacher_model ${TEACHER_MODEL_DIR} \
46 |     --student_model ${STUDENT_MODEL_DIR} \
47 |     --weight_bits ${wbits} \
48 |     --weight_quant_method bwn \
49 |     --input_bits ${abits} \
50 |     --input_quant_method ${act_quan_method} \
51 |     --clip_lr 1e-3 \
52 |     --learnable_scaling \
53 |     --is_binarybert \
54 |     --split
55 | else
56 | TASK_NAME=SQuADv2.0
57 | python quant_task_distill_squad.py \
58 |     --data_dir ${DATA_DIR} \
59 |     --job_id ${JOB_ID} \
60 |     --batch_size 4 \
61 |     --learning_rate 2e-5 \
62 |     --eval_step 1000 \
63 |     --num_train_epochs 1 \
64 |     --ACT2FN gelu \
65 |     --output_dir output/${JOB_ID}/${TASK_NAME} \
66 |     --kd_type two_stage \
67 |     --teacher_model ${TEACHER_MODEL_DIR} \
68 |     --student_model ${STUDENT_MODEL_DIR} \
69 |     --weight_bits ${wbits} \
70 |     --weight_quant_method bwn \
71 |     --input_bits ${abits} \
72 |     --input_quant_method uniform \
73 |     --clip_lr 1e-3 \
74 |     --learnable_scaling \
75 |     --version_2_with_negative \
76 |     --is_binarybert \
77 |     --split
78 | fi
79 | 
80 | 


--------------------------------------------------------------------------------
/BinaryBERT/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.2"
 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 3 | 
 4 | 
 5 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
 6 |                        BertForMaskedLM, BertForNextSentencePrediction,
 7 |                        TinyBertForSequenceClassification,
 8 |                        load_tf_weights_in_bert)
 9 | 
10 | from .optimization import BertAdam
11 | 
12 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
13 | 


--------------------------------------------------------------------------------
/CAME/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/.DS_Store


--------------------------------------------------------------------------------
/CAME/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
15 | FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
16 | FROM ${FROM_IMAGE_NAME}
17 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
18 | 
19 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data
20 | 
21 | WORKDIR /workspace
22 | RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
23 | RUN git clone https://github.com/soskek/bookcorpus.git
24 | 
25 | # Copy the perf_client over
26 | COPY --from=trt /workspace/install/ /workspace/install/
27 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
28 | 
29 | # Install trt python api
30 | RUN apt-get install libb64-0d
31 | RUN pip install /workspace/install/python/tensorrtserver*.whl
32 | 
33 | WORKDIR /workspace/bert
34 | RUN pip install --no-cache-dir \
35 |  tqdm boto3 requests six ipdb h5py nltk progressbar onnxruntime \
36 |  git+https://github.com/NVIDIA/dllogger wget
37 | 
38 | RUN apt-get install -y iputils-ping
39 | 
40 | COPY . .
41 | 


--------------------------------------------------------------------------------
/CAME/NOTICE:
--------------------------------------------------------------------------------
1 | BERT PyTorch
2 | 
3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
4 | licensed under the Apache License 2.0.
5 | 


--------------------------------------------------------------------------------
/CAME/README.md:
--------------------------------------------------------------------------------
 1 | # CAME Optimizer - Pytorch
 2 | 
 3 | This repository provides a script and recipe to train the BERT model with our proposed CAME optimizer in:
 4 | 
 5 | CAME: Confidence-guided Adaptive Memory Efficient Optimization
 6 | 
 7 | This work has been accepted by **ACL2023** main conference.
 8 | 
 9 | In this work, we studied a confidence-guided strategy to reduce the instability of existing memory efficient optimizers. 
10 | Based on this strategy, we proposed CAME to simultaneously achieve two goals: fast convergence as in traditional adaptive methods, and low memory usage as in memory-efficient methods.
11 | 
12 | 
13 | ## Training
14 | 
15 | ### The script including the setting of hyperparameters to pretrain BERT:
16 | bash run_came_pretraining.sh  
17 | 
18 | ### The startup file corresponding to the script:  
19 | startup_came.py  
20 | 
21 | ### Pytorch implementation:  
22 | came.py: the Pytorch implementation of our proposed CAME optimizer.
23 | ![CAME](./came_pcode.png)
24 | 
25 | ## Pretraining Results
26 | ![BERT Pretraining](./bert_pretrain.png)
27 | 
28 | ## Memory Usage Comparison
29 | ![Memory Cost](./memory.png)
30 | 
31 | ## Usage
32 | ```
33 | from came import CAME
34 | optimizer = CAME(model.parameters(), lr=2e-4, weight_decay=1e-2, betas=(0.9, 0.999, 0.9999), eps=(1e-30, 1e-16))
35 | ```
36 | ## Citation
37 | 


--------------------------------------------------------------------------------
/CAME/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	  "attention_probs_dropout_prob": 0.1,
 3 | 	  "hidden_act": "gelu",
 4 | 	  "hidden_dropout_prob": 0.1,
 5 | 	  "hidden_size": 768,
 6 | 	  "initializer_range": 0.02,
 7 | 	  "intermediate_size": 3072,
 8 | 	  "max_position_embeddings": 512,
 9 | 	  "num_attention_heads": 12,
10 | 	  "num_hidden_layers": 12,
11 | 	  "type_vocab_size": 2,
12 | 	  "vocab_size": 30528
13 | 	}
14 | 


--------------------------------------------------------------------------------
/CAME/bert_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 1024,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 4096,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 16,
10 |   "num_hidden_layers": 24,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/CAME/bert_pretrain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/bert_pretrain.png


--------------------------------------------------------------------------------
/CAME/came_pcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/came_pcode.png


--------------------------------------------------------------------------------
/CAME/create_data.sh:
--------------------------------------------------------------------------------
 1 | python /home/ma-user/work/Old_BERT/create_pretraining_data.py \
 2 | 	--input_file=/cache/data/book/book_corpus_2.txt  \
 3 | 	--output_file=/cache/data/book/book_corpus_2.hdf5 \
 4 | 	--vocab_file=/home/ma-user/work/Old_BERT/bert-large-uncased-vocab.txt \
 5 | 	--bert_model=bert-large-uncased \
 6 | 	--max_seq_length=128 \
 7 | 	--max_predictions_per_seq=20 \
 8 | 	--dupe_factor=5 \
 9 | 	--masked_lm_prob=0.15 
10 | 


--------------------------------------------------------------------------------
/CAME/data/BooksDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import subprocess
15 | 
16 | class BooksDownloader:
17 |     def __init__(self, save_path):
18 |         self.save_path = save_path
19 |         pass
20 | 
21 | 
22 |     def download(self):
23 |         bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
24 |         bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
25 |         bookscorpus_download_command += ' --trash-bad-count'
26 |         bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
27 | 


--------------------------------------------------------------------------------
/CAME/data/BookscorpusTextFormatting.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import glob
15 | import os
16 | 
17 | class BookscorpusTextFormatting:
18 |     def __init__(self, books_path, output_filename, recursive = False):
19 |         self.books_path = books_path
20 |         self.recursive = recursive
21 |         self.output_filename = output_filename
22 | 
23 | 
24 |     # This puts one book per line
25 |     def merge(self):
26 |         with open(self.output_filename, mode='w', newline='\n') as ofile:
27 |             for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
28 |                 with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
29 |                     for line in file:
30 |                         if line.strip() != '':
31 |                             ofile.write(line.strip() + ' ')
32 |                 ofile.write("\n\n")


--------------------------------------------------------------------------------
/CAME/data/GLUEDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import sys
15 | import wget
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def mkdir(path):
21 |     Path(path).mkdir(parents=True, exist_ok=True)
22 | 
23 | 
24 | class GLUEDownloader:
25 | 
26 |     def __init__(self, save_path):
27 |         self.save_path = save_path + '/glue'
28 | 
29 |     def download(self, task_name):
30 |         mkdir(self.save_path)
31 |         if task_name in {'mrpc', 'mnli'}:
32 |             task_name = task_name.upper()
33 |         elif task_name == 'cola':
34 |             task_name = 'CoLA'
35 |         else:  # SST-2
36 |             assert task_name == 'sst-2'
37 |             task_name = 'SST'
38 |         wget.download(
39 |             'https://gist.githubusercontent.com/roclark/9ab385e980c5bdb9e15ecad5963848e0/raw/c9dcc44a6e1336d2411e3333c25bcfd507c39c81/download_glue_data.py',
40 |             out=self.save_path,
41 |         )
42 |         sys.path.append(self.save_path)
43 |         import download_glue_data
44 |         download_glue_data.main(
45 |             ['--data_dir', self.save_path, '--tasks', task_name])
46 |         sys.path.pop()
47 | 


--------------------------------------------------------------------------------
/CAME/data/NVIDIAPretrainedWeightDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import os
15 | 
16 | class NVIDIAPretrainedWeightDownloader:
17 |     def __init__(self, save_path):
18 |         self.save_path = save_path + '/nvidia_pretrained_weights'
19 | 
20 |         if not os.path.exists(self.save_path):
21 |             os.makedirs(self.save_path)
22 | 
23 |         pass
24 | 
25 | 
26 |     def download(self):
27 |         assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'


--------------------------------------------------------------------------------
/CAME/data/SquadDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import bz2
15 | import os
16 | import urllib.request
17 | import sys
18 | 
19 | class SquadDownloader:
20 |     def __init__(self, save_path):
21 |         self.save_path = save_path + '/squad'
22 | 
23 |         if not os.path.exists(self.save_path):
24 |             os.makedirs(self.save_path)
25 | 
26 |         if not os.path.exists(self.save_path + '/v1.1'):
27 |             os.makedirs(self.save_path + '/v1.1')
28 | 
29 |         if not os.path.exists(self.save_path + '/v2.0'):
30 |             os.makedirs(self.save_path + '/v2.0')
31 | 
32 |         self.download_urls = {
33 |             'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
34 |             'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
35 |             'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
36 |             'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
37 |             'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
38 |             'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
39 |         }
40 | 
41 |     def download(self):
42 |         for item in self.download_urls:
43 |             url = item
44 |             file = self.download_urls[item]
45 | 
46 |             print('Downloading:', url)
47 |             if os.path.isfile(self.save_path + '/' + file):
48 |                 print('** Download file already exists, skipping download')
49 |             else:
50 |                 response = urllib.request.urlopen(url)
51 |                 with open(self.save_path + '/' + file, "wb") as handle:
52 |                     handle.write(response.read())
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/CAME/data/WikiDownloader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import bz2
15 | import os
16 | import urllib.request
17 | import subprocess
18 | import sys
19 | import subprocess
20 | 
21 | class WikiDownloader:
22 |     def __init__(self, language, save_path):
23 |         self.save_path = save_path + '/wikicorpus_' + language
24 | 
25 |         if not os.path.exists(self.save_path):
26 |             os.makedirs(self.save_path)
27 | 
28 |         self.language = language
29 |         # Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
30 |         self.download_urls = {
31 |             'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
32 |             'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
33 |         }
34 | 
35 |         self.output_files = {
36 |             'en' : 'wikicorpus_en.xml.bz2',
37 |             'zh' : 'wikicorpus_zh.xml.bz2'
38 |         }
39 | 
40 | 
41 |     def download(self):
42 |         if self.language in self.download_urls:
43 |             url = self.download_urls[self.language]
44 |             filename = self.output_files[self.language]
45 | 
46 |             print('Downloading:', url)
47 |             if os.path.isfile(self.save_path + '/' + filename):
48 |                 print('** Download file already exists, skipping download')
49 |             else:
50 |                 cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
51 |                 print('Running:', cmd)
52 |                 status = subprocess.run(cmd)
53 |                 if status.returncode != 0:
54 |                     raise RuntimeError('Wiki download not successful')
55 | 
56 |             # Always unzipping since this is relatively fast and will overwrite
57 |             print('Unzipping:', self.output_files[self.language])
58 |             subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
59 | 
60 |         else:
61 |             assert False, 'WikiDownloader not implemented for this language yet.'
62 | 


--------------------------------------------------------------------------------
/CAME/data/WikicorpusTextFormatting.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import glob
15 | import os
16 | 
17 | class WikicorpusTextFormatting:
18 |     def __init__(self, wiki_path, output_filename, recursive = False):
19 |         self.wiki_path = wiki_path
20 |         self.recursive = recursive
21 |         self.output_filename = output_filename
22 | 
23 | 
24 |     # This puts one article per line
25 |     def merge(self):
26 |         with open(self.output_filename, mode='w', newline='\n') as ofile:
27 |             for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
28 |                 for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
29 |                     print(filename)
30 |                     article_lines = []
31 |                     article_open = False
32 | 
33 |                     with open(filename, mode='r', newline='\n') as file:
34 |                         for line in file:
35 |                             if '<doc id=' in line:
36 |                                 article_open = True
37 |                             elif '</doc>' in line:
38 |                                 article_open = False
39 |                                 for oline in article_lines[1:]:
40 |                                     if oline != '\n':
41 |                                         ofile.write(oline.rstrip() + " ")
42 |                                 ofile.write("\n\n")
43 |                                 article_lines = []
44 |                             else:
45 |                                 if article_open:
46 |                                     article_lines.append(line)


--------------------------------------------------------------------------------
/CAME/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 


--------------------------------------------------------------------------------
/CAME/data/__pycache__/TextSharding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/data/__pycache__/TextSharding.cpython-36.pyc


--------------------------------------------------------------------------------
/CAME/data/create_datasets_from_start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | to_download=${1:-"wiki_only"}
17 | 
18 | #Download
19 | if [ "$to_download" = "wiki_books" ] ; then
20 |     python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
21 | fi
22 | 
23 | python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
24 | python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
25 | python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
26 | python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
27 | python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2
28 | 
29 | # Properly format the text files
30 | if [ "$to_download" = "wiki_books" ] ; then
31 |     python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
32 | fi
33 | python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
34 | 
35 | if [ "$to_download" = "wiki_books" ] ; then
36 |     DATASET="books_wiki_en_corpus"
37 | else
38 |     DATASET="wikicorpus_en"
39 |     # Shard the text files
40 | fi
41 | 
42 | # Shard the text files
43 | python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET
44 | 
45 | # Create HDF5 files Phase 1
46 | python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \
47 | --max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
48 | 
49 | # Create HDF5 files Phase 2
50 | python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \
51 | --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
52 | 


--------------------------------------------------------------------------------
/CAME/data/shard.py:
--------------------------------------------------------------------------------
 1 | import TextSharding
 2 | 
 3 | # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
 4 | # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
 5 | # Different languages (e.g., Chinese simplified/traditional) may require translation and
 6 | # other packages to be called from here -- just add a conditional branch for those extra steps
 7 | segmenter = TextSharding.NLTKSegmenter()
 8 | sharding = TextSharding.Sharding(['/home/ma-user/work/Old_BERT/data/origin/wiki_sliced/wiki_00', '/home/ma-user/work/Old_BERT/data/origin/wiki_sliced/wiki_01'], '/home/ma-user/work/Old_BERT/data/origin/wiki_sliced', 256, 256, 0.1)
 9 | 
10 | sharding.load_articles()
11 | sharding.segment_articles_into_sentences(segmenter)
12 | sharding.distribute_articles_over_shards()
13 | sharding.write_shards_to_disk()
14 | 


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 
37 | /docs/_build
38 | .idea
39 | *.iml
40 | 
41 | .travis-solo
42 | G*
43 | *.db
44 | *.mdb
45 | 
46 | # Vim
47 | [._]*.s[a-w][a-z]
48 | [._]s[a-w][a-z]
49 | *.un~
50 | Session.vim
51 | .netrwhist
52 | *~
53 | 


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # NOTES
 4 | #
 5 | # - Must expand templates to avoid a large loss of content.
 6 | # - Text will not (redundantly) contain the title string.
 7 | # - Keep sections. Section title will be marked by "Section::::".
 8 | # - Keep lists. List bullets will be marked by "BULLET::::".
 9 | # - Keep tables. They're mostly garbage but can be removed later (remove "^!*").
10 | # - Remove disambiguation pages. Right now there is no use for them.
11 | 
12 | INPUT=$1
13 | PROCESSES=$2
14 | TEMPLATES=$3
15 | OUTPUT=$4
16 | 
17 | python -m wikiextractor.WikiExtractor.py $INPUT \
18 |        --json \
19 |        --processes $PROCESSES \
20 |        --templates $TEMPLATES \
21 |        --output $OUTPUT \
22 |        --bytes 1M \
23 |        --compress \
24 |        --links \
25 |        --sections \
26 |        --lists \
27 |        --keep_tables \
28 |        --min_text_length 0 \
29 |        --filter_disambig_pages
30 | 


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import re
 3 | 
 4 | from wikiextractor.WikiExtractor import __version__
 5 | 
 6 | 
 7 | def get_version(version):
 8 |     if re.match(r'^\d+\.\d+$', version):
 9 |         return version + '.0'
10 |     return version
11 | 
12 | with open("README.md", "r") as fh:
13 |     long_description = fh.read()
14 | 
15 | setup(
16 |     name='wikiextractor',
17 |     version=get_version(__version__),
18 |     author='Giuseppe Attardi',
19 |     author_email='attardi@gmail.com',
20 |     description='A tool for extracting plain text from Wikipedia dumps',
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     license='GNU Affero General Public License',
24 |     install_requires=[],
25 |     url="https://github.com/attardi/wikiextractor",
26 |     packages=find_packages(include=["wikiextractor"]),
27 |     classifiers=[
28 |         'Development Status :: 5 - Production/Stable',
29 |         'Intended Audience :: Developers',
30 |         'Topic :: Text Processing :: Linguistic',
31 |         'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
32 |         'Programming Language :: Python :: 3'
33 |      ],
34 |     entry_points={
35 |         "console_scripts": [
36 |             "wikiextractor = wikiextractor.WikiExtractor:main",
37 |             "extractPage = wikiextractor.extractPage:main",
38 |             ]
39 |         },
40 |     python_requires='>=3.6',
41 | )
42 | 


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/wikiextractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/data/wikiextractor/wikiextractor/__init__.py


--------------------------------------------------------------------------------
/CAME/data/wikiextractor/wikiextractor/clean.py:
--------------------------------------------------------------------------------
 1 | # =============================================================================
 2 | #  Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it).
 3 | # =============================================================================
 4 | #  This file is part of Tanl.
 5 | #
 6 | #  Tanl is free software; you can redistribute it and/or modify it
 7 | #  under the terms of the GNU Affero General Public License, version 3,
 8 | #  as published by the Free Software Foundation.
 9 | #
10 | #  Tanl is distributed in the hope that it will be useful,
11 | #  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | #  GNU Affero General Public License for more details.
14 | #
15 | #  You should have received a copy of the GNU Affero General Public License
16 | #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | # =============================================================================
18 | 
19 | from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags
20 | 
21 | 
22 | def clean_markup(markup, keep_links=False, ignore_headers=True):
23 |     """
24 |     Clean Wikimarkup to produce plaintext.
25 | 
26 |     :param keep_links: Set to True to keep internal and external links
27 |     :param ignore_headers: if set to True, the output list will not contain
28 |     headers, only 
29 | 
30 |     Returns a list of paragraphs (unicode strings).
31 |     """
32 | 
33 |     if not keep_links:
34 |         ignoreTag('a')
35 | 
36 |     extractor = Extractor(0, '', [])
37 | 
38 |     # returns a list of strings
39 |     paragraphs = extractor.clean_text(markup,
40 |                                       mark_headers=True,
41 |                                       expand_templates=False,
42 |                                       escape_doc=True)
43 |     resetIgnoredTags()
44 | 
45 |     if ignore_headers:
46 |         paragraphs = filter(lambda s: not s.startswith('## '), paragraphs)
47 | 
48 |     return paragraphs
49 | 


--------------------------------------------------------------------------------
/CAME/memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/memory.png


--------------------------------------------------------------------------------
/CAME/processors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/processors/__init__.py


--------------------------------------------------------------------------------
/CAME/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Accessing files from S3 directly.
 4 | boto3==1.15
 5 | # Used for downloading models over HTTP
 6 | requests
 7 | six
 8 | ipdb
 9 | #Data processing
10 | h5py
11 | nltk
12 | progressbar
13 | #Others
14 | 


--------------------------------------------------------------------------------
/CAME/run_came_pretraining.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # num_eval_examples too large will cause 'unable to write to file' https://github.com/pytorch/pytorch/issues/2926
 4 | 
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 |     /home/ma-user/work/Old_BERT/run_pretraining.py \
 7 |     --seed=12439 \
 8 |     --do_train \
 9 |     --do_eval \
10 | 		--optimizer=came \
11 |     --config_file=/home/ma-user/work/Old_BERT/bert_large_config.json \
12 |     --output_dir=/cache/results \
13 |      --fp16 \
14 |     --allreduce_post_accumulation \
15 |     --allreduce_post_accumulation_fp16 \
16 |     --gradient_accumulation_steps=256 \
17 | 	 --bert_model=bert-large-uncased \
18 |     --log_freq=1 \
19 |     --train_batch_size=4096 \
20 | 	--dev_batch_size=64 \
21 |     --learning_rate=0.00024 \
22 |     --warmup_proportion=0.2 \
23 | 	 --num_steps_per_checkpoint=5 \
24 |     --input_dir=/cache/data/train_data \
25 |     --dev_dir=/cache/data/dev_data \
26 |     --phase2 \
27 |     --max_seq_length=128 \
28 |     --max_predictions_per_seq=20 \
29 |     --max_steps=20000 \
30 |     --init_checkpoint=None \
31 |     --phase1_end_step=0 
32 | 
33 | 


--------------------------------------------------------------------------------
/CAME/run_validation.sh:
--------------------------------------------------------------------------------
 1 | python -m torch.distributed.launch --nproc_per_node=8 \
 2 |     /home/ma-user/work/Old_BERT/validation.py \
 3 |     --seed=12439 \
 4 |     --do_train \
 5 |     --config_file=/home/ma-user/work/Old_BERT/bert_large_config.json \
 6 |     --output_dir=/cache/results \
 7 |      --fp16 \
 8 | 		--optimizer=SM3 \
 9 |     --allreduce_post_accumulation \
10 |     --allreduce_post_accumulation_fp16 \
11 |     --gradient_accumulation_steps=256 \
12 | 	 --bert_model=bert-large-uncased \
13 | 	 --init_checkpoint=/cache/ckpt_9989.pt \
14 |     --log_freq=1 \
15 |     --train_batch_size=4096 \
16 | 		--dev_batch_size=64 \
17 |     --learning_rate=0.1 \
18 |     --warmup_proportion=0.1 \
19 | 	 --num_steps_per_checkpoint=5 \
20 |     --input_dir=/cache/data/train_data \
21 | 	--dev_dir=/cache/data/dev_data \
22 |     --phase2 \
23 |     --max_seq_length=128 \
24 |     --max_predictions_per_seq=20 \
25 |     --max_steps=20000 \
26 |     --init_checkpoint=None \
27 |     --phase1_end_step=0
28 | 


--------------------------------------------------------------------------------
/CAME/scripts/data_download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | DATA_DIR=${1:-/workspace/bert/data}
17 | 
18 | # Download vocab files from pretrained model
19 | cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
20 | 
21 | # Download SQUAD
22 | cd $DATA_DIR/squad && . squad_download.sh
23 | 
24 | # Download SWAG
25 | git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
26 | 
27 | # Download GLUE
28 | cd $DATA_DIR/glue && . download_mrpc.sh
29 | 
30 | # WIKI Download
31 | cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
32 | 
33 | # Bookcorpus  Download
34 | cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
35 | 
36 | cd $DATA_DIR
37 | # Create HDF5 files for WIKI
38 | bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
39 |   && rm -r ./wikipedia_corpus/final_* \
40 | 
41 | # Create HDF5 files for Bookcorpus
42 | bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
43 |   && rm -r ./bookcorpus/final_* \
44 | 
45 | # Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
46 | bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024
47 | 


--------------------------------------------------------------------------------
/CAME/scripts/docker/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker build --network=host . --rm --pull --no-cache -t bert
3 | 


--------------------------------------------------------------------------------
/CAME/scripts/docker/launch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CMD=${1:-/bin/bash}
 4 | NV_VISIBLE_DEVICES=${2:-"all"}
 5 | DOCKER_BRIDGE=${3:-"host"}
 6 | 
 7 | docker run -it --rm \
 8 |   --gpus device=$NV_VISIBLE_DEVICES \
 9 |   --net=$DOCKER_BRIDGE \
10 |   --shm-size=1g \
11 |   --ulimit memlock=-1 \
12 |   --ulimit stack=67108864 \
13 |   -e LD_LIBRARY_PATH='/workspace/install/lib/' \
14 |   -v $PWD:/workspace/bert \
15 |   -v $PWD/results:/results \
16 |   bert $CMD
17 | 


--------------------------------------------------------------------------------
/CAME/scripts/run_swag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | SWAG_DIR=/workspace/bert/data/swag
17 | OUT_DIR=/results/SWAG
18 | 
19 | mkdir -p $OUT_DIR
20 | 
21 | echo "Container nvidia build = " $NVIDIA_BUILD_ID
22 | 
23 | init_checkpoint=${1}
24 | mode=${2:-"train"}
25 | max_steps=${3:-"-1.0"} # if < 0, has no effect
26 | batch_size=${4:-"12"}
27 | learning_rate=${5:-"5e-6"}
28 | precision=${6:-"fp32"}
29 | num_gpu=${7:-"8"}
30 | epochs=${8:-"2"}
31 | 
32 | if [ "$mode" != "train" ] ; then
33 |   num_gpu=1
34 | fi
35 | 
36 | use_fp16=""
37 | if [ "$precision" = "fp16" ] ; then
38 |   echo "fp16 activated!"
39 |   use_fp16="--fp16"
40 | fi
41 | 
42 | if [ "$num_gpu" = "1" ] ; then
43 |   mpi_command=""
44 | else
45 |   mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
46 | fi
47 | 
48 | CMD="python -m $mpi_command run_swag.py "
49 | CMD+="--init_checkpoint=$init_checkpoint "
50 | if [ "$mode" = "train" ] ; then
51 |   CMD+="--do_train "
52 |   CMD+="--train_batch_size=$batch_size "
53 | else
54 |   CMD+="--do_eval "
55 |   CMD+="--eval_batch_size=$batch_size "
56 | fi
57 | CMD+="--do_lower_case "
58 | CMD+="--data_dir $SWAG_DIR/data/ "
59 | CMD+="--bert_model bert-large-uncased "
60 | CMD+="--max_seq_length 128 "
61 | CMD+="--learning_rate $learning_rate "
62 | CMD+="--num_train_epochs $epochs "
63 | CMD+="--max_steps $max_steps "
64 | CMD+="--output_dir $OUT_DIR "
65 | CMD+="$use_fp16"
66 | 
67 | LOGFILE=$OUT_DIR/logfile
68 | $CMD |& tee $LOGFILE
69 | 
70 | sed -r 's/
71 | |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
72 | 
73 | throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
74 | 
75 | echo "throughput: $throughput"
76 | 
77 | 


--------------------------------------------------------------------------------
/CAME/startup_came.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Startup script to run on the cloud
 3 | """
 4 | 
 5 | import moxing
 6 | import os
 7 | import argparse
 8 | import logging
 9 | 
10 | # install libraries
11 | 
12 | os.environ["NUMBA_NUM_THREADS"] = '1'
13 | 
14 | os.system('pip install setuptools==59.0.1')
15 | os.system('pip install torchmetrics==0.7.1')
16 | print('Install torch finished...')
17 | 
18 | os.system('pip install pyarrow==2.0.0')
19 | os.system('pip install tqdm')
20 | os.system('pip install h5py')
21 | os.system('pip install onnxruntime==1.0.0')
22 | os.system('pip install boto3==1.15.0')
23 | os.system('pip install torch-optimizer==0.0.1a16')
24 | os.system('pip install torch-SM3')
25 | 
26 | logging.info("finish install tqdm and h5py")
27 | 
28 | 
29 | try:
30 |     import torch
31 |     print('Import torch success...')
32 |     print('torch version: ', torch.__version__)
33 |     print('cuda status: ', torch.cuda.is_available())
34 |     import apex
35 |     print('Import apex success...')
36 |     import amp_C
37 |     print('Import amp_C success...')
38 |     import apex_C
39 |     print('Import apex_C success...')
40 | except Exception as e:
41 |     print('Some failure...', e)
42 | 
43 | 
44 | parser = argparse.ArgumentParser()
45 | parser.add_argument('--data_url', type=str)
46 | parser.add_argument('--train_url', type=str)
47 | parser.add_argument('--batch_size', type=int)
48 | parser.add_argument('--learning_rate', type=float)
49 | parser.add_argument('--max_steps', type=int)
50 | parser.add_argument('--gradient_accumulation_steps', type=int)
51 | 
52 | 
53 | args, unparsed = parser.parse_known_args()
54 | print(args, unparsed)
55 | 
56 | # download data
57 | 
58 | moxing.file.copy_parallel('/pretraining_data/train_data','/cache/data/train_data' )
59 | moxing.file.copy_parallel('/pretraining_data/dev_all_data','/cache/data/dev_data' )
60 | 
61 | # run program
62 | import os
63 | 
64 | os.system('bash /home/ma-user/work/BERT/run_came_pretraining.sh')
65 | 


--------------------------------------------------------------------------------
/CAME/triton/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | export TRITON_MODEL_OVERWRITE=True
17 | NV_VISIBLE_DEVICES=0
18 | 
19 | bert_model=${1:-"large"}
20 | precision=${2:-"fp32"}
21 | init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
22 | EXPORT_FORMAT=${4:-"ts-script"}
23 | 
24 | MODEL_NAME="bert_${bert_model}_${precision}"
25 | BERT_DIR="/workspace/bert"
26 | VOCAB_FILE="/workspace/bert/vocab/vocab"
27 | PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
28 | SQUAD_DIR="/workspace/bert/data/squad/v1.1"
29 | OUT_DIR="/results"
30 | BATCH_SIZE="8"
31 | # Create common bridge for client and server
32 | BRIDGE_NAME="tritonnet"
33 | docker network create ${BRIDGE_NAME}
34 | 
35 | EXPORT_MODEL_ARGS="${BATCH_SIZE} ${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME} 0 1"
36 | 
37 | # Clean up
38 | cleanup() {
39 |     docker kill trt_server_cont
40 |     docker network rm ${BRIDGE_NAME}
41 | }
42 | trap cleanup EXIT
43 | trap cleanup SIGTERM
44 | 
45 | ./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${EXPORT_MODEL_ARGS} ${TRITON_MODEL_OVERWRITE}
46 | 
47 | # Start Server
48 | echo Starting server...
49 | SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
50 | SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
51 | 
52 | ./triton/wait_for_triton_server.sh
53 | 
54 | CMD="python triton/run_squad_client.py \
55 |     --model_name ${MODEL_NAME} \
56 |     --do_lower_case \
57 |     --vocab_file ${VOCAB_FILE} \
58 |     --output_dir ${OUT_DIR} \
59 |     --predict_file ${PREDICT_FILE} \
60 |     --batch_size ${BATCH_SIZE}"
61 | 
62 | bash scripts/docker/launch.sh "${CMD}"
63 | 
64 | bash scripts/docker/launch.sh "python ${SQUAD_DIR}/evaluate-v1.1.py ${PREDICT_FILE} ${OUT_DIR}/predictions.json"
65 | 


--------------------------------------------------------------------------------
/CAME/triton/export_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License. 
15 | 
16 | NV_VISIBLE_DEVICES=${1:-"0"}
17 | DOCKER_BRIDGE=${2:-"host"}
18 | checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
19 | batch_size=${4:-"8"}
20 | BERT_DIR=${5:-"/workspace/bert"}
21 | EXPORT_FORMAT=${6:-"ts-script"}
22 | precision=${7:-"fp16"}
23 | triton_model_version=${8:-1}
24 | triton_model_name=${9:-"bertQA-ts-script"}
25 | triton_dyn_batching_delay=${10:-0}
26 | triton_engine_count=${11:-1}
27 | triton_model_overwrite=${12:-"False"}
28 | 
29 | PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
30 | 
31 | DEPLOYER="deployer.py"
32 | 
33 | CMD="python triton/${DEPLOYER} \
34 |     --${EXPORT_FORMAT} \
35 |     --save-dir /results/triton_models \
36 |     --triton-model-name ${triton_model_name} \
37 |     --triton-model-version ${triton_model_version} \
38 |     --triton-max-batch-size ${batch_size} \
39 |     --triton-dyn-batching-delay ${triton_dyn_batching_delay} \
40 |     --triton-engine-count ${triton_engine_count} "
41 | 
42 | CMD+="-- --checkpoint ${checkpoint} \
43 |     --config_file ${BERT_DIR}/bert_config.json \
44 |     --vocab_file /workspace/bert/vocab/vocab \
45 |     --predict_file ${PREDICT_FILE} \
46 |     --do_lower_case \
47 |     --batch_size=${batch_size} "
48 | 
49 | if [[ $precision == "fp16" ]]; then
50 |     CMD+="--fp16 "
51 | fi
52 | 
53 | bash scripts/docker/launch.sh "${CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE}
54 | 


--------------------------------------------------------------------------------
/CAME/triton/launch_triton_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License. 
15 | 
16 | DOCKER_BRIDGE=${1:-"bridge"}
17 | NV_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES:-"0"}
18 | 
19 | # Start TRITON server in detached state
20 | docker run -d --rm \
21 |    --gpus device=${NV_VISIBLE_DEVICES} \
22 |    --shm-size=1g \
23 |    --ulimit memlock=-1 \
24 |    --ulimit stack=67108864 \
25 |    --network=${DOCKER_BRIDGE} \
26 |    -p 8000:8000 \
27 |    -p 8001:8001 \
28 |    -p 8002:8002 \
29 |    --name trt_server_cont \
30 |    -v $PWD/results/triton_models:/models \
31 |    nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1
32 | 


--------------------------------------------------------------------------------
/CAME/triton/profiling_data_int64/input__0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/triton/profiling_data_int64/input__0


--------------------------------------------------------------------------------
/CAME/triton/run_perf_client.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License. 
15 | 
16 | MODEL_NAME=${1:-"bert"}
17 | MODEL_VERSION=${2:-1}
18 | precision=${3:-"fp32"}
19 | BATCH_SIZE=${4:-1}
20 | MAX_LATENCY=${5:-500}
21 | MAX_CLIENT_THREADS=${6:-10}
22 | MAX_CONCURRENCY=${7:-50}
23 | SERVER_HOSTNAME=${8:-"localhost"}
24 | DOCKER_BRIDGE=${9:-"host"}
25 | RESULTS_ID=${10:-""}
26 | PROFILING_DATA=${11:-"triton/profiling_data_int64"}
27 | NV_VISIBLE_DEVICES=${12:-"0"}
28 | 
29 | if [[ $SERVER_HOSTNAME == *":"* ]]; then
30 |   echo "ERROR! Do not include the port when passing the Server Hostname. These scripts require that the TRITON HTTP endpoint is on Port 8000 and the gRPC endpoint is on Port 8001. Exiting..."
31 |   exit 1
32 | fi
33 | 
34 | if [ "$SERVER_HOSTNAME" = "localhost" ]
35 | then
36 |     if [ ! "$(docker inspect -f "{{.State.Running}}" trt_server_cont)" = "true" ] ; then
37 | 
38 |         echo "Launching TRITON server"
39 |         bash triton/launch_triton_server.sh ${DOCKER_BRIDGE} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES
40 |         SERVER_LAUNCHED=true
41 | 
42 |         function cleanup_server {
43 |             docker kill trt_server_cont
44 |         }
45 | 
46 |         # Ensure we cleanup the server on exit
47 |         # trap "exit" INT TERM
48 |         trap cleanup_server EXIT
49 |     fi
50 | fi
51 | 
52 | # Wait until server is up. curl on the health of the server and sleep until its ready
53 | bash triton/wait_for_triton_server.sh $SERVER_HOSTNAME
54 | 
55 | TIMESTAMP=$(date "+%y%m%d_%H%M")
56 | 
57 | # Create model directory on host (directory /results is mounted)
58 | bash scripts/docker/launch.sh "mkdir -p /results/perf_client/${MODEL_NAME}"
59 | if [ ! -z "${RESULTS_ID}" ];
60 | then
61 |     RESULTS_ID="_${RESULTS_ID}"
62 | fi
63 | 
64 | OUTPUT_FILE_CSV="/results/perf_client/${MODEL_NAME}/results${RESULTS_ID}_${TIMESTAMP}.csv"
65 | 
66 | ARGS="\
67 |    --max-threads ${MAX_CLIENT_THREADS} \
68 |    -m ${MODEL_NAME} \
69 |    -x ${MODEL_VERSION} \
70 |    -p 3000 \
71 |    -d \
72 |    -v \
73 |    -i gRPC \
74 |    -u ${SERVER_HOSTNAME}:8001 \
75 |    -b ${BATCH_SIZE} \
76 |    -l ${MAX_LATENCY} \
77 |    -c ${MAX_CONCURRENCY} \
78 |    -f ${OUTPUT_FILE_CSV} \
79 |    --input-data ${PROFILING_DATA}"
80 | 
81 | echo "Using args:  $(echo "$ARGS" | sed -e 's/   -/\n-/g')"
82 | bash scripts/docker/launch.sh "/workspace/install/bin/perf_client $ARGS" all $DOCKER_BRIDGE
83 | 


--------------------------------------------------------------------------------
/CAME/triton/wait_for_triton_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License. 
15 | 
16 | SERVER_URI=${1:-"localhost"}
17 | 
18 | echo "Waiting for TRITON Server to be ready at http://$SERVER_URI:8000..."
19 | 
20 | live_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/live"
21 | ready_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/ready"
22 | 
23 | current_status=$($live_command)
24 | echo $current_status
25 | 
26 | # First check the current status. If that passes, check the json. If either fail, loop
27 | while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
28 | 
29 |    printf "."
30 |    sleep 1
31 |    current_status=$($live_command)
32 | done
33 | 
34 | echo "TRITON Server is ready!"
35 | 


--------------------------------------------------------------------------------
/CAME/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import torch
15 | import torch.distributed as dist
16 | 
17 | from pathlib import Path
18 | 
19 | 
20 | def get_rank():
21 |     if not dist.is_available():
22 |         return 0
23 |     if not dist.is_initialized():
24 |         return 0
25 |     return dist.get_rank()
26 | 
27 | 
28 | def get_world_size():
29 |     if not dist.is_available():
30 |         return 1
31 |     if not dist.is_initialized():
32 |         return 1
33 |     return dist.get_world_size()
34 | 
35 | 
36 | def is_main_process():
37 |     return get_rank() == 0
38 | 
39 | 
40 | def barrier():
41 |     if dist.is_available() and dist.is_initialized():
42 |         dist.barrier()
43 | 
44 | 
45 | def format_step(step):
46 |     if isinstance(step, str):
47 |         return step
48 |     s = ""
49 |     if len(step) > 0:
50 |         s += "Training Epoch: {} ".format(step[0])
51 |     if len(step) > 1:
52 |         s += "Training Iteration: {} ".format(step[1])
53 |     if len(step) > 2:
54 |         s += "Validation Iteration: {} ".format(step[2])
55 |     return s
56 | 
57 | 
58 | def mkdir(path):
59 |     Path(path).mkdir(parents=True, exist_ok=True)
60 | 
61 | 
62 | def mkdir_by_main_process(path):
63 |     if is_main_process():
64 |         mkdir(path)
65 |     barrier()
66 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CeMAT/CeMAT_maskPredict/LICENSE


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/__init__.py:
--------------------------------------------------------------------------------
1 | from .criterions import *
2 | from .models import *
3 | from .tasks import *
4 | from .data import *


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/criterions/__init__.py:
--------------------------------------------------------------------------------
1 | from .label_smoothed_length_cross_entropy import *
2 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_pair_self_dataset_mask import *


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/meters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import time
 9 | 
10 | 
11 | class AverageMeter(object):
12 |     """Computes and stores the average and current value"""
13 |     def __init__(self):
14 |         self.reset()
15 | 
16 |     def reset(self):
17 |         self.val = 0
18 |         self.avg = 0
19 |         self.sum = 0
20 |         self.count = 0
21 | 
22 |     def update(self, val, n=1):
23 |         self.val = val
24 |         self.sum += val * n
25 |         self.count += n
26 |         self.avg = self.sum / self.count
27 | 
28 | 
29 | class TimeMeter(object):
30 |     """Computes the average occurrence of some event per second"""
31 |     def __init__(self, init=0):
32 |         self.reset(init)
33 | 
34 |     def reset(self, init=0):
35 |         self.init = init
36 |         self.start = time.time()
37 |         self.n = 0
38 | 
39 |     def update(self, val=1):
40 |         self.n += val
41 | 
42 |     @property
43 |     def avg(self):
44 |         return self.n / self.elapsed_time
45 | 
46 |     @property
47 |     def elapsed_time(self):
48 |         return self.init + (time.time() - self.start)
49 | 
50 | 
51 | class StopwatchMeter(object):
52 |     """Computes the sum/avg duration of some event in seconds"""
53 |     def __init__(self):
54 |         self.reset()
55 | 
56 |     def start(self):
57 |         self.start_time = time.time()
58 | 
59 |     def stop(self, n=1):
60 |         if self.start_time is not None:
61 |             delta = time.time() - self.start_time
62 |             self.sum += delta
63 |             self.n += n
64 |             self.start_time = None
65 | 
66 |     def reset(self):
67 |         self.sum = 0
68 |         self.n = 0
69 |         self.start_time = None
70 | 
71 |     @property
72 |     def avg(self):
73 |         return self.sum / self.n
74 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .bert_seq2seq import *


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/strategies/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights
 5 | # can be found in the PATENTS file in the same directory.
 6 | 
 7 | import argparse
 8 | import importlib
 9 | import os
10 | 
11 | from .decoding_strategy import DecodingStrategy
12 | 
13 | 
14 | STRATEGY_REGISTRY = {}
15 | STRATEGY_CLASS_NAMES = set()
16 | 
17 | 
18 | def setup_strategy(args):
19 |     return STRATEGY_REGISTRY[args.decoding_strategy](args)
20 | 
21 | 
22 | def register_strategy(name):
23 |     def register_strategy_cls(cls):
24 |         if name in STRATEGY_REGISTRY:
25 |             raise ValueError('Cannot register duplicate strategy ({})'.format(name))
26 |         if not issubclass(cls, DecodingStrategy):
27 |             raise ValueError('Strategy ({}: {}) must extend DecodingStrategy'.format(name, cls.__name__))
28 |         if cls.__name__ in STRATEGY_CLASS_NAMES:
29 |             raise ValueError('Cannot register strategy with duplicate class name ({})'.format(cls.__name__))
30 |         STRATEGY_REGISTRY[name] = cls
31 |         STRATEGY_CLASS_NAMES.add(cls.__name__)
32 |         return cls
33 | 
34 |     return register_strategy_cls
35 | 
36 | 
37 | # automatically import any Python files in the strategies/ directory
38 | for file in os.listdir(os.path.dirname(__file__)):
39 |     if file.endswith('.py') and not file.startswith('_'):
40 |         strategy_name = file[:file.find('.py')]
41 |         importlib.import_module('CeMAT_maskPredict.strategies.' + strategy_name)
42 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/strategies/decoding_strategy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | 
 9 | class DecodingStrategy(object):
10 |     
11 |     def generate(model, encoder_out, tgt_tokens, tgt_dict):
12 |         pass
13 | 
14 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/strategies/left_to_right.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn.functional as F
 9 | 
10 | from . import register_strategy
11 | from .easy_first import EasyFirst
12 | from .strategy_utils import duplicate_encoder_out, generate_step_with_prob, assign_single_value_long, assign_single_value_byte, assign_multi_value_long, convert_tokens
13 | 
14 | 
15 | @register_strategy('left_to_right')
16 | class LeftToRight(EasyFirst):
17 |     
18 |     def __init__(self, args):
19 |         super().__init__(args)
20 |     
21 |     def generate(self, model, encoder_out, tokens, tgt_dict):
22 |         bsz, seq_len = tokens.size()
23 |         duplicate_encoder_out(encoder_out, bsz, self.beam_size)
24 |         tokens = tokens.unsqueeze(1).repeat(1, self.beam_size, 1)
25 |         lprobs = tokens.new(bsz, self.beam_size).float().fill_(float('-inf'))
26 |         lprobs[:, 0] = 0
27 | 
28 |         """
29 |         for batch in range(bsz):
30 |             for beam in range(self.beam_size):
31 |                 print("Initialization: ", convert_tokens(tgt_dict, tokens[batch, beam]))
32 |         print()
33 |         """
34 | 
35 |         for position in range(seq_len):
36 |             tokens = tokens.view(bsz * self.beam_size, seq_len) # merge beam with batch
37 |             decoder_out = model.decoder(tokens, encoder_out)
38 |             candidate_lprobs = self.generate_candidates(decoder_out, tokens, tgt_dict.mask(), position)
39 |             tokens = tokens.view(bsz, self.beam_size, seq_len) # separate beam from batch
40 |             candidate_lprobs = candidate_lprobs.view(bsz, self.beam_size, seq_len, -1) # separate beam from batch
41 |             tokens, lprobs = self.select_best(tokens, lprobs, candidate_lprobs)
42 | 
43 |             """
44 |             for batch in range(bsz):
45 |                 for beam in range(self.beam_size):
46 |                     print("Prediction: ", convert_tokens(tgt_dict, tokens[batch, beam]))
47 |             print()
48 |             """
49 | 
50 |         return tokens[:, 0, :], lprobs[:, 0]
51 |     
52 |     def generate_candidates(self, decoder_out, tokens, mask, position):
53 |         candidate_probs = F.softmax(decoder_out[0], dim=-1)
54 |         candidate_probs = candidate_probs * tokens.eq(mask).float().unsqueeze(-1)
55 |         candidate_probs[:, :, mask] = 0
56 |         candidate_probs[:, position + 1:, :] = 0
57 |         return candidate_probs.log()
58 | 
59 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/strategies/strategy_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | def duplicate_encoder_out(encoder_out, bsz, beam_size):
13 |     encoder_out['encoder_out'] = encoder_out['encoder_out'].unsqueeze(2).repeat(1, 1, beam_size, 1).view(-1, bsz * beam_size, encoder_out['encoder_out'].size(-1))
14 |     if encoder_out['encoder_padding_mask'] is not None:
15 |         encoder_out['encoder_padding_mask'] = encoder_out['encoder_padding_mask'].unsqueeze(1).repeat(1, beam_size, 1).view(bsz * beam_size, -1)
16 | 
17 | 
18 | def generate_step_with_prob(out):
19 |     probs = F.softmax(out[0], dim=-1)
20 |     max_probs, idx = probs.max(dim=-1)
21 |     return idx, max_probs, probs
22 | 
23 | 
24 | def assign_single_value_byte(x, i, y):
25 |     x.view(-1)[i.view(-1).nonzero()] = y
26 | 
27 | 
28 | def assign_multi_value_byte(x, i, y):
29 |     x.view(-1)[i.view(-1).nonzero()] = y.view(-1)[i.view(-1).nonzero()]
30 | 
31 | 
32 | def assign_single_value_long(x, i, y):
33 |     b, l = x.size()
34 |     i = i + torch.arange(0, b*l, l, device=i.device).unsqueeze(1)
35 |     x.view(-1)[i.view(-1)] = y
36 | 
37 | 
38 | def assign_multi_value_long(x, i, y):
39 |     b, l = x.size()
40 |     i = i + torch.arange(0, b*l, l, device=i.device).unsqueeze(1)
41 |     x.view(-1)[i.view(-1)] = y.view(-1)[i.view(-1)]
42 | 
43 | 
44 | def convert_tokens(dictionary, tokens):
45 |     return ' '.join([dictionary[token] for token in tokens])
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/task_NAT_cemat.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=
 2 | task=translation_self_from_pt
 3 | SRC=
 4 | TGT=
 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en'
 6 | ARCH=bert_transformer_seq2seq_big
 7 | freq=8
 8 | patience=80
 9 | valid_subset='valid'
10 | SAVE_DIR=
11 | PRETRAIN=
12 | 
13 | python train.py ${DATA_PATH} --fp16 \
14 |   --user-dir CeMAT_maskPredict \
15 |   --encoder-normalize-before --decoder-normalize-before \
16 |   --encoder-learned-pos --decoder-learned-pos \
17 |   --task ${task} --from-pt \
18 |   --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \
19 |   --arch ${ARCH}  --share-all-embeddings \
20 |   --criterion label_smoothed_length_cross_entropy --label-smoothing 0.1 \
21 |   --optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-6 \
22 |   --lr 0.0005 --warmup-init-lr 1e-7 --min-lr 1e-9 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
23 |   --dropout 0.3 --weight-decay 0.01 \
24 |   --max-tokens 4096 --update-freq  ${freq} \
25 |   --max-source-positions 10000 --max-target-positions 10000 --max-update 300000 --seed 0 \
26 |   --restore-file ${PRETRAIN} --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
27 |   --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 \
28 |   --log-format simple --log-interval 100 \
29 |   --ddp-backend no_c10d \
30 |   --save-dir ${SAVE_DIR} --patience ${patience} --num-workers 4 \
31 |   --distributed-no-spawn \
32 |   --valid-subset ${valid_subset}


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/task_infer_nat.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=
 2 | task=translation_self_from_pt
 3 | SRC=
 4 | TGT=
 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en'
 6 | SAVE_DIR=
 7 | PRETRAIN=
 8 | 
 9 | python generate_cmlm.py  ${DATA_PATH} --fp16 \
10 |     --user-dir CeMAT_maskPredict \
11 |     --path ${PRETRAIN} \
12 |     --task ${task} --decoding-strategy mask_predict \
13 |     --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \
14 |     --gen-subset test \
15 |     --max-sentences 20 --decoding-iterations 10 --remove-bpe | tee infer.txt
16 | 
17 | grep ^H  infer.txt \
18 | | sed 's/^H\-//' \
19 | | sort -V \
20 | | cut -f 2 \
21 | | sed 's/\['$TGT'\] //g' \
22 | | sed 's/\['$TGT'\]//g' \
23 | > infer.sys
24 | 
25 | grep ^T-  infer.txt \
26 | | sed 's/^T\-//' \
27 | | sort -V \
28 | | cut -f 2 \
29 | | sed 's/\['$TGT'\] //g' \
30 | | sed 's/\['$TGT'\]//g' \
31 | > infer.ref
32 |   
33 | sacrebleu --tokenize 'none' -w 2 infer.ref < infer.sys


--------------------------------------------------------------------------------
/CeMAT/CeMAT_maskPredict/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .translation_self_from_cemat import *
2 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/__init__.py:
--------------------------------------------------------------------------------
1 | from .criterions import *
2 | from .models import *
3 | from .tasks import *
4 | from .data import *


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/criterions/__init__.py:
--------------------------------------------------------------------------------
1 | from .label_smoothed_cross_entropy_with_maskdecode import *
2 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .concat_pair_dataset import *
2 | from .ddenoising_pair_dataset_dyna_replace import *
3 | from .language_pair_dataset import *
4 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/data/cemat_dataset.py:
--------------------------------------------------------------------------------
 1 | # 2022 - Added code for CeMAT
 2 | #        Huawei Technologies Co., Ltd. <lipengfei111@huawei.com>
 3 | # Copyright 2022 Huawei Technologies Co., Ltd.
 4 | #
 5 | # Copyright (c) Facebook, Inc. and its affiliates.
 6 | #
 7 | # This source code is licensed under the MIT license found in the
 8 | # LICENSE file in the root directory of this source tree.
 9 | #
10 | 
11 | import numpy as np
12 | import torch.utils.data
13 | from fairseq.data import data_utils,FairseqDataset
14 | 
15 | class CematDataset(FairseqDataset):
16 |     """A dataset that provides helpers for batching."""
17 | 
18 |     def __init__(self):
19 |         super(CematDataset, self).__init__()
20 | 
21 |     def filter_indices_by_size(self, indices, max_sizes):
22 |         """
23 |         Filter a list of sample indices. Remove those that are longer than
24 |         specified in *max_sizes*.
25 | 
26 |         WARNING: don't update, override method in child classes
27 | 
28 |         Args:
29 |             indices (np.array): original array of sample indices
30 |             max_sizes (int or list[int] or tuple[int]): max sample size,
31 |                 can be defined separately for src and tgt (then list or tuple)
32 | 
33 |         Returns:
34 |             np.array: filtered sample array
35 |             list: list of removed indices
36 |         """
37 |         max_sizes= max_sizes[0]
38 |         if isinstance(max_sizes, float) or isinstance(max_sizes, int):
39 |             if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray):
40 |                 ignored = indices[self.sizes[:,0][indices] > max_sizes].tolist()
41 |                 indices = indices[self.sizes[:,0][indices] <= max_sizes]
42 |             elif (
43 |                 hasattr(self, "sizes")
44 |                 and isinstance(self.sizes, list)
45 |                 and len(self.sizes) == 1
46 |             ):
47 |                 ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
48 |                 indices = indices[self.sizes[0][indices] <= max_sizes]
49 |             else:
50 |                 indices, ignored = data_utils._filter_by_size_dynamic(
51 |                     indices, self.size, max_sizes
52 |                 )
53 |         else:
54 |             indices, ignored = data_utils._filter_by_size_dynamic(
55 |                 indices, self.size, max_sizes
56 |             )
57 |         return indices, ignored
58 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import *
2 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/models/fairseq_encoder.py:
--------------------------------------------------------------------------------
 1 | # 2022 - Added code for CeMAT
 2 | #        Huawei Technologies Co., Ltd. <lipengfei111@huawei.com>
 3 | # Copyright 2022 Huawei Technologies Co., Ltd.
 4 | #
 5 | # Copyright (c) Facebook, Inc. and its affiliates.
 6 | #
 7 | # This source code is licensed under the MIT license found in the
 8 | # LICENSE file in the root directory of this source tree.
 9 | #
10 | 
11 | from typing import Dict, List, NamedTuple, Optional,Tuple
12 | import torch
13 | import torch.nn as nn
14 | from torch import Tensor
15 | from fairseq import utils
16 | from fairseq.models import FairseqEncoder
17 | 
18 | EncoderOut = NamedTuple(
19 |     "EncoderOut",
20 |     [
21 |         ("encoder_out", Tensor),  # T x B x C
22 |         ("encoder_padding_mask", Optional[Tensor]),  # B x T
23 |         ("encoder_embedding", Optional[Tensor]),  # B x T x C
24 |         ("encoder_states", Optional[List[Tensor]]),  # List[T x B x C]
25 |         ("src_tokens", Optional[Tensor]),  # B x T
26 |         ("src_lengths", Optional[Tensor]),  # B x 1
27 |     ],
28 | )
29 | 
30 | 
31 | class CematEncoder(FairseqEncoder):
32 |     """Base class for encoders."""
33 | 
34 |     def __init__(self, dictionary):
35 |         super().__init__(dictionary)
36 |         self.dictionary = dictionary
37 |         self.onnx_trace = False
38 | 
39 |     def get_normalized_probs(
40 |         self,
41 |         net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
42 |         log_probs: bool,
43 |         sample: Optional[Dict[str, Tensor]] = None,
44 |     ):
45 |         """Get normalized probabilities (or log probs) from a net's output."""
46 | 
47 |         if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
48 |             if sample is not None:
49 |                 assert "source" in sample
50 |                 source = sample["source"]
51 |             else:
52 |                 source = None
53 |             out = self.adaptive_softmax.get_log_prob(net_output[0], target=source)
54 |             return out.exp_() if not log_probs else out
55 | 
56 |         logits = net_output[0]
57 |         if log_probs:
58 |             return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
59 |         else:
60 |             return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
61 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/task_NMT_cemat.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=
 2 | task=translation_from_pretrained_cemat
 3 | SRC=
 4 | TGT=
 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en'
 6 | ARCH=cemat_transformer_big
 7 | freq=16
 8 | patience=35
 9 | valid_subset='valid'
10 | SAVE_DIR=
11 | PRETRAIN=
12 | 
13 | fairseq-train ${DATA_PATH} --fp16 \
14 |   --user-dir CeMAT_plugins \
15 |   --encoder-normalize-before --decoder-normalize-before --layernorm-embedding \
16 |   --task ${task} \
17 |   --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \
18 |   --arch ${ARCH} \
19 |   --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
20 |   --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
21 |   --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
22 |   --dropout 0.3 --attention-dropout 0.1 \
23 |   --max-tokens 4096 --update-freq ${freq} --seed 222 \
24 |   --log-format simple --skip-invalid-size-inputs-valid-test \
25 |   --keep-interval-updates 20 --log-interval 10 \
26 |   --validate-interval 1 \
27 |   --restore-file ${PRETRAIN} --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
28 |   --eval-bleu \
29 |   --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
30 |   --eval-bleu-detok moses \
31 |   --eval-bleu-remove-bpe \
32 |   --eval-bleu-print-samples \
33 |   --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
34 |   --ddp-backend c10d \
35 |   --save-dir ${SAVE_DIR} --num-workers 4 --patience ${patience} \
36 |   --valid-subset ${valid_subset}


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/task_infer_nmt.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=
 2 | task=translation_from_pretrained_cemat
 3 | SRC=
 4 | TGT=
 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en'
 6 | SAVE_DIR=
 7 | PRETRAIN=
 8 | 
 9 | fairseq-generate  ${DATA_PATH} --fp16 \
10 |   --user-dir CeMAT_plugins \
11 |   --path ${PRETRAIN} \
12 |   --task ${task} \
13 |   -s ${SRC} -t ${TGT} --langs ${langs} --add-lang-token --share-dict \
14 |   --batch-size 128 --beam 5 --sacrebleu --remove-bpe | tee infer.txt
15 |   
16 | grep ^H infer.txt \
17 | | sed 's/^H\-//' \
18 | | sort -V \
19 | | cut -f 3 \
20 | | sed 's/\['$TGT'\] //g' \
21 | > infer.sys
22 | 
23 | grep ^T infer.txt \
24 | | sed 's/^T\-//' \
25 | | sort -V \
26 | | cut -f 2 \
27 | | sed 's/\['$TGT'\] //g' \
28 | > infer.ref
29 | 
30 | 
31 | sacrebleu --tokenize 'none' -w 2 infer.ref < infer.sys


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/task_pt_cemat.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=
 2 | task=cemat_pretraining
 3 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en'
 4 | word_trans='word_trans2id.dict'
 5 | ARCH=cemat_transformer_big
 6 | # with 32 GPUS.
 7 | freq=8
 8 | patience=10
 9 | valid_subset='valid'
10 | SAVE_DIR=
11 | 
12 | fairseq-train ${DATA_PATH} --fp16 \
13 |   --user-dir CeMAT_plugins \
14 |   --encoder-normalize-before --decoder-normalize-before --layernorm-embedding \
15 |   --task ${task} \
16 |   --langs ${langs} --add-lang-token --share-dict --shuffle-lang-pair --multilang-sampling-alpha 0.7 \
17 |   --trans-dict ${word_trans} \
18 |   --arch ${ARCH} --bi_self_att --plus-encoder-loss --encoder-loss-lambda 0.3 \
19 |   --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
20 |   --criterion label_smoothed_cross_entropy_with_maskdecode --label-smoothing 0.1 \
21 |   --lr 0.0005 --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 1200000 \
22 |   --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.0001 \
23 |   --max-tokens 4096 --update-freq ${freq} --seed 222 \
24 |   --log-format simple --skip-invalid-size-inputs-valid-test \
25 |   --ddp-backend c10d \
26 |   --keep-interval-updates 20 --log-interval 10 \
27 |   --validate-interval 1 \
28 |   --save-dir ${SAVE_DIR} --num-workers 4 --patience ${patience} \
29 |   --valid-subset ${valid_subset} \
30 | 


--------------------------------------------------------------------------------
/CeMAT/CeMAT_plugins/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .cemat_pretraining import *
2 | from .translation_from_pretrained_cemat import *
3 | 
4 | 


--------------------------------------------------------------------------------
/CeMAT/License:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/CeMAT/cemat_scripts/create_trans/example_extract_alignedpairs.sh:
--------------------------------------------------------------------------------
 1 | # vocab file path.
 2 | vocab_path=
 3 | # bilignual(multilignual) word translation dict path
 4 | wordTrans_path=
 5 | # data path(BPE format)
 6 | data_path=
 7 | prefix=
 8 | langs=
 9 | # output path
10 | out_path=
11 | 
12 | python extract_aligned_pairs.py --vocab-path $vocab_path --trans-path $wordTrans_path --data-path $data_path --output-path $out_path --prefix $prefix --langs $langs --add-mask --merge


--------------------------------------------------------------------------------
/CeMAT/cemat_scripts/process/preprocess_NMT.sh:
--------------------------------------------------------------------------------
 1 | SRC=
 2 | TGT=
 3 | DATA=
 4 | OUTPATH=
 5 | DEST=
 6 | # vocab,model path.
 7 | MODEL=
 8 | mkdir -p ${OUTPATH}
 9 | mkdir -p ${DEST}
10 | 
11 | N_THREADS=8
12 | # if need
13 | pip install jieba
14 | 
15 | FASTBPE_DIR=
16 | FASTBPE=
17 | BPEROOT=
18 | 
19 | #moses decoder path
20 | MOSES=
21 | REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
22 | NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
23 | REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
24 | TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl
25 | INPUT_FROM_SGM=$MOSES/scripts/ems/support/input-from-sgm.perl
26 | CLEAN=$MOSES/scripts/training/clean-corpus-n.perl
27 | NORMALIZE_ROMANIAN=$MOSES/scripts/tokenizer/ro/normalise-romanian.py
28 | REMOVE_DIACRITICS=$MOSES/scripts/tokenizer/ro/remove-diacritics.py
29 | JA_SCRIPT=$MOSES/scripts/tokenizer/ja/kytea.py
30 | JA_MODEL=$MOSES/scripts/tokenizer/ja/ja-0.4.7-1.mod
31 | 
32 | # BPE / vocab files
33 | BPE_CODES=$MODEL/codes
34 | FULL_VOCAB=$MODEL/vocab
35 | 
36 | for split in "train" "valid" "test";
37 | do
38 |     for lang in $SRC $TGT;
39 |     do
40 |         Data_TRAIN=$DATA/$split.$lang
41 |         Data_TRAIN_TOK=$OUTPATH/$split.tok.$lang
42 |         Data_TRAIN_BPE=$OUTPATH/$split.spm.$lang
43 |         echo $Data_TRAIN "TOKENIZER:====>>" $Data_TRAIN_TOK
44 |         if [ "$lang" == "ro" ]; then 
45 |             cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | perl $NORMALIZE_ROMANIAN | perl $REMOVE_DIACRITICS | perl $TOKENIZER -l $lang -a -threads $N_THREADS > $Data_TRAIN_TOK
46 |         elif [ "$lang" == "ja" ]; then 
47 |             cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | python ${JA_SCRIPT} -m ${JA_MODEL}             > $Data_TRAIN_TOK
48 |         elif [ "$lang" == "zh" ]; then 
49 |             cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | python -m jieba -d                             > $Data_TRAIN_TOK
50 |         else
51 |             cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l $lang -a -threads $N_THREADS > $Data_TRAIN_TOK
52 |         fi
53 | 
54 |         echo $Data_TRAIN_TOK "====>>" $Data_TRAIN_BPE
55 |         python $BPEROOT/apply_bpe.py -c $BPE_CODES < $Data_TRAIN_TOK > $Data_TRAIN_BPE
56 |     done
57 |     if [ "$split" == "train" ]; then 
58 |         echo "clean by ratio."
59 |         perl $CLEAN -ratio 1.5 $OUTPATH/$split.spm $SRC $TGT $OUTPATH/$split.spm.clean 1 250
60 |     fi
61 | done
62 | 
63 | #Binarize the dataset
64 | fairseq-preprocess \
65 |   --source-lang ${SRC} \
66 |   --target-lang ${TGT} \
67 |   --trainpref ${OUTPATH}/train.spm.clean \
68 |   --validpref ${OUTPATH}/valid.spm \
69 |   --testpref ${OUTPATH}/test.spm \
70 |   --destdir ${DEST}/ \
71 |   --thresholdtgt 0 \
72 |   --thresholdsrc 0 \
73 |   --srcdict $FULL_VOCAB \
74 |   --tgtdict $FULL_VOCAB \
75 |   --workers 70
76 | 


--------------------------------------------------------------------------------
/DynaBERT/dynabert_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/DynaBERT/dynabert_overview.png


--------------------------------------------------------------------------------
/DynaBERT/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Accessing files from S3 directly.
 4 | boto3
 5 | # Used for downloading models over HTTP
 6 | requests
 7 | 
 8 | torch==1.0.0
 9 | python==3.6
10 | seaborn


--------------------------------------------------------------------------------
/DynaBERT/transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/DynaBERT/transformers/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .processors import InputExample, InputFeatures, DataProcessor
2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
3 | 
4 | from .metrics import is_sklearn_available
5 | if is_sklearn_available():
6 |     from .metrics import glue_compute_metrics
7 | 


--------------------------------------------------------------------------------
/DynaBERT/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import InputExample, InputFeatures, DataProcessor
2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
3 | 
4 | 


--------------------------------------------------------------------------------
/HyperText/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2021  Huawei Technologies Co., Ltd.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining
 5 | a copy of this software and associated documentation files (the
 6 | "Software"), to deal in the Software without restriction, including
 7 | without limitation the rights to use, copy, modify, merge, publish,
 8 | distribute, sublicense, and/or sell copies of the Software, and to
 9 | permit persons to whom the Software is furnished to do so, subject to
10 | the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/HyperText/README.md:
--------------------------------------------------------------------------------
 1 | #  HyperText 
 2 | Natural language data exhibit tree-like hierarchical structures such as the hypernymhyponym relations in WordNet. Considering that hyperbolic space is naturally suitable for modeling tree-like hierarchical data, we propose a new model named HyperText for efﬁcient text classiﬁcation by endowing FastText with hyperbolic geometry. Empirically, we show that HyperText outperforms FastText on a range of text classiﬁcation tasks with much reduced parameters.  
 3 | 
 4 | ![avatar](./hypertext_model_architecture.png)
 5 | 
 6 | For more details about the techniques of HyperText,  please refer to our paper:  
 7 | 
 8 | [HyperText: Endowing FastText with Hyperbolic Geometry](https://arxiv.org/abs/2010.16143 "HyperText: Endowing FastText with Hyperbolic Geometry")
 9 | 
10 | #  Release Notes
11 | 
12 | First version: 2021.01.16
13 | 
14 | # Installation
15 | Run command below to install the environment(using python3)  
16 | ```python
17 | 
18 | pip install -r requirements.txt  
19 | 
20 | ```
21 | 
22 | # Train and Evaluation  
23 | * Data Preprocessing  
24 |   please refer to our paper for details
25 | 
26 | * Train & Evaluation
27 | 
28 | ```python
29 | 
30 | python main.py --datasetdir $data_path --outputdir $output_path --dropout $droout --require_improvement $early_stopping_steps --num_epochs $max_epoch --batch_size $batch_size --max_length $max_sequence_length --learning_rate $learning_rate --embed_dim $embedding_dimension --bucket $hash_bucket_size --wordNgrams $word_ngram --eval_per_batchs $evaluation_frequency --min_freq $minimum_word_frequency --lr_decay_rate $learning_rate_decay
31 | 
32 | ```
33 | 
34 | # Examples  
35 | 
36 | * TNEWS Dataset
37 | 
38 | 
39 | ```python
40 | 
41 | python main.py --datasetdir ./data/tnews_public --outputdir ./output --dropout 0.0 --require_improvement 6000 --num_epochs 50 --batch_size 32 --max_length 40 --learning_rate 1.1e-2 --embed_dim 200 --bucket 1500000 --wordNgrams 2 --eval_per_batchs 100 --min_freq 1 --lr_decay_rate 0.96  
42 | 
43 | ```
44 | 
45 | * IFLYTEK Dataset
46 | 
47 | ```python
48 | 
49 | python main.py --datasetdir ./data/iflytek_public --outputdir ./output --dropout 0.0 --require_improvement 2500 --num_epochs 50 --batch_size 32 --max_length 1000 --learning_rate 1.3e-2 --embed_dim 100 --bucket 2000000 --wordNgrams 2 --eval_per_batchs 50 --min_freq 1 --lr_decay_rate 0.94
50 | ```
51 | 
52 | 


--------------------------------------------------------------------------------
/HyperText/__init__.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | #The MIT License (MIT)
 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd.
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | 
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | 
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | 


--------------------------------------------------------------------------------
/HyperText/hyperbolic/__init__.py:
--------------------------------------------------------------------------------
1 | from .poincare import PoincareBall
2 | from .math_utils import *
3 | from .mobius_linear import *


--------------------------------------------------------------------------------
/HyperText/hyperbolic/math_utils.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | #The MIT License (MIT)
 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd.
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | 
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | 
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | """Hyperbolic Math utils functions."""
25 | 
26 | import torch
27 | 
28 | eps = 1e-15
29 | 
30 | def artanh(x):
31 |     return Artanh.apply(x)
32 | 
33 | class Artanh(torch.autograd.Function):
34 |     @staticmethod
35 |     def forward(ctx, x):
36 |         x = x.clamp(-1 + eps, 1 - eps)
37 |         ctx.save_for_backward(x)
38 |         out = (torch.log(1 + x.double()).sub(torch.log(1 - x.double()))).mul(0.5)
39 |         return out.to(x.dtype)
40 | 
41 |     @staticmethod
42 |     def backward(ctx, grad_output):
43 |         x, = ctx.saved_tensors
44 |         return grad_output / (1 - x ** 2)
45 | 
46 | 


--------------------------------------------------------------------------------
/HyperText/hyperbolic/mobius_linear.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | #The MIT License (MIT)
 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd.
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | 
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | 
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | import torch
25 | import torch.nn.init as init
26 | import math
27 | 
28 | class MobiusLinear(torch.nn.Module):
29 |     """
30 |         Mobius linear layer.
31 |     """
32 |     def __init__(self, manifold, in_features, out_features, c, use_bias=True):
33 |         super(MobiusLinear, self).__init__()
34 |         self.use_bias = use_bias
35 |         self.in_features = in_features
36 |         self.out_features = out_features
37 |         self.c = c
38 |         self.manifold = manifold
39 |         self.bias = torch.nn.Parameter(torch.zeros(out_features))
40 | 
41 |         self.weight = torch.nn.Parameter(torch.randn(out_features, in_features))
42 |         self.reset_parameters()
43 | 
44 |     @torch.no_grad()
45 |     def reset_parameters(self):
46 |         init.xavier_uniform_(self.weight, gain=math.sqrt(2))
47 |         init.constant_(self.bias, 0.0)
48 | 
49 |     def forward(self, x):
50 |         mv = self.manifold.mobius_matvec(self.weight, x, self.c)
51 |         res = self.manifold.proj(mv, self.c)
52 |         if self.use_bias:
53 |             bias = self.manifold.proj_tan0(self.bias.view(1, -1))
54 |             hyp_bias = self.manifold.expmap0(bias, self.c)
55 |             hyp_bias = self.manifold.proj(hyp_bias, self.c)
56 |             res = self.manifold.mobius_add(res, hyp_bias, c=self.c)
57 |             res = self.manifold.proj(res, self.c)
58 |         return res
59 | 
60 |     def extra_repr(self):
61 |         return 'in_features_size={}, out_features_size={}, curvalture={}'.format(
62 |             self.in_features, self.out_features, self.c
63 |         )
64 | 


--------------------------------------------------------------------------------
/HyperText/hypertext_model_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/HyperText/hypertext_model_architecture.png


--------------------------------------------------------------------------------
/HyperText/models/__init__.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | #The MIT License (MIT)
 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd.
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining
 6 | # a copy of this software and associated documentation files (the
 7 | # "Software"), to deal in the Software without restriction, including
 8 | # without limitation the rights to use, copy, modify, merge, publish,
 9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | 
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | 
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/HyperText/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scikit-learn
3 | torch==1.3.0


--------------------------------------------------------------------------------
/JABER-PyTorch/NEZHA_PyTorch/README.md:
--------------------------------------------------------------------------------
 1 | # NEZHA pytorch version
 2 | We only provide fine-tuning codes for sentence classification task in this repository. For MRC and sequential labelling task, please see [CLUE](https://github.com/CLUEbenchmark/CLUE)
 3 | 
 4 | ### requirements
 5 | 
 6 | - pytorch==1.1.0
 7 | - python==3.5
 8 | 
 9 | ### download NEZHA-pytorch models
10 | 
11 | 1. Download models from :
12 | 2. Put pretrained models in pytorch_nezha/pretrained_models/
13 | 
14 | ### Run fine-tuning task
15 | ```shell
16 | sh run_classifier.sh
17 | ```
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/JABER-PyTorch/alue_predictions/README.md:
--------------------------------------------------------------------------------
1 | alue_predictions
2 | 


--------------------------------------------------------------------------------
/JABER-PyTorch/alue_test_submission/README.md:
--------------------------------------------------------------------------------
1 | alue_test_submission
2 | 


--------------------------------------------------------------------------------
/JABER-PyTorch/pretrained_models/README.md:
--------------------------------------------------------------------------------
1 | pretrained_models
2 | 


--------------------------------------------------------------------------------
/JABER-PyTorch/raw_datasets/toy.mq2q.dev.tsv:
--------------------------------------------------------------------------------
1 | 0	كيف يمكنني التوقف عن الإلحاح؟	كيف تتوقف عن كونك جبانا؟
2 | 1	ما معنى حياتنا؟	ما معنى الحياة؟


--------------------------------------------------------------------------------
/NEZHA-Gen-TensorFlow/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | NEZHA-Gen-TensorFlow
 3 | =============
 4 | We provide two GPT models pretrained by Huawei Noah's Ark Lab. One is Yuefu (乐府), a Chinese Classical Poetry generation model. The other is a Chinese GPT model pretrained with Chinese wikipedia and news corpus
 5 | 
 6 | 
 7 | Release Notes
 8 | =============
 9 | First version: 2020/07/22
10 | 
11 | Yuefu updated: 2020/09/24
12 | 
13 | Environment
14 | ============
15 | The scripts are tested sucessfully with Tensorflow 1.13 and Python 3.6. 
16 | 
17 | The python package ``fire`` is required. You may need to install the ``fire`` package with the command:
18 | 
19 | ```
20 | pip3 install fire
21 | ```
22 | 
23 | Usage of Yuefu (乐府)
24 | ====================
25 | 
26 | Step 1: Download the folder named ``models_yuefu`` from the link below and move the folder to the same directory with the scripts. Rename the folder to ``models``.
27 | 
28 | Step 2: Run the script ``poetry.py`` with the command to see a demo output:
29 | 
30 | ```
31 | python3 poetry.py
32 | ```
33 | 
34 | The opensourced Yuefu is only for academic research.
35 | Any business application should refer to [Huawei Cloud API](https://support.huaweicloud.com/api-nlp/nlp_03_0070.html).
36 | 
37 | 
38 | Usage of GPT
39 | ====================
40 | 
41 | Step 1: Download the folder named ``models_gpt`` from the link below and move the folder to the same directory with the scripts. Rename the folder to ``models``.
42 | 
43 | Step 2: Run the script ``interactive_conditional_generation.py`` with the command:
44 | 
45 | ```
46 | python3 interactive_conditional_generation.py
47 | ```
48 | 
49 | Step 3: Type in Chinese characters as the initial words and press ENTER to start generating sentences.
50 | 
51 | Model download 
52 | ===========================
53 | * Yuefu
54 |     * [Google Drive](https://drive.google.com/drive/folders/1B5-jxUlzhoKwFVMQ-nkqqbmJQgr1lRAp?usp=sharing) 
55 |     * [Baidu Netdisk](https://pan.baidu.com/s/1me6_BGYHbWFdTi80vRQ2Lg)(code: ytim)
56 | 
57 | * Chinese GPT
58 |     * [Google Drive](https://drive.google.com/drive/folders/1i4f_8LhaVDNjnGlLXNJ0rNgBP0E4L6V0?usp=sharing) 
59 |     * [Baidu Netdisk](https://pan.baidu.com/s/1Bgle8TpcxHyuUz_jAXOBWw)(code:rb5m)
60 | 
61 | 


--------------------------------------------------------------------------------
/NEZHA-PyTorch/README.md:
--------------------------------------------------------------------------------
 1 | # NEZHA pytorch version
 2 | We only provide fine-tuning codes for sentence classification task in this repository. For MRC and sequential labelling task, please see [CLUE](https://github.com/CLUEbenchmark/CLUE)
 3 | 
 4 | ### requirements
 5 | 
 6 | - pytorch==1.1.0
 7 | - python==3.5
 8 | 
 9 | ### download NEZHA-pytorch models
10 | 
11 | 1. Download models from :
12 | 2. Put pretrained models in pytorch_nezha/pretrained_models/
13 | 
14 | ### Run fine-tuning task
15 | ```shell
16 | sh run_classifier.sh
17 | ```
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/NEZHA-PyTorch/pretrained_models/nezha-cn-base/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "max_relative_position": 64,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 12,
12 |   "type_vocab_size": 2,
13 |   "vocab_size": 21128,
14 |   "use_relative_position": true
15 | }
16 | 


--------------------------------------------------------------------------------
/NEZHA-PyTorch/pretrained_models/nezha-en-base/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "max_relative_position": 127,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 12,
12 |   "type_vocab_size": 2,
13 |   "vocab_size": 28996,
14 |   "use_relative_position": true
15 |  }
16 | 


--------------------------------------------------------------------------------
/NEZHA-PyTorch/run_classifier.sh:
--------------------------------------------------------------------------------
 1 | #########################################################################
 2 | # run_classifier.sh for sentence classification task
 3 | #########################################################################
 4 | #!/bin/bash
 5 | 
 6 | CUDA_VISIBLE_DEVICES=0 python run_sequence_classifier.py \
 7 |   --task_name=text-clf \
 8 |   --do_train \
 9 |   --do_eval \
10 |   --data_dir=data/chnsenti/ \
11 |   --bert_model=pretrained_models/nezha-cn-base/ \
12 |   --max_seq_length=128 \
13 |   --train_batch_size=16 \
14 |   --eval_batch_size=16 \
15 |   --learning_rate=3e-5  \
16 |   --num_train_epochs=10.0 \
17 |   --output_dir=output/0414chnsenti/
18 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | BERT needs to maintain permanent compatibility with the pre-trained model files,
 4 | so we do not plan to make any major changes to this library (other than what was
 5 | promised in the README). However, we can accept small patches related to
 6 | re-factoring and documentation. To submit contributes, there are just a few
 7 | small guidelines you need to follow.
 8 | 
 9 | ## Contributor License Agreement
10 | 
11 | Contributions to this project must be accompanied by a Contributor License
12 | Agreement. You (or your employer) retain the copyright to your contribution;
13 | this simply gives us permission to use and redistribute your contributions as
14 | part of the project. Head over to <https://cla.developers.google.com/> to see
15 | your current agreements on file or to sign a new one.
16 | 
17 | You generally only need to submit a CLA once, so if you've already submitted one
18 | (even if it was for a different project), you probably don't need to do it
19 | again.
20 | 
21 | ## Code reviews
22 | 
23 | All submissions, including submissions by project members, require review. We
24 | use GitHub pull requests for this purpose. Consult
25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
26 | information on using pull requests.
27 | 
28 | ## Community Guidelines
29 | 
30 | This project follows
31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
32 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.06-py3
 2 | 
 3 | FROM tensorrtserver_client as trt
 4 | 
 5 | FROM ${FROM_IMAGE_NAME}
 6 | 
 7 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2
 8 | 
 9 | RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
10 | 
11 | WORKDIR /workspace
12 | RUN git clone https://github.com/openai/gradient-checkpointing.git
13 | RUN git clone https://github.com/attardi/wikiextractor.git
14 | RUN git clone https://github.com/soskek/bookcorpus.git
15 | 
16 | # Copy the perf_client over
17 | COPY --from=trt /workspace/build/perf_client /workspace/build/perf_client
18 | 
19 | # Copy the python wheel and install with pip
20 | COPY --from=trt /workspace/build/dist/dist/tensorrtserver*.whl /tmp/
21 | RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
22 | 
23 | 
24 | WORKDIR /workspace/bert
25 | COPY . .
26 | 
27 | ENV PYTHONPATH=/workspace/bert
28 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/NOTICE:
--------------------------------------------------------------------------------
1 | BERT TensorFlow
2 | 
3 | This repository includes software from https://github.com/google-research/bert
4 | This repository includes software from https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT
5 | licensed under the Apache License, Version 2.0 (the "License")


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_00.tfrecord:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_00.tfrecord


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_01.tfrecord:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_01.tfrecord


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/fp16_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import tensorflow as tf
17 | import numpy as np
18 | 
19 | 
20 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
21 |                                     initializer=None, regularizer=None,
22 |                                     trainable=True,
23 |                                     *args, **kwargs):
24 |     """Custom variable getter that forces trainable variables to be stored in
25 |        float32 precision and then casts them to the training precision.
26 |     """
27 |     storage_dtype = tf.float32 if trainable else dtype
28 |     variable = getter(name, shape, dtype=storage_dtype,
29 |                       initializer=initializer, regularizer=regularizer,
30 |                       trainable=trainable,
31 |                       *args, **kwargs)
32 |     if trainable and dtype != tf.float32:
33 |         variable = tf.cast(variable, dtype)
34 |     return variable
35 | 
36 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/gpu_environment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import tensorflow as tf
17 | import numpy as np
18 | 
19 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
20 |                                     initializer=None, regularizer=None,
21 |                                     trainable=True,
22 |                                     *args, **kwargs):
23 |     """Custom variable getter that forces trainable variables to be stored in
24 |        float32 precision and then casts them to the training precision.
25 |     """
26 |     storage_dtype = tf.float32 if trainable else dtype
27 |     variable = getter(name, shape, dtype=storage_dtype,
28 |                       initializer=initializer, regularizer=regularizer,
29 |                       trainable=trainable,
30 |                       *args, **kwargs)
31 |     if trainable and dtype != tf.float32:
32 |         variable = tf.cast(variable, dtype)
33 |     return variable
34 | 
35 | def get_custom_getter(compute_type):
36 |     return float32_variable_storage_getter if compute_type == tf.float16 else None
37 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/nezha/bert_base_rel_config_vocab_100503.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 100503,
13 |   "use_relative_position": true
14 | }
15 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/nezha/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1, 
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.1, 
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 21128,
19 |   "use_relative_position": true
20 | }
21 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/optimization_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import optimization
20 | import tensorflow as tf
21 | 
22 | 
23 | class OptimizationTest(tf.test.TestCase):
24 | 
25 |   def test_adam(self):
26 |     with self.test_session() as sess:
27 |       w = tf.get_variable(
28 |           "w",
29 |           shape=[3],
30 |           initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
31 |       x = tf.constant([0.4, 0.2, -0.5])
32 |       loss = tf.reduce_mean(tf.square(x - w))
33 |       tvars = tf.trainable_variables()
34 |       grads = tf.gradients(loss, tvars)
35 |       global_step = tf.train.get_or_create_global_step()
36 |       optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
37 |       train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
38 |       init_op = tf.group(tf.global_variables_initializer(),
39 |                          tf.local_variables_initializer())
40 |       sess.run(init_op)
41 |       for _ in range(100):
42 |         sess.run(train_op)
43 |       w_np = sess.run(w)
44 |       self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |   tf.test.main()
49 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_clf.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #################
 3 | #Created on Fri Jul 12 11:05:22 2019
 4 | #start codes for running clf(lcqmc/chnsenti/xnli) tasks.
 5 | # task_name is 'lcqmc' for lcqmc task, 'xnli' for xnli task and 'text_clf' for chnsenti
 6 | #read_tf_events is to find the best eval ckpt and do predict
 7 | ##################
 8 | 
 9 | CUDA_VISIBLE_DEVICES=1 python ../run_classifier.py \
10 |   --task_name=lcqmc \
11 |   --do_train=true \
12 |   --do_eval=true \
13 |   --do_train_and_eval=true \
14 |   --data_dir=../data/lcqmc/ \
15 |   --save_checkpoints_steps=50 \
16 |   --vocab_file=../nezha/vocab.txt \
17 |   --bert_config_file=../nezha/bert_config.json \
18 |   --init_checkpoint=../nezha/model.ckpt \
19 |   --max_seq_length=128 \
20 |   --train_batch_size=32 \
21 |   --eval_batch_size=32 \
22 |   --num_train_epochs=5 \
23 |   --output_dir=../output/lcqmc/
24 | 
25 | python ../read_tf_events.py \
26 |   --task_name=lcqmc \
27 |   --task_data_dir=../data/lcqmc/ \
28 |   --max_seq_length=128 \
29 |   --predict_batch_size=32 \
30 |   --pretrained_model_dir=../nezha/ \
31 |   --task_output_dir=../output/lcqmc/ \
32 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_clf_predict.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #####################
 3 | #Created on Fri Jul 12 11:05:22 2019
 4 | #start codes for running clf(lxqmc/chnsenti/xnli) predict tasks
 5 | #####################
 6 | 
 7 | python ../run_classifier.py \
 8 |   --task_name=$1 \
 9 |   --do_predict=true \
10 |   --data_dir=$2 \
11 |   --vocab_file=$3/vocab.txt \
12 |   --bert_config_file=$3/bert_config.json \
13 |   --init_checkpoint=$4 \
14 |   --max_seq_length=$5 \
15 |   --predict_batch_size=$6 \
16 |   --output_dir=$7/
17 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_ner_predict.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | python bert-downstream-code/run_classifier_ner.py \
 5 |   --task_name=$1 \
 6 |   --do_predict=true \
 7 |   --data_dir=$2 \
 8 |   --vocab_file=$3/vocab.txt \
 9 |   --bert_config_file=$3/bert_config.json \
10 |   --init_checkpoint=$4 \
11 |   --max_seq_length=$5 \
12 |   --predict_batch_size=$6 \
13 |   --output_dir=$7/
14 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_pretraining.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #################
 3 | #Run pretraining.
 4 | ##################
 5 | 
 6 | 
 7 | mpiexec --allow-run-as-root --bind-to socket -np 2 python run_pretraining.py \
 8 |   --input_file=./data/pretrain-toy/*.tfrecord \
 9 |   --output_dir=./nezha/ \
10 |   --do_train=True \
11 |   --do_eval=True \
12 |   --bert_config_file=./nezha/bert_config.json \
13 |   --train_batch_size=32 \
14 |   --max_seq_length=128 \
15 |   --max_predictions_per_seq=20 \
16 |   --num_train_steps=200 \
17 |   --num_warmup_steps=10 \
18 |   --learning_rate=2e-5 \
19 |   --horovod  
20 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_reading.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | ############
 3 | #Created on Fri Jul 12 11:05:22 2019
 4 | #script for squad-like task fine-tuning
 5 | #############
 6 | 
 7 | 
 8 | CUDA_VISIBLE_DEVICES=1 python ../run_squad.py \
 9 |   --vocab_file=../nezha/vocab.txt \
10 |   --bert_config_file=../nezha/bert_config.json \
11 |   --init_checkpoint=../nezha/model.ckpt \
12 |   --do_train=True \
13 |   --train_file=../data/cmrc/new_cmrc2018_train.json \
14 |   --do_predict=True \
15 |   --predict_file=../data/cmrc/new_cmrc2018_dev.json \
16 |   --train_batch_size=4 \
17 |   --learning_rate=3e-5 \
18 |   --num_train_epochs=1.0 \
19 |   --max_seq_length=512 \
20 |   --doc_stride=128 \
21 |   --do_lower_case=False \
22 |   --output_dir=../output/cmrc/
23 | python ../cmrc2018_evaluate.py ../data/cmrc/new_cmrc2018_dev.json ../output/cmrc/dev_predictions.json ../output/cmrc/result_metric.txt
24 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_seq_labelling.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | ##########
 3 | #Created on Fri Jul 12 11:05:22 2019
 4 | #start codes for running ner task.
 5 | #Note that read_tf_events.py is to read evaluation results from tf events file.
 6 | ###########
 7 | 
 8 | CUDA_VISIBLE_DEVICES=1 python ../run_classifier_ner.py \
 9 |   --task_name=ner \
10 |   --do_train=true \
11 |   --do_eval=true \
12 |   --do_train_and_eval=true \
13 |   --data_dir=../data/peoples-daily-ner \
14 |   --save_checkpoints_steps=100 \
15 |   --vocab_file=../nezha/vocab.txt \
16 |   --bert_config_file=../nezha/bert_config.json \
17 |   --init_checkpoint=../nezha/model.ckpt \
18 |   --max_seq_length=256 \
19 |   --train_batch_size=16 \
20 |   --eval_batch_size=16 \
21 |   --num_train_epochs=10 \
22 |   --output_dir=../output/peoples-daily-ner/
23 | python ../read_tf_events.py \
24 |   --task_name=ner \
25 |   --task_data_dir=../data/peoples-daily-ner \
26 |   --max_seq_length=256 \
27 |   --predict_batch_size=16 \
28 |   --pretrained_model_dir=../nezha/ \
29 |   --task_output_dir=../output/peoples-daily-ner/ \
30 |   
31 | 


--------------------------------------------------------------------------------
/NEZHA-TensorFlow/scripts/run_seq_labelling_predict.sh:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | ################
 3 | #Created on Fri Jul 12 11:05:22 2019
 4 | #start codes for running ds tasks prediction
 5 | #################
 6 | 
 7 | python ../run_classifier_ner.py \
 8 |   --task_name=$1 \
 9 |   --do_predict=true \
10 |   --data_dir=$2 \
11 |   --vocab_file=$3/vocab.txt \
12 |   --bert_config_file=$3/bert_config.json \
13 |   --init_checkpoint=$4 \
14 |   --max_seq_length=$5 \
15 |   --predict_batch_size=$6 \
16 |   --output_dir=$7/
17 | 


--------------------------------------------------------------------------------
/Noah_WuKong/README.md:
--------------------------------------------------------------------------------
 1 | # WukongOpenSource
 2 | 
 3 | Code for paper _“Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework”_ ([arXiv:2202.06767](https://arxiv.org/abs/2202.06767))
 4 | 
 5 | ## Code structure
 6 | 
 7 | ```
 8 | .
 9 | ├── configs/...                     # contains configs for model loading
10 | ├── data
11 | │   ├── __init__.py
12 | │   ├── datasets.py                 # definition of datasets, e.g., ImageNet
13 | │   ├── res
14 | │   │   ├── classnames.json         # definition of classification names
15 | │   │   └── prompts.txt             # prompts for ensemble
16 | │   └── tokenizer
17 | │       ├── __init__.py
18 | │       ├── res
19 | │       │   └── vocab.txt           # vocabulary file for tokenization
20 | │       ├── simple_tokenizer.py     # implementation of Chinese tokenization
21 | │       └── utils.py
22 | ├── main.py                         # main script for model evaluation
23 | ├── model
24 | │   ├── __init__.py
25 | │   ├── builder.py
26 | │   ├── language
27 | │   │   ├── __init__.py
28 | │   │   └── transformer.py          # module of text encoder
29 | │   ├── modules.py                  # some other modules
30 | │   ├── utils.py
31 | │   ├── vision
32 | │   │   ├── __init__.py
33 | │   │   ├── swin_transformer.py     # module of vision encoder [swin-transformer]
34 | │   │   └── vision_transformer.py   # module of vision encoder [vit]
35 | │   └── wukong.py                   # model backbone
36 | ├── README.md
37 | ├── requirements.txt
38 | └── utils.py
39 | ```
40 | 
41 | ## Download models
42 | 
43 | Benchmark of our pretrained multi-modality models can be found in [Noah-Wukong Benchmark](https://wukong-dataset.github.io/wukong-dataset/benchmark.html)
44 | 
45 | ## Evaluate on ImageNet
46 | 
47 | Below is an example for evaluating using Wukong_ViT-L model.
48 | 
49 | ```shell
50 | python main.py \
51 |   --config="configs/wukong_vit_l/wukong_vit_l.py" \
52 |   --checkpoint="/cache/ckpt/wukong_vit_l.ckpt" \
53 |   --data_dir="/cache/data/ILSVRC/"
54 | ```
55 | 
56 | ## Reference
57 | 
58 | Jiaxi Gu, Xiaojun Meng, Guansong Lu, Lu Hou, Minzhe Niu, Xiaodan Liang, Lewei Yao, Runhui Huang, Wei Zhang, Xin Jiang, Chunjing Xu, Hang Xu.
59 | [Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework](https://arxiv.org/abs/2202.06767).
60 | ```
61 | @misc{gu2022wukong,
62 |   title = {Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework},
63 |   author = {Gu, Jiaxi and Meng, Xiaojun and Lu, Guansong and Hou, Lu and Niu, Minzhe and Liang, Xiaodan and Yao, Lewei and Huang, Runhui and Zhang, Wei and Jiang, Xin and Xu, Chunjing and Xu, Hang},
64 |   url = {https://arxiv.org/abs/2202.06767},
65 |   year = {2022}
66 | }
67 | ```


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_swin/wukong_swin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='SwinTransformer',
23 |         name='swin_large_patch4_window7_224',
24 |         input_resolution=224,
25 |         embed_dim=192,
26 |         depths=[2, 2, 18, 2],
27 |         num_heads=[6, 12, 24, 48],
28 |         window_size=7,
29 |         patch_size=4,
30 |         num_classes=21841,
31 |         token_reduction=dict(num_tokens=12)),
32 |     text=dict(
33 |         type='TextTransformer',
34 |         context_length=32,
35 |         vocab_size=21128,
36 |         width=768,
37 |         heads=12,
38 |         layers=12),
39 |     is_token_wise=True
40 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_swin/wukong_swin_f.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='SwinTransformer',
23 |         name='swin_large_patch4_window7_224',
24 |         input_resolution=224,
25 |         embed_dim=192,
26 |         depths=[2, 2, 18, 2],
27 |         num_heads=[6, 12, 24, 48],
28 |         window_size=7,
29 |         patch_size=4,
30 |         num_classes=21841),
31 |     text=dict(
32 |         type='TextTransformer',
33 |         context_length=32,
34 |         vocab_size=21128,
35 |         width=768,
36 |         heads=12,
37 |         layers=12),
38 |     is_token_wise=True
39 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_swin/wukong_swin_g.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=768,
21 |     visual=dict(
22 |         type='SwinTransformer',
23 |         name='swin_large_patch4_window7_224',
24 |         input_resolution=224,
25 |         embed_dim=192,
26 |         depths=[2, 2, 18, 2],
27 |         num_heads=[6, 12, 24, 48],
28 |         window_size=7,
29 |         patch_size=4,
30 |         num_classes=21841),
31 |     text=dict(
32 |         type='TextTransformer',
33 |         context_length=32,
34 |         vocab_size=21128,
35 |         width=768,
36 |         heads=12,
37 |         layers=12),
38 |     is_token_wise=False
39 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_b/wukong_vit_b.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=12,
25 |         width=768,
26 |         patch_size=32,
27 |         token_reduction=dict(num_tokens=12)),
28 |     text=dict(
29 |         type='TextTransformer',
30 |         context_length=32,
31 |         vocab_size=21128,
32 |         width=512,
33 |         heads=8,
34 |         layers=12),
35 |     is_token_wise=True
36 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_b/wukong_vit_b_f.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=12,
25 |         width=768,
26 |         patch_size=32),
27 |     text=dict(
28 |         type='TextTransformer',
29 |         context_length=32,
30 |         vocab_size=21128,
31 |         width=512,
32 |         heads=8,
33 |         layers=12),
34 |     is_token_wise=True
35 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_b/wukong_vit_b_g.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=512,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=12,
25 |         width=768,
26 |         patch_size=32),
27 |     text=dict(
28 |         type='TextTransformer',
29 |         context_length=32,
30 |         vocab_size=21128,
31 |         width=512,
32 |         heads=8,
33 |         layers=12),
34 |     is_token_wise=False
35 | )
36 | 


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_l/wukong_vit_l.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=24,
25 |         width=1024,
26 |         patch_size=14,
27 |         token_reduction=dict(num_tokens=24)),
28 |     text=dict(
29 |         type='TextTransformer',
30 |         context_length=32,
31 |         vocab_size=21128,
32 |         width=768,
33 |         heads=12,
34 |         layers=12),
35 |     is_token_wise=True
36 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_l/wukong_vit_l_f.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=256,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=24,
25 |         width=1024,
26 |         patch_size=14),
27 |     text=dict(
28 |         type='TextTransformer',
29 |         context_length=32,
30 |         vocab_size=21128,
31 |         width=768,
32 |         heads=12,
33 |         layers=12),
34 |     is_token_wise=True
35 | )


--------------------------------------------------------------------------------
/Noah_WuKong/configs/wukong_vit_l/wukong_vit_l_g.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | model = dict(
18 |     type='Wukong',
19 |     pretrained='',
20 |     embed_dim=768,
21 |     visual=dict(
22 |         type='VisionTransformer',
23 |         input_resolution=224,
24 |         layers=24,
25 |         width=1024,
26 |         patch_size=14),
27 |     text=dict(
28 |         type='TextTransformer',
29 |         context_length=32,
30 |         vocab_size=21128,
31 |         width=768,
32 |         heads=12,
33 |         layers=12),
34 |     is_token_wise=False
35 | )


--------------------------------------------------------------------------------
/Noah_WuKong/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/Noah_WuKong/data/__init__.py


--------------------------------------------------------------------------------
/Noah_WuKong/data/res/prompts.txt:
--------------------------------------------------------------------------------
 1 | {}的照片。
 2 | 质量差的{}的照片。
 3 | 许多{}的照片。
 4 | {}的雕塑。
 5 | 难以看到{}的照片。
 6 | {}的低分辨率照片。
 7 | {}的渲染。
 8 | 涂鸦{}。
 9 | {}的糟糕照片。
10 | {}的裁剪照片。
11 | {}的纹身。
12 | {}的刺绣照片。
13 | 很难看到{}的照片。
14 | {}的明亮照片。
15 | 一张干净的{}的照片。
16 | 一张包含{}的照片。
17 | {}的深色照片。
18 | {}的手绘画。
19 | 我的{}的照片。
20 | 不自然的{}的照片。
21 | 一张酷的{}的照片。
22 | {}的特写照片。
23 | {}的黑白照片。
24 | 一幅{}的画。
25 | 一幅{}的绘画。
26 | 一张{}的像素照片。
27 | {}的雕像。
28 | 一张{}的明亮照片。
29 | {}的裁剪照片。
30 | 人造的{}的照片。
31 | 一张关于{}的照片。
32 | 损坏的{}的jpeg照片。
33 | {}的模糊照片。
34 | {}的相片。
35 | 一张{}的好照片。
36 | {}的渲染照。
37 | 视频游戏中的{}。
38 | 一张{}的照片。
39 | {}的涂鸦。
40 | {}的近距离照片。
41 | {}的折纸。
42 | {}在视频游戏中。
43 | {}的草图。
44 | {}的涂鸦照。
45 | {}的折纸形状。
46 | 低分辨率的{}的照片。
47 | 玩具{}。
48 | {}的副本。
49 | {}的干净的照片。
50 | 一张大{}的照片。
51 | {}的重现。
52 | 一张漂亮的{}的照片。
53 | 一张奇怪的{}的照片。
54 | 模糊的{}的照片。
55 | 卡通{}。
56 | {}的艺术作品。
57 | {}的素描。
58 | 刺绣{}。
59 | {}的像素照。
60 | {}的拍照。
61 | {}的损坏的照片。
62 | 高质量的{}的照片。
63 | 毛绒玩具{}。
64 | 漂亮的{}的照片。
65 | 小{}的照片。
66 | 照片是奇怪的{}。
67 | 漫画{}。
68 | {}的艺术照。
69 | {}的图形。
70 | 大{}的照片。
71 | 黑白的{}的照片。
72 | {}毛绒玩具。
73 | 一张{}的深色照片。
74 | {}的摄影图。
75 | {}的涂鸦照。
76 | 玩具形状的{}。
77 | 拍了{}的照片。
78 | 酷酷的{}的照片。
79 | 照片里的小{}。
80 | {}的刺青。


--------------------------------------------------------------------------------
/Noah_WuKong/data/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_tokenizer import SimpleTokenizer


--------------------------------------------------------------------------------
/Noah_WuKong/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .wukong import Wukong
2 | from .builder import build_model


--------------------------------------------------------------------------------
/Noah_WuKong/model/builder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | from mmcv import Registry
18 | from mmcv import build_from_cfg
19 | 
20 | MODELS = Registry('model')
21 | 
22 | 
23 | def build_model(config):
24 |     return build_from_cfg(config, MODELS)
25 | 


--------------------------------------------------------------------------------
/Noah_WuKong/model/language/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import Transformer, TextTransformer
2 | 


--------------------------------------------------------------------------------
/Noah_WuKong/model/vision/__init__.py:
--------------------------------------------------------------------------------
1 | from .vision_transformer import VisionTransformer
2 | from .swin_transformer import SwinTransformer
3 | 


--------------------------------------------------------------------------------
/Noah_WuKong/requirements.txt:
--------------------------------------------------------------------------------
1 | mmcv
2 | timm
3 | torch
4 | torchvision


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/Noah_Wukong-MindSpore/src/__init__.py


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_b_32.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 12
 6 |     width: 768
 7 |     patch_size: 32
 8 |     output_dim: 256
 9 |     token_learner:
10 |       num_tokens: 12
11 |       num_groups: 8
12 |       dropout_rate: 0.0
13 |   text:
14 |     type: TextTransformer
15 |     context_length: 32
16 |     vocab_size: 21128
17 |     width: 512
18 |     heads: 8
19 |     layers: 12
20 |     output_dim: 256
21 | eval: filip
22 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_b_32_clip.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 12
 6 |     width: 768
 7 |     patch_size: 32
 8 |     output_dim: 512
 9 |     return_full_embed: False
10 |   text:
11 |     type: TextTransformer
12 |     context_length: 32
13 |     vocab_size: 21128
14 |     width: 512
15 |     heads: 8
16 |     layers: 12
17 |     output_dim: 512
18 |     return_full_embed: False
19 | eval: clip
20 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_b_32_filip.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 12
 6 |     width: 768
 7 |     patch_size: 32
 8 |     output_dim: 256
 9 |   text:
10 |     type: TextTransformer
11 |     context_length: 32
12 |     vocab_size: 21128
13 |     width: 512
14 |     heads: 8
15 |     layers: 12
16 |     output_dim: 256
17 | eval: filip
18 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_l_14.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 24
 6 |     width: 1024
 7 |     patch_size: 14
 8 |     output_dim: 256
 9 |     token_learner:
10 |       num_tokens: 24
11 |       num_groups: 8
12 |       dropout_rate: 0.0
13 |   text:
14 |     type: TextTransformer
15 |     context_length: 32
16 |     vocab_size: 21128
17 |     width: 768
18 |     heads: 12
19 |     layers: 12
20 |     output_dim: 256
21 | eval: filip
22 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_l_14_clip.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 24
 6 |     width: 1024
 7 |     patch_size: 14
 8 |     output_dim: 768
 9 |     return_full_embed: False
10 |   text:
11 |     type: TextTransformer
12 |     context_length: 32
13 |     vocab_size: 21128
14 |     width: 768
15 |     heads: 12
16 |     layers: 12
17 |     output_dim: 768
18 |     return_full_embed: False
19 | eval: clip
20 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/config/wukong_vit_l_14_filip.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   visual:
 3 |     type: VisionTransformer
 4 |     input_resolution: 224
 5 |     layers: 24
 6 |     width: 1024
 7 |     patch_size: 14
 8 |     output_dim: 256
 9 |   text:
10 |     type: TextTransformer
11 |     context_length: 32
12 |     vocab_size: 21128
13 |     width: 768
14 |     heads: 12
15 |     layers: 12
16 |     output_dim: 256
17 | eval: filip
18 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | from .dataset import get_dataset
16 | 
17 | 
18 | __all__ = ['get_dataset']
19 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/dataset/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | from mindspore import dtype as mstype
16 | import mindspore.dataset as ds
17 | from mindspore.dataset.vision import Inter
18 | import mindspore.dataset.vision as C
19 | import mindspore.dataset.transforms as C2
20 | 
21 | 
22 | def get_wukong_dataset(dataset_path, columns_list, num_parallel_workers, shuffle, num_shards, shard_id, batch_size):
23 |     wukong_dataset = ds.MindDataset(dataset_path,
24 |                                     columns_list=columns_list,
25 |                                     num_parallel_workers=num_parallel_workers,
26 |                                     shuffle=shuffle,
27 |                                     num_shards=num_shards,
28 |                                     shard_id=shard_id)
29 |     wukong_dataset = wukong_dataset.batch(batch_size)
30 |     return wukong_dataset
31 | 
32 | 
33 | def get_dataset(dataset_path, batch_size):
34 |     norm_mean = (0.48145466, 0.4578275, 0.40821073)
35 |     norm_std = (0.26862954, 0.26130258, 0.27577711)
36 |     norm_mean_2 = tuple(map(lambda x: x * 255, norm_mean))
37 |     norm_std_2 = tuple(map(lambda x: x * 255, norm_std))
38 |     val_dataset = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4)
39 |     val_dataset = val_dataset.map(
40 |         [C.Decode(),
41 |          C.Normalize(mean=norm_mean_2, std=norm_std_2),
42 |          C.Resize(224, Inter.BICUBIC),
43 |          C.CenterCrop(224),
44 |          C.HWC2CHW(),
45 |          C2.TypeCast(mstype.float32)],
46 |         input_columns=["image"], output_columns=None, column_order=["image", "label"])
47 |     val_dataset = val_dataset.batch(batch_size)
48 |     return val_dataset
49 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | from .visual_encoder import VisualTransformer, ClipVisualTransformer
16 | from .text_encoder import BERT_Wukong
17 | from .matrics import FilipTemplateEncoder, ClipTemplateEncoder, FilipEval, ClipEval
18 | 
19 | __all__ = ['VisualTransformer', 'ClipVisualTransformer', 'BERT_Wukong', 'FilipTemplateEncoder',
20 |            'ClipTemplateEncoder', 'FilipEval', 'ClipEval']
21 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/model/token_learner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | import mindspore.nn as nn
16 | import mindspore.ops as ops
17 | 
18 | 
19 | class TokenLearnerModule(nn.Cell):
20 |     def __init__(self, in_channels, num_tokens, num_groups, dropout_rate):
21 |         super(TokenLearnerModule, self).__init__()
22 |         self.in_channels = in_channels
23 |         self.num_tokens = num_tokens
24 |         self.num_groups = num_groups
25 |         self.norm = nn.LayerNorm([self.in_channels])
26 |         self.attention_maps = nn.SequentialCell([
27 |             nn.Conv2d(self.in_channels, self.in_channels, 1, group=self.num_groups),
28 |             nn.Conv2d(self.in_channels, self.num_tokens, 1)
29 |         ])
30 |         self.feat_conv = nn.Conv2d(self.in_channels, self.in_channels, 1, group=self.num_groups)
31 |         self.softmax = nn.Softmax()
32 |         self.dropout = nn.Dropout(1.0 - dropout_rate)
33 | 
34 |     def construct(self, x):
35 |         bs, h, w, _ = x.shape
36 | 
37 |         selected = x
38 |         selected = self.norm(selected)
39 |         selected = selected.transpose(0, 3, 1, 2)
40 |         selected = self.attention_maps(selected)
41 |         selected = selected.transpose(0, 2, 3, 1)
42 |         selected = selected.reshape(bs, h * w, -1)
43 |         selected = selected.transpose(0, 2, 1)
44 |         selected = self.softmax(selected)
45 | 
46 |         feat = x
47 |         feat = feat.transpose(0, 3, 1, 2)
48 |         feat = self.feat_conv(feat)
49 |         feat = feat.transpose(0, 2, 3, 1)
50 |         feat = feat.reshape(bs, h * w, -1)
51 | 
52 |         outputs = ops.matmul(selected, feat)
53 |         outputs = self.dropout(outputs)
54 |         return outputs, selected
55 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | from .template_generate import generate_zh_template
16 | from .model_utils import load_visual_model, load_text_model
17 | from .simple_tokenizer import set_tokenizer_lang, tokenize
18 | 
19 | 
20 | __all__ = ['generate_zh_template', 'load_visual_model',
21 |            'load_text_model', 'set_tokenizer_lang', 'tokenize']
22 | 


--------------------------------------------------------------------------------
/Noah_Wukong-MindSpore/src/tools/template_generate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | import os
16 | from .simple_tokenizer import set_tokenizer_lang, tokenize
17 | 
18 | 
19 | def generate_zh_template(label_list):
20 |     set_tokenizer_lang('zh', 32)
21 |     template_list = []
22 |     template_path = os.path.join(
23 |         os.path.dirname(os.path.abspath(__file__)),
24 |         'zh_templates.txt'
25 |     )
26 | 
27 |     templates = []
28 |     for line in open(template_path, 'r'):
29 |         templates.append(line.strip())
30 |     num_prompts = len(templates)
31 |     num_labels = len(label_list)
32 |     for label in label_list:
33 |         for template in templates:
34 |             template_list.append(template.replace('{}', label))
35 |     token = tokenize(template_list).reshape((num_labels, num_prompts, -1))
36 |     return token
37 | 


--------------------------------------------------------------------------------
/PanGu-Bot/Readme.md:
--------------------------------------------------------------------------------
1 | to be complete
2 | 


--------------------------------------------------------------------------------
/PanGu-α/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/PanGu-α/.idea/PanGu-Alpha.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="GOOGLE" />
10 |     <option name="myDocStringFormat" value="Google" />
11 |   </component>
12 |   <component name="TestRunnerService">
13 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
14 |   </component>
15 | </module>


--------------------------------------------------------------------------------
/PanGu-α/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="7">
 8 |             <item index="0" class="java.lang.String" itemvalue="protobuf" />
 9 |             <item index="1" class="java.lang.String" itemvalue="jsonlines" />
10 |             <item index="2" class="java.lang.String" itemvalue="regex" />
11 |             <item index="3" class="java.lang.String" itemvalue="boto3" />
12 |             <item index="4" class="java.lang.String" itemvalue="pycocotools" />
13 |             <item index="5" class="java.lang.String" itemvalue="tensorflow" />
14 |             <item index="6" class="java.lang.String" itemvalue="six" />
15 |           </list>
16 |         </value>
17 |       </option>
18 |     </inspection_tool>
19 |   </profile>
20 | </component>


--------------------------------------------------------------------------------
/PanGu-α/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/PanGu-α/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/PanGu-α/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/PanGu-Alpha.iml" filepath="$PROJECT_DIR$/.idea/PanGu-Alpha.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/PanGu-α/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/PanGu-α/PANGU-α.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/PANGU-α.pdf


--------------------------------------------------------------------------------
/PanGu-α/docs/13B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/13B.png


--------------------------------------------------------------------------------
/PanGu-α/docs/2.6B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/2.6B.png


--------------------------------------------------------------------------------
/PanGu-α/docs/Pipline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/Pipline.png


--------------------------------------------------------------------------------
/PanGu-α/docs/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/dataset.png


--------------------------------------------------------------------------------
/PanGu-α/docs/logos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/logos.png


--------------------------------------------------------------------------------
/PanGu-α/docs/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/model.png


--------------------------------------------------------------------------------
/PanGu-α/docs/task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/task.png


--------------------------------------------------------------------------------
/PanGu-α/docs/微信交流群2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/微信交流群2.png


--------------------------------------------------------------------------------
/PanGu-α/docs/鹏程.盘古微信交流群.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/鹏程.盘古微信交流群.png


--------------------------------------------------------------------------------
/PanGu-α/generate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TopK for text generation
 3 | """
 4 | 
 5 | import numpy as np
 6 | import mindspore.common.dtype as mstype
 7 | from mindspore.common.tensor import Tensor
 8 | 
 9 | def generate(model, origin_inputs, seq_length, end_token=50256):
10 |     """
11 |     TopK for text generation
12 | 
13 |     Inputs:
14 |         model: the model for inferencing
15 |         origin_inputs: the original inputs based on which the model will continue writing
16 |         seq_length: seq_length for the model
17 |         end_token: end of sentence token id
18 | 
19 |     Returns:
20 |         outputs: the ids for the generated text
21 |     """
22 |     TOPK = 3
23 |     seq_length = seq_length
24 |     bs, valid_length = origin_inputs.shape
25 |     pad_length = seq_length - origin_inputs.shape[-1]
26 |     input_ids = np.pad(origin_inputs, ((0, 0), (0, pad_length)), 'constant', constant_values=(0, 0))
27 |     print("input_ids is ", input_ids)
28 |     while valid_length < seq_length:
29 |         inputs = Tensor(input_ids, mstype.int32)
30 |         probs, p_args = model.predict(inputs)
31 |         probs = probs.asnumpy()[valid_length-1, :]
32 |         p_args = p_args.asnumpy()[valid_length-1, :]
33 | 
34 |         p = probs
35 |         p = p / sum(p)
36 |         target_index = np.random.choice(len(p), p=p)
37 |         if p_args[target_index] == end_token or valid_length == seq_length-1:
38 |             outputs = input_ids
39 |             break
40 |         input_ids[0][valid_length] = p_args[target_index]
41 |         valid_length += 1
42 |     length = np.sum(outputs != 0)
43 |     outputs = outputs[0][:length]
44 |     return outputs
45 | 
46 | 


--------------------------------------------------------------------------------
/PanGu-α/scripts/run_distribute_predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | execute_path=$(pwd)
 3 | script_self=$(readlink -f "$0")
 4 | self_path=$(dirname "${script_self}")
 5 | export RANK_SIZE=$1
 6 | export RANK_TABLE_FILE=$2
 7 | export STRATEGY=$3
 8 | export TOKENIZER=$4
 9 | export CKPT_PATH=$5
10 | export CKPT_NAME=$6
11 | export MODE=$7
12 | 
13 | for((i=0;i<$RANK_SIZE;i++));
14 | do
15 |   rm -rf ${execute_path}/device_$i/
16 |   mkdir ${execute_path}/device_$i/
17 |   cd ${execute_path}/device_$i/ || exit
18 |   export RANK_ID=$i
19 |   export DEVICE_ID=$i
20 |   python -s ${self_path}/../run_pangu_alpha_predict.py --strategy_load_ckpt_path=$STRATEGY --tokenizer_path=$TOKENIZER --load_ckpt_path=$CKPT_PATH \
21 |                   --load_ckpt_name=$CKPT_NAME --mode=$MODE  >train_deep$i.log 2>&1 &
22 | done
23 | 


--------------------------------------------------------------------------------
/PanGu-α/scripts/run_distribute_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | execute_path=$(pwd)
 4 | script_self=$(readlink -f "$0")
 5 | self_path=$(dirname "${script_self}")
 6 | export RANK_SIZE=$1
 7 | export DATASET=$2
 8 | export RANK_TABLE_FILE=$3
 9 | export MODE=$4
10 | for((i=0;i<$RANK_SIZE;i++));
11 | do
12 |   rm -rf ${execute_path}/device_$i/
13 |   mkdir ${execute_path}/device_$i/
14 |   cd ${execute_path}/device_$i/ || exit
15 |   export RANK_ID=$i
16 |   export DEVICE_ID=$i
17 |   python -s ${self_path}/../run_pangu_alpha_train.py --data_url=$DATASET --mode=$MODE >train_deep$i.log 2>&1 &
18 | done
19 | 


--------------------------------------------------------------------------------
/PanGu-α/serving_demo/PanGu-Alpha-serving-demo.avi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/serving_demo/PanGu-Alpha-serving-demo.avi


--------------------------------------------------------------------------------
/PanGu-α/strategy_load_ckpt/pangu_alpha_13B_cktp_strategy.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/strategy_load_ckpt/pangu_alpha_13B_cktp_strategy.ckpt


--------------------------------------------------------------------------------
/PanGu-α/strategy_load_ckpt/pangu_alpha_2.6B_ckpt_strategy.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/strategy_load_ckpt/pangu_alpha_2.6B_ckpt_strategy.ckpt


--------------------------------------------------------------------------------
/PanGu-α/tokenizer/vocab.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/tokenizer/vocab.model


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT-MindSpore/__init__.py


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/mindspore_hub_conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """Bert hub interface for bert base"""
17 | 
18 | from src.tinybert_model import BertModel
19 | from src.tinybert_model import BertConfig
20 | import mindspore.common.dtype as mstype
21 | 
22 | tinybert_student_net_cfg = BertConfig(
23 |     seq_length=128,
24 |     vocab_size=30522,
25 |     hidden_size=768,
26 |     num_hidden_layers=6,
27 |     num_attention_heads=12,
28 |     intermediate_size=3072,
29 |     hidden_act="gelu",
30 |     hidden_dropout_prob=0.1,
31 |     attention_probs_dropout_prob=0.1,
32 |     max_position_embeddings=512,
33 |     type_vocab_size=2,
34 |     initializer_range=0.02,
35 |     use_relative_positions=False,
36 |     dtype=mstype.float32,
37 |     compute_type=mstype.float32,
38 |     do_quant=True,
39 |     embedding_bits=2,
40 |     weight_bits=2,
41 |     weight_clip_value=3.0,
42 |     cls_dropout_prob=0.1,
43 |     activation_init=2.5,
44 |     is_lgt_fit=False
45 | )
46 | 
47 | 
48 | def create_network(name, *args, **kwargs):
49 |     """
50 |     Create tinybert network.
51 |     """
52 |     if name == "ternarybert":
53 |         if "seq_length" in kwargs:
54 |             tinybert_student_net_cfg.seq_length = kwargs["seq_length"]
55 |         is_training = kwargs.get("is_training", False)
56 |         return BertModel(tinybert_student_net_cfg, is_training, *args)
57 |     raise NotImplementedError(f"{name} is not implemented in the repo")
58 | 


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/scripts/run_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2021 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | mkdir -p ms_log
18 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
19 | CUR_DIR=`pwd`
20 | export GLOG_log_dir=${CUR_DIR}/ms_log
21 | export GLOG_logtostderr=0
22 | python ${PROJECT_DIR}/../eval.py \
23 |     --task_name=sts-b \
24 |     --device_id=0 \
25 |     --model_dir="" \
26 |     --data_dir="" > log.txt


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/scripts/run_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2021 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | mkdir -p ms_log
18 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
19 | CUR_DIR=`pwd`
20 | export GLOG_log_dir=${CUR_DIR}/ms_log
21 | export GLOG_logtostderr=0
22 | python ${PROJECT_DIR}/../train.py \
23 |     --task_name=sts-b \
24 |     --device_id=0 \
25 |     --teacher_model_dir="" \
26 |     --student_model_dir="" \
27 |     --data_dir="" > log.txt


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT-MindSpore/src/__init__.py


--------------------------------------------------------------------------------
/TernaryBERT-MindSpore/src/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """create tinybert dataset"""
17 | 
18 | from enum import Enum
19 | import mindspore.common.dtype as mstype
20 | import mindspore.dataset.engine.datasets as de
21 | import mindspore.dataset.transforms.c_transforms as C
22 | 
23 | 
24 | class DataType(Enum):
25 |     """Enumerate supported dataset format"""
26 |     TFRECORD = 1
27 |     MINDRECORD = 2
28 | 
29 | 
30 | def create_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None,
31 |                    data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True):
32 |     """create tinybert dataset"""
33 |     if isinstance(data_dir, list):
34 |         data_files = data_dir
35 |     else:
36 |         data_files = [data_dir]
37 | 
38 |     columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
39 | 
40 |     shuffle = (do_shuffle == "true")
41 | 
42 |     if data_type == 'mindrecord':
43 |         ds = de.MindDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num,
44 |                             shard_id=rank)
45 |     else:
46 |         ds = de.TFRecordDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num,
47 |                                 shard_id=rank, shard_equal_rows=(device_num == 1))
48 | 
49 |     if device_num == 1 and shuffle is True:
50 |         ds = ds.shuffle(10000)
51 | 
52 |     type_cast_op = C.TypeCast(mstype.int32)
53 |     slice_op = C.Slice(slice(0, seq_length, 1))
54 |     label_type = mstype.int32 if task_type == 'classification' else mstype.float32
55 |     ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["segment_ids"])
56 |     ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_mask"])
57 |     ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_ids"])
58 |     ds = ds.map(operations=[C.TypeCast(label_type), slice_op], input_columns=["label_ids"])
59 |     # apply batch operations
60 |     ds = ds.batch(batch_size, drop_remainder=drop_remainder)
61 | 
62 |     return ds
63 | 


--------------------------------------------------------------------------------
/TernaryBERT/README.md:
--------------------------------------------------------------------------------
 1 | # TernaryBERT
 2 | 
 3 | This directory contains code for [TernaryBERT: Distillation-aware Ultra-low Bit BERT](https://arxiv.org/abs/2009.12812).
 4 | <br />
 5 | <img src="main.png"/>
 6 | <br />
 7 | ## Envs
 8 | ```
 9 | conda create -n myenv python=3.6
10 | conda activate myenv
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | 
15 | ## GLUE
16 | ### Data
17 | Download GLUE
18 | https://github.com/nyu-mll/GLUE-baselines
19 | 
20 | ```
21 | ls data/mnli
22 | ```
23 | shows:
24 | ```
25 | dev_matched.tsv  dev_mismatched.tsv  train.tsv
26 | ```
27 | 
28 | ### Model
29 | Preparing a fine-tuend BERT base model on MNLI
30 | ```
31 | ls models/mnli
32 | ```
33 | shows
34 | ```
35 | config.json  pytorch_model.bin  vocab.txt
36 | ```
37 | 
38 | ### Distillation-aware quantization training on MNLI
39 | ```
40 | python quant_task_glue.py \
41 |             --data_dir data \
42 |             --model_dir models \
43 |             --task_name mnli \
44 |             --output_dir output \
45 |             --learning_rate 2e-5 \
46 |             --num_train_epochs 3 \
47 |             --weight_bits 2 \
48 |             --input_bits 8 \
49 |             --pred_distill \
50 |             --intermediate_distill \
51 |             --save_fp_model \
52 |             --save_quantized_model
53 | ```
54 | More details of arguments are in quant_task_glue.py
55 | 
56 | ## SQuAD
57 | 
58 | ### Data
59 | Download data
60 | https://rajpurkar.github.io/SQuAD-explorer/
61 | 
62 | ### Model
63 | Preparing fine-tuend BERT base model on SQuAD v1.1/v2.0
64 | 
65 | ### Distillation-aware quantization training
66 | ```
67 | python quant_task_squad.py \
68 |             --data_dir data/squadv2.0 \
69 |             --model_dir models/squadv2.0 \
70 |             --output_dir output \
71 |             --learning_rate 2e-5 \
72 |             --num_train_epochs 3 \
73 |             --version_2_with_negative \
74 |             --weight_bits 2 \
75 |             --input_bits 8 \
76 |             --pred_distill \
77 |             --intermediate_distill \
78 |             --save_fp_model \
79 |             --save_quantized_model
80 | ```
81 | 
82 | ## Reference
83 | 
84 | ```
85 | @inproceedings{zhang-etal-2020-ternarybert,
86 |     title = {TernaryBERT: Distillation-aware Ultra-low Bit BERT},
87 |     author = {Wei Zhang, Lu Hou, Yichun Yin, Lifeng Shang, Xiao Chen, Xin Jiang Xin, Qun Liu}
88 |     booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
89 |     year = {2020},
90 | }
91 | ```
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/TernaryBERT/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT/main.png


--------------------------------------------------------------------------------
/TernaryBERT/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | requests
3 | scipy
4 | future
5 | Pillow
6 | tensorflow==1.14.0
7 | torch==1.1.0
8 | 


--------------------------------------------------------------------------------
/TernaryBERT/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
2 | from .modeling import BertForSequenceClassification,BertForQuestionAnswering, CONFIG_NAME, WEIGHTS_NAME
3 | from .configuration import BertConfig
4 | from .optimization import BertAdam
5 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT-MindSpore/__init__.py


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/mindspore_hub_conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | '''
16 | Bert hub interface for bert base and bert nezha
17 | '''
18 | from src.tinybert_model import TinyBertModel
19 | from src.tinybert_model import BertConfig
20 | import mindspore.common.dtype as mstype
21 | 
22 | tinybert_student_net_cfg = BertConfig(
23 |     seq_length=128,
24 |     vocab_size=30522,
25 |     hidden_size=384,
26 |     num_hidden_layers=4,
27 |     num_attention_heads=12,
28 |     intermediate_size=1536,
29 |     hidden_act="gelu",
30 |     hidden_dropout_prob=0.1,
31 |     attention_probs_dropout_prob=0.1,
32 |     max_position_embeddings=512,
33 |     type_vocab_size=2,
34 |     initializer_range=0.02,
35 |     use_relative_positions=False,
36 |     dtype=mstype.float32,
37 |     compute_type=mstype.float16
38 | )
39 | 
40 | def create_network(name, *args, **kwargs):
41 |     '''
42 |     Create tinybert network.
43 |     '''
44 |     if name == "tinybert":
45 |         if "seq_length" in kwargs:
46 |             tinybert_student_net_cfg.seq_length = kwargs["seq_length"]
47 |         is_training = kwargs.get("is_training", False)
48 |         return TinyBertModel(tinybert_student_net_cfg, is_training, *args)
49 |     raise NotImplementedError(f"{name} is not implemented in the repo")
50 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/scripts/run_distributed_gd_ascend.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | echo "=============================================================================================================="
18 | echo "Please run the scipt as: "
19 | echo "bash scripts/run_distributed_gd.sh DEVICE_NUM EPOCH_SIZE RANK_TABLE_FILE"
20 | echo "for example: bash scripts/run_distributed_gd.sh 8 40 /path/hccl.json"
21 | echo "It is better to use absolute path."
22 | echo "running....... please see details by LOG{}/log.txt"
23 | echo "=============================================================================================================="
24 | 
25 | EPOCH_SIZE=$2
26 | 
27 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
28 | export RANK_TABLE_FILE=$3
29 | export RANK_SIZE=$1
30 | cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
31 | echo "the number of logical core" $cores
32 | avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
33 | core_gap=`expr $avg_core_per_rank \- 1`
34 | echo "avg_core_per_rank" $avg_core_per_rank
35 | echo "core_gap" $core_gap
36 | for((i=0;i<RANK_SIZE;i++))
37 | do
38 |     start=`expr $i \* $avg_core_per_rank`
39 |     export DEVICE_ID=$i
40 |     export RANK_ID=$i
41 |     export DEPLOY_MODE=0
42 |     export GE_USE_STATIC_MEMORY=1
43 |     end=`expr $start \+ $core_gap`
44 |     cmdopt=$start"-"$end
45 | 
46 |     rm -rf LOG$i
47 |     mkdir ./LOG$i
48 |     cp  *.py ./LOG$i
49 |     cd ./LOG$i || exit
50 |     echo "start training for rank $i, device $DEVICE_ID"
51 |     mkdir -p ms_log
52 |     CUR_DIR=`pwd`
53 |     export GLOG_log_dir=${CUR_DIR}/ms_log
54 |     export GLOG_logtostderr=0
55 |     env > env.log
56 |     taskset -c $cmdopt python ${PROJECT_DIR}/../run_general_distill.py  \
57 |     --distribute="true" \
58 |     --device_target="Ascend" \
59 |     --epoch_size=$EPOCH_SIZE \
60 |     --device_id=$DEVICE_ID \
61 |     --device_num=$RANK_SIZE \
62 |     --enable_data_sink="true" \
63 |     --data_sink_steps=100 \
64 |     --save_ckpt_step=10000 \
65 |     --max_ckpt_num=1 \
66 |     --load_teacher_ckpt_path="" \
67 |     --data_dir="" \
68 |     --schema_dir="" \
69 |     --dataset_type="tfrecord" > log.txt 2>&1 &
70 |     cd ../
71 | done
72 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/scripts/run_distributed_gd_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | echo "=============================================================================================================="
18 | echo "Please run the scipt as: "
19 | echo "bash run_distributed_gd_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR TEACHER_CKPT_PATH"
20 | echo "for example: bash run_distributed_gd_gpu.sh 8 3 /path/data/ /path/datasetSchema.json /path/bert_base.ckpt"
21 | echo "It is better to use absolute path."
22 | echo "=============================================================================================================="
23 | 
24 | RANK_SIZE=$1
25 | EPOCH_SIZE=$2
26 | DATA_DIR=$3
27 | SCHEMA_DIR=$4
28 | TEACHER_CKPT_PATH=$5
29 | 
30 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
31 | 
32 | mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
33 | 	python ${PROJECT_DIR}/../run_general_distill.py  \
34 | 	--distribute="true" \
35 | 	--device_target="GPU" \
36 | 	--epoch_size=$EPOCH_SIZE \
37 | 	--save_ckpt_path="" \
38 | 	--data_dir=$DATA_DIR \
39 | 	--schema_dir=$SCHEMA_DIR \
40 | 	--dataset_type="tfrecord" \
41 | 	--enable_data_sink="false" \
42 | 	--load_teacher_ckpt_path=$TEACHER_CKPT_PATH > log.txt 2>&1 &
43 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/scripts/run_standalone_gd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | echo "=============================================================================================================="
18 | echo "Please run the scipt as: "
19 | echo "bash scripts/run_standalone_gd.sh"
20 | echo "for example: bash scripts/run_standalone_gd.sh"
21 | echo "running....... please see details by log.txt"
22 | echo "=============================================================================================================="
23 | 
24 | 
25 | mkdir -p ms_log
26 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
27 | CUR_DIR=`pwd`
28 | export GLOG_log_dir=${CUR_DIR}/ms_log
29 | export GLOG_logtostderr=0
30 | python ${PROJECT_DIR}/../run_general_distill.py  \
31 |     --distribute="false" \
32 |     --device_target="Ascend" \
33 |     --epoch_size=3 \
34 |     --device_id=0 \
35 |     --enable_data_sink="true" \
36 |     --data_sink_steps=100 \
37 |     --save_ckpt_step=100 \
38 |     --max_ckpt_num=1 \
39 |     --save_ckpt_path="" \
40 |     --load_teacher_ckpt_path="" \
41 |     --data_dir="" \
42 |     --schema_dir="" \
43 |     --dataset_type="tfrecord" > log.txt 2>&1 &
44 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/scripts/run_standalone_td.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2020 Huawei Technologies Co., Ltd
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ============================================================================
16 | 
17 | echo "=============================================================================================================="
18 | echo "Please run the scipt as: "
19 | echo "bash scipts/run_standalone_td.sh"
20 | echo "for example: bash scipts/run_standalone_td.sh"
21 | echo "=============================================================================================================="
22 | 
23 | mkdir -p ms_log
24 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
25 | CUR_DIR=`pwd`
26 | export GLOG_log_dir=${CUR_DIR}/ms_log
27 | export GLOG_logtostderr=0
28 | python ${PROJECT_DIR}/../run_task_distill.py \
29 |     --device_target="Ascend" \
30 |     --device_id=0 \
31 |     --do_train="true" \
32 |     --do_eval="true" \
33 |     --td_phase1_epoch_size=10 \
34 |     --td_phase2_epoch_size=3 \
35 |     --task_name="" \
36 |     --do_shuffle="true" \
37 |     --enable_data_sink="true" \
38 |     --data_sink_steps=100 \
39 |     --save_ckpt_step=100 \
40 |     --max_ckpt_num=1 \
41 |     --load_teacher_ckpt_path="" \
42 |     --load_gd_ckpt_path="" \
43 |     --load_td1_ckpt_path="" \
44 |     --train_data_dir="" \
45 |     --eval_data_dir="" \
46 |     --schema_dir="" \
47 |     --dataset_type="tfrecord" > log.txt 2>&1 &
48 | 
49 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT-MindSpore/src/__init__.py


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/src/assessment_method.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """assessment methods"""
17 | 
18 | import numpy as np
19 | 
20 | class Accuracy():
21 |     """Accuracy"""
22 |     def __init__(self):
23 |         self.acc_num = 0
24 |         self.total_num = 0
25 | 
26 |     def update(self, logits, labels):
27 |         labels = labels.asnumpy()
28 |         labels = np.reshape(labels, -1)
29 |         logits = logits.asnumpy()
30 |         logit_id = np.argmax(logits, axis=-1)
31 |         self.acc_num += np.sum(labels == logit_id)
32 |         self.total_num += len(labels)
33 | 
34 | class F1():
35 |     """F1"""
36 |     def __init__(self):
37 |         self.TP = 0
38 |         self.FP = 0
39 |         self.FN = 0
40 | 
41 |     def update(self, logits, labels):
42 |         """Update F1 score"""
43 |         labels = labels.asnumpy()
44 |         labels = np.reshape(labels, -1)
45 |         logits = logits.asnumpy()
46 |         logit_id = np.argmax(logits, axis=-1)
47 |         logit_id = np.reshape(logit_id, -1)
48 |         pos_eva = np.isin(logit_id, [2, 3, 4, 5, 6, 7])
49 |         pos_label = np.isin(labels, [2, 3, 4, 5, 6, 7])
50 |         self.TP += np.sum(pos_eva & pos_label)
51 |         self.FP += np.sum(pos_eva & (~pos_label))
52 |         self.FN += np.sum((~pos_eva) & pos_label)
53 |         print("-----------------precision is ", self.TP / (self.TP + self.FP))
54 |         print("-----------------recall is ", self.TP / (self.TP + self.FN))
55 | 


--------------------------------------------------------------------------------
/TinyBERT-MindSpore/src/gd_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | """
16 | network config setting, will be used in dataset.py, run_general_distill.py and run_task_distill.py
17 | """
18 | import mindspore.common.dtype as mstype
19 | from easydict import EasyDict as edict
20 | from .tinybert_model import BertConfig
21 | 
22 | common_cfg = edict({
23 |     'batch_size': 32,
24 |     'loss_scale_value': 2 ** 16,
25 |     'scale_factor': 2,
26 |     'scale_window': 1000,
27 |     'AdamWeightDecay': edict({
28 |         'learning_rate': 5e-5,
29 |         'end_learning_rate': 1e-14,
30 |         'power': 1.0,
31 |         'weight_decay': 1e-4,
32 |         'eps': 1e-6,
33 |         'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
34 |     }),
35 | })
36 | '''
37 | Including two kinds of network: \
38 | teacher network: The BERT-base network.
39 | student network: The network which is inherited from teacher network.
40 | '''
41 | bert_teacher_net_cfg = BertConfig(
42 |     seq_length=128,
43 |     vocab_size=30522,
44 |     hidden_size=768,
45 |     num_hidden_layers=12,
46 |     num_attention_heads=12,
47 |     intermediate_size=3072,
48 |     hidden_act="gelu",
49 |     hidden_dropout_prob=0.1,
50 |     attention_probs_dropout_prob=0.1,
51 |     max_position_embeddings=512,
52 |     type_vocab_size=2,
53 |     initializer_range=0.02,
54 |     use_relative_positions=False,
55 |     dtype=mstype.float32,
56 |     compute_type=mstype.float16
57 | )
58 | bert_student_net_cfg = BertConfig(
59 |     seq_length=128,
60 |     vocab_size=30522,
61 |     hidden_size=384,
62 |     num_hidden_layers=4,
63 |     num_attention_heads=12,
64 |     intermediate_size=1536,
65 |     hidden_act="gelu",
66 |     hidden_dropout_prob=0.1,
67 |     attention_probs_dropout_prob=0.1,
68 |     max_position_embeddings=512,
69 |     type_vocab_size=2,
70 |     initializer_range=0.02,
71 |     use_relative_positions=False,
72 |     dtype=mstype.float32,
73 |     compute_type=mstype.float16
74 | )
75 | 


--------------------------------------------------------------------------------
/TinyBERT/requirements.txt:
--------------------------------------------------------------------------------
 1 | # progress bars in model download and training scripts
 2 | tqdm
 3 | # Accessing files from S3 directly.
 4 | boto3
 5 | # Used for downloading models over HTTP
 6 | requests
 7 | 
 8 | torch>=1.0.1
 9 | scipy>=0.14.0
10 | seaborn


--------------------------------------------------------------------------------
/TinyBERT/tinybert_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT/tinybert_overview.png


--------------------------------------------------------------------------------
/TinyBERT/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.2"
 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 3 | 
 4 | 
 5 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
 6 |                        BertForMaskedLM, BertForNextSentencePrediction,
 7 |                        TinyBertForSequenceClassification,
 8 |                        load_tf_weights_in_bert)
 9 | 
10 | from .optimization import BertAdam
11 | 
12 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
13 | 


--------------------------------------------------------------------------------