├── AutoTinyBERT ├── AutoTinyBERT_overview.PNG ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── generate_data.py ├── inference_time_evaluation.py ├── latency_predictor.py ├── pre_training.py ├── searcher.py ├── submodel_extractor.py ├── superbert_run_en_classifier.py ├── transformer │ ├── __init__.py │ ├── file_utils.py │ ├── modeling_base.py │ ├── modeling_extractor.py │ ├── modeling_super_kd.py │ ├── optimization.py │ └── tokenization.py └── utils.py ├── BBPE ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── bbpe │ ├── byteVocab.txt │ ├── charNumVocab.txt │ ├── charVocab.txt │ ├── cn_wiki_sample.txt │ ├── fastBPE-master │ │ ├── bpe_postprocessing.py │ │ ├── fastBPE │ │ │ ├── fastBPE.hpp │ │ │ ├── fastBPE.pyx │ │ │ └── main.cc │ │ └── vocab_byteTo16base.py │ ├── genByteVocab.py │ ├── genNum.py │ ├── map_freq.py │ ├── mergeVocab.py │ ├── protectList.txt │ ├── text2utf-8-mt-byte.sh │ ├── text2utf-8-mt-char.sh │ ├── tokenization.py │ ├── utf-8-mt-byte.py │ └── utf-8-mt-char.py └── example.png ├── BinaryBERT ├── LICENSE ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── __init__.py ├── assets │ └── model.png ├── helper.py ├── kd_learner_glue.py ├── kd_learner_squad.py ├── quant_task_distill_glue.py ├── quant_task_distill_squad.py ├── readme.md ├── requirements.txt ├── scripts │ ├── terarny_glue.sh │ ├── terarny_squad.sh │ ├── tws_glue.sh │ └── tws_squad.sh ├── transformer │ ├── __init__.py │ ├── binary_model_init.py │ ├── configuration_bert.py │ ├── configuration_utils.py │ ├── file_utils.py │ ├── modeling.py │ ├── modeling_dynabert.py │ ├── modeling_dynabert_binary.py │ ├── modeling_dynabert_quant.py │ ├── modeling_utils.py │ ├── optimization.py │ ├── tokenization.py │ └── utils_quant.py ├── utils_glue.py └── utils_squad.py ├── CAME ├── .DS_Store ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── adafactor.py ├── bert-large-uncased-vocab.txt ├── bert_config.json ├── bert_large_config.json ├── bert_pretrain.png ├── bind.sh ├── bind_pyt.py ├── came.py ├── came_pcode.png ├── configurations.yml ├── create_data.sh ├── create_pretraining_data.py ├── data │ ├── BooksDownloader.py │ ├── BookscorpusTextFormatting.py │ ├── Downloader.py │ ├── GLUEDownloader.py │ ├── GooglePretrainedWeightDownloader.py │ ├── NVIDIAPretrainedWeightDownloader.py │ ├── SquadDownloader.py │ ├── TextSharding.py │ ├── WikiDownloader.py │ ├── WikicorpusTextFormatting.py │ ├── __init__.py │ ├── __pycache__ │ │ └── TextSharding.cpython-36.pyc │ ├── bertPrep.py │ ├── create_datasets_from_start.sh │ ├── shard.py │ ├── squad │ │ └── squad_download.sh │ └── wikiextractor │ │ ├── .github │ │ └── workflows │ │ │ └── python-publish.yml │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── extract.sh │ │ ├── setup.py │ │ └── wikiextractor │ │ ├── WikiExtractor.py │ │ ├── __init__.py │ │ ├── cirrus-extract.py │ │ ├── clean.py │ │ ├── extract.py │ │ └── extractPage.py ├── extract_features.py ├── file_utils.py ├── inference.py ├── memory.png ├── modeling.py ├── optimization.py ├── processors │ ├── __init__.py │ └── glue.py ├── requirements.txt ├── run.sub ├── run_came_pretraining.sh ├── run_glue.py ├── run_pretraining.py ├── run_squad.py ├── run_swag.py ├── run_validation.sh ├── schedulers.py ├── scripts │ ├── configs │ │ ├── glue_config.sh │ │ ├── pretrain_config.sh │ │ └── squad_config.sh │ ├── data_download.sh │ ├── docker │ │ ├── build.sh │ │ └── launch.sh │ ├── run_glue.sh │ ├── run_pretraining.sh │ ├── run_squad.sh │ └── run_swag.sh ├── start_data.py ├── startup_came.py ├── tokenization.py ├── triton │ ├── LICENSE │ ├── README.md │ ├── client.py │ ├── deployer.py │ ├── deployer_lib.py │ ├── evaluate.sh │ ├── export_model.sh │ ├── generate_figures.sh │ ├── launch_triton_server.sh │ ├── profiling_data_int64 │ │ ├── input__0 │ │ ├── input__1 │ │ └── input__2 │ ├── run_perf_client.sh │ ├── run_squad_client.py │ └── wait_for_triton_server.sh ├── utils.py ├── v1.1 │ ├── dev-v1.1.json │ ├── evaluate-v1.1.py │ └── train-v1.1.json └── vocab │ └── vocab ├── CeMAT ├── CeMAT_maskPredict │ ├── LICENSE │ ├── __init__.py │ ├── cemat_nat_options.py │ ├── checkpoint_utils.py │ ├── criterions │ │ ├── __init__.py │ │ └── label_smoothed_length_cross_entropy.py │ ├── data │ │ ├── __init__.py │ │ └── language_pair_self_dataset_mask.py │ ├── generate_cmlm.py │ ├── meters.py │ ├── models │ │ ├── __init__.py │ │ ├── bert_seq2seq.py │ │ └── cemat_model.py │ ├── pybleu.py │ ├── strategies │ │ ├── __init__.py │ │ ├── decoding_strategy.py │ │ ├── easy_first.py │ │ ├── left_to_right.py │ │ ├── mask_predict.py │ │ └── strategy_utils.py │ ├── task_NAT_cemat.sh │ ├── task_infer_nat.sh │ ├── tasks │ │ ├── __init__.py │ │ └── translation_self_from_cemat.py │ └── train.py ├── CeMAT_plugins │ ├── __init__.py │ ├── checkpoint_utils.py │ ├── criterions │ │ ├── __init__.py │ │ └── label_smoothed_cross_entropy_with_maskdecode.py │ ├── data │ │ ├── __init__.py │ │ ├── cemat_dataset.py │ │ ├── concat_pair_dataset.py │ │ ├── ddenoising_pair_dataset_dyna_replace.py │ │ └── language_pair_dataset.py │ ├── models │ │ ├── __init__.py │ │ ├── cemat_model.py │ │ ├── fairseq_encoder.py │ │ └── transformer.py │ ├── task_NMT_cemat.sh │ ├── task_infer_nmt.sh │ ├── task_pt_cemat.sh │ └── tasks │ │ ├── __init__.py │ │ ├── cemat_pretraining.py │ │ └── translation_from_pretrained_cemat.py ├── License ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.md └── cemat_scripts │ ├── create_trans │ ├── example_extract_alignedpairs.sh │ └── extract_aligned_pairs.py │ └── process │ ├── preprocess_Mono.sh │ ├── preprocess_NMT.sh │ └── preprocess_Para.sh ├── DynaBERT ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── dynabert_overview.png ├── eval_glue.py ├── requirements.txt ├── run_glue.py └── transformers │ ├── __init__.py │ ├── __main__.py │ ├── configuration_bert.py │ ├── configuration_roberta.py │ ├── configuration_utils.py │ ├── data │ ├── __init__.py │ ├── metrics │ │ └── __init__.py │ └── processors │ │ ├── __init__.py │ │ ├── glue.py │ │ └── utils.py │ ├── file_utils.py │ ├── modeling_bert.py │ ├── modeling_roberta.py │ ├── modeling_utils.py │ ├── optimization.py │ ├── tokenization_bert.py │ ├── tokenization_gpt2.py │ ├── tokenization_roberta.py │ └── tokenization_utils.py ├── HyperText ├── LICENSE ├── README.md ├── __init__.py ├── hyperbolic │ ├── __init__.py │ ├── euclidean.py │ ├── math_utils.py │ ├── mobius_linear.py │ └── poincare.py ├── hypertext_model_architecture.png ├── main.py ├── models │ ├── Config.py │ ├── HyperText.py │ └── __init__.py ├── radam_optimizer.py ├── requirements.txt ├── train.py └── utils.py ├── JABER-PyTorch ├── LICENSE ├── NEZHA_PyTorch │ ├── LICENSE │ ├── README.md │ ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── file_utils.py │ ├── modeling_nezha.py │ ├── optimization.py │ └── tools │ │ ├── file_utils.py │ │ ├── official_tokenization.py │ │ ├── pytorch_optimization.py │ │ └── utils.py ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── alue_predictions │ └── README.md ├── alue_test_submission │ └── README.md ├── compute_metrics.py ├── generate_data.py ├── pretrained_models │ └── README.md ├── processors.py ├── raw_datasets │ └── toy.mq2q.dev.tsv ├── requirements.txt ├── run_alue.py ├── run_alue.sh └── tokenizationBBPE.py ├── NEZHA-Gen-TensorFlow ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── interactive_conditional_generation.py ├── poetry.py └── tokenization.py ├── NEZHA-PyTorch ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── convert_tf_checkpoint_to_pytorch.py ├── data │ ├── chnsenti │ │ ├── dev.tsv │ │ └── train.tsv │ └── mrpc │ │ ├── dev.tsv │ │ └── train.tsv ├── file_utils.py ├── modeling_nezha.py ├── optimization.py ├── pretrained_models │ ├── nezha-cn-base │ │ ├── bert_config.json │ │ └── vocab.txt │ └── nezha-en-base │ │ ├── bert_config.json │ │ └── vocab.txt ├── run_classifier.sh ├── run_sequence_classifier.py └── tools │ ├── file_utils.py │ ├── official_tokenization.py │ ├── pytorch_optimization.py │ └── utils.py ├── NEZHA-TensorFlow ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── __init__.py ├── data │ └── pretrain-toy │ │ ├── tf_examples_00.tfrecord │ │ └── tf_examples_01.tfrecord ├── extract_features.py ├── fp16_utils.py ├── fused_layer_norm.py ├── gpu_environment.py ├── modeling.py ├── modeling_ori.py ├── modeling_test.py ├── multilingual.md ├── nezha │ ├── bert_base_rel_config_vocab_100503.json │ ├── bert_config.json │ └── vocab.txt ├── optimization.py ├── optimization_test.py ├── read_tf_events.py ├── run_classifier.py ├── run_classifier_ner.py ├── run_classifier_with_tfhub.py ├── run_pretraining.py ├── run_squad.py ├── run_squad_trtis_client.py ├── sample_text.txt ├── scripts │ ├── run_clf.sh │ ├── run_clf_predict.sh │ ├── run_ner_predict.sh │ ├── run_pretraining.sh │ ├── run_reading.sh │ ├── run_seq_labelling.sh │ └── run_seq_labelling_predict.sh ├── tf_metrics.py ├── tokenization.py ├── tokenization_test.py └── utils │ ├── create_glue_data.py │ ├── create_pretraining_data.py │ ├── create_squad_data.py │ └── utils.py ├── Noah_WuKong ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE.txt ├── configs │ ├── wukong_swin │ │ ├── wukong_swin.py │ │ ├── wukong_swin_f.py │ │ └── wukong_swin_g.py │ ├── wukong_vit_b │ │ ├── wukong_vit_b.py │ │ ├── wukong_vit_b_f.py │ │ └── wukong_vit_b_g.py │ └── wukong_vit_l │ │ ├── wukong_vit_l.py │ │ ├── wukong_vit_l_f.py │ │ └── wukong_vit_l_g.py ├── data │ ├── __init__.py │ ├── datasets.py │ ├── res │ │ ├── classnames.json │ │ └── prompts.txt │ └── tokenizer │ │ ├── __init__.py │ │ ├── res │ │ └── vocab.txt │ │ └── simple_tokenizer.py ├── main.py ├── model │ ├── __init__.py │ ├── builder.py │ ├── language │ │ ├── __init__.py │ │ └── transformer.py │ ├── modules.py │ ├── utils.py │ ├── vision │ │ ├── __init__.py │ │ ├── swin_transformer.py │ │ └── vision_transformer.py │ └── wukong.py └── requirements.txt ├── Noah_Wukong-MindSpore ├── README.md ├── README_CN.md ├── eval.py └── src │ ├── __init__.py │ ├── config │ ├── wukong_vit_b_32.yaml │ ├── wukong_vit_b_32_clip.yaml │ ├── wukong_vit_b_32_filip.yaml │ ├── wukong_vit_l_14.yaml │ ├── wukong_vit_l_14_clip.yaml │ └── wukong_vit_l_14_filip.yaml │ ├── dataset │ ├── __init__.py │ ├── dataset.py │ ├── generate_dataset.py │ └── wukong_download.py │ ├── model │ ├── __init__.py │ ├── matrics.py │ ├── text_encoder.py │ ├── token_learner.py │ └── visual_encoder.py │ └── tools │ ├── __init__.py │ ├── model_utils.py │ ├── simple_tokenizer.py │ ├── template_generate.py │ └── utils.py ├── PMLM ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── create_pretraining_data.py ├── en_vocab.txt ├── interactive_conditional_samples_sincos_acrostic.py ├── modeling.py └── tokenization.py ├── PanGu-Bot └── Readme.md ├── PanGu-α ├── .idea │ ├── .gitignore │ ├── PanGu-Alpha.iml │ ├── inspectionProfiles │ │ ├── Project_Default.xml │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ └── vcs.xml ├── LICENSE ├── PANGU-α.pdf ├── README.md ├── dataset.py ├── docs │ ├── 13B.png │ ├── 2.6B.png │ ├── Pipline.png │ ├── dataset.png │ ├── logos.png │ ├── model.png │ ├── task.png │ ├── 微信交流群2.png │ └── 鹏程.盘古微信交流群.png ├── generate.py ├── pangu_alpha.py ├── pangu_alpha_config.py ├── pangu_alpha_predict.py ├── pangu_alpha_train.py ├── pangu_alpha_wrapcell.py ├── run_pangu_alpha_predict.py ├── run_pangu_alpha_train.py ├── scripts │ ├── run_distribute_predict.sh │ └── run_distribute_train.sh ├── serving_demo │ └── PanGu-Alpha-serving-demo.avi ├── strategy_load_ckpt │ ├── pangu_alpha_13B_cktp_strategy.ckpt │ └── pangu_alpha_2.6B_ckpt_strategy.ckpt ├── tokenization_jieba.py ├── tokenizer │ ├── vocab.model │ └── vocab.vocab └── utils.py ├── README.md ├── TernaryBERT-MindSpore ├── LICENSE ├── README.md ├── __init__.py ├── eval.py ├── mindspore_hub_conf.py ├── scripts │ ├── run_eval.sh │ └── run_train.sh ├── src │ ├── __init__.py │ ├── assessment_method.py │ ├── cell_wrapper.py │ ├── config.py │ ├── dataset.py │ ├── quant.py │ ├── tinybert_model.py │ └── utils.py └── train.py ├── TernaryBERT ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── main.png ├── quant_task_glue.py ├── quant_task_squad.py ├── requirements.txt ├── transformer │ ├── __init__.py │ ├── configuration.py │ ├── file_utils.py │ ├── modeling.py │ ├── modeling_quant.py │ ├── optimization.py │ ├── tokenization.py │ └── utils_quant.py ├── utils_glue.py └── utils_squad.py ├── TinyBERT-MindSpore ├── LICENSE ├── README.md ├── __init__.py ├── mindspore_hub_conf.py ├── run_general_distill.py ├── run_task_distill.py ├── scripts │ ├── run_distributed_gd_ascend.sh │ ├── run_distributed_gd_gpu.sh │ ├── run_standalone_gd.sh │ └── run_standalone_td.sh └── src │ ├── __init__.py │ ├── assessment_method.py │ ├── dataset.py │ ├── gd_config.py │ ├── td_config.py │ ├── tinybert_for_gd_td.py │ ├── tinybert_model.py │ └── utils.py └── TinyBERT ├── LICENSE ├── README.md ├── THIRD PARTY OPEN SOURCE SOFTWARE NOTICE ├── data_augmentation.py ├── general_distill.py ├── pregenerate_training_data.py ├── requirements.txt ├── task_distill.py ├── tinybert_overview.png └── transformer ├── __init__.py ├── file_utils.py ├── modeling.py ├── optimization.py └── tokenization.py /AutoTinyBERT/AutoTinyBERT_overview.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/AutoTinyBERT/AutoTinyBERT_overview.PNG -------------------------------------------------------------------------------- /AutoTinyBERT/submodel_extractor.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 Huawei Technologies Co., Ltd. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import json 18 | import torch 19 | import argparse 20 | 21 | from transformer.modeling_extractor import SuperBertModel 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--model", 27 | default=None, 28 | type=str, 29 | required=True) 30 | parser.add_argument('--arch', 31 | type=str, 32 | required=True) 33 | parser.add_argument('--output', 34 | type=str, 35 | required=True) 36 | parser.add_argument('--kd', action='store_true') 37 | 38 | args = parser.parse_args() 39 | 40 | model = SuperBertModel.from_pretrained(args.model) 41 | size = 0 42 | for n, p in model.named_parameters(): 43 | size += p.nelement() 44 | print('n: {}#@#p: {}'.format(n, p.nelement())) 45 | 46 | print('the model size is : {}'.format(size)) 47 | 48 | arch = json.loads(json.dumps(eval(args.arch))) 49 | 50 | print('kd: {}'.format(args.kd)) 51 | 52 | kd = True if args.kd else False 53 | model.module.set_sample_config(arch, kd) if hasattr(model, 'module') \ 54 | else model.set_sample_config(arch, kd) 55 | 56 | size = 0 57 | for n, p in model.named_parameters(): 58 | size += p.nelement() 59 | print('n: {}#@#p: {}'.format(n, p.nelement())) 60 | 61 | print('the extracted model size is : {}'.format(size)) 62 | 63 | model_to_save = model.module if hasattr(model, 'module') else model 64 | 65 | model_output = os.path.join(args.output, 'pytorch_model.bin') 66 | torch.save(model_to_save.state_dict(), model_output) 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | 72 | -------------------------------------------------------------------------------- /AutoTinyBERT/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | __version__ = "0.6.1" 4 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 5 | 6 | from .optimization import BertAdam 7 | from .optimization import AdamW, get_linear_schedule_with_warmup 8 | 9 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME 10 | -------------------------------------------------------------------------------- /AutoTinyBERT/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 Huawei Technologies Co., Ltd. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import random 17 | 18 | 19 | def sample_arch_4_kd(layer_numbers, hidden_sizes, ffn_sizes, qkv_sizes, 20 | reset_rand_seed=False, rand_seed=0): 21 | 22 | if reset_rand_seed: 23 | random.seed(rand_seed) 24 | 25 | config = dict() 26 | 27 | layer_num = random.choice(layer_numbers) 28 | 29 | config['sample_layer_num'] = layer_num 30 | config['sample_hidden_size'] = random.choice(hidden_sizes) 31 | config['sample_intermediate_sizes'] = [random.choice(ffn_sizes)] * layer_num 32 | config['sample_num_attention_heads'] = [12] * layer_num 33 | config['sample_qkv_sizes'] = [random.choice(qkv_sizes)] * layer_num 34 | return config 35 | 36 | 37 | def sample_arch_4_mlm(layer_numbers, hidden_sizes, ffn_sizes, 38 | head_numbers, reset_rand_seed=False, rand_seed=0): 39 | 40 | if reset_rand_seed: 41 | random.seed(rand_seed) 42 | 43 | config = dict() 44 | 45 | layer_num = random.choice(layer_numbers) 46 | head_num = random.choice(head_numbers) 47 | 48 | config['sample_layer_num'] = layer_num 49 | config['sample_hidden_size'] = random.choice(hidden_sizes) 50 | config['sample_intermediate_sizes'] = [random.choice(ffn_sizes)] * layer_num 51 | config['sample_num_attention_heads'] = [head_num] * layer_num 52 | config['sample_qkv_sizes'] = [head_num * 64] * layer_num 53 | return config 54 | 55 | -------------------------------------------------------------------------------- /BBPE/bbpe/fastBPE-master/bpe_postprocessing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import sys 4 | import six 5 | import unicodedata 6 | import collections 7 | import base64 8 | 9 | count = 0 10 | output = open(sys.argv[1], "w", encoding = "utf-8") 11 | b16 = {} 12 | byteVocab = {} 13 | 14 | def getChinese(context): 15 | # context = context.decode("utf-8") # convert context from str to unicode 16 | filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range 17 | context = filtrate.sub(r'', context) # remove all non-Chinese characters 18 | # context = context.encode("utf-8") # convert unicode back to str 19 | return context 20 | 21 | for i in range(10): 22 | b16[i] = str(i) 23 | 24 | b16[10] = 'A' 25 | b16[11] = 'B' 26 | b16[12] = 'C' 27 | b16[13] = 'D' 28 | b16[14] = 'E' 29 | b16[15] = 'F' 30 | 31 | b256tob16 = {} 32 | def base16decode(s): 33 | result = 0 34 | for c in s: 35 | result = result * 16 + b16[c] 36 | return result 37 | 38 | def base16encode(n): 39 | result = '' 40 | n = int(n) 41 | while n > 0: 42 | n = int(n) 43 | result = b16[n%16] + result 44 | n /= 16 45 | n = int(n) 46 | return result 47 | 48 | def base256encode(n): 49 | return chr(n) 50 | result = '' 51 | while n > 0: 52 | n = int(n) 53 | result = chr(n%256) + result 54 | n /= 256 55 | return result 56 | for i in range(256): 57 | b256tob16[str(base256encode(i))] = i 58 | for line in sys.stdin: 59 | print(line) 60 | line = line.split('\t') #bytes(line.strip(), encoding="utf-8") 61 | # output.writelines("{}\t".format(line[0])) 62 | print(line) 63 | pair = line[0].split(' ') 64 | output.writelines("{}\n".format(pair[0]+pair[1])) 65 | -------------------------------------------------------------------------------- /BBPE/bbpe/fastBPE-master/fastBPE/fastBPE.pyx: -------------------------------------------------------------------------------- 1 | # distutils: language = c++ 2 | 3 | from libcpp.vector cimport vector 4 | from libcpp.string cimport string 5 | 6 | cdef extern from "fastBPE.hpp" namespace "fastBPE": 7 | cdef cppclass BPEApplyer: 8 | BPEApplyer(const string& codes_path, const string& vocab_path) 9 | vector[string] apply(vector[string]& sentences) 10 | 11 | cdef class fastBPE: 12 | cdef BPEApplyer* c_obj 13 | 14 | def __dealloc__(self): 15 | del self.c_obj 16 | 17 | def __init__(self, codes_path, vocab_path=""): 18 | self.c_obj = new BPEApplyer(codes_path.encode(), vocab_path.encode()) 19 | 20 | def apply(self, sentences): 21 | cdef vector[string] s = [x.encode() for x in sentences] 22 | cdef vector[string] res = self.c_obj.apply(s) 23 | return [x.decode() for x in res] 24 | -------------------------------------------------------------------------------- /BBPE/bbpe/fastBPE-master/fastBPE/main.cc: -------------------------------------------------------------------------------- 1 | #include "fastBPE.hpp" 2 | 3 | using namespace std; 4 | using namespace fastBPE; 5 | 6 | void printUsage() { 7 | cerr 8 | << "usage: fastbpe \n\n" 9 | << "The commands supported by fastBPE are:\n\n" 10 | << "getvocab input1 [input2] extract the vocabulary from one " 11 | "or two text files\n" 12 | << "learnbpe nCodes input1 [input2] learn BPE codes from one or two " 13 | "text files\n" 14 | << "applybpe output input codes [vocab] apply BPE codes to a text file\n" 15 | << "applybpe_stream codes [vocab] apply BPE codes to stdin and output to stdout\n" 16 | << endl; 17 | } 18 | 19 | 20 | int main(int argc, char **argv) { 21 | if (argc < 2) { 22 | printUsage(); 23 | exit(EXIT_FAILURE); 24 | } 25 | string command = argv[1]; 26 | if (command == "getvocab") { 27 | assert(argc == 3 || argc == 4); 28 | getvocab(argv[2], argc == 4 ? argv[3] : ""); 29 | } else if (command == "learnbpe") { 30 | assert(argc == 4 || argc == 5); 31 | learnbpe(stoi(argv[2]), argv[3], argc == 5 ? argv[4] : ""); 32 | } else if (command == "applybpe") { 33 | assert(argc == 5 || argc == 6); 34 | applybpe(argv[2], argv[3], argv[4], argc == 6 ? argv[5] : ""); 35 | } else if (command == "applybpe_stream") { 36 | assert(argc == 3 || argc == 4); 37 | applybpe_stream(argv[2], argc == 4 ? argv[3] : ""); 38 | } else { 39 | printUsage(); 40 | exit(EXIT_FAILURE); 41 | } 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /BBPE/bbpe/genByteVocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import sys 4 | import unicodedata 5 | import collections 6 | import base64 7 | 8 | output = open("byteVocab.txt", "w") 9 | 10 | corpus = open("cn_wiki_sample.txt", "r") 11 | 12 | vocab = {} 13 | 14 | def getChinese(context): 15 | # context = context.decode("utf-8") # convert context from str to unicode 16 | filtrate = re.compile(u'[^\u4E00-\u9FA5]') # non-Chinese unicode range 17 | context = filtrate.sub(r'', context) # remove all non-Chinese characters 18 | # context = context.encode("utf-8") # convert unicode back to str 19 | return context 20 | 21 | i = 0 22 | 23 | for line in corpus: 24 | line = line.strip() 25 | # print(line) 26 | tokens = line #.split() 27 | print(tokens) 28 | 29 | for token in tokens: #range(len(tokens)): 30 | # token = tokens[i] 31 | print(token) 32 | if len(getChinese(token)) > 0 and token not in vocab: 33 | vocab[token] = i #int(tokens[1]) 34 | i += 1 35 | if i>= 512: break 36 | if i >= 512: break 37 | 38 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=False) 39 | 40 | for item in mergedVocab: 41 | output.writelines("{}\t{}\n".format(item[1], item[0])) 42 | -------------------------------------------------------------------------------- /BBPE/bbpe/genNum.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import sys 4 | import unicodedata 5 | import collections 6 | import base64 7 | 8 | def base256encode(n): 9 | return chr(n) 10 | result = '' 11 | while n > 0: 12 | n = int(n) 13 | result = chr(n%256) + result 14 | n /= 256 15 | return result 16 | 17 | charvocab = open("charVocab.txt", "r") 18 | 19 | vocab = {} 20 | for line in charvocab: 21 | line = line.strip() 22 | # print(line) 23 | tokens = line.split('\t') 24 | # print("tokens: " + tokens[0] + " " + tokens[1] + "\n") 25 | tk = tokens[0]#(str(base16encode((b256tob16[tokens[0]])))) 26 | #vocab[tk+'\t'+tk] = int(tokens[1]) 27 | vocab[tk] = int(tokens[1]) 28 | 29 | for i in range(1000): 30 | # print(str(i).encode("utf-8")) 31 | token = (str(base64.b16encode(str(i).encode("utf-8")))[2:-1]) 32 | if token not in vocab: vocab[token] = 1 33 | 34 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=True) 35 | 36 | output = open('charNumVocab.txt', 'w') 37 | for item in mergedVocab: 38 | output.writelines("{}\t{}\n".format(item[0], item[1])) 39 | output.writelines("##{}\t{}\n".format(item[0], item[1])) 40 | 41 | for i in range(10): 42 | token = (str(base64.b16encode(('00'+str(i)).encode("utf-8")))[2:-1]) 43 | output.writelines("##{}\t{}\n".format(token, 1)) 44 | 45 | for i in range(100): 46 | if i < 10: continue 47 | token = (str(base64.b16encode(('0'+str(i)).encode("utf-8")))[2:-1]) 48 | output.writelines("##{}\t{}\n".format(token, 1)) 49 | -------------------------------------------------------------------------------- /BBPE/bbpe/mergeVocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import sys 4 | import unicodedata 5 | import collections 6 | import base64 7 | b16 = {} 8 | 9 | for i in range(10): 10 | b16[i] = str(i) 11 | 12 | b16[10] = 'A' 13 | b16[11] = 'B' 14 | b16[12] = 'C' 15 | b16[13] = 'D' 16 | b16[14] = 'E' 17 | b16[15] = 'F' 18 | 19 | b256tob16 = {} 20 | def base16decode(s): 21 | result = 0 22 | for c in s: 23 | result = result * 16 + b16[c] 24 | return result 25 | 26 | def base16encode(n): 27 | result = '' 28 | while n > 0: 29 | n = int(n) 30 | result = b16[n%16] + result 31 | n /= 16 32 | n = int(n) 33 | return result 34 | 35 | def base256encode(n): 36 | return chr(n) 37 | 38 | for i in range(256): 39 | b256tob16[str(base256encode(i))] = i 40 | vocab = {} 41 | 42 | byteVocab = open(sys.argv[1], 'r') 43 | Vocab = open(sys.argv[2], 'r') 44 | 45 | for line in byteVocab: 46 | line = line.strip() 47 | # print(line) 48 | tokens = line.split('\t') 49 | # print("tokens: " + tokens[0] + " " + tokens[1] + "\n") 50 | tk = tokens[0]#(str(base16encode((b256tob16[tokens[0]])))) 51 | #vocab[tk+'\t'+tk] = int(tokens[1]) 52 | vocab[tk] = int(tokens[1]) 53 | 54 | numVocab = {} 55 | 56 | for i in range(10): 57 | # print(str(i).encode("utf-8")) 58 | token = (str(base64.b16encode(str(i).encode("utf-8")))[2:-1]) 59 | if token not in numVocab: numVocab[token] = 1 60 | print(numVocab) 61 | for line in Vocab: 62 | tokens = line.strip().split('\t') 63 | if tokens[0] in byteVocab: continue 64 | isNum = False 65 | # print(tokens[0]) 66 | # print(int(len((tokens[0]))/2)) 67 | 68 | for i in range(int(len((tokens[0]))/2)): 69 | # print(tokens[0][2*i:2*i+2]) 70 | if i == 0 and tokens[0][0:2] == '##': 71 | # print('##') 72 | continue 73 | if tokens[0][2*i:2*i+2] not in numVocab: break 74 | if i == int(len(tokens[0])/2)-1: 75 | isNum = True 76 | if isNum: 77 | print(tokens[0]) 78 | continue 79 | # print(line) 80 | # if tokens[0] not in vocab: 81 | #vocab[tokens[0]+'\t'+tokens[1]] = int(tokens[2]) 82 | vocab[tokens[0]] = int(tokens[1]) 83 | 84 | 85 | mergedVocab = sorted(vocab.items(), key=lambda item:item[1], reverse=True) 86 | 87 | output = open(sys.argv[3], 'w') 88 | for item in mergedVocab: 89 | output.writelines("{}\t{}\n".format(item[0], item[1])) 90 | -------------------------------------------------------------------------------- /BBPE/bbpe/protectList.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | =SYMBOL= 8 | =NUMBER= 9 | =QUANTIFIER= 10 | =DATE= 11 | =TIME= 12 | =RANGE= 13 | 14 | 15 | -------------------------------------------------------------------------------- /BBPE/bbpe/text2utf-8-mt-byte.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | TEXT_DIR=./MTData/ #./wiki_others-hebin-yafu/wiki_others-hebin-yafu/ 4 | NUM=0 5 | for TEXT_FILE in ${TEXT_DIR}/*; do 6 | NUM=$((NUM+1)) 7 | echo $NUM 8 | cat $TEXT_FILE | python3 utf-8-mt-byte.py MTData_byte/$(basename "$TEXT_FILE") & 9 | if (($NUM>60)) 10 | then 11 | wait 12 | NUM=0 13 | fi 14 | done 15 | 16 | -------------------------------------------------------------------------------- /BBPE/bbpe/text2utf-8-mt-char.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | TEXT_DIR=./MTData/ #./wiki_others-hebin-yafu/wiki_others-hebin-yafu/ 4 | NUM=0 5 | for TEXT_FILE in ${TEXT_DIR}/*; do 6 | NUM=$((NUM+1)) 7 | echo $NUM 8 | cat $TEXT_FILE | python3 utf-8-mt-char.py MTData_utf8/$(basename "$TEXT_FILE") & 9 | if (($NUM>50)) 10 | then 11 | wait 12 | NUM=0 13 | fi 14 | done 15 | 16 | -------------------------------------------------------------------------------- /BBPE/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BBPE/example.png -------------------------------------------------------------------------------- /BinaryBERT/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BinaryBERT/__init__.py -------------------------------------------------------------------------------- /BinaryBERT/assets/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/BinaryBERT/assets/model.png -------------------------------------------------------------------------------- /BinaryBERT/helper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 Huawei Technologies Co., Ltd. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import logging 16 | import os 17 | import string 18 | import random 19 | import torch 20 | 21 | def generate_job_id(): 22 | return ''.join(random.sample(string.ascii_letters+string.digits, 5)) 23 | 24 | def init_logging(log_path): 25 | 26 | if not os.path.isdir(os.path.dirname(log_path)): 27 | print("Log path does not exist. Create a new one.") 28 | os.makedirs(os.path.dirname(log_path)) 29 | if os.path.exists(log_path): 30 | print("%s already exists. replace it with current experiment." % log_path) 31 | os.system('rm %s' % log_path) 32 | 33 | logger = logging.getLogger() 34 | logger.setLevel(logging.INFO) 35 | logFormatter = logging.Formatter('%(asctime)s [%(levelname)s]: %(message)s') 36 | 37 | fileHandler = logging.FileHandler(log_path) 38 | fileHandler.setFormatter(logFormatter) 39 | logger.addHandler(fileHandler) 40 | 41 | consoleHandler = logging.StreamHandler() 42 | consoleHandler.setFormatter(logFormatter) 43 | logger.addHandler(consoleHandler) 44 | 45 | def print_args(args): 46 | for k, v in zip(args.keys(), args.values()): 47 | logging.info("{0}: {1}".format(k, v)) 48 | 49 | def soft_cross_entropy(predicts, targets): 50 | student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) 51 | targets_prob = torch.nn.functional.softmax(targets, dim=-1) 52 | return (- targets_prob * student_likelihood).mean() 53 | 54 | 55 | def visualize_clip(clip_dict): 56 | # assert len(clip_dict) > 0, 'empty clip_dict, possibly not learnable_scalling.' 57 | logging.info("Visualizing learnable clipping vals...") 58 | for n, p in clip_dict.items(): 59 | if p.nelement() == 2: 60 | # PACT clip val has two elements 61 | logging.info("PACT clip_val: %s: (%.4f, %.4f)" % (n, p[0].item(), p[1].item())) 62 | elif p.nelement() == 1: 63 | # LSQ step size has only one element 64 | logging.info("LSQ step_size: %s: %.4f" % (n, p.item())) 65 | 66 | 67 | def result_to_file(result, file_name): 68 | with open(file_name, "a") as writer: 69 | logging.info("***** Eval results *****") 70 | for key in sorted(result.keys()): 71 | if result[key]>0.0: 72 | logging.info(" %s = %s", key, str(result[key])) 73 | writer.write("%s = %s\n" % (key, str(result[key]))) 74 | -------------------------------------------------------------------------------- /BinaryBERT/readme.md: -------------------------------------------------------------------------------- 1 | # BinaryBERT: Pushing the Limit of BERT Quantization 2 | This repository contains the implementation of our paper 3 | "BinaryBERT: Pushing the Limit of BERT Quantization" 4 | in ACL 2021. 5 | The overall workflow of training BinaryBERT is shown below. 6 | We first train a half-sized ternary BERT model, and then apply **ternary weight splitting** 7 | to initalize the full-sized BinaryBERT. We then fine-tune BinaryBERT for further refinement. 8 | ![BinaryBERT](./assets/model.png) 9 | 10 | ## Dependencies 11 | ```bash 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## Datasets 16 | 17 | We train and test BinaryBERT on GLUE and SQuAD benchmarks. Both dataset are available online: 18 | - **GLUE**: https://github.com/nyu-mll/GLUE-baselines 19 | - **SQuAD**: https://rajpurkar.github.io/SQuAD-explorer/ 20 | 21 | For data augmentation on GLUE, please follow the instruction in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT). 22 | 23 | ## Execution 24 | Our experiments are based on the fine-tuned full-precision DynaBERT, 25 | which can be found [here](https://drive.google.com/file/d/1pYApaDcse5QIB6lZagWO0uElAavFazpA/view?usp=sharing). 26 | Complete running scripts and more detailed tips are provided in `./scripts`. 27 | There are two steps for execution, and we illustrate them 28 | with training BinaryBERT with 4-bit activations on MRPC. 29 | 30 | ### Step one: Train a half-sized ternary BERT 31 | This correponds to `scripts/ternary_glue.sh`. For example 32 | ```bash 33 | sh scripts/terarny_glue.sh mrpc data/mrpc/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ 2 4 34 | ``` 35 | 36 | ### Step two: Apply TWS and finetune BinaryBERT 37 | This correponds to `scripts/tws_glue.sh`. Based on the model checkpoint of ternary BERT, execute: 38 | ```bash 39 | sh scripts/tws_glue.sh mrpc data/mrpc/ models/dynabert_model/mrpc/width_0.5_depth_1.0/ output/Ternary_W2A8/mrpc/kd_stage2/ 1 4 40 | ``` 41 | Go through each script for more detail. 42 | 43 | ## Citation 44 | If you find this repo helpful for your research, please: 45 | ``` 46 | @inproceedings{bai2021binarybert, 47 | title={BinaryBERT: Pushing the Limit of BERT Quantization}, 48 | author={Bai, H. and Zhang, W. and Hou, L. and Shang, L. and Jin, J. and Jiang, X. and Liu, Q. and Lyu, M. and King, I.}, 49 | booktitle={Annual Meeting of the Association for Computational Linguistics}, 50 | year={2021} 51 | } 52 | ``` -------------------------------------------------------------------------------- /BinaryBERT/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Used for downloading models over HTTP 4 | requests 5 | 6 | torch==1.0.0 7 | python==3.6 8 | seaborn 9 | pickle 10 | collections -------------------------------------------------------------------------------- /BinaryBERT/scripts/terarny_glue.sh: -------------------------------------------------------------------------------- 1 | # Step 1: First train a half-sized ternary BERT model from the dynabert model checkpoint 2 | # Tips: 3 | # 1. If trained with data augmentation, please add --aug_train 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu; 5 | # use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization 6 | 7 | TASK_NAME=$1 8 | GLUE_DIR=$2 9 | TEACHER_MODEL_DIR=$3 10 | STUDENT_MODEL_DIR=$4 11 | wbits=$5 12 | abits=$6 13 | JOB_ID=Ternary_W${wbits}A${abits} 14 | echo $TASK_NAME 15 | echo $GLUE_DIR 16 | echo $TEACHER_MODEL_DIR 17 | echo $STUDENT_MODEL_DIR 18 | echo $wbits 19 | echo $abits 20 | echo $JOB_ID 21 | 22 | if [ $abits == 4 ] 23 | then 24 | act_quan_method=lsq 25 | ACT2FN=relu 26 | else 27 | act_quan_method=uniform 28 | ACT2FN=gelu 29 | fi 30 | 31 | export CUDA_VISIBLE_DEVICES=5 32 | python quant_task_distill_glue.py \ 33 | --data_dir ${GLUE_DIR} \ 34 | --job_id ${JOB_ID} \ 35 | --batch_size 16 \ 36 | --learning_rate 1e-5 \ 37 | --eval_step 1000 \ 38 | --num_train_epochs 2 \ 39 | --ACT2FN ${ACT2FN} \ 40 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 41 | --kd_type two_stage \ 42 | --task_name $TASK_NAME \ 43 | --teacher_model ${TEACHER_MODEL_DIR} \ 44 | --student_model ${STUDENT_MODEL_DIR} \ 45 | --weight_bits ${wbits} \ 46 | --weight_quant_method twn \ 47 | --input_bits ${abits} \ 48 | --input_quant_method ${act_quan_method} \ 49 | --clip_lr 1e-4 \ 50 | --learnable_scaling 51 | -------------------------------------------------------------------------------- /BinaryBERT/scripts/terarny_squad.sh: -------------------------------------------------------------------------------- 1 | # Step 1: First train a half-sized ternary BERT model from the dynabert model checkpoint 2 | # Tips: 3 | # 1. If trained with data augmentation, please add --aug_train 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu; 5 | # use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization 6 | 7 | TASK=$1 8 | DATA_DIR=$2 9 | TEACHER_MODEL_DIR=$3 10 | STUDENT_MODEL_DIR=$4 11 | wbits=$5 12 | abits=$6 13 | JOB_ID=Ternary_W${wbits}A${abits} 14 | echo $TASK 15 | echo $DATA_DIR 16 | echo $TEACHER_MODEL_DIR 17 | echo $STUDENT_MODEL_DIR 18 | echo $wbits 19 | echo $abits 20 | echo $JOB_ID 21 | 22 | if [ $abits == 4 ] 23 | then 24 | act_quan_method=lsq 25 | ACT2FN=relu 26 | else 27 | act_quan_method=uniform 28 | ACT2FN=gelu 29 | fi 30 | 31 | export CUDA_VISIBLE_DEVICES=7 32 | if [ $TASK == 1 ] 33 | then 34 | TASK_NAME=SQuADv1.1 35 | python quant_task_distill_squad.py \ 36 | --data_dir ${DATA_DIR} \ 37 | --job_id ${JOB_ID} \ 38 | --batch_size 4 \ 39 | --learning_rate 2e-5 \ 40 | --eval_step 1000 \ 41 | --num_train_epochs 1 \ 42 | --ACT2FN ${ACT2FN} \ 43 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 44 | --kd_type two_stage \ 45 | --teacher_model ${TEACHER_MODEL_DIR} \ 46 | --student_model ${STUDENT_MODEL_DIR} \ 47 | --weight_bits ${wbits} \ 48 | --weight_quant_method twn \ 49 | --input_bits ${abits} \ 50 | --input_quant_method ${act_quan_method} \ 51 | --clip_lr 1e-3 \ 52 | --learnable_scaling 53 | else 54 | TASK_NAME=SQuADv2.0 55 | python quant_task_distill_squad.py \ 56 | --data_dir ${DATA_DIR} \ 57 | --job_id ${JOB_ID} \ 58 | --batch_size 4 \ 59 | --learning_rate 2e-5 \ 60 | --eval_step 1000 \ 61 | --num_train_epochs 1 \ 62 | --ACT2FN ${ACT2FN} \ 63 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 64 | --kd_type two_stage \ 65 | --teacher_model ${TEACHER_MODEL_DIR} \ 66 | --student_model ${STUDENT_MODEL_DIR} \ 67 | --weight_bits ${wbits} \ 68 | --weight_quant_method twn \ 69 | --input_bits ${abits} \ 70 | --input_quant_method ${act_quan_method} \ 71 | --clip_lr 1e-3 \ 72 | --learnable_scaling \ 73 | --version_2_with_negative 74 | fi 75 | 76 | -------------------------------------------------------------------------------- /BinaryBERT/scripts/tws_glue.sh: -------------------------------------------------------------------------------- 1 | # Step 2: Apply ternary weight splitting and finetune BinaryBERT. 2 | # Tips: 3 | # 1. If trained with data augmentation, please add --aug_train 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu; 5 | # use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization 6 | 7 | TASK_NAME=$1 8 | GLUE_DIR=$2 9 | TEACHER_MODEL_DIR=$3 10 | STUDENT_MODEL_DIR=$4 11 | wbits=$5 12 | abits=$6 13 | JOB_ID=Ternary_W${wbits}A${abits} 14 | echo $TASK_NAME 15 | echo $GLUE_DIR 16 | echo $TEACHER_MODEL_DIR 17 | echo $STUDENT_MODEL_DIR 18 | echo $wbits 19 | echo $abits 20 | echo $JOB_ID 21 | 22 | if [ $abits == 4 ] 23 | then 24 | act_quan_method=lsq 25 | ACT2FN=relu 26 | else 27 | act_quan_method=uniform 28 | ACT2FN=gelu 29 | fi 30 | 31 | export CUDA_VISIBLE_DEVICES=5 32 | python quant_task_distill_glue.py \ 33 | --data_dir ${GLUE_DIR} \ 34 | --job_id ${JOB_ID} \ 35 | --batch_size 16 \ 36 | --learning_rate 5e-5 \ 37 | --eval_step 100 \ 38 | --num_train_epochs 2 \ 39 | --ACT2FN ${ACT2FN} \ 40 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 41 | --kd_type two_stage \ 42 | --task_name $TASK_NAME \ 43 | --teacher_model ${TEACHER_MODEL_DIR} \ 44 | --student_model ${STUDENT_MODEL_DIR} \ 45 | --weight_bits ${wbits} \ 46 | --weight_quant_method bwn \ 47 | --input_bits ${abits} \ 48 | --input_quant_method ${act_quan_method} \ 49 | --clip_lr 1e-4 \ 50 | --learnable_scaling \ 51 | --is_binarybert \ 52 | --split -------------------------------------------------------------------------------- /BinaryBERT/scripts/tws_squad.sh: -------------------------------------------------------------------------------- 1 | # Step 2: Apply ternary weight splitting and finetune BinaryBERT. 2 | # Tips: 3 | # 1. If trained with data augmentation, please add --aug_train 4 | # 2. For activation quantziation, use uniform quant for A8, with ACT2FN=gelu; 5 | # use lsq quant for A4, use lsq, with ACT2FN=relu to ensure non-negativity of LSQ asymmetric quantization 6 | 7 | TASK=$1 8 | DATA_DIR=$2 9 | TEACHER_MODEL_DIR=$3 10 | STUDENT_MODEL_DIR=$4 11 | wbits=$5 12 | abits=$6 13 | JOB_ID=Ternary_W${wbits}A${abits} 14 | echo $TASK 15 | echo $DATA_DIR 16 | echo $TEACHER_MODEL_DIR 17 | echo $STUDENT_MODEL_DIR 18 | echo $wbits 19 | echo $abits 20 | echo $JOB_ID 21 | 22 | if [ $abits == 4 ] 23 | then 24 | act_quan_method=lsq 25 | ACT2FN=relu 26 | else 27 | act_quan_method=uniform 28 | ACT2FN=gelu 29 | fi 30 | 31 | export CUDA_VISIBLE_DEVICES=7 32 | if [ $TASK == 1 ] 33 | then 34 | TASK_NAME=SQuADv1.1 35 | python quant_task_distill_squad.py \ 36 | --data_dir ${DATA_DIR} \ 37 | --job_id ${JOB_ID} \ 38 | --batch_size 4 \ 39 | --learning_rate 2e-5 \ 40 | --eval_step 1000 \ 41 | --num_train_epochs 1 \ 42 | --ACT2FN ${ACT2FN} \ 43 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 44 | --kd_type two_stage \ 45 | --teacher_model ${TEACHER_MODEL_DIR} \ 46 | --student_model ${STUDENT_MODEL_DIR} \ 47 | --weight_bits ${wbits} \ 48 | --weight_quant_method bwn \ 49 | --input_bits ${abits} \ 50 | --input_quant_method ${act_quan_method} \ 51 | --clip_lr 1e-3 \ 52 | --learnable_scaling \ 53 | --is_binarybert \ 54 | --split 55 | else 56 | TASK_NAME=SQuADv2.0 57 | python quant_task_distill_squad.py \ 58 | --data_dir ${DATA_DIR} \ 59 | --job_id ${JOB_ID} \ 60 | --batch_size 4 \ 61 | --learning_rate 2e-5 \ 62 | --eval_step 1000 \ 63 | --num_train_epochs 1 \ 64 | --ACT2FN gelu \ 65 | --output_dir output/${JOB_ID}/${TASK_NAME} \ 66 | --kd_type two_stage \ 67 | --teacher_model ${TEACHER_MODEL_DIR} \ 68 | --student_model ${STUDENT_MODEL_DIR} \ 69 | --weight_bits ${wbits} \ 70 | --weight_quant_method bwn \ 71 | --input_bits ${abits} \ 72 | --input_quant_method uniform \ 73 | --clip_lr 1e-3 \ 74 | --learnable_scaling \ 75 | --version_2_with_negative \ 76 | --is_binarybert \ 77 | --split 78 | fi 79 | 80 | -------------------------------------------------------------------------------- /BinaryBERT/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.2" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | 4 | 5 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 6 | BertForMaskedLM, BertForNextSentencePrediction, 7 | TinyBertForSequenceClassification, 8 | load_tf_weights_in_bert) 9 | 10 | from .optimization import BertAdam 11 | 12 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME 13 | -------------------------------------------------------------------------------- /CAME/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/.DS_Store -------------------------------------------------------------------------------- /CAME/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3 15 | FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt 16 | FROM ${FROM_IMAGE_NAME} 17 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract 18 | 19 | ENV BERT_PREP_WORKING_DIR /workspace/bert/data 20 | 21 | WORKDIR /workspace 22 | RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd .. 23 | RUN git clone https://github.com/soskek/bookcorpus.git 24 | 25 | # Copy the perf_client over 26 | COPY --from=trt /workspace/install/ /workspace/install/ 27 | ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH} 28 | 29 | # Install trt python api 30 | RUN apt-get install libb64-0d 31 | RUN pip install /workspace/install/python/tensorrtserver*.whl 32 | 33 | WORKDIR /workspace/bert 34 | RUN pip install --no-cache-dir \ 35 | tqdm boto3 requests six ipdb h5py nltk progressbar onnxruntime \ 36 | git+https://github.com/NVIDIA/dllogger wget 37 | 38 | RUN apt-get install -y iputils-ping 39 | 40 | COPY . . 41 | -------------------------------------------------------------------------------- /CAME/NOTICE: -------------------------------------------------------------------------------- 1 | BERT PyTorch 2 | 3 | This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT 4 | licensed under the Apache License 2.0. 5 | -------------------------------------------------------------------------------- /CAME/README.md: -------------------------------------------------------------------------------- 1 | # CAME Optimizer - Pytorch 2 | 3 | This repository provides a script and recipe to train the BERT model with our proposed CAME optimizer in: 4 | 5 | CAME: Confidence-guided Adaptive Memory Efficient Optimization 6 | 7 | This work has been accepted by **ACL2023** main conference. 8 | 9 | In this work, we studied a confidence-guided strategy to reduce the instability of existing memory efficient optimizers. 10 | Based on this strategy, we proposed CAME to simultaneously achieve two goals: fast convergence as in traditional adaptive methods, and low memory usage as in memory-efficient methods. 11 | 12 | 13 | ## Training 14 | 15 | ### The script including the setting of hyperparameters to pretrain BERT: 16 | bash run_came_pretraining.sh 17 | 18 | ### The startup file corresponding to the script: 19 | startup_came.py 20 | 21 | ### Pytorch implementation: 22 | came.py: the Pytorch implementation of our proposed CAME optimizer. 23 | ![CAME](./came_pcode.png) 24 | 25 | ## Pretraining Results 26 | ![BERT Pretraining](./bert_pretrain.png) 27 | 28 | ## Memory Usage Comparison 29 | ![Memory Cost](./memory.png) 30 | 31 | ## Usage 32 | ``` 33 | from came import CAME 34 | optimizer = CAME(model.parameters(), lr=2e-4, weight_decay=1e-2, betas=(0.9, 0.999, 0.9999), eps=(1e-30, 1e-16)) 35 | ``` 36 | ## Citation 37 | -------------------------------------------------------------------------------- /CAME/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30528 13 | } 14 | -------------------------------------------------------------------------------- /CAME/bert_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /CAME/bert_pretrain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/bert_pretrain.png -------------------------------------------------------------------------------- /CAME/came_pcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/came_pcode.png -------------------------------------------------------------------------------- /CAME/create_data.sh: -------------------------------------------------------------------------------- 1 | python /home/ma-user/work/Old_BERT/create_pretraining_data.py \ 2 | --input_file=/cache/data/book/book_corpus_2.txt \ 3 | --output_file=/cache/data/book/book_corpus_2.hdf5 \ 4 | --vocab_file=/home/ma-user/work/Old_BERT/bert-large-uncased-vocab.txt \ 5 | --bert_model=bert-large-uncased \ 6 | --max_seq_length=128 \ 7 | --max_predictions_per_seq=20 \ 8 | --dupe_factor=5 \ 9 | --masked_lm_prob=0.15 10 | -------------------------------------------------------------------------------- /CAME/data/BooksDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import subprocess 15 | 16 | class BooksDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path 19 | pass 20 | 21 | 22 | def download(self): 23 | bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out' 24 | bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus' 25 | bookscorpus_download_command += ' --trash-bad-count' 26 | bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True) 27 | -------------------------------------------------------------------------------- /CAME/data/BookscorpusTextFormatting.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import glob 15 | import os 16 | 17 | class BookscorpusTextFormatting: 18 | def __init__(self, books_path, output_filename, recursive = False): 19 | self.books_path = books_path 20 | self.recursive = recursive 21 | self.output_filename = output_filename 22 | 23 | 24 | # This puts one book per line 25 | def merge(self): 26 | with open(self.output_filename, mode='w', newline='\n') as ofile: 27 | for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True): 28 | with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file: 29 | for line in file: 30 | if line.strip() != '': 31 | ofile.write(line.strip() + ' ') 32 | ofile.write("\n\n") -------------------------------------------------------------------------------- /CAME/data/GLUEDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import sys 15 | import wget 16 | 17 | from pathlib import Path 18 | 19 | 20 | def mkdir(path): 21 | Path(path).mkdir(parents=True, exist_ok=True) 22 | 23 | 24 | class GLUEDownloader: 25 | 26 | def __init__(self, save_path): 27 | self.save_path = save_path + '/glue' 28 | 29 | def download(self, task_name): 30 | mkdir(self.save_path) 31 | if task_name in {'mrpc', 'mnli'}: 32 | task_name = task_name.upper() 33 | elif task_name == 'cola': 34 | task_name = 'CoLA' 35 | else: # SST-2 36 | assert task_name == 'sst-2' 37 | task_name = 'SST' 38 | wget.download( 39 | 'https://gist.githubusercontent.com/roclark/9ab385e980c5bdb9e15ecad5963848e0/raw/c9dcc44a6e1336d2411e3333c25bcfd507c39c81/download_glue_data.py', 40 | out=self.save_path, 41 | ) 42 | sys.path.append(self.save_path) 43 | import download_glue_data 44 | download_glue_data.main( 45 | ['--data_dir', self.save_path, '--tasks', task_name]) 46 | sys.path.pop() 47 | -------------------------------------------------------------------------------- /CAME/data/NVIDIAPretrainedWeightDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | 16 | class NVIDIAPretrainedWeightDownloader: 17 | def __init__(self, save_path): 18 | self.save_path = save_path + '/nvidia_pretrained_weights' 19 | 20 | if not os.path.exists(self.save_path): 21 | os.makedirs(self.save_path) 22 | 23 | pass 24 | 25 | 26 | def download(self): 27 | assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.' -------------------------------------------------------------------------------- /CAME/data/SquadDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import bz2 15 | import os 16 | import urllib.request 17 | import sys 18 | 19 | class SquadDownloader: 20 | def __init__(self, save_path): 21 | self.save_path = save_path + '/squad' 22 | 23 | if not os.path.exists(self.save_path): 24 | os.makedirs(self.save_path) 25 | 26 | if not os.path.exists(self.save_path + '/v1.1'): 27 | os.makedirs(self.save_path + '/v1.1') 28 | 29 | if not os.path.exists(self.save_path + '/v2.0'): 30 | os.makedirs(self.save_path + '/v2.0') 31 | 32 | self.download_urls = { 33 | 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json', 34 | 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json', 35 | 'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py', 36 | 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json', 37 | 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json', 38 | 'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py', 39 | } 40 | 41 | def download(self): 42 | for item in self.download_urls: 43 | url = item 44 | file = self.download_urls[item] 45 | 46 | print('Downloading:', url) 47 | if os.path.isfile(self.save_path + '/' + file): 48 | print('** Download file already exists, skipping download') 49 | else: 50 | response = urllib.request.urlopen(url) 51 | with open(self.save_path + '/' + file, "wb") as handle: 52 | handle.write(response.read()) 53 | 54 | 55 | -------------------------------------------------------------------------------- /CAME/data/WikiDownloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import bz2 15 | import os 16 | import urllib.request 17 | import subprocess 18 | import sys 19 | import subprocess 20 | 21 | class WikiDownloader: 22 | def __init__(self, language, save_path): 23 | self.save_path = save_path + '/wikicorpus_' + language 24 | 25 | if not os.path.exists(self.save_path): 26 | os.makedirs(self.save_path) 27 | 28 | self.language = language 29 | # Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work 30 | self.download_urls = { 31 | 'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2', 32 | 'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2' 33 | } 34 | 35 | self.output_files = { 36 | 'en' : 'wikicorpus_en.xml.bz2', 37 | 'zh' : 'wikicorpus_zh.xml.bz2' 38 | } 39 | 40 | 41 | def download(self): 42 | if self.language in self.download_urls: 43 | url = self.download_urls[self.language] 44 | filename = self.output_files[self.language] 45 | 46 | print('Downloading:', url) 47 | if os.path.isfile(self.save_path + '/' + filename): 48 | print('** Download file already exists, skipping download') 49 | else: 50 | cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)] 51 | print('Running:', cmd) 52 | status = subprocess.run(cmd) 53 | if status.returncode != 0: 54 | raise RuntimeError('Wiki download not successful') 55 | 56 | # Always unzipping since this is relatively fast and will overwrite 57 | print('Unzipping:', self.output_files[self.language]) 58 | subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True) 59 | 60 | else: 61 | assert False, 'WikiDownloader not implemented for this language yet.' 62 | -------------------------------------------------------------------------------- /CAME/data/WikicorpusTextFormatting.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import glob 15 | import os 16 | 17 | class WikicorpusTextFormatting: 18 | def __init__(self, wiki_path, output_filename, recursive = False): 19 | self.wiki_path = wiki_path 20 | self.recursive = recursive 21 | self.output_filename = output_filename 22 | 23 | 24 | # This puts one article per line 25 | def merge(self): 26 | with open(self.output_filename, mode='w', newline='\n') as ofile: 27 | for dirname in glob.glob(self.wiki_path + '/*/', recursive=False): 28 | for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive): 29 | print(filename) 30 | article_lines = [] 31 | article_open = False 32 | 33 | with open(filename, mode='r', newline='\n') as file: 34 | for line in file: 35 | if '' in line: 38 | article_open = False 39 | for oline in article_lines[1:]: 40 | if oline != '\n': 41 | ofile.write(oline.rstrip() + " ") 42 | ofile.write("\n\n") 43 | article_lines = [] 44 | else: 45 | if article_open: 46 | article_lines.append(line) -------------------------------------------------------------------------------- /CAME/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /CAME/data/__pycache__/TextSharding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/data/__pycache__/TextSharding.cpython-36.pyc -------------------------------------------------------------------------------- /CAME/data/create_datasets_from_start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | to_download=${1:-"wiki_only"} 17 | 18 | #Download 19 | if [ "$to_download" = "wiki_books" ] ; then 20 | python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus 21 | fi 22 | 23 | python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en 24 | python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab 25 | python3 /workspace/bert/data/bertPrep.py --action download --dataset squad 26 | python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc 27 | python3 /workspace/bert/data/bertPrep.py --action download --dataset sst-2 28 | 29 | # Properly format the text files 30 | if [ "$to_download" = "wiki_books" ] ; then 31 | python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus 32 | fi 33 | python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en 34 | 35 | if [ "$to_download" = "wiki_books" ] ; then 36 | DATASET="books_wiki_en_corpus" 37 | else 38 | DATASET="wikicorpus_en" 39 | # Shard the text files 40 | fi 41 | 42 | # Shard the text files 43 | python3 /workspace/bert/data/bertPrep.py --action sharding --dataset $DATASET 44 | 45 | # Create HDF5 files Phase 1 46 | python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 128 \ 47 | --max_predictions_per_seq 20 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 48 | 49 | # Create HDF5 files Phase 2 50 | python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 512 \ 51 | --max_predictions_per_seq 80 --vocab_file $BERT_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1 52 | -------------------------------------------------------------------------------- /CAME/data/shard.py: -------------------------------------------------------------------------------- 1 | import TextSharding 2 | 3 | # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and 4 | # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this. 5 | # Different languages (e.g., Chinese simplified/traditional) may require translation and 6 | # other packages to be called from here -- just add a conditional branch for those extra steps 7 | segmenter = TextSharding.NLTKSegmenter() 8 | sharding = TextSharding.Sharding(['/home/ma-user/work/Old_BERT/data/origin/wiki_sliced/wiki_00', '/home/ma-user/work/Old_BERT/data/origin/wiki_sliced/wiki_01'], '/home/ma-user/work/Old_BERT/data/origin/wiki_sliced', 256, 256, 0.1) 9 | 10 | sharding.load_articles() 11 | sharding.segment_articles_into_sentences(segmenter) 12 | sharding.distribute_articles_over_shards() 13 | sharding.write_shards_to_disk() 14 | -------------------------------------------------------------------------------- /CAME/data/wikiextractor/.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /CAME/data/wikiextractor/.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | /docs/_build 38 | .idea 39 | *.iml 40 | 41 | .travis-solo 42 | G* 43 | *.db 44 | *.mdb 45 | 46 | # Vim 47 | [._]*.s[a-w][a-z] 48 | [._]s[a-w][a-z] 49 | *.un~ 50 | Session.vim 51 | .netrwhist 52 | *~ 53 | -------------------------------------------------------------------------------- /CAME/data/wikiextractor/extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # NOTES 4 | # 5 | # - Must expand templates to avoid a large loss of content. 6 | # - Text will not (redundantly) contain the title string. 7 | # - Keep sections. Section title will be marked by "Section::::". 8 | # - Keep lists. List bullets will be marked by "BULLET::::". 9 | # - Keep tables. They're mostly garbage but can be removed later (remove "^!*"). 10 | # - Remove disambiguation pages. Right now there is no use for them. 11 | 12 | INPUT=$1 13 | PROCESSES=$2 14 | TEMPLATES=$3 15 | OUTPUT=$4 16 | 17 | python -m wikiextractor.WikiExtractor.py $INPUT \ 18 | --json \ 19 | --processes $PROCESSES \ 20 | --templates $TEMPLATES \ 21 | --output $OUTPUT \ 22 | --bytes 1M \ 23 | --compress \ 24 | --links \ 25 | --sections \ 26 | --lists \ 27 | --keep_tables \ 28 | --min_text_length 0 \ 29 | --filter_disambig_pages 30 | -------------------------------------------------------------------------------- /CAME/data/wikiextractor/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import re 3 | 4 | from wikiextractor.WikiExtractor import __version__ 5 | 6 | 7 | def get_version(version): 8 | if re.match(r'^\d+\.\d+$', version): 9 | return version + '.0' 10 | return version 11 | 12 | with open("README.md", "r") as fh: 13 | long_description = fh.read() 14 | 15 | setup( 16 | name='wikiextractor', 17 | version=get_version(__version__), 18 | author='Giuseppe Attardi', 19 | author_email='attardi@gmail.com', 20 | description='A tool for extracting plain text from Wikipedia dumps', 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | license='GNU Affero General Public License', 24 | install_requires=[], 25 | url="https://github.com/attardi/wikiextractor", 26 | packages=find_packages(include=["wikiextractor"]), 27 | classifiers=[ 28 | 'Development Status :: 5 - Production/Stable', 29 | 'Intended Audience :: Developers', 30 | 'Topic :: Text Processing :: Linguistic', 31 | 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)', 32 | 'Programming Language :: Python :: 3' 33 | ], 34 | entry_points={ 35 | "console_scripts": [ 36 | "wikiextractor = wikiextractor.WikiExtractor:main", 37 | "extractPage = wikiextractor.extractPage:main", 38 | ] 39 | }, 40 | python_requires='>=3.6', 41 | ) 42 | -------------------------------------------------------------------------------- /CAME/data/wikiextractor/wikiextractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/data/wikiextractor/wikiextractor/__init__.py -------------------------------------------------------------------------------- /CAME/data/wikiextractor/wikiextractor/clean.py: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # Copyright (c) 2020. Giuseppe Attardi (attardi@di.unipi.it). 3 | # ============================================================================= 4 | # This file is part of Tanl. 5 | # 6 | # Tanl is free software; you can redistribute it and/or modify it 7 | # under the terms of the GNU Affero General Public License, version 3, 8 | # as published by the Free Software Foundation. 9 | # 10 | # Tanl is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU Affero General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Affero General Public License 16 | # along with this program. If not, see . 17 | # ============================================================================= 18 | 19 | from wikiextractor.extract import Extractor, ignoreTag, resetIgnoredTags 20 | 21 | 22 | def clean_markup(markup, keep_links=False, ignore_headers=True): 23 | """ 24 | Clean Wikimarkup to produce plaintext. 25 | 26 | :param keep_links: Set to True to keep internal and external links 27 | :param ignore_headers: if set to True, the output list will not contain 28 | headers, only 29 | 30 | Returns a list of paragraphs (unicode strings). 31 | """ 32 | 33 | if not keep_links: 34 | ignoreTag('a') 35 | 36 | extractor = Extractor(0, '', []) 37 | 38 | # returns a list of strings 39 | paragraphs = extractor.clean_text(markup, 40 | mark_headers=True, 41 | expand_templates=False, 42 | escape_doc=True) 43 | resetIgnoredTags() 44 | 45 | if ignore_headers: 46 | paragraphs = filter(lambda s: not s.startswith('## '), paragraphs) 47 | 48 | return paragraphs 49 | -------------------------------------------------------------------------------- /CAME/memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/memory.png -------------------------------------------------------------------------------- /CAME/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/processors/__init__.py -------------------------------------------------------------------------------- /CAME/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3==1.15 5 | # Used for downloading models over HTTP 6 | requests 7 | six 8 | ipdb 9 | #Data processing 10 | h5py 11 | nltk 12 | progressbar 13 | #Others 14 | -------------------------------------------------------------------------------- /CAME/run_came_pretraining.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # num_eval_examples too large will cause 'unable to write to file' https://github.com/pytorch/pytorch/issues/2926 4 | 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | /home/ma-user/work/Old_BERT/run_pretraining.py \ 7 | --seed=12439 \ 8 | --do_train \ 9 | --do_eval \ 10 | --optimizer=came \ 11 | --config_file=/home/ma-user/work/Old_BERT/bert_large_config.json \ 12 | --output_dir=/cache/results \ 13 | --fp16 \ 14 | --allreduce_post_accumulation \ 15 | --allreduce_post_accumulation_fp16 \ 16 | --gradient_accumulation_steps=256 \ 17 | --bert_model=bert-large-uncased \ 18 | --log_freq=1 \ 19 | --train_batch_size=4096 \ 20 | --dev_batch_size=64 \ 21 | --learning_rate=0.00024 \ 22 | --warmup_proportion=0.2 \ 23 | --num_steps_per_checkpoint=5 \ 24 | --input_dir=/cache/data/train_data \ 25 | --dev_dir=/cache/data/dev_data \ 26 | --phase2 \ 27 | --max_seq_length=128 \ 28 | --max_predictions_per_seq=20 \ 29 | --max_steps=20000 \ 30 | --init_checkpoint=None \ 31 | --phase1_end_step=0 32 | 33 | -------------------------------------------------------------------------------- /CAME/run_validation.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.launch --nproc_per_node=8 \ 2 | /home/ma-user/work/Old_BERT/validation.py \ 3 | --seed=12439 \ 4 | --do_train \ 5 | --config_file=/home/ma-user/work/Old_BERT/bert_large_config.json \ 6 | --output_dir=/cache/results \ 7 | --fp16 \ 8 | --optimizer=SM3 \ 9 | --allreduce_post_accumulation \ 10 | --allreduce_post_accumulation_fp16 \ 11 | --gradient_accumulation_steps=256 \ 12 | --bert_model=bert-large-uncased \ 13 | --init_checkpoint=/cache/ckpt_9989.pt \ 14 | --log_freq=1 \ 15 | --train_batch_size=4096 \ 16 | --dev_batch_size=64 \ 17 | --learning_rate=0.1 \ 18 | --warmup_proportion=0.1 \ 19 | --num_steps_per_checkpoint=5 \ 20 | --input_dir=/cache/data/train_data \ 21 | --dev_dir=/cache/data/dev_data \ 22 | --phase2 \ 23 | --max_seq_length=128 \ 24 | --max_predictions_per_seq=20 \ 25 | --max_steps=20000 \ 26 | --init_checkpoint=None \ 27 | --phase1_end_step=0 28 | -------------------------------------------------------------------------------- /CAME/scripts/data_download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | DATA_DIR=${1:-/workspace/bert/data} 17 | 18 | # Download vocab files from pretrained model 19 | cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.* 20 | 21 | # Download SQUAD 22 | cd $DATA_DIR/squad && . squad_download.sh 23 | 24 | # Download SWAG 25 | git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag 26 | 27 | # Download GLUE 28 | cd $DATA_DIR/glue && . download_mrpc.sh 29 | 30 | # WIKI Download 31 | cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh 32 | 33 | # Bookcorpus Download 34 | cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh 35 | 36 | cd $DATA_DIR 37 | # Create HDF5 files for WIKI 38 | bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \ 39 | && rm -r ./wikipedia_corpus/final_* \ 40 | 41 | # Create HDF5 files for Bookcorpus 42 | bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \ 43 | && rm -r ./bookcorpus/final_* \ 44 | 45 | # Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus 46 | bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024 47 | -------------------------------------------------------------------------------- /CAME/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker build --network=host . --rm --pull --no-cache -t bert 3 | -------------------------------------------------------------------------------- /CAME/scripts/docker/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD=${1:-/bin/bash} 4 | NV_VISIBLE_DEVICES=${2:-"all"} 5 | DOCKER_BRIDGE=${3:-"host"} 6 | 7 | docker run -it --rm \ 8 | --gpus device=$NV_VISIBLE_DEVICES \ 9 | --net=$DOCKER_BRIDGE \ 10 | --shm-size=1g \ 11 | --ulimit memlock=-1 \ 12 | --ulimit stack=67108864 \ 13 | -e LD_LIBRARY_PATH='/workspace/install/lib/' \ 14 | -v $PWD:/workspace/bert \ 15 | -v $PWD/results:/results \ 16 | bert $CMD 17 | -------------------------------------------------------------------------------- /CAME/scripts/run_swag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SWAG_DIR=/workspace/bert/data/swag 17 | OUT_DIR=/results/SWAG 18 | 19 | mkdir -p $OUT_DIR 20 | 21 | echo "Container nvidia build = " $NVIDIA_BUILD_ID 22 | 23 | init_checkpoint=${1} 24 | mode=${2:-"train"} 25 | max_steps=${3:-"-1.0"} # if < 0, has no effect 26 | batch_size=${4:-"12"} 27 | learning_rate=${5:-"5e-6"} 28 | precision=${6:-"fp32"} 29 | num_gpu=${7:-"8"} 30 | epochs=${8:-"2"} 31 | 32 | if [ "$mode" != "train" ] ; then 33 | num_gpu=1 34 | fi 35 | 36 | use_fp16="" 37 | if [ "$precision" = "fp16" ] ; then 38 | echo "fp16 activated!" 39 | use_fp16="--fp16" 40 | fi 41 | 42 | if [ "$num_gpu" = "1" ] ; then 43 | mpi_command="" 44 | else 45 | mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu" 46 | fi 47 | 48 | CMD="python -m $mpi_command run_swag.py " 49 | CMD+="--init_checkpoint=$init_checkpoint " 50 | if [ "$mode" = "train" ] ; then 51 | CMD+="--do_train " 52 | CMD+="--train_batch_size=$batch_size " 53 | else 54 | CMD+="--do_eval " 55 | CMD+="--eval_batch_size=$batch_size " 56 | fi 57 | CMD+="--do_lower_case " 58 | CMD+="--data_dir $SWAG_DIR/data/ " 59 | CMD+="--bert_model bert-large-uncased " 60 | CMD+="--max_seq_length 128 " 61 | CMD+="--learning_rate $learning_rate " 62 | CMD+="--num_train_epochs $epochs " 63 | CMD+="--max_steps $max_steps " 64 | CMD+="--output_dir $OUT_DIR " 65 | CMD+="$use_fp16" 66 | 67 | LOGFILE=$OUT_DIR/logfile 68 | $CMD |& tee $LOGFILE 69 | 70 | sed -r 's/ 71 | |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit 72 | 73 | throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'` 74 | 75 | echo "throughput: $throughput" 76 | 77 | -------------------------------------------------------------------------------- /CAME/startup_came.py: -------------------------------------------------------------------------------- 1 | """ 2 | Startup script to run on the cloud 3 | """ 4 | 5 | import moxing 6 | import os 7 | import argparse 8 | import logging 9 | 10 | # install libraries 11 | 12 | os.environ["NUMBA_NUM_THREADS"] = '1' 13 | 14 | os.system('pip install setuptools==59.0.1') 15 | os.system('pip install torchmetrics==0.7.1') 16 | print('Install torch finished...') 17 | 18 | os.system('pip install pyarrow==2.0.0') 19 | os.system('pip install tqdm') 20 | os.system('pip install h5py') 21 | os.system('pip install onnxruntime==1.0.0') 22 | os.system('pip install boto3==1.15.0') 23 | os.system('pip install torch-optimizer==0.0.1a16') 24 | os.system('pip install torch-SM3') 25 | 26 | logging.info("finish install tqdm and h5py") 27 | 28 | 29 | try: 30 | import torch 31 | print('Import torch success...') 32 | print('torch version: ', torch.__version__) 33 | print('cuda status: ', torch.cuda.is_available()) 34 | import apex 35 | print('Import apex success...') 36 | import amp_C 37 | print('Import amp_C success...') 38 | import apex_C 39 | print('Import apex_C success...') 40 | except Exception as e: 41 | print('Some failure...', e) 42 | 43 | 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--data_url', type=str) 46 | parser.add_argument('--train_url', type=str) 47 | parser.add_argument('--batch_size', type=int) 48 | parser.add_argument('--learning_rate', type=float) 49 | parser.add_argument('--max_steps', type=int) 50 | parser.add_argument('--gradient_accumulation_steps', type=int) 51 | 52 | 53 | args, unparsed = parser.parse_known_args() 54 | print(args, unparsed) 55 | 56 | # download data 57 | 58 | moxing.file.copy_parallel('/pretraining_data/train_data','/cache/data/train_data' ) 59 | moxing.file.copy_parallel('/pretraining_data/dev_all_data','/cache/data/dev_data' ) 60 | 61 | # run program 62 | import os 63 | 64 | os.system('bash /home/ma-user/work/BERT/run_came_pretraining.sh') 65 | -------------------------------------------------------------------------------- /CAME/triton/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | export TRITON_MODEL_OVERWRITE=True 17 | NV_VISIBLE_DEVICES=0 18 | 19 | bert_model=${1:-"large"} 20 | precision=${2:-"fp32"} 21 | init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"} 22 | EXPORT_FORMAT=${4:-"ts-script"} 23 | 24 | MODEL_NAME="bert_${bert_model}_${precision}" 25 | BERT_DIR="/workspace/bert" 26 | VOCAB_FILE="/workspace/bert/vocab/vocab" 27 | PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json" 28 | SQUAD_DIR="/workspace/bert/data/squad/v1.1" 29 | OUT_DIR="/results" 30 | BATCH_SIZE="8" 31 | # Create common bridge for client and server 32 | BRIDGE_NAME="tritonnet" 33 | docker network create ${BRIDGE_NAME} 34 | 35 | EXPORT_MODEL_ARGS="${BATCH_SIZE} ${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME} 0 1" 36 | 37 | # Clean up 38 | cleanup() { 39 | docker kill trt_server_cont 40 | docker network rm ${BRIDGE_NAME} 41 | } 42 | trap cleanup EXIT 43 | trap cleanup SIGTERM 44 | 45 | ./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${EXPORT_MODEL_ARGS} ${TRITON_MODEL_OVERWRITE} 46 | 47 | # Start Server 48 | echo Starting server... 49 | SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES ) 50 | SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} ) 51 | 52 | ./triton/wait_for_triton_server.sh 53 | 54 | CMD="python triton/run_squad_client.py \ 55 | --model_name ${MODEL_NAME} \ 56 | --do_lower_case \ 57 | --vocab_file ${VOCAB_FILE} \ 58 | --output_dir ${OUT_DIR} \ 59 | --predict_file ${PREDICT_FILE} \ 60 | --batch_size ${BATCH_SIZE}" 61 | 62 | bash scripts/docker/launch.sh "${CMD}" 63 | 64 | bash scripts/docker/launch.sh "python ${SQUAD_DIR}/evaluate-v1.1.py ${PREDICT_FILE} ${OUT_DIR}/predictions.json" 65 | -------------------------------------------------------------------------------- /CAME/triton/export_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | NV_VISIBLE_DEVICES=${1:-"0"} 17 | DOCKER_BRIDGE=${2:-"host"} 18 | checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"} 19 | batch_size=${4:-"8"} 20 | BERT_DIR=${5:-"/workspace/bert"} 21 | EXPORT_FORMAT=${6:-"ts-script"} 22 | precision=${7:-"fp16"} 23 | triton_model_version=${8:-1} 24 | triton_model_name=${9:-"bertQA-ts-script"} 25 | triton_dyn_batching_delay=${10:-0} 26 | triton_engine_count=${11:-1} 27 | triton_model_overwrite=${12:-"False"} 28 | 29 | PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json" 30 | 31 | DEPLOYER="deployer.py" 32 | 33 | CMD="python triton/${DEPLOYER} \ 34 | --${EXPORT_FORMAT} \ 35 | --save-dir /results/triton_models \ 36 | --triton-model-name ${triton_model_name} \ 37 | --triton-model-version ${triton_model_version} \ 38 | --triton-max-batch-size ${batch_size} \ 39 | --triton-dyn-batching-delay ${triton_dyn_batching_delay} \ 40 | --triton-engine-count ${triton_engine_count} " 41 | 42 | CMD+="-- --checkpoint ${checkpoint} \ 43 | --config_file ${BERT_DIR}/bert_config.json \ 44 | --vocab_file /workspace/bert/vocab/vocab \ 45 | --predict_file ${PREDICT_FILE} \ 46 | --do_lower_case \ 47 | --batch_size=${batch_size} " 48 | 49 | if [[ $precision == "fp16" ]]; then 50 | CMD+="--fp16 " 51 | fi 52 | 53 | bash scripts/docker/launch.sh "${CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE} 54 | -------------------------------------------------------------------------------- /CAME/triton/launch_triton_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | DOCKER_BRIDGE=${1:-"bridge"} 17 | NV_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES:-"0"} 18 | 19 | # Start TRITON server in detached state 20 | docker run -d --rm \ 21 | --gpus device=${NV_VISIBLE_DEVICES} \ 22 | --shm-size=1g \ 23 | --ulimit memlock=-1 \ 24 | --ulimit stack=67108864 \ 25 | --network=${DOCKER_BRIDGE} \ 26 | -p 8000:8000 \ 27 | -p 8001:8001 \ 28 | -p 8002:8002 \ 29 | --name trt_server_cont \ 30 | -v $PWD/results/triton_models:/models \ 31 | nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1 32 | -------------------------------------------------------------------------------- /CAME/triton/profiling_data_int64/input__0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CAME/triton/profiling_data_int64/input__0 -------------------------------------------------------------------------------- /CAME/triton/run_perf_client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | MODEL_NAME=${1:-"bert"} 17 | MODEL_VERSION=${2:-1} 18 | precision=${3:-"fp32"} 19 | BATCH_SIZE=${4:-1} 20 | MAX_LATENCY=${5:-500} 21 | MAX_CLIENT_THREADS=${6:-10} 22 | MAX_CONCURRENCY=${7:-50} 23 | SERVER_HOSTNAME=${8:-"localhost"} 24 | DOCKER_BRIDGE=${9:-"host"} 25 | RESULTS_ID=${10:-""} 26 | PROFILING_DATA=${11:-"triton/profiling_data_int64"} 27 | NV_VISIBLE_DEVICES=${12:-"0"} 28 | 29 | if [[ $SERVER_HOSTNAME == *":"* ]]; then 30 | echo "ERROR! Do not include the port when passing the Server Hostname. These scripts require that the TRITON HTTP endpoint is on Port 8000 and the gRPC endpoint is on Port 8001. Exiting..." 31 | exit 1 32 | fi 33 | 34 | if [ "$SERVER_HOSTNAME" = "localhost" ] 35 | then 36 | if [ ! "$(docker inspect -f "{{.State.Running}}" trt_server_cont)" = "true" ] ; then 37 | 38 | echo "Launching TRITON server" 39 | bash triton/launch_triton_server.sh ${DOCKER_BRIDGE} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES 40 | SERVER_LAUNCHED=true 41 | 42 | function cleanup_server { 43 | docker kill trt_server_cont 44 | } 45 | 46 | # Ensure we cleanup the server on exit 47 | # trap "exit" INT TERM 48 | trap cleanup_server EXIT 49 | fi 50 | fi 51 | 52 | # Wait until server is up. curl on the health of the server and sleep until its ready 53 | bash triton/wait_for_triton_server.sh $SERVER_HOSTNAME 54 | 55 | TIMESTAMP=$(date "+%y%m%d_%H%M") 56 | 57 | # Create model directory on host (directory /results is mounted) 58 | bash scripts/docker/launch.sh "mkdir -p /results/perf_client/${MODEL_NAME}" 59 | if [ ! -z "${RESULTS_ID}" ]; 60 | then 61 | RESULTS_ID="_${RESULTS_ID}" 62 | fi 63 | 64 | OUTPUT_FILE_CSV="/results/perf_client/${MODEL_NAME}/results${RESULTS_ID}_${TIMESTAMP}.csv" 65 | 66 | ARGS="\ 67 | --max-threads ${MAX_CLIENT_THREADS} \ 68 | -m ${MODEL_NAME} \ 69 | -x ${MODEL_VERSION} \ 70 | -p 3000 \ 71 | -d \ 72 | -v \ 73 | -i gRPC \ 74 | -u ${SERVER_HOSTNAME}:8001 \ 75 | -b ${BATCH_SIZE} \ 76 | -l ${MAX_LATENCY} \ 77 | -c ${MAX_CONCURRENCY} \ 78 | -f ${OUTPUT_FILE_CSV} \ 79 | --input-data ${PROFILING_DATA}" 80 | 81 | echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')" 82 | bash scripts/docker/launch.sh "/workspace/install/bin/perf_client $ARGS" all $DOCKER_BRIDGE 83 | -------------------------------------------------------------------------------- /CAME/triton/wait_for_triton_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SERVER_URI=${1:-"localhost"} 17 | 18 | echo "Waiting for TRITON Server to be ready at http://$SERVER_URI:8000..." 19 | 20 | live_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/live" 21 | ready_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/ready" 22 | 23 | current_status=$($live_command) 24 | echo $current_status 25 | 26 | # First check the current status. If that passes, check the json. If either fail, loop 27 | while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do 28 | 29 | printf "." 30 | sleep 1 31 | current_status=$($live_command) 32 | done 33 | 34 | echo "TRITON Server is ready!" 35 | -------------------------------------------------------------------------------- /CAME/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import torch 15 | import torch.distributed as dist 16 | 17 | from pathlib import Path 18 | 19 | 20 | def get_rank(): 21 | if not dist.is_available(): 22 | return 0 23 | if not dist.is_initialized(): 24 | return 0 25 | return dist.get_rank() 26 | 27 | 28 | def get_world_size(): 29 | if not dist.is_available(): 30 | return 1 31 | if not dist.is_initialized(): 32 | return 1 33 | return dist.get_world_size() 34 | 35 | 36 | def is_main_process(): 37 | return get_rank() == 0 38 | 39 | 40 | def barrier(): 41 | if dist.is_available() and dist.is_initialized(): 42 | dist.barrier() 43 | 44 | 45 | def format_step(step): 46 | if isinstance(step, str): 47 | return step 48 | s = "" 49 | if len(step) > 0: 50 | s += "Training Epoch: {} ".format(step[0]) 51 | if len(step) > 1: 52 | s += "Training Iteration: {} ".format(step[1]) 53 | if len(step) > 2: 54 | s += "Validation Iteration: {} ".format(step[2]) 55 | return s 56 | 57 | 58 | def mkdir(path): 59 | Path(path).mkdir(parents=True, exist_ok=True) 60 | 61 | 62 | def mkdir_by_main_process(path): 63 | if is_main_process(): 64 | mkdir(path) 65 | barrier() 66 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/CeMAT/CeMAT_maskPredict/LICENSE -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/__init__.py: -------------------------------------------------------------------------------- 1 | from .criterions import * 2 | from .models import * 3 | from .tasks import * 4 | from .data import * -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | from .label_smoothed_length_cross_entropy import * 2 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_pair_self_dataset_mask import * -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/meters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import time 9 | 10 | 11 | class AverageMeter(object): 12 | """Computes and stores the average and current value""" 13 | def __init__(self): 14 | self.reset() 15 | 16 | def reset(self): 17 | self.val = 0 18 | self.avg = 0 19 | self.sum = 0 20 | self.count = 0 21 | 22 | def update(self, val, n=1): 23 | self.val = val 24 | self.sum += val * n 25 | self.count += n 26 | self.avg = self.sum / self.count 27 | 28 | 29 | class TimeMeter(object): 30 | """Computes the average occurrence of some event per second""" 31 | def __init__(self, init=0): 32 | self.reset(init) 33 | 34 | def reset(self, init=0): 35 | self.init = init 36 | self.start = time.time() 37 | self.n = 0 38 | 39 | def update(self, val=1): 40 | self.n += val 41 | 42 | @property 43 | def avg(self): 44 | return self.n / self.elapsed_time 45 | 46 | @property 47 | def elapsed_time(self): 48 | return self.init + (time.time() - self.start) 49 | 50 | 51 | class StopwatchMeter(object): 52 | """Computes the sum/avg duration of some event in seconds""" 53 | def __init__(self): 54 | self.reset() 55 | 56 | def start(self): 57 | self.start_time = time.time() 58 | 59 | def stop(self, n=1): 60 | if self.start_time is not None: 61 | delta = time.time() - self.start_time 62 | self.sum += delta 63 | self.n += n 64 | self.start_time = None 65 | 66 | def reset(self): 67 | self.sum = 0 68 | self.n = 0 69 | self.start_time = None 70 | 71 | @property 72 | def avg(self): 73 | return self.sum / self.n 74 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .bert_seq2seq import * -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights 5 | # can be found in the PATENTS file in the same directory. 6 | 7 | import argparse 8 | import importlib 9 | import os 10 | 11 | from .decoding_strategy import DecodingStrategy 12 | 13 | 14 | STRATEGY_REGISTRY = {} 15 | STRATEGY_CLASS_NAMES = set() 16 | 17 | 18 | def setup_strategy(args): 19 | return STRATEGY_REGISTRY[args.decoding_strategy](args) 20 | 21 | 22 | def register_strategy(name): 23 | def register_strategy_cls(cls): 24 | if name in STRATEGY_REGISTRY: 25 | raise ValueError('Cannot register duplicate strategy ({})'.format(name)) 26 | if not issubclass(cls, DecodingStrategy): 27 | raise ValueError('Strategy ({}: {}) must extend DecodingStrategy'.format(name, cls.__name__)) 28 | if cls.__name__ in STRATEGY_CLASS_NAMES: 29 | raise ValueError('Cannot register strategy with duplicate class name ({})'.format(cls.__name__)) 30 | STRATEGY_REGISTRY[name] = cls 31 | STRATEGY_CLASS_NAMES.add(cls.__name__) 32 | return cls 33 | 34 | return register_strategy_cls 35 | 36 | 37 | # automatically import any Python files in the strategies/ directory 38 | for file in os.listdir(os.path.dirname(__file__)): 39 | if file.endswith('.py') and not file.startswith('_'): 40 | strategy_name = file[:file.find('.py')] 41 | importlib.import_module('CeMAT_maskPredict.strategies.' + strategy_name) 42 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/strategies/decoding_strategy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | 9 | class DecodingStrategy(object): 10 | 11 | def generate(model, encoder_out, tgt_tokens, tgt_dict): 12 | pass 13 | 14 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/strategies/left_to_right.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn.functional as F 9 | 10 | from . import register_strategy 11 | from .easy_first import EasyFirst 12 | from .strategy_utils import duplicate_encoder_out, generate_step_with_prob, assign_single_value_long, assign_single_value_byte, assign_multi_value_long, convert_tokens 13 | 14 | 15 | @register_strategy('left_to_right') 16 | class LeftToRight(EasyFirst): 17 | 18 | def __init__(self, args): 19 | super().__init__(args) 20 | 21 | def generate(self, model, encoder_out, tokens, tgt_dict): 22 | bsz, seq_len = tokens.size() 23 | duplicate_encoder_out(encoder_out, bsz, self.beam_size) 24 | tokens = tokens.unsqueeze(1).repeat(1, self.beam_size, 1) 25 | lprobs = tokens.new(bsz, self.beam_size).float().fill_(float('-inf')) 26 | lprobs[:, 0] = 0 27 | 28 | """ 29 | for batch in range(bsz): 30 | for beam in range(self.beam_size): 31 | print("Initialization: ", convert_tokens(tgt_dict, tokens[batch, beam])) 32 | print() 33 | """ 34 | 35 | for position in range(seq_len): 36 | tokens = tokens.view(bsz * self.beam_size, seq_len) # merge beam with batch 37 | decoder_out = model.decoder(tokens, encoder_out) 38 | candidate_lprobs = self.generate_candidates(decoder_out, tokens, tgt_dict.mask(), position) 39 | tokens = tokens.view(bsz, self.beam_size, seq_len) # separate beam from batch 40 | candidate_lprobs = candidate_lprobs.view(bsz, self.beam_size, seq_len, -1) # separate beam from batch 41 | tokens, lprobs = self.select_best(tokens, lprobs, candidate_lprobs) 42 | 43 | """ 44 | for batch in range(bsz): 45 | for beam in range(self.beam_size): 46 | print("Prediction: ", convert_tokens(tgt_dict, tokens[batch, beam])) 47 | print() 48 | """ 49 | 50 | return tokens[:, 0, :], lprobs[:, 0] 51 | 52 | def generate_candidates(self, decoder_out, tokens, mask, position): 53 | candidate_probs = F.softmax(decoder_out[0], dim=-1) 54 | candidate_probs = candidate_probs * tokens.eq(mask).float().unsqueeze(-1) 55 | candidate_probs[:, :, mask] = 0 56 | candidate_probs[:, position + 1:, :] = 0 57 | return candidate_probs.log() 58 | 59 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/strategies/strategy_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def duplicate_encoder_out(encoder_out, bsz, beam_size): 13 | encoder_out['encoder_out'] = encoder_out['encoder_out'].unsqueeze(2).repeat(1, 1, beam_size, 1).view(-1, bsz * beam_size, encoder_out['encoder_out'].size(-1)) 14 | if encoder_out['encoder_padding_mask'] is not None: 15 | encoder_out['encoder_padding_mask'] = encoder_out['encoder_padding_mask'].unsqueeze(1).repeat(1, beam_size, 1).view(bsz * beam_size, -1) 16 | 17 | 18 | def generate_step_with_prob(out): 19 | probs = F.softmax(out[0], dim=-1) 20 | max_probs, idx = probs.max(dim=-1) 21 | return idx, max_probs, probs 22 | 23 | 24 | def assign_single_value_byte(x, i, y): 25 | x.view(-1)[i.view(-1).nonzero()] = y 26 | 27 | 28 | def assign_multi_value_byte(x, i, y): 29 | x.view(-1)[i.view(-1).nonzero()] = y.view(-1)[i.view(-1).nonzero()] 30 | 31 | 32 | def assign_single_value_long(x, i, y): 33 | b, l = x.size() 34 | i = i + torch.arange(0, b*l, l, device=i.device).unsqueeze(1) 35 | x.view(-1)[i.view(-1)] = y 36 | 37 | 38 | def assign_multi_value_long(x, i, y): 39 | b, l = x.size() 40 | i = i + torch.arange(0, b*l, l, device=i.device).unsqueeze(1) 41 | x.view(-1)[i.view(-1)] = y.view(-1)[i.view(-1)] 42 | 43 | 44 | def convert_tokens(dictionary, tokens): 45 | return ' '.join([dictionary[token] for token in tokens]) 46 | 47 | 48 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/task_NAT_cemat.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH= 2 | task=translation_self_from_pt 3 | SRC= 4 | TGT= 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en' 6 | ARCH=bert_transformer_seq2seq_big 7 | freq=8 8 | patience=80 9 | valid_subset='valid' 10 | SAVE_DIR= 11 | PRETRAIN= 12 | 13 | python train.py ${DATA_PATH} --fp16 \ 14 | --user-dir CeMAT_maskPredict \ 15 | --encoder-normalize-before --decoder-normalize-before \ 16 | --encoder-learned-pos --decoder-learned-pos \ 17 | --task ${task} --from-pt \ 18 | --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \ 19 | --arch ${ARCH} --share-all-embeddings \ 20 | --criterion label_smoothed_length_cross_entropy --label-smoothing 0.1 \ 21 | --optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-6 \ 22 | --lr 0.0005 --warmup-init-lr 1e-7 --min-lr 1e-9 --lr-scheduler inverse_sqrt --warmup-updates 10000 \ 23 | --dropout 0.3 --weight-decay 0.01 \ 24 | --max-tokens 4096 --update-freq ${freq} \ 25 | --max-source-positions 10000 --max-target-positions 10000 --max-update 300000 --seed 0 \ 26 | --restore-file ${PRETRAIN} --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ 27 | --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 \ 28 | --log-format simple --log-interval 100 \ 29 | --ddp-backend no_c10d \ 30 | --save-dir ${SAVE_DIR} --patience ${patience} --num-workers 4 \ 31 | --distributed-no-spawn \ 32 | --valid-subset ${valid_subset} -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/task_infer_nat.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH= 2 | task=translation_self_from_pt 3 | SRC= 4 | TGT= 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en' 6 | SAVE_DIR= 7 | PRETRAIN= 8 | 9 | python generate_cmlm.py ${DATA_PATH} --fp16 \ 10 | --user-dir CeMAT_maskPredict \ 11 | --path ${PRETRAIN} \ 12 | --task ${task} --decoding-strategy mask_predict \ 13 | --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \ 14 | --gen-subset test \ 15 | --max-sentences 20 --decoding-iterations 10 --remove-bpe | tee infer.txt 16 | 17 | grep ^H infer.txt \ 18 | | sed 's/^H\-//' \ 19 | | sort -V \ 20 | | cut -f 2 \ 21 | | sed 's/\['$TGT'\] //g' \ 22 | | sed 's/\['$TGT'\]//g' \ 23 | > infer.sys 24 | 25 | grep ^T- infer.txt \ 26 | | sed 's/^T\-//' \ 27 | | sort -V \ 28 | | cut -f 2 \ 29 | | sed 's/\['$TGT'\] //g' \ 30 | | sed 's/\['$TGT'\]//g' \ 31 | > infer.ref 32 | 33 | sacrebleu --tokenize 'none' -w 2 infer.ref < infer.sys -------------------------------------------------------------------------------- /CeMAT/CeMAT_maskPredict/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .translation_self_from_cemat import * 2 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from .criterions import * 2 | from .models import * 3 | from .tasks import * 4 | from .data import * -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | from .label_smoothed_cross_entropy_with_maskdecode import * 2 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .concat_pair_dataset import * 2 | from .ddenoising_pair_dataset_dyna_replace import * 3 | from .language_pair_dataset import * 4 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/data/cemat_dataset.py: -------------------------------------------------------------------------------- 1 | # 2022 - Added code for CeMAT 2 | # Huawei Technologies Co., Ltd. 3 | # Copyright 2022 Huawei Technologies Co., Ltd. 4 | # 5 | # Copyright (c) Facebook, Inc. and its affiliates. 6 | # 7 | # This source code is licensed under the MIT license found in the 8 | # LICENSE file in the root directory of this source tree. 9 | # 10 | 11 | import numpy as np 12 | import torch.utils.data 13 | from fairseq.data import data_utils,FairseqDataset 14 | 15 | class CematDataset(FairseqDataset): 16 | """A dataset that provides helpers for batching.""" 17 | 18 | def __init__(self): 19 | super(CematDataset, self).__init__() 20 | 21 | def filter_indices_by_size(self, indices, max_sizes): 22 | """ 23 | Filter a list of sample indices. Remove those that are longer than 24 | specified in *max_sizes*. 25 | 26 | WARNING: don't update, override method in child classes 27 | 28 | Args: 29 | indices (np.array): original array of sample indices 30 | max_sizes (int or list[int] or tuple[int]): max sample size, 31 | can be defined separately for src and tgt (then list or tuple) 32 | 33 | Returns: 34 | np.array: filtered sample array 35 | list: list of removed indices 36 | """ 37 | max_sizes= max_sizes[0] 38 | if isinstance(max_sizes, float) or isinstance(max_sizes, int): 39 | if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray): 40 | ignored = indices[self.sizes[:,0][indices] > max_sizes].tolist() 41 | indices = indices[self.sizes[:,0][indices] <= max_sizes] 42 | elif ( 43 | hasattr(self, "sizes") 44 | and isinstance(self.sizes, list) 45 | and len(self.sizes) == 1 46 | ): 47 | ignored = indices[self.sizes[0][indices] > max_sizes].tolist() 48 | indices = indices[self.sizes[0][indices] <= max_sizes] 49 | else: 50 | indices, ignored = data_utils._filter_by_size_dynamic( 51 | indices, self.size, max_sizes 52 | ) 53 | else: 54 | indices, ignored = data_utils._filter_by_size_dynamic( 55 | indices, self.size, max_sizes 56 | ) 57 | return indices, ignored 58 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import * 2 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/models/fairseq_encoder.py: -------------------------------------------------------------------------------- 1 | # 2022 - Added code for CeMAT 2 | # Huawei Technologies Co., Ltd. 3 | # Copyright 2022 Huawei Technologies Co., Ltd. 4 | # 5 | # Copyright (c) Facebook, Inc. and its affiliates. 6 | # 7 | # This source code is licensed under the MIT license found in the 8 | # LICENSE file in the root directory of this source tree. 9 | # 10 | 11 | from typing import Dict, List, NamedTuple, Optional,Tuple 12 | import torch 13 | import torch.nn as nn 14 | from torch import Tensor 15 | from fairseq import utils 16 | from fairseq.models import FairseqEncoder 17 | 18 | EncoderOut = NamedTuple( 19 | "EncoderOut", 20 | [ 21 | ("encoder_out", Tensor), # T x B x C 22 | ("encoder_padding_mask", Optional[Tensor]), # B x T 23 | ("encoder_embedding", Optional[Tensor]), # B x T x C 24 | ("encoder_states", Optional[List[Tensor]]), # List[T x B x C] 25 | ("src_tokens", Optional[Tensor]), # B x T 26 | ("src_lengths", Optional[Tensor]), # B x 1 27 | ], 28 | ) 29 | 30 | 31 | class CematEncoder(FairseqEncoder): 32 | """Base class for encoders.""" 33 | 34 | def __init__(self, dictionary): 35 | super().__init__(dictionary) 36 | self.dictionary = dictionary 37 | self.onnx_trace = False 38 | 39 | def get_normalized_probs( 40 | self, 41 | net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], 42 | log_probs: bool, 43 | sample: Optional[Dict[str, Tensor]] = None, 44 | ): 45 | """Get normalized probabilities (or log probs) from a net's output.""" 46 | 47 | if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: 48 | if sample is not None: 49 | assert "source" in sample 50 | source = sample["source"] 51 | else: 52 | source = None 53 | out = self.adaptive_softmax.get_log_prob(net_output[0], target=source) 54 | return out.exp_() if not log_probs else out 55 | 56 | logits = net_output[0] 57 | if log_probs: 58 | return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) 59 | else: 60 | return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) 61 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/task_NMT_cemat.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH= 2 | task=translation_from_pretrained_cemat 3 | SRC= 4 | TGT= 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en' 6 | ARCH=cemat_transformer_big 7 | freq=16 8 | patience=35 9 | valid_subset='valid' 10 | SAVE_DIR= 11 | PRETRAIN= 12 | 13 | fairseq-train ${DATA_PATH} --fp16 \ 14 | --user-dir CeMAT_plugins \ 15 | --encoder-normalize-before --decoder-normalize-before --layernorm-embedding \ 16 | --task ${task} \ 17 | --source-lang ${SRC} --target-lang ${TGT} --langs ${langs} --add-lang-token --share-dict \ 18 | --arch ${ARCH} \ 19 | --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 20 | --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ 21 | --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ 22 | --dropout 0.3 --attention-dropout 0.1 \ 23 | --max-tokens 4096 --update-freq ${freq} --seed 222 \ 24 | --log-format simple --skip-invalid-size-inputs-valid-test \ 25 | --keep-interval-updates 20 --log-interval 10 \ 26 | --validate-interval 1 \ 27 | --restore-file ${PRETRAIN} --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ 28 | --eval-bleu \ 29 | --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ 30 | --eval-bleu-detok moses \ 31 | --eval-bleu-remove-bpe \ 32 | --eval-bleu-print-samples \ 33 | --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \ 34 | --ddp-backend c10d \ 35 | --save-dir ${SAVE_DIR} --num-workers 4 --patience ${patience} \ 36 | --valid-subset ${valid_subset} -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/task_infer_nmt.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH= 2 | task=translation_from_pretrained_cemat 3 | SRC= 4 | TGT= 5 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en' 6 | SAVE_DIR= 7 | PRETRAIN= 8 | 9 | fairseq-generate ${DATA_PATH} --fp16 \ 10 | --user-dir CeMAT_plugins \ 11 | --path ${PRETRAIN} \ 12 | --task ${task} \ 13 | -s ${SRC} -t ${TGT} --langs ${langs} --add-lang-token --share-dict \ 14 | --batch-size 128 --beam 5 --sacrebleu --remove-bpe | tee infer.txt 15 | 16 | grep ^H infer.txt \ 17 | | sed 's/^H\-//' \ 18 | | sort -V \ 19 | | cut -f 3 \ 20 | | sed 's/\['$TGT'\] //g' \ 21 | > infer.sys 22 | 23 | grep ^T infer.txt \ 24 | | sed 's/^T\-//' \ 25 | | sort -V \ 26 | | cut -f 2 \ 27 | | sed 's/\['$TGT'\] //g' \ 28 | > infer.ref 29 | 30 | 31 | sacrebleu --tokenize 'none' -w 2 infer.ref < infer.sys -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/task_pt_cemat.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH= 2 | task=cemat_pretraining 3 | langs='ar-en,be-en,bg-en,de-en,el-en,en-af,en-cs,en-es,en-fr,en-gu,en-he,en-ja,en-kk,en-lt,en-mt,en-ro,en-ru,en-tr,en-zh,eo-en,et-en,fi-en,hi-en,it-en,ka-en,ko-en,lv-en,mn-en,ms-en,my-en,sr-en,vi-en' 4 | word_trans='word_trans2id.dict' 5 | ARCH=cemat_transformer_big 6 | # with 32 GPUS. 7 | freq=8 8 | patience=10 9 | valid_subset='valid' 10 | SAVE_DIR= 11 | 12 | fairseq-train ${DATA_PATH} --fp16 \ 13 | --user-dir CeMAT_plugins \ 14 | --encoder-normalize-before --decoder-normalize-before --layernorm-embedding \ 15 | --task ${task} \ 16 | --langs ${langs} --add-lang-token --share-dict --shuffle-lang-pair --multilang-sampling-alpha 0.7 \ 17 | --trans-dict ${word_trans} \ 18 | --arch ${ARCH} --bi_self_att --plus-encoder-loss --encoder-loss-lambda 0.3 \ 19 | --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 20 | --criterion label_smoothed_cross_entropy_with_maskdecode --label-smoothing 0.1 \ 21 | --lr 0.0005 --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 1200000 \ 22 | --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.0001 \ 23 | --max-tokens 4096 --update-freq ${freq} --seed 222 \ 24 | --log-format simple --skip-invalid-size-inputs-valid-test \ 25 | --ddp-backend c10d \ 26 | --keep-interval-updates 20 --log-interval 10 \ 27 | --validate-interval 1 \ 28 | --save-dir ${SAVE_DIR} --num-workers 4 --patience ${patience} \ 29 | --valid-subset ${valid_subset} \ 30 | -------------------------------------------------------------------------------- /CeMAT/CeMAT_plugins/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .cemat_pretraining import * 2 | from .translation_from_pretrained_cemat import * 3 | 4 | -------------------------------------------------------------------------------- /CeMAT/License: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /CeMAT/cemat_scripts/create_trans/example_extract_alignedpairs.sh: -------------------------------------------------------------------------------- 1 | # vocab file path. 2 | vocab_path= 3 | # bilignual(multilignual) word translation dict path 4 | wordTrans_path= 5 | # data path(BPE format) 6 | data_path= 7 | prefix= 8 | langs= 9 | # output path 10 | out_path= 11 | 12 | python extract_aligned_pairs.py --vocab-path $vocab_path --trans-path $wordTrans_path --data-path $data_path --output-path $out_path --prefix $prefix --langs $langs --add-mask --merge -------------------------------------------------------------------------------- /CeMAT/cemat_scripts/process/preprocess_NMT.sh: -------------------------------------------------------------------------------- 1 | SRC= 2 | TGT= 3 | DATA= 4 | OUTPATH= 5 | DEST= 6 | # vocab,model path. 7 | MODEL= 8 | mkdir -p ${OUTPATH} 9 | mkdir -p ${DEST} 10 | 11 | N_THREADS=8 12 | # if need 13 | pip install jieba 14 | 15 | FASTBPE_DIR= 16 | FASTBPE= 17 | BPEROOT= 18 | 19 | #moses decoder path 20 | MOSES= 21 | REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl 22 | NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl 23 | REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl 24 | TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl 25 | INPUT_FROM_SGM=$MOSES/scripts/ems/support/input-from-sgm.perl 26 | CLEAN=$MOSES/scripts/training/clean-corpus-n.perl 27 | NORMALIZE_ROMANIAN=$MOSES/scripts/tokenizer/ro/normalise-romanian.py 28 | REMOVE_DIACRITICS=$MOSES/scripts/tokenizer/ro/remove-diacritics.py 29 | JA_SCRIPT=$MOSES/scripts/tokenizer/ja/kytea.py 30 | JA_MODEL=$MOSES/scripts/tokenizer/ja/ja-0.4.7-1.mod 31 | 32 | # BPE / vocab files 33 | BPE_CODES=$MODEL/codes 34 | FULL_VOCAB=$MODEL/vocab 35 | 36 | for split in "train" "valid" "test"; 37 | do 38 | for lang in $SRC $TGT; 39 | do 40 | Data_TRAIN=$DATA/$split.$lang 41 | Data_TRAIN_TOK=$OUTPATH/$split.tok.$lang 42 | Data_TRAIN_BPE=$OUTPATH/$split.spm.$lang 43 | echo $Data_TRAIN "TOKENIZER:====>>" $Data_TRAIN_TOK 44 | if [ "$lang" == "ro" ]; then 45 | cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | perl $NORMALIZE_ROMANIAN | perl $REMOVE_DIACRITICS | perl $TOKENIZER -l $lang -a -threads $N_THREADS > $Data_TRAIN_TOK 46 | elif [ "$lang" == "ja" ]; then 47 | cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | python ${JA_SCRIPT} -m ${JA_MODEL} > $Data_TRAIN_TOK 48 | elif [ "$lang" == "zh" ]; then 49 | cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | python -m jieba -d > $Data_TRAIN_TOK 50 | else 51 | cat $Data_TRAIN | perl $NORM_PUNC -l $lang | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l $lang -a -threads $N_THREADS > $Data_TRAIN_TOK 52 | fi 53 | 54 | echo $Data_TRAIN_TOK "====>>" $Data_TRAIN_BPE 55 | python $BPEROOT/apply_bpe.py -c $BPE_CODES < $Data_TRAIN_TOK > $Data_TRAIN_BPE 56 | done 57 | if [ "$split" == "train" ]; then 58 | echo "clean by ratio." 59 | perl $CLEAN -ratio 1.5 $OUTPATH/$split.spm $SRC $TGT $OUTPATH/$split.spm.clean 1 250 60 | fi 61 | done 62 | 63 | #Binarize the dataset 64 | fairseq-preprocess \ 65 | --source-lang ${SRC} \ 66 | --target-lang ${TGT} \ 67 | --trainpref ${OUTPATH}/train.spm.clean \ 68 | --validpref ${OUTPATH}/valid.spm \ 69 | --testpref ${OUTPATH}/test.spm \ 70 | --destdir ${DEST}/ \ 71 | --thresholdtgt 0 \ 72 | --thresholdsrc 0 \ 73 | --srcdict $FULL_VOCAB \ 74 | --tgtdict $FULL_VOCAB \ 75 | --workers 70 76 | -------------------------------------------------------------------------------- /DynaBERT/dynabert_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/DynaBERT/dynabert_overview.png -------------------------------------------------------------------------------- /DynaBERT/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3 5 | # Used for downloading models over HTTP 6 | requests 7 | 8 | torch==1.0.0 9 | python==3.6 10 | seaborn -------------------------------------------------------------------------------- /DynaBERT/transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_bert import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | -------------------------------------------------------------------------------- /DynaBERT/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .processors import InputExample, InputFeatures, DataProcessor 2 | from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | 4 | from .metrics import is_sklearn_available 5 | if is_sklearn_available(): 6 | from .metrics import glue_compute_metrics 7 | -------------------------------------------------------------------------------- /DynaBERT/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import InputExample, InputFeatures, DataProcessor 2 | from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features 3 | 4 | -------------------------------------------------------------------------------- /HyperText/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2021 Huawei Technologies Co., Ltd. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /HyperText/README.md: -------------------------------------------------------------------------------- 1 | # HyperText 2 | Natural language data exhibit tree-like hierarchical structures such as the hypernymhyponym relations in WordNet. Considering that hyperbolic space is naturally suitable for modeling tree-like hierarchical data, we propose a new model named HyperText for efficient text classification by endowing FastText with hyperbolic geometry. Empirically, we show that HyperText outperforms FastText on a range of text classification tasks with much reduced parameters. 3 | 4 | ![avatar](./hypertext_model_architecture.png) 5 | 6 | For more details about the techniques of HyperText, please refer to our paper: 7 | 8 | [HyperText: Endowing FastText with Hyperbolic Geometry](https://arxiv.org/abs/2010.16143 "HyperText: Endowing FastText with Hyperbolic Geometry") 9 | 10 | # Release Notes 11 | 12 | First version: 2021.01.16 13 | 14 | # Installation 15 | Run command below to install the environment(using python3) 16 | ```python 17 | 18 | pip install -r requirements.txt 19 | 20 | ``` 21 | 22 | # Train and Evaluation 23 | * Data Preprocessing 24 | please refer to our paper for details 25 | 26 | * Train & Evaluation 27 | 28 | ```python 29 | 30 | python main.py --datasetdir $data_path --outputdir $output_path --dropout $droout --require_improvement $early_stopping_steps --num_epochs $max_epoch --batch_size $batch_size --max_length $max_sequence_length --learning_rate $learning_rate --embed_dim $embedding_dimension --bucket $hash_bucket_size --wordNgrams $word_ngram --eval_per_batchs $evaluation_frequency --min_freq $minimum_word_frequency --lr_decay_rate $learning_rate_decay 31 | 32 | ``` 33 | 34 | # Examples 35 | 36 | * TNEWS Dataset 37 | 38 | 39 | ```python 40 | 41 | python main.py --datasetdir ./data/tnews_public --outputdir ./output --dropout 0.0 --require_improvement 6000 --num_epochs 50 --batch_size 32 --max_length 40 --learning_rate 1.1e-2 --embed_dim 200 --bucket 1500000 --wordNgrams 2 --eval_per_batchs 100 --min_freq 1 --lr_decay_rate 0.96 42 | 43 | ``` 44 | 45 | * IFLYTEK Dataset 46 | 47 | ```python 48 | 49 | python main.py --datasetdir ./data/iflytek_public --outputdir ./output --dropout 0.0 --require_improvement 2500 --num_epochs 50 --batch_size 32 --max_length 1000 --learning_rate 1.3e-2 --embed_dim 100 --bucket 2000000 --wordNgrams 2 --eval_per_batchs 50 --min_freq 1 --lr_decay_rate 0.94 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /HyperText/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | #The MIT License (MIT) 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd. 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /HyperText/hyperbolic/__init__.py: -------------------------------------------------------------------------------- 1 | from .poincare import PoincareBall 2 | from .math_utils import * 3 | from .mobius_linear import * -------------------------------------------------------------------------------- /HyperText/hyperbolic/math_utils.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | #The MIT License (MIT) 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd. 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | """Hyperbolic Math utils functions.""" 25 | 26 | import torch 27 | 28 | eps = 1e-15 29 | 30 | def artanh(x): 31 | return Artanh.apply(x) 32 | 33 | class Artanh(torch.autograd.Function): 34 | @staticmethod 35 | def forward(ctx, x): 36 | x = x.clamp(-1 + eps, 1 - eps) 37 | ctx.save_for_backward(x) 38 | out = (torch.log(1 + x.double()).sub(torch.log(1 - x.double()))).mul(0.5) 39 | return out.to(x.dtype) 40 | 41 | @staticmethod 42 | def backward(ctx, grad_output): 43 | x, = ctx.saved_tensors 44 | return grad_output / (1 - x ** 2) 45 | 46 | -------------------------------------------------------------------------------- /HyperText/hyperbolic/mobius_linear.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | #The MIT License (MIT) 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd. 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | import torch 25 | import torch.nn.init as init 26 | import math 27 | 28 | class MobiusLinear(torch.nn.Module): 29 | """ 30 | Mobius linear layer. 31 | """ 32 | def __init__(self, manifold, in_features, out_features, c, use_bias=True): 33 | super(MobiusLinear, self).__init__() 34 | self.use_bias = use_bias 35 | self.in_features = in_features 36 | self.out_features = out_features 37 | self.c = c 38 | self.manifold = manifold 39 | self.bias = torch.nn.Parameter(torch.zeros(out_features)) 40 | 41 | self.weight = torch.nn.Parameter(torch.randn(out_features, in_features)) 42 | self.reset_parameters() 43 | 44 | @torch.no_grad() 45 | def reset_parameters(self): 46 | init.xavier_uniform_(self.weight, gain=math.sqrt(2)) 47 | init.constant_(self.bias, 0.0) 48 | 49 | def forward(self, x): 50 | mv = self.manifold.mobius_matvec(self.weight, x, self.c) 51 | res = self.manifold.proj(mv, self.c) 52 | if self.use_bias: 53 | bias = self.manifold.proj_tan0(self.bias.view(1, -1)) 54 | hyp_bias = self.manifold.expmap0(bias, self.c) 55 | hyp_bias = self.manifold.proj(hyp_bias, self.c) 56 | res = self.manifold.mobius_add(res, hyp_bias, c=self.c) 57 | res = self.manifold.proj(res, self.c) 58 | return res 59 | 60 | def extra_repr(self): 61 | return 'in_features_size={}, out_features_size={}, curvalture={}'.format( 62 | self.in_features, self.out_features, self.c 63 | ) 64 | -------------------------------------------------------------------------------- /HyperText/hypertext_model_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/HyperText/hypertext_model_architecture.png -------------------------------------------------------------------------------- /HyperText/models/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | #The MIT License (MIT) 3 | #Copyright (c) 2021 Huawei Technologies Co., Ltd. 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /HyperText/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scikit-learn 3 | torch==1.3.0 -------------------------------------------------------------------------------- /JABER-PyTorch/NEZHA_PyTorch/README.md: -------------------------------------------------------------------------------- 1 | # NEZHA pytorch version 2 | We only provide fine-tuning codes for sentence classification task in this repository. For MRC and sequential labelling task, please see [CLUE](https://github.com/CLUEbenchmark/CLUE) 3 | 4 | ### requirements 5 | 6 | - pytorch==1.1.0 7 | - python==3.5 8 | 9 | ### download NEZHA-pytorch models 10 | 11 | 1. Download models from : 12 | 2. Put pretrained models in pytorch_nezha/pretrained_models/ 13 | 14 | ### Run fine-tuning task 15 | ```shell 16 | sh run_classifier.sh 17 | ``` 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /JABER-PyTorch/alue_predictions/README.md: -------------------------------------------------------------------------------- 1 | alue_predictions 2 | -------------------------------------------------------------------------------- /JABER-PyTorch/alue_test_submission/README.md: -------------------------------------------------------------------------------- 1 | alue_test_submission 2 | -------------------------------------------------------------------------------- /JABER-PyTorch/pretrained_models/README.md: -------------------------------------------------------------------------------- 1 | pretrained_models 2 | -------------------------------------------------------------------------------- /JABER-PyTorch/raw_datasets/toy.mq2q.dev.tsv: -------------------------------------------------------------------------------- 1 | 0 كيف يمكنني التوقف عن الإلحاح؟ كيف تتوقف عن كونك جبانا؟ 2 | 1 ما معنى حياتنا؟ ما معنى الحياة؟ -------------------------------------------------------------------------------- /NEZHA-Gen-TensorFlow/README.md: -------------------------------------------------------------------------------- 1 | 2 | NEZHA-Gen-TensorFlow 3 | ============= 4 | We provide two GPT models pretrained by Huawei Noah's Ark Lab. One is Yuefu (乐府), a Chinese Classical Poetry generation model. The other is a Chinese GPT model pretrained with Chinese wikipedia and news corpus 5 | 6 | 7 | Release Notes 8 | ============= 9 | First version: 2020/07/22 10 | 11 | Yuefu updated: 2020/09/24 12 | 13 | Environment 14 | ============ 15 | The scripts are tested sucessfully with Tensorflow 1.13 and Python 3.6. 16 | 17 | The python package ``fire`` is required. You may need to install the ``fire`` package with the command: 18 | 19 | ``` 20 | pip3 install fire 21 | ``` 22 | 23 | Usage of Yuefu (乐府) 24 | ==================== 25 | 26 | Step 1: Download the folder named ``models_yuefu`` from the link below and move the folder to the same directory with the scripts. Rename the folder to ``models``. 27 | 28 | Step 2: Run the script ``poetry.py`` with the command to see a demo output: 29 | 30 | ``` 31 | python3 poetry.py 32 | ``` 33 | 34 | The opensourced Yuefu is only for academic research. 35 | Any business application should refer to [Huawei Cloud API](https://support.huaweicloud.com/api-nlp/nlp_03_0070.html). 36 | 37 | 38 | Usage of GPT 39 | ==================== 40 | 41 | Step 1: Download the folder named ``models_gpt`` from the link below and move the folder to the same directory with the scripts. Rename the folder to ``models``. 42 | 43 | Step 2: Run the script ``interactive_conditional_generation.py`` with the command: 44 | 45 | ``` 46 | python3 interactive_conditional_generation.py 47 | ``` 48 | 49 | Step 3: Type in Chinese characters as the initial words and press ENTER to start generating sentences. 50 | 51 | Model download 52 | =========================== 53 | * Yuefu 54 | * [Google Drive](https://drive.google.com/drive/folders/1B5-jxUlzhoKwFVMQ-nkqqbmJQgr1lRAp?usp=sharing) 55 | * [Baidu Netdisk](https://pan.baidu.com/s/1me6_BGYHbWFdTi80vRQ2Lg)(code: ytim) 56 | 57 | * Chinese GPT 58 | * [Google Drive](https://drive.google.com/drive/folders/1i4f_8LhaVDNjnGlLXNJ0rNgBP0E4L6V0?usp=sharing) 59 | * [Baidu Netdisk](https://pan.baidu.com/s/1Bgle8TpcxHyuUz_jAXOBWw)(code:rb5m) 60 | 61 | -------------------------------------------------------------------------------- /NEZHA-PyTorch/README.md: -------------------------------------------------------------------------------- 1 | # NEZHA pytorch version 2 | We only provide fine-tuning codes for sentence classification task in this repository. For MRC and sequential labelling task, please see [CLUE](https://github.com/CLUEbenchmark/CLUE) 3 | 4 | ### requirements 5 | 6 | - pytorch==1.1.0 7 | - python==3.5 8 | 9 | ### download NEZHA-pytorch models 10 | 11 | 1. Download models from : 12 | 2. Put pretrained models in pytorch_nezha/pretrained_models/ 13 | 14 | ### Run fine-tuning task 15 | ```shell 16 | sh run_classifier.sh 17 | ``` 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /NEZHA-PyTorch/pretrained_models/nezha-cn-base/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "max_relative_position": 64, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "type_vocab_size": 2, 13 | "vocab_size": 21128, 14 | "use_relative_position": true 15 | } 16 | -------------------------------------------------------------------------------- /NEZHA-PyTorch/pretrained_models/nezha-en-base/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "max_relative_position": 127, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "type_vocab_size": 2, 13 | "vocab_size": 28996, 14 | "use_relative_position": true 15 | } 16 | -------------------------------------------------------------------------------- /NEZHA-PyTorch/run_classifier.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # run_classifier.sh for sentence classification task 3 | ######################################################################### 4 | #!/bin/bash 5 | 6 | CUDA_VISIBLE_DEVICES=0 python run_sequence_classifier.py \ 7 | --task_name=text-clf \ 8 | --do_train \ 9 | --do_eval \ 10 | --data_dir=data/chnsenti/ \ 11 | --bert_model=pretrained_models/nezha-cn-base/ \ 12 | --max_seq_length=128 \ 13 | --train_batch_size=16 \ 14 | --eval_batch_size=16 \ 15 | --learning_rate=3e-5 \ 16 | --num_train_epochs=10.0 \ 17 | --output_dir=output/0414chnsenti/ 18 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | BERT needs to maintain permanent compatibility with the pre-trained model files, 4 | so we do not plan to make any major changes to this library (other than what was 5 | promised in the README). However, we can accept small patches related to 6 | re-factoring and documentation. To submit contributes, there are just a few 7 | small guidelines you need to follow. 8 | 9 | ## Contributor License Agreement 10 | 11 | Contributions to this project must be accompanied by a Contributor License 12 | Agreement. You (or your employer) retain the copyright to your contribution; 13 | this simply gives us permission to use and redistribute your contributions as 14 | part of the project. Head over to to see 15 | your current agreements on file or to sign a new one. 16 | 17 | You generally only need to submit a CLA once, so if you've already submitted one 18 | (even if it was for a different project), you probably don't need to do it 19 | again. 20 | 21 | ## Code reviews 22 | 23 | All submissions, including submissions by project members, require review. We 24 | use GitHub pull requests for this purpose. Consult 25 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 26 | information on using pull requests. 27 | 28 | ## Community Guidelines 29 | 30 | This project follows 31 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 32 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.06-py3 2 | 3 | FROM tensorrtserver_client as trt 4 | 5 | FROM ${FROM_IMAGE_NAME} 6 | 7 | RUN apt-get update && apt-get install -y pbzip2 pv bzip2 8 | 9 | RUN pip install toposort networkx pytest nltk tqdm html2text progressbar 10 | 11 | WORKDIR /workspace 12 | RUN git clone https://github.com/openai/gradient-checkpointing.git 13 | RUN git clone https://github.com/attardi/wikiextractor.git 14 | RUN git clone https://github.com/soskek/bookcorpus.git 15 | 16 | # Copy the perf_client over 17 | COPY --from=trt /workspace/build/perf_client /workspace/build/perf_client 18 | 19 | # Copy the python wheel and install with pip 20 | COPY --from=trt /workspace/build/dist/dist/tensorrtserver*.whl /tmp/ 21 | RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl 22 | 23 | 24 | WORKDIR /workspace/bert 25 | COPY . . 26 | 27 | ENV PYTHONPATH=/workspace/bert 28 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/NOTICE: -------------------------------------------------------------------------------- 1 | BERT TensorFlow 2 | 3 | This repository includes software from https://github.com/google-research/bert 4 | This repository includes software from https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT 5 | licensed under the Apache License, Version 2.0 (the "License") -------------------------------------------------------------------------------- /NEZHA-TensorFlow/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/data/pretrain-toy/tf_examples_00.tfrecord: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_00.tfrecord -------------------------------------------------------------------------------- /NEZHA-TensorFlow/data/pretrain-toy/tf_examples_01.tfrecord: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/NEZHA-TensorFlow/data/pretrain-toy/tf_examples_01.tfrecord -------------------------------------------------------------------------------- /NEZHA-TensorFlow/fp16_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import tensorflow as tf 17 | import numpy as np 18 | 19 | 20 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None, 21 | initializer=None, regularizer=None, 22 | trainable=True, 23 | *args, **kwargs): 24 | """Custom variable getter that forces trainable variables to be stored in 25 | float32 precision and then casts them to the training precision. 26 | """ 27 | storage_dtype = tf.float32 if trainable else dtype 28 | variable = getter(name, shape, dtype=storage_dtype, 29 | initializer=initializer, regularizer=regularizer, 30 | trainable=trainable, 31 | *args, **kwargs) 32 | if trainable and dtype != tf.float32: 33 | variable = tf.cast(variable, dtype) 34 | return variable 35 | 36 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/gpu_environment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import tensorflow as tf 17 | import numpy as np 18 | 19 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None, 20 | initializer=None, regularizer=None, 21 | trainable=True, 22 | *args, **kwargs): 23 | """Custom variable getter that forces trainable variables to be stored in 24 | float32 precision and then casts them to the training precision. 25 | """ 26 | storage_dtype = tf.float32 if trainable else dtype 27 | variable = getter(name, shape, dtype=storage_dtype, 28 | initializer=initializer, regularizer=regularizer, 29 | trainable=trainable, 30 | *args, **kwargs) 31 | if trainable and dtype != tf.float32: 32 | variable = tf.cast(variable, dtype) 33 | return variable 34 | 35 | def get_custom_getter(compute_type): 36 | return float32_variable_storage_getter if compute_type == tf.float16 else None 37 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/nezha/bert_base_rel_config_vocab_100503.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 100503, 13 | "use_relative_position": true 14 | } 15 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/nezha/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128, 19 | "use_relative_position": true 20 | } 21 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import optimization 20 | import tensorflow as tf 21 | 22 | 23 | class OptimizationTest(tf.test.TestCase): 24 | 25 | def test_adam(self): 26 | with self.test_session() as sess: 27 | w = tf.get_variable( 28 | "w", 29 | shape=[3], 30 | initializer=tf.constant_initializer([0.1, -0.2, -0.1])) 31 | x = tf.constant([0.4, 0.2, -0.5]) 32 | loss = tf.reduce_mean(tf.square(x - w)) 33 | tvars = tf.trainable_variables() 34 | grads = tf.gradients(loss, tvars) 35 | global_step = tf.train.get_or_create_global_step() 36 | optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) 37 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) 38 | init_op = tf.group(tf.global_variables_initializer(), 39 | tf.local_variables_initializer()) 40 | sess.run(init_op) 41 | for _ in range(100): 42 | sess.run(train_op) 43 | w_np = sess.run(w) 44 | self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) 45 | 46 | 47 | if __name__ == "__main__": 48 | tf.test.main() 49 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_clf.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ################# 3 | #Created on Fri Jul 12 11:05:22 2019 4 | #start codes for running clf(lcqmc/chnsenti/xnli) tasks. 5 | # task_name is 'lcqmc' for lcqmc task, 'xnli' for xnli task and 'text_clf' for chnsenti 6 | #read_tf_events is to find the best eval ckpt and do predict 7 | ################## 8 | 9 | CUDA_VISIBLE_DEVICES=1 python ../run_classifier.py \ 10 | --task_name=lcqmc \ 11 | --do_train=true \ 12 | --do_eval=true \ 13 | --do_train_and_eval=true \ 14 | --data_dir=../data/lcqmc/ \ 15 | --save_checkpoints_steps=50 \ 16 | --vocab_file=../nezha/vocab.txt \ 17 | --bert_config_file=../nezha/bert_config.json \ 18 | --init_checkpoint=../nezha/model.ckpt \ 19 | --max_seq_length=128 \ 20 | --train_batch_size=32 \ 21 | --eval_batch_size=32 \ 22 | --num_train_epochs=5 \ 23 | --output_dir=../output/lcqmc/ 24 | 25 | python ../read_tf_events.py \ 26 | --task_name=lcqmc \ 27 | --task_data_dir=../data/lcqmc/ \ 28 | --max_seq_length=128 \ 29 | --predict_batch_size=32 \ 30 | --pretrained_model_dir=../nezha/ \ 31 | --task_output_dir=../output/lcqmc/ \ 32 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_clf_predict.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ##################### 3 | #Created on Fri Jul 12 11:05:22 2019 4 | #start codes for running clf(lxqmc/chnsenti/xnli) predict tasks 5 | ##################### 6 | 7 | python ../run_classifier.py \ 8 | --task_name=$1 \ 9 | --do_predict=true \ 10 | --data_dir=$2 \ 11 | --vocab_file=$3/vocab.txt \ 12 | --bert_config_file=$3/bert_config.json \ 13 | --init_checkpoint=$4 \ 14 | --max_seq_length=$5 \ 15 | --predict_batch_size=$6 \ 16 | --output_dir=$7/ 17 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_ner_predict.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | python bert-downstream-code/run_classifier_ner.py \ 5 | --task_name=$1 \ 6 | --do_predict=true \ 7 | --data_dir=$2 \ 8 | --vocab_file=$3/vocab.txt \ 9 | --bert_config_file=$3/bert_config.json \ 10 | --init_checkpoint=$4 \ 11 | --max_seq_length=$5 \ 12 | --predict_batch_size=$6 \ 13 | --output_dir=$7/ 14 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_pretraining.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ################# 3 | #Run pretraining. 4 | ################## 5 | 6 | 7 | mpiexec --allow-run-as-root --bind-to socket -np 2 python run_pretraining.py \ 8 | --input_file=./data/pretrain-toy/*.tfrecord \ 9 | --output_dir=./nezha/ \ 10 | --do_train=True \ 11 | --do_eval=True \ 12 | --bert_config_file=./nezha/bert_config.json \ 13 | --train_batch_size=32 \ 14 | --max_seq_length=128 \ 15 | --max_predictions_per_seq=20 \ 16 | --num_train_steps=200 \ 17 | --num_warmup_steps=10 \ 18 | --learning_rate=2e-5 \ 19 | --horovod 20 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_reading.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ############ 3 | #Created on Fri Jul 12 11:05:22 2019 4 | #script for squad-like task fine-tuning 5 | ############# 6 | 7 | 8 | CUDA_VISIBLE_DEVICES=1 python ../run_squad.py \ 9 | --vocab_file=../nezha/vocab.txt \ 10 | --bert_config_file=../nezha/bert_config.json \ 11 | --init_checkpoint=../nezha/model.ckpt \ 12 | --do_train=True \ 13 | --train_file=../data/cmrc/new_cmrc2018_train.json \ 14 | --do_predict=True \ 15 | --predict_file=../data/cmrc/new_cmrc2018_dev.json \ 16 | --train_batch_size=4 \ 17 | --learning_rate=3e-5 \ 18 | --num_train_epochs=1.0 \ 19 | --max_seq_length=512 \ 20 | --doc_stride=128 \ 21 | --do_lower_case=False \ 22 | --output_dir=../output/cmrc/ 23 | python ../cmrc2018_evaluate.py ../data/cmrc/new_cmrc2018_dev.json ../output/cmrc/dev_predictions.json ../output/cmrc/result_metric.txt 24 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_seq_labelling.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ########## 3 | #Created on Fri Jul 12 11:05:22 2019 4 | #start codes for running ner task. 5 | #Note that read_tf_events.py is to read evaluation results from tf events file. 6 | ########### 7 | 8 | CUDA_VISIBLE_DEVICES=1 python ../run_classifier_ner.py \ 9 | --task_name=ner \ 10 | --do_train=true \ 11 | --do_eval=true \ 12 | --do_train_and_eval=true \ 13 | --data_dir=../data/peoples-daily-ner \ 14 | --save_checkpoints_steps=100 \ 15 | --vocab_file=../nezha/vocab.txt \ 16 | --bert_config_file=../nezha/bert_config.json \ 17 | --init_checkpoint=../nezha/model.ckpt \ 18 | --max_seq_length=256 \ 19 | --train_batch_size=16 \ 20 | --eval_batch_size=16 \ 21 | --num_train_epochs=10 \ 22 | --output_dir=../output/peoples-daily-ner/ 23 | python ../read_tf_events.py \ 24 | --task_name=ner \ 25 | --task_data_dir=../data/peoples-daily-ner \ 26 | --max_seq_length=256 \ 27 | --predict_batch_size=16 \ 28 | --pretrained_model_dir=../nezha/ \ 29 | --task_output_dir=../output/peoples-daily-ner/ \ 30 | 31 | -------------------------------------------------------------------------------- /NEZHA-TensorFlow/scripts/run_seq_labelling_predict.sh: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ################ 3 | #Created on Fri Jul 12 11:05:22 2019 4 | #start codes for running ds tasks prediction 5 | ################# 6 | 7 | python ../run_classifier_ner.py \ 8 | --task_name=$1 \ 9 | --do_predict=true \ 10 | --data_dir=$2 \ 11 | --vocab_file=$3/vocab.txt \ 12 | --bert_config_file=$3/bert_config.json \ 13 | --init_checkpoint=$4 \ 14 | --max_seq_length=$5 \ 15 | --predict_batch_size=$6 \ 16 | --output_dir=$7/ 17 | -------------------------------------------------------------------------------- /Noah_WuKong/README.md: -------------------------------------------------------------------------------- 1 | # WukongOpenSource 2 | 3 | Code for paper _“Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework”_ ([arXiv:2202.06767](https://arxiv.org/abs/2202.06767)) 4 | 5 | ## Code structure 6 | 7 | ``` 8 | . 9 | ├── configs/... # contains configs for model loading 10 | ├── data 11 | │ ├── __init__.py 12 | │ ├── datasets.py # definition of datasets, e.g., ImageNet 13 | │ ├── res 14 | │ │ ├── classnames.json # definition of classification names 15 | │ │ └── prompts.txt # prompts for ensemble 16 | │ └── tokenizer 17 | │ ├── __init__.py 18 | │ ├── res 19 | │ │ └── vocab.txt # vocabulary file for tokenization 20 | │ ├── simple_tokenizer.py # implementation of Chinese tokenization 21 | │ └── utils.py 22 | ├── main.py # main script for model evaluation 23 | ├── model 24 | │ ├── __init__.py 25 | │ ├── builder.py 26 | │ ├── language 27 | │ │ ├── __init__.py 28 | │ │ └── transformer.py # module of text encoder 29 | │ ├── modules.py # some other modules 30 | │ ├── utils.py 31 | │ ├── vision 32 | │ │ ├── __init__.py 33 | │ │ ├── swin_transformer.py # module of vision encoder [swin-transformer] 34 | │ │ └── vision_transformer.py # module of vision encoder [vit] 35 | │ └── wukong.py # model backbone 36 | ├── README.md 37 | ├── requirements.txt 38 | └── utils.py 39 | ``` 40 | 41 | ## Download models 42 | 43 | Benchmark of our pretrained multi-modality models can be found in [Noah-Wukong Benchmark](https://wukong-dataset.github.io/wukong-dataset/benchmark.html) 44 | 45 | ## Evaluate on ImageNet 46 | 47 | Below is an example for evaluating using Wukong_ViT-L model. 48 | 49 | ```shell 50 | python main.py \ 51 | --config="configs/wukong_vit_l/wukong_vit_l.py" \ 52 | --checkpoint="/cache/ckpt/wukong_vit_l.ckpt" \ 53 | --data_dir="/cache/data/ILSVRC/" 54 | ``` 55 | 56 | ## Reference 57 | 58 | Jiaxi Gu, Xiaojun Meng, Guansong Lu, Lu Hou, Minzhe Niu, Xiaodan Liang, Lewei Yao, Runhui Huang, Wei Zhang, Xin Jiang, Chunjing Xu, Hang Xu. 59 | [Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework](https://arxiv.org/abs/2202.06767). 60 | ``` 61 | @misc{gu2022wukong, 62 | title = {Wukong: 100 Million Large-scale Chinese Cross-modal Pre-training Dataset and A Foundation Framework}, 63 | author = {Gu, Jiaxi and Meng, Xiaojun and Lu, Guansong and Hou, Lu and Niu, Minzhe and Liang, Xiaodan and Yao, Lewei and Huang, Runhui and Zhang, Wei and Jiang, Xin and Xu, Chunjing and Xu, Hang}, 64 | url = {https://arxiv.org/abs/2202.06767}, 65 | year = {2022} 66 | } 67 | ``` -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_swin/wukong_swin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='SwinTransformer', 23 | name='swin_large_patch4_window7_224', 24 | input_resolution=224, 25 | embed_dim=192, 26 | depths=[2, 2, 18, 2], 27 | num_heads=[6, 12, 24, 48], 28 | window_size=7, 29 | patch_size=4, 30 | num_classes=21841, 31 | token_reduction=dict(num_tokens=12)), 32 | text=dict( 33 | type='TextTransformer', 34 | context_length=32, 35 | vocab_size=21128, 36 | width=768, 37 | heads=12, 38 | layers=12), 39 | is_token_wise=True 40 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_swin/wukong_swin_f.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='SwinTransformer', 23 | name='swin_large_patch4_window7_224', 24 | input_resolution=224, 25 | embed_dim=192, 26 | depths=[2, 2, 18, 2], 27 | num_heads=[6, 12, 24, 48], 28 | window_size=7, 29 | patch_size=4, 30 | num_classes=21841), 31 | text=dict( 32 | type='TextTransformer', 33 | context_length=32, 34 | vocab_size=21128, 35 | width=768, 36 | heads=12, 37 | layers=12), 38 | is_token_wise=True 39 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_swin/wukong_swin_g.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=768, 21 | visual=dict( 22 | type='SwinTransformer', 23 | name='swin_large_patch4_window7_224', 24 | input_resolution=224, 25 | embed_dim=192, 26 | depths=[2, 2, 18, 2], 27 | num_heads=[6, 12, 24, 48], 28 | window_size=7, 29 | patch_size=4, 30 | num_classes=21841), 31 | text=dict( 32 | type='TextTransformer', 33 | context_length=32, 34 | vocab_size=21128, 35 | width=768, 36 | heads=12, 37 | layers=12), 38 | is_token_wise=False 39 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_b/wukong_vit_b.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=12, 25 | width=768, 26 | patch_size=32, 27 | token_reduction=dict(num_tokens=12)), 28 | text=dict( 29 | type='TextTransformer', 30 | context_length=32, 31 | vocab_size=21128, 32 | width=512, 33 | heads=8, 34 | layers=12), 35 | is_token_wise=True 36 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_b/wukong_vit_b_f.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=12, 25 | width=768, 26 | patch_size=32), 27 | text=dict( 28 | type='TextTransformer', 29 | context_length=32, 30 | vocab_size=21128, 31 | width=512, 32 | heads=8, 33 | layers=12), 34 | is_token_wise=True 35 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_b/wukong_vit_b_g.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=512, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=12, 25 | width=768, 26 | patch_size=32), 27 | text=dict( 28 | type='TextTransformer', 29 | context_length=32, 30 | vocab_size=21128, 31 | width=512, 32 | heads=8, 33 | layers=12), 34 | is_token_wise=False 35 | ) 36 | -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_l/wukong_vit_l.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=24, 25 | width=1024, 26 | patch_size=14, 27 | token_reduction=dict(num_tokens=24)), 28 | text=dict( 29 | type='TextTransformer', 30 | context_length=32, 31 | vocab_size=21128, 32 | width=768, 33 | heads=12, 34 | layers=12), 35 | is_token_wise=True 36 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_l/wukong_vit_l_f.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=256, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=24, 25 | width=1024, 26 | patch_size=14), 27 | text=dict( 28 | type='TextTransformer', 29 | context_length=32, 30 | vocab_size=21128, 31 | width=768, 32 | heads=12, 33 | layers=12), 34 | is_token_wise=True 35 | ) -------------------------------------------------------------------------------- /Noah_WuKong/configs/wukong_vit_l/wukong_vit_l_g.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | model = dict( 18 | type='Wukong', 19 | pretrained='', 20 | embed_dim=768, 21 | visual=dict( 22 | type='VisionTransformer', 23 | input_resolution=224, 24 | layers=24, 25 | width=1024, 26 | patch_size=14), 27 | text=dict( 28 | type='TextTransformer', 29 | context_length=32, 30 | vocab_size=21128, 31 | width=768, 32 | heads=12, 33 | layers=12), 34 | is_token_wise=False 35 | ) -------------------------------------------------------------------------------- /Noah_WuKong/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/Noah_WuKong/data/__init__.py -------------------------------------------------------------------------------- /Noah_WuKong/data/res/prompts.txt: -------------------------------------------------------------------------------- 1 | {}的照片。 2 | 质量差的{}的照片。 3 | 许多{}的照片。 4 | {}的雕塑。 5 | 难以看到{}的照片。 6 | {}的低分辨率照片。 7 | {}的渲染。 8 | 涂鸦{}。 9 | {}的糟糕照片。 10 | {}的裁剪照片。 11 | {}的纹身。 12 | {}的刺绣照片。 13 | 很难看到{}的照片。 14 | {}的明亮照片。 15 | 一张干净的{}的照片。 16 | 一张包含{}的照片。 17 | {}的深色照片。 18 | {}的手绘画。 19 | 我的{}的照片。 20 | 不自然的{}的照片。 21 | 一张酷的{}的照片。 22 | {}的特写照片。 23 | {}的黑白照片。 24 | 一幅{}的画。 25 | 一幅{}的绘画。 26 | 一张{}的像素照片。 27 | {}的雕像。 28 | 一张{}的明亮照片。 29 | {}的裁剪照片。 30 | 人造的{}的照片。 31 | 一张关于{}的照片。 32 | 损坏的{}的jpeg照片。 33 | {}的模糊照片。 34 | {}的相片。 35 | 一张{}的好照片。 36 | {}的渲染照。 37 | 视频游戏中的{}。 38 | 一张{}的照片。 39 | {}的涂鸦。 40 | {}的近距离照片。 41 | {}的折纸。 42 | {}在视频游戏中。 43 | {}的草图。 44 | {}的涂鸦照。 45 | {}的折纸形状。 46 | 低分辨率的{}的照片。 47 | 玩具{}。 48 | {}的副本。 49 | {}的干净的照片。 50 | 一张大{}的照片。 51 | {}的重现。 52 | 一张漂亮的{}的照片。 53 | 一张奇怪的{}的照片。 54 | 模糊的{}的照片。 55 | 卡通{}。 56 | {}的艺术作品。 57 | {}的素描。 58 | 刺绣{}。 59 | {}的像素照。 60 | {}的拍照。 61 | {}的损坏的照片。 62 | 高质量的{}的照片。 63 | 毛绒玩具{}。 64 | 漂亮的{}的照片。 65 | 小{}的照片。 66 | 照片是奇怪的{}。 67 | 漫画{}。 68 | {}的艺术照。 69 | {}的图形。 70 | 大{}的照片。 71 | 黑白的{}的照片。 72 | {}毛绒玩具。 73 | 一张{}的深色照片。 74 | {}的摄影图。 75 | {}的涂鸦照。 76 | 玩具形状的{}。 77 | 拍了{}的照片。 78 | 酷酷的{}的照片。 79 | 照片里的小{}。 80 | {}的刺青。 -------------------------------------------------------------------------------- /Noah_WuKong/data/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_tokenizer import SimpleTokenizer -------------------------------------------------------------------------------- /Noah_WuKong/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .wukong import Wukong 2 | from .builder import build_model -------------------------------------------------------------------------------- /Noah_WuKong/model/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright (c) 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | from mmcv import Registry 18 | from mmcv import build_from_cfg 19 | 20 | MODELS = Registry('model') 21 | 22 | 23 | def build_model(config): 24 | return build_from_cfg(config, MODELS) 25 | -------------------------------------------------------------------------------- /Noah_WuKong/model/language/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import Transformer, TextTransformer 2 | -------------------------------------------------------------------------------- /Noah_WuKong/model/vision/__init__.py: -------------------------------------------------------------------------------- 1 | from .vision_transformer import VisionTransformer 2 | from .swin_transformer import SwinTransformer 3 | -------------------------------------------------------------------------------- /Noah_WuKong/requirements.txt: -------------------------------------------------------------------------------- 1 | mmcv 2 | timm 3 | torch 4 | torchvision -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/Noah_Wukong-MindSpore/src/__init__.py -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_b_32.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 12 6 | width: 768 7 | patch_size: 32 8 | output_dim: 256 9 | token_learner: 10 | num_tokens: 12 11 | num_groups: 8 12 | dropout_rate: 0.0 13 | text: 14 | type: TextTransformer 15 | context_length: 32 16 | vocab_size: 21128 17 | width: 512 18 | heads: 8 19 | layers: 12 20 | output_dim: 256 21 | eval: filip 22 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_b_32_clip.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 12 6 | width: 768 7 | patch_size: 32 8 | output_dim: 512 9 | return_full_embed: False 10 | text: 11 | type: TextTransformer 12 | context_length: 32 13 | vocab_size: 21128 14 | width: 512 15 | heads: 8 16 | layers: 12 17 | output_dim: 512 18 | return_full_embed: False 19 | eval: clip 20 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_b_32_filip.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 12 6 | width: 768 7 | patch_size: 32 8 | output_dim: 256 9 | text: 10 | type: TextTransformer 11 | context_length: 32 12 | vocab_size: 21128 13 | width: 512 14 | heads: 8 15 | layers: 12 16 | output_dim: 256 17 | eval: filip 18 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_l_14.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 24 6 | width: 1024 7 | patch_size: 14 8 | output_dim: 256 9 | token_learner: 10 | num_tokens: 24 11 | num_groups: 8 12 | dropout_rate: 0.0 13 | text: 14 | type: TextTransformer 15 | context_length: 32 16 | vocab_size: 21128 17 | width: 768 18 | heads: 12 19 | layers: 12 20 | output_dim: 256 21 | eval: filip 22 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_l_14_clip.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 24 6 | width: 1024 7 | patch_size: 14 8 | output_dim: 768 9 | return_full_embed: False 10 | text: 11 | type: TextTransformer 12 | context_length: 32 13 | vocab_size: 21128 14 | width: 768 15 | heads: 12 16 | layers: 12 17 | output_dim: 768 18 | return_full_embed: False 19 | eval: clip 20 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/config/wukong_vit_l_14_filip.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | visual: 3 | type: VisionTransformer 4 | input_resolution: 224 5 | layers: 24 6 | width: 1024 7 | patch_size: 14 8 | output_dim: 256 9 | text: 10 | type: TextTransformer 11 | context_length: 32 12 | vocab_size: 21128 13 | width: 768 14 | heads: 12 15 | layers: 12 16 | output_dim: 256 17 | eval: filip 18 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | from .dataset import get_dataset 16 | 17 | 18 | __all__ = ['get_dataset'] 19 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | from mindspore import dtype as mstype 16 | import mindspore.dataset as ds 17 | from mindspore.dataset.vision import Inter 18 | import mindspore.dataset.vision as C 19 | import mindspore.dataset.transforms as C2 20 | 21 | 22 | def get_wukong_dataset(dataset_path, columns_list, num_parallel_workers, shuffle, num_shards, shard_id, batch_size): 23 | wukong_dataset = ds.MindDataset(dataset_path, 24 | columns_list=columns_list, 25 | num_parallel_workers=num_parallel_workers, 26 | shuffle=shuffle, 27 | num_shards=num_shards, 28 | shard_id=shard_id) 29 | wukong_dataset = wukong_dataset.batch(batch_size) 30 | return wukong_dataset 31 | 32 | 33 | def get_dataset(dataset_path, batch_size): 34 | norm_mean = (0.48145466, 0.4578275, 0.40821073) 35 | norm_std = (0.26862954, 0.26130258, 0.27577711) 36 | norm_mean_2 = tuple(map(lambda x: x * 255, norm_mean)) 37 | norm_std_2 = tuple(map(lambda x: x * 255, norm_std)) 38 | val_dataset = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4) 39 | val_dataset = val_dataset.map( 40 | [C.Decode(), 41 | C.Normalize(mean=norm_mean_2, std=norm_std_2), 42 | C.Resize(224, Inter.BICUBIC), 43 | C.CenterCrop(224), 44 | C.HWC2CHW(), 45 | C2.TypeCast(mstype.float32)], 46 | input_columns=["image"], output_columns=None, column_order=["image", "label"]) 47 | val_dataset = val_dataset.batch(batch_size) 48 | return val_dataset 49 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | from .visual_encoder import VisualTransformer, ClipVisualTransformer 16 | from .text_encoder import BERT_Wukong 17 | from .matrics import FilipTemplateEncoder, ClipTemplateEncoder, FilipEval, ClipEval 18 | 19 | __all__ = ['VisualTransformer', 'ClipVisualTransformer', 'BERT_Wukong', 'FilipTemplateEncoder', 20 | 'ClipTemplateEncoder', 'FilipEval', 'ClipEval'] 21 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/model/token_learner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import mindspore.nn as nn 16 | import mindspore.ops as ops 17 | 18 | 19 | class TokenLearnerModule(nn.Cell): 20 | def __init__(self, in_channels, num_tokens, num_groups, dropout_rate): 21 | super(TokenLearnerModule, self).__init__() 22 | self.in_channels = in_channels 23 | self.num_tokens = num_tokens 24 | self.num_groups = num_groups 25 | self.norm = nn.LayerNorm([self.in_channels]) 26 | self.attention_maps = nn.SequentialCell([ 27 | nn.Conv2d(self.in_channels, self.in_channels, 1, group=self.num_groups), 28 | nn.Conv2d(self.in_channels, self.num_tokens, 1) 29 | ]) 30 | self.feat_conv = nn.Conv2d(self.in_channels, self.in_channels, 1, group=self.num_groups) 31 | self.softmax = nn.Softmax() 32 | self.dropout = nn.Dropout(1.0 - dropout_rate) 33 | 34 | def construct(self, x): 35 | bs, h, w, _ = x.shape 36 | 37 | selected = x 38 | selected = self.norm(selected) 39 | selected = selected.transpose(0, 3, 1, 2) 40 | selected = self.attention_maps(selected) 41 | selected = selected.transpose(0, 2, 3, 1) 42 | selected = selected.reshape(bs, h * w, -1) 43 | selected = selected.transpose(0, 2, 1) 44 | selected = self.softmax(selected) 45 | 46 | feat = x 47 | feat = feat.transpose(0, 3, 1, 2) 48 | feat = self.feat_conv(feat) 49 | feat = feat.transpose(0, 2, 3, 1) 50 | feat = feat.reshape(bs, h * w, -1) 51 | 52 | outputs = ops.matmul(selected, feat) 53 | outputs = self.dropout(outputs) 54 | return outputs, selected 55 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | from .template_generate import generate_zh_template 16 | from .model_utils import load_visual_model, load_text_model 17 | from .simple_tokenizer import set_tokenizer_lang, tokenize 18 | 19 | 20 | __all__ = ['generate_zh_template', 'load_visual_model', 21 | 'load_text_model', 'set_tokenizer_lang', 'tokenize'] 22 | -------------------------------------------------------------------------------- /Noah_Wukong-MindSpore/src/tools/template_generate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | import os 16 | from .simple_tokenizer import set_tokenizer_lang, tokenize 17 | 18 | 19 | def generate_zh_template(label_list): 20 | set_tokenizer_lang('zh', 32) 21 | template_list = [] 22 | template_path = os.path.join( 23 | os.path.dirname(os.path.abspath(__file__)), 24 | 'zh_templates.txt' 25 | ) 26 | 27 | templates = [] 28 | for line in open(template_path, 'r'): 29 | templates.append(line.strip()) 30 | num_prompts = len(templates) 31 | num_labels = len(label_list) 32 | for label in label_list: 33 | for template in templates: 34 | template_list.append(template.replace('{}', label)) 35 | token = tokenize(template_list).reshape((num_labels, num_prompts, -1)) 36 | return token 37 | -------------------------------------------------------------------------------- /PanGu-Bot/Readme.md: -------------------------------------------------------------------------------- 1 | to be complete 2 | -------------------------------------------------------------------------------- /PanGu-α/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /PanGu-α/.idea/PanGu-Alpha.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /PanGu-α/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 20 | -------------------------------------------------------------------------------- /PanGu-α/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /PanGu-α/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /PanGu-α/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /PanGu-α/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /PanGu-α/PANGU-α.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/PANGU-α.pdf -------------------------------------------------------------------------------- /PanGu-α/docs/13B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/13B.png -------------------------------------------------------------------------------- /PanGu-α/docs/2.6B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/2.6B.png -------------------------------------------------------------------------------- /PanGu-α/docs/Pipline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/Pipline.png -------------------------------------------------------------------------------- /PanGu-α/docs/dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/dataset.png -------------------------------------------------------------------------------- /PanGu-α/docs/logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/logos.png -------------------------------------------------------------------------------- /PanGu-α/docs/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/model.png -------------------------------------------------------------------------------- /PanGu-α/docs/task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/task.png -------------------------------------------------------------------------------- /PanGu-α/docs/微信交流群2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/微信交流群2.png -------------------------------------------------------------------------------- /PanGu-α/docs/鹏程.盘古微信交流群.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/docs/鹏程.盘古微信交流群.png -------------------------------------------------------------------------------- /PanGu-α/generate.py: -------------------------------------------------------------------------------- 1 | """ 2 | TopK for text generation 3 | """ 4 | 5 | import numpy as np 6 | import mindspore.common.dtype as mstype 7 | from mindspore.common.tensor import Tensor 8 | 9 | def generate(model, origin_inputs, seq_length, end_token=50256): 10 | """ 11 | TopK for text generation 12 | 13 | Inputs: 14 | model: the model for inferencing 15 | origin_inputs: the original inputs based on which the model will continue writing 16 | seq_length: seq_length for the model 17 | end_token: end of sentence token id 18 | 19 | Returns: 20 | outputs: the ids for the generated text 21 | """ 22 | TOPK = 3 23 | seq_length = seq_length 24 | bs, valid_length = origin_inputs.shape 25 | pad_length = seq_length - origin_inputs.shape[-1] 26 | input_ids = np.pad(origin_inputs, ((0, 0), (0, pad_length)), 'constant', constant_values=(0, 0)) 27 | print("input_ids is ", input_ids) 28 | while valid_length < seq_length: 29 | inputs = Tensor(input_ids, mstype.int32) 30 | probs, p_args = model.predict(inputs) 31 | probs = probs.asnumpy()[valid_length-1, :] 32 | p_args = p_args.asnumpy()[valid_length-1, :] 33 | 34 | p = probs 35 | p = p / sum(p) 36 | target_index = np.random.choice(len(p), p=p) 37 | if p_args[target_index] == end_token or valid_length == seq_length-1: 38 | outputs = input_ids 39 | break 40 | input_ids[0][valid_length] = p_args[target_index] 41 | valid_length += 1 42 | length = np.sum(outputs != 0) 43 | outputs = outputs[0][:length] 44 | return outputs 45 | 46 | -------------------------------------------------------------------------------- /PanGu-α/scripts/run_distribute_predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | execute_path=$(pwd) 3 | script_self=$(readlink -f "$0") 4 | self_path=$(dirname "${script_self}") 5 | export RANK_SIZE=$1 6 | export RANK_TABLE_FILE=$2 7 | export STRATEGY=$3 8 | export TOKENIZER=$4 9 | export CKPT_PATH=$5 10 | export CKPT_NAME=$6 11 | export MODE=$7 12 | 13 | for((i=0;i<$RANK_SIZE;i++)); 14 | do 15 | rm -rf ${execute_path}/device_$i/ 16 | mkdir ${execute_path}/device_$i/ 17 | cd ${execute_path}/device_$i/ || exit 18 | export RANK_ID=$i 19 | export DEVICE_ID=$i 20 | python -s ${self_path}/../run_pangu_alpha_predict.py --strategy_load_ckpt_path=$STRATEGY --tokenizer_path=$TOKENIZER --load_ckpt_path=$CKPT_PATH \ 21 | --load_ckpt_name=$CKPT_NAME --mode=$MODE >train_deep$i.log 2>&1 & 22 | done 23 | -------------------------------------------------------------------------------- /PanGu-α/scripts/run_distribute_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | execute_path=$(pwd) 4 | script_self=$(readlink -f "$0") 5 | self_path=$(dirname "${script_self}") 6 | export RANK_SIZE=$1 7 | export DATASET=$2 8 | export RANK_TABLE_FILE=$3 9 | export MODE=$4 10 | for((i=0;i<$RANK_SIZE;i++)); 11 | do 12 | rm -rf ${execute_path}/device_$i/ 13 | mkdir ${execute_path}/device_$i/ 14 | cd ${execute_path}/device_$i/ || exit 15 | export RANK_ID=$i 16 | export DEVICE_ID=$i 17 | python -s ${self_path}/../run_pangu_alpha_train.py --data_url=$DATASET --mode=$MODE >train_deep$i.log 2>&1 & 18 | done 19 | -------------------------------------------------------------------------------- /PanGu-α/serving_demo/PanGu-Alpha-serving-demo.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/serving_demo/PanGu-Alpha-serving-demo.avi -------------------------------------------------------------------------------- /PanGu-α/strategy_load_ckpt/pangu_alpha_13B_cktp_strategy.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/strategy_load_ckpt/pangu_alpha_13B_cktp_strategy.ckpt -------------------------------------------------------------------------------- /PanGu-α/strategy_load_ckpt/pangu_alpha_2.6B_ckpt_strategy.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/strategy_load_ckpt/pangu_alpha_2.6B_ckpt_strategy.ckpt -------------------------------------------------------------------------------- /PanGu-α/tokenizer/vocab.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/PanGu-α/tokenizer/vocab.model -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT-MindSpore/__init__.py -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/mindspore_hub_conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Bert hub interface for bert base""" 17 | 18 | from src.tinybert_model import BertModel 19 | from src.tinybert_model import BertConfig 20 | import mindspore.common.dtype as mstype 21 | 22 | tinybert_student_net_cfg = BertConfig( 23 | seq_length=128, 24 | vocab_size=30522, 25 | hidden_size=768, 26 | num_hidden_layers=6, 27 | num_attention_heads=12, 28 | intermediate_size=3072, 29 | hidden_act="gelu", 30 | hidden_dropout_prob=0.1, 31 | attention_probs_dropout_prob=0.1, 32 | max_position_embeddings=512, 33 | type_vocab_size=2, 34 | initializer_range=0.02, 35 | use_relative_positions=False, 36 | dtype=mstype.float32, 37 | compute_type=mstype.float32, 38 | do_quant=True, 39 | embedding_bits=2, 40 | weight_bits=2, 41 | weight_clip_value=3.0, 42 | cls_dropout_prob=0.1, 43 | activation_init=2.5, 44 | is_lgt_fit=False 45 | ) 46 | 47 | 48 | def create_network(name, *args, **kwargs): 49 | """ 50 | Create tinybert network. 51 | """ 52 | if name == "ternarybert": 53 | if "seq_length" in kwargs: 54 | tinybert_student_net_cfg.seq_length = kwargs["seq_length"] 55 | is_training = kwargs.get("is_training", False) 56 | return BertModel(tinybert_student_net_cfg, is_training, *args) 57 | raise NotImplementedError(f"{name} is not implemented in the repo") 58 | -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/scripts/run_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | mkdir -p ms_log 18 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 19 | CUR_DIR=`pwd` 20 | export GLOG_log_dir=${CUR_DIR}/ms_log 21 | export GLOG_logtostderr=0 22 | python ${PROJECT_DIR}/../eval.py \ 23 | --task_name=sts-b \ 24 | --device_id=0 \ 25 | --model_dir="" \ 26 | --data_dir="" > log.txt -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/scripts/run_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | mkdir -p ms_log 18 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 19 | CUR_DIR=`pwd` 20 | export GLOG_log_dir=${CUR_DIR}/ms_log 21 | export GLOG_logtostderr=0 22 | python ${PROJECT_DIR}/../train.py \ 23 | --task_name=sts-b \ 24 | --device_id=0 \ 25 | --teacher_model_dir="" \ 26 | --student_model_dir="" \ 27 | --data_dir="" > log.txt -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT-MindSpore/src/__init__.py -------------------------------------------------------------------------------- /TernaryBERT-MindSpore/src/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """create tinybert dataset""" 17 | 18 | from enum import Enum 19 | import mindspore.common.dtype as mstype 20 | import mindspore.dataset.engine.datasets as de 21 | import mindspore.dataset.transforms.c_transforms as C 22 | 23 | 24 | class DataType(Enum): 25 | """Enumerate supported dataset format""" 26 | TFRECORD = 1 27 | MINDRECORD = 2 28 | 29 | 30 | def create_dataset(batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, 31 | data_type='tfrecord', seq_length=128, task_type=mstype.int32, drop_remainder=True): 32 | """create tinybert dataset""" 33 | if isinstance(data_dir, list): 34 | data_files = data_dir 35 | else: 36 | data_files = [data_dir] 37 | 38 | columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] 39 | 40 | shuffle = (do_shuffle == "true") 41 | 42 | if data_type == 'mindrecord': 43 | ds = de.MindDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, 44 | shard_id=rank) 45 | else: 46 | ds = de.TFRecordDataset(data_files, columns_list=columns_list, shuffle=shuffle, num_shards=device_num, 47 | shard_id=rank, shard_equal_rows=(device_num == 1)) 48 | 49 | if device_num == 1 and shuffle is True: 50 | ds = ds.shuffle(10000) 51 | 52 | type_cast_op = C.TypeCast(mstype.int32) 53 | slice_op = C.Slice(slice(0, seq_length, 1)) 54 | label_type = mstype.int32 if task_type == 'classification' else mstype.float32 55 | ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["segment_ids"]) 56 | ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_mask"]) 57 | ds = ds.map(operations=[type_cast_op, slice_op], input_columns=["input_ids"]) 58 | ds = ds.map(operations=[C.TypeCast(label_type), slice_op], input_columns=["label_ids"]) 59 | # apply batch operations 60 | ds = ds.batch(batch_size, drop_remainder=drop_remainder) 61 | 62 | return ds 63 | -------------------------------------------------------------------------------- /TernaryBERT/README.md: -------------------------------------------------------------------------------- 1 | # TernaryBERT 2 | 3 | This directory contains code for [TernaryBERT: Distillation-aware Ultra-low Bit BERT](https://arxiv.org/abs/2009.12812). 4 |
5 | 6 |
7 | ## Envs 8 | ``` 9 | conda create -n myenv python=3.6 10 | conda activate myenv 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | 15 | ## GLUE 16 | ### Data 17 | Download GLUE 18 | https://github.com/nyu-mll/GLUE-baselines 19 | 20 | ``` 21 | ls data/mnli 22 | ``` 23 | shows: 24 | ``` 25 | dev_matched.tsv dev_mismatched.tsv train.tsv 26 | ``` 27 | 28 | ### Model 29 | Preparing a fine-tuend BERT base model on MNLI 30 | ``` 31 | ls models/mnli 32 | ``` 33 | shows 34 | ``` 35 | config.json pytorch_model.bin vocab.txt 36 | ``` 37 | 38 | ### Distillation-aware quantization training on MNLI 39 | ``` 40 | python quant_task_glue.py \ 41 | --data_dir data \ 42 | --model_dir models \ 43 | --task_name mnli \ 44 | --output_dir output \ 45 | --learning_rate 2e-5 \ 46 | --num_train_epochs 3 \ 47 | --weight_bits 2 \ 48 | --input_bits 8 \ 49 | --pred_distill \ 50 | --intermediate_distill \ 51 | --save_fp_model \ 52 | --save_quantized_model 53 | ``` 54 | More details of arguments are in quant_task_glue.py 55 | 56 | ## SQuAD 57 | 58 | ### Data 59 | Download data 60 | https://rajpurkar.github.io/SQuAD-explorer/ 61 | 62 | ### Model 63 | Preparing fine-tuend BERT base model on SQuAD v1.1/v2.0 64 | 65 | ### Distillation-aware quantization training 66 | ``` 67 | python quant_task_squad.py \ 68 | --data_dir data/squadv2.0 \ 69 | --model_dir models/squadv2.0 \ 70 | --output_dir output \ 71 | --learning_rate 2e-5 \ 72 | --num_train_epochs 3 \ 73 | --version_2_with_negative \ 74 | --weight_bits 2 \ 75 | --input_bits 8 \ 76 | --pred_distill \ 77 | --intermediate_distill \ 78 | --save_fp_model \ 79 | --save_quantized_model 80 | ``` 81 | 82 | ## Reference 83 | 84 | ``` 85 | @inproceedings{zhang-etal-2020-ternarybert, 86 | title = {TernaryBERT: Distillation-aware Ultra-low Bit BERT}, 87 | author = {Wei Zhang, Lu Hou, Yichun Yin, Lifeng Shang, Xiao Chen, Xin Jiang Xin, Qun Liu} 88 | booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 89 | year = {2020}, 90 | } 91 | ``` 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /TernaryBERT/main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TernaryBERT/main.png -------------------------------------------------------------------------------- /TernaryBERT/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | requests 3 | scipy 4 | future 5 | Pillow 6 | tensorflow==1.14.0 7 | torch==1.1.0 8 | -------------------------------------------------------------------------------- /TernaryBERT/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 2 | from .modeling import BertForSequenceClassification,BertForQuestionAnswering, CONFIG_NAME, WEIGHTS_NAME 3 | from .configuration import BertConfig 4 | from .optimization import BertAdam 5 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT-MindSpore/__init__.py -------------------------------------------------------------------------------- /TinyBERT-MindSpore/mindspore_hub_conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | ''' 16 | Bert hub interface for bert base and bert nezha 17 | ''' 18 | from src.tinybert_model import TinyBertModel 19 | from src.tinybert_model import BertConfig 20 | import mindspore.common.dtype as mstype 21 | 22 | tinybert_student_net_cfg = BertConfig( 23 | seq_length=128, 24 | vocab_size=30522, 25 | hidden_size=384, 26 | num_hidden_layers=4, 27 | num_attention_heads=12, 28 | intermediate_size=1536, 29 | hidden_act="gelu", 30 | hidden_dropout_prob=0.1, 31 | attention_probs_dropout_prob=0.1, 32 | max_position_embeddings=512, 33 | type_vocab_size=2, 34 | initializer_range=0.02, 35 | use_relative_positions=False, 36 | dtype=mstype.float32, 37 | compute_type=mstype.float16 38 | ) 39 | 40 | def create_network(name, *args, **kwargs): 41 | ''' 42 | Create tinybert network. 43 | ''' 44 | if name == "tinybert": 45 | if "seq_length" in kwargs: 46 | tinybert_student_net_cfg.seq_length = kwargs["seq_length"] 47 | is_training = kwargs.get("is_training", False) 48 | return TinyBertModel(tinybert_student_net_cfg, is_training, *args) 49 | raise NotImplementedError(f"{name} is not implemented in the repo") 50 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/scripts/run_distributed_gd_ascend.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | echo "==============================================================================================================" 18 | echo "Please run the scipt as: " 19 | echo "bash scripts/run_distributed_gd.sh DEVICE_NUM EPOCH_SIZE RANK_TABLE_FILE" 20 | echo "for example: bash scripts/run_distributed_gd.sh 8 40 /path/hccl.json" 21 | echo "It is better to use absolute path." 22 | echo "running....... please see details by LOG{}/log.txt" 23 | echo "==============================================================================================================" 24 | 25 | EPOCH_SIZE=$2 26 | 27 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 28 | export RANK_TABLE_FILE=$3 29 | export RANK_SIZE=$1 30 | cores=`cat /proc/cpuinfo|grep "processor" |wc -l` 31 | echo "the number of logical core" $cores 32 | avg_core_per_rank=`expr $cores \/ $RANK_SIZE` 33 | core_gap=`expr $avg_core_per_rank \- 1` 34 | echo "avg_core_per_rank" $avg_core_per_rank 35 | echo "core_gap" $core_gap 36 | for((i=0;i env.log 56 | taskset -c $cmdopt python ${PROJECT_DIR}/../run_general_distill.py \ 57 | --distribute="true" \ 58 | --device_target="Ascend" \ 59 | --epoch_size=$EPOCH_SIZE \ 60 | --device_id=$DEVICE_ID \ 61 | --device_num=$RANK_SIZE \ 62 | --enable_data_sink="true" \ 63 | --data_sink_steps=100 \ 64 | --save_ckpt_step=10000 \ 65 | --max_ckpt_num=1 \ 66 | --load_teacher_ckpt_path="" \ 67 | --data_dir="" \ 68 | --schema_dir="" \ 69 | --dataset_type="tfrecord" > log.txt 2>&1 & 70 | cd ../ 71 | done 72 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/scripts/run_distributed_gd_gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | echo "==============================================================================================================" 18 | echo "Please run the scipt as: " 19 | echo "bash run_distributed_gd_gpu.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR TEACHER_CKPT_PATH" 20 | echo "for example: bash run_distributed_gd_gpu.sh 8 3 /path/data/ /path/datasetSchema.json /path/bert_base.ckpt" 21 | echo "It is better to use absolute path." 22 | echo "==============================================================================================================" 23 | 24 | RANK_SIZE=$1 25 | EPOCH_SIZE=$2 26 | DATA_DIR=$3 27 | SCHEMA_DIR=$4 28 | TEACHER_CKPT_PATH=$5 29 | 30 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 31 | 32 | mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ 33 | python ${PROJECT_DIR}/../run_general_distill.py \ 34 | --distribute="true" \ 35 | --device_target="GPU" \ 36 | --epoch_size=$EPOCH_SIZE \ 37 | --save_ckpt_path="" \ 38 | --data_dir=$DATA_DIR \ 39 | --schema_dir=$SCHEMA_DIR \ 40 | --dataset_type="tfrecord" \ 41 | --enable_data_sink="false" \ 42 | --load_teacher_ckpt_path=$TEACHER_CKPT_PATH > log.txt 2>&1 & 43 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/scripts/run_standalone_gd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | echo "==============================================================================================================" 18 | echo "Please run the scipt as: " 19 | echo "bash scripts/run_standalone_gd.sh" 20 | echo "for example: bash scripts/run_standalone_gd.sh" 21 | echo "running....... please see details by log.txt" 22 | echo "==============================================================================================================" 23 | 24 | 25 | mkdir -p ms_log 26 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 27 | CUR_DIR=`pwd` 28 | export GLOG_log_dir=${CUR_DIR}/ms_log 29 | export GLOG_logtostderr=0 30 | python ${PROJECT_DIR}/../run_general_distill.py \ 31 | --distribute="false" \ 32 | --device_target="Ascend" \ 33 | --epoch_size=3 \ 34 | --device_id=0 \ 35 | --enable_data_sink="true" \ 36 | --data_sink_steps=100 \ 37 | --save_ckpt_step=100 \ 38 | --max_ckpt_num=1 \ 39 | --save_ckpt_path="" \ 40 | --load_teacher_ckpt_path="" \ 41 | --data_dir="" \ 42 | --schema_dir="" \ 43 | --dataset_type="tfrecord" > log.txt 2>&1 & 44 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/scripts/run_standalone_td.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Huawei Technologies Co., Ltd 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================ 16 | 17 | echo "==============================================================================================================" 18 | echo "Please run the scipt as: " 19 | echo "bash scipts/run_standalone_td.sh" 20 | echo "for example: bash scipts/run_standalone_td.sh" 21 | echo "==============================================================================================================" 22 | 23 | mkdir -p ms_log 24 | PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) 25 | CUR_DIR=`pwd` 26 | export GLOG_log_dir=${CUR_DIR}/ms_log 27 | export GLOG_logtostderr=0 28 | python ${PROJECT_DIR}/../run_task_distill.py \ 29 | --device_target="Ascend" \ 30 | --device_id=0 \ 31 | --do_train="true" \ 32 | --do_eval="true" \ 33 | --td_phase1_epoch_size=10 \ 34 | --td_phase2_epoch_size=3 \ 35 | --task_name="" \ 36 | --do_shuffle="true" \ 37 | --enable_data_sink="true" \ 38 | --data_sink_steps=100 \ 39 | --save_ckpt_step=100 \ 40 | --max_ckpt_num=1 \ 41 | --load_teacher_ckpt_path="" \ 42 | --load_gd_ckpt_path="" \ 43 | --load_td1_ckpt_path="" \ 44 | --train_data_dir="" \ 45 | --eval_data_dir="" \ 46 | --schema_dir="" \ 47 | --dataset_type="tfrecord" > log.txt 2>&1 & 48 | 49 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT-MindSpore/src/__init__.py -------------------------------------------------------------------------------- /TinyBERT-MindSpore/src/assessment_method.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """assessment methods""" 17 | 18 | import numpy as np 19 | 20 | class Accuracy(): 21 | """Accuracy""" 22 | def __init__(self): 23 | self.acc_num = 0 24 | self.total_num = 0 25 | 26 | def update(self, logits, labels): 27 | labels = labels.asnumpy() 28 | labels = np.reshape(labels, -1) 29 | logits = logits.asnumpy() 30 | logit_id = np.argmax(logits, axis=-1) 31 | self.acc_num += np.sum(labels == logit_id) 32 | self.total_num += len(labels) 33 | 34 | class F1(): 35 | """F1""" 36 | def __init__(self): 37 | self.TP = 0 38 | self.FP = 0 39 | self.FN = 0 40 | 41 | def update(self, logits, labels): 42 | """Update F1 score""" 43 | labels = labels.asnumpy() 44 | labels = np.reshape(labels, -1) 45 | logits = logits.asnumpy() 46 | logit_id = np.argmax(logits, axis=-1) 47 | logit_id = np.reshape(logit_id, -1) 48 | pos_eva = np.isin(logit_id, [2, 3, 4, 5, 6, 7]) 49 | pos_label = np.isin(labels, [2, 3, 4, 5, 6, 7]) 50 | self.TP += np.sum(pos_eva & pos_label) 51 | self.FP += np.sum(pos_eva & (~pos_label)) 52 | self.FN += np.sum((~pos_eva) & pos_label) 53 | print("-----------------precision is ", self.TP / (self.TP + self.FP)) 54 | print("-----------------recall is ", self.TP / (self.TP + self.FN)) 55 | -------------------------------------------------------------------------------- /TinyBERT-MindSpore/src/gd_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | network config setting, will be used in dataset.py, run_general_distill.py and run_task_distill.py 17 | """ 18 | import mindspore.common.dtype as mstype 19 | from easydict import EasyDict as edict 20 | from .tinybert_model import BertConfig 21 | 22 | common_cfg = edict({ 23 | 'batch_size': 32, 24 | 'loss_scale_value': 2 ** 16, 25 | 'scale_factor': 2, 26 | 'scale_window': 1000, 27 | 'AdamWeightDecay': edict({ 28 | 'learning_rate': 5e-5, 29 | 'end_learning_rate': 1e-14, 30 | 'power': 1.0, 31 | 'weight_decay': 1e-4, 32 | 'eps': 1e-6, 33 | 'decay_filter': lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), 34 | }), 35 | }) 36 | ''' 37 | Including two kinds of network: \ 38 | teacher network: The BERT-base network. 39 | student network: The network which is inherited from teacher network. 40 | ''' 41 | bert_teacher_net_cfg = BertConfig( 42 | seq_length=128, 43 | vocab_size=30522, 44 | hidden_size=768, 45 | num_hidden_layers=12, 46 | num_attention_heads=12, 47 | intermediate_size=3072, 48 | hidden_act="gelu", 49 | hidden_dropout_prob=0.1, 50 | attention_probs_dropout_prob=0.1, 51 | max_position_embeddings=512, 52 | type_vocab_size=2, 53 | initializer_range=0.02, 54 | use_relative_positions=False, 55 | dtype=mstype.float32, 56 | compute_type=mstype.float16 57 | ) 58 | bert_student_net_cfg = BertConfig( 59 | seq_length=128, 60 | vocab_size=30522, 61 | hidden_size=384, 62 | num_hidden_layers=4, 63 | num_attention_heads=12, 64 | intermediate_size=1536, 65 | hidden_act="gelu", 66 | hidden_dropout_prob=0.1, 67 | attention_probs_dropout_prob=0.1, 68 | max_position_embeddings=512, 69 | type_vocab_size=2, 70 | initializer_range=0.02, 71 | use_relative_positions=False, 72 | dtype=mstype.float32, 73 | compute_type=mstype.float16 74 | ) 75 | -------------------------------------------------------------------------------- /TinyBERT/requirements.txt: -------------------------------------------------------------------------------- 1 | # progress bars in model download and training scripts 2 | tqdm 3 | # Accessing files from S3 directly. 4 | boto3 5 | # Used for downloading models over HTTP 6 | requests 7 | 8 | torch>=1.0.1 9 | scipy>=0.14.0 10 | seaborn -------------------------------------------------------------------------------- /TinyBERT/tinybert_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huawei-noah/Pretrained-Language-Model/0598f02d7fc4eaa7b4cbc1e9d7ab18bc875c24f9/TinyBERT/tinybert_overview.png -------------------------------------------------------------------------------- /TinyBERT/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.2" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | 4 | 5 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 6 | BertForMaskedLM, BertForNextSentencePrediction, 7 | TinyBertForSequenceClassification, 8 | load_tf_weights_in_bert) 9 | 10 | from .optimization import BertAdam 11 | 12 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME 13 | --------------------------------------------------------------------------------