├── .github └── workflows │ └── cnn_e2e.yml ├── .gitignore ├── Classification └── cnns │ ├── README.md │ ├── alexnet_model.py │ ├── benchmark.sh │ ├── config.py │ ├── data │ ├── ILSVRC2012_val_00020287.JPEG │ ├── fish.jpg │ └── tiger.jpg │ ├── docs │ ├── resnet50_lr_schedule.png │ └── resnet50_validation_acuracy.png │ ├── evaluate.sh │ ├── imagenet1000_clsidx_to_labels.py │ ├── inception_model.py │ ├── inference.sh │ ├── job_function_util.py │ ├── mobilenet_v2_model.py │ ├── of_cnn_evaluate.py │ ├── of_cnn_inference.py │ ├── of_cnn_train_val.py │ ├── ofrecord_util.py │ ├── optimizer_util.py │ ├── resnet2onnx.sh │ ├── resnet_model.py │ ├── resnet_to_onnx.py │ ├── resnext_model.py │ ├── tools │ ├── README.md │ ├── extract_trainval.sh │ ├── imagenet_2012_validation_synset_labels.txt │ ├── imagenet_lsvrc_2015_synsets.txt │ ├── imagenet_metadata.txt │ ├── imagenet_ofrecord.py │ ├── preprocess_imagenet_validation_data.py │ └── process_bounding_boxes.py │ ├── train.sh │ ├── train_fp16.sh │ ├── train_fp32.sh │ ├── util.py │ └── vgg_model.py ├── ClickThroughRate └── WideDeepLearning │ ├── README.md │ ├── how_to_make_hf_dataset.md │ ├── how_to_make_ofrecord_for_wdl.md │ ├── wdl_test_report.md │ ├── wdl_train_eval.py │ ├── wdl_train_eval_test.py │ └── wdl_train_eval_with_hybrid_embd.py ├── Generative ├── README.md ├── dcgan.py ├── layers.py └── pic │ ├── 1.png │ └── 2.png ├── LanguageModeling ├── BERT │ ├── README.md │ ├── bert.py │ ├── classifier.py │ ├── config.py │ ├── convert_tf_ckpt_to_of.py │ ├── pretrain.py │ ├── run_classifier.py │ ├── run_pretraining.py │ ├── run_pretraining_adam.sh │ ├── run_pretraining_lamb.sh │ ├── run_squad.py │ ├── run_squad.sh │ ├── squad.py │ ├── squad_util.py │ ├── tokenization.py │ └── util.py └── GPT │ ├── LICENSE │ ├── README.md │ ├── examples │ ├── distribute_pretrain_2n4d.sh │ ├── distribute_pretrain_4n8d.sh │ ├── distribute_pretrain_4n8d_2x4x4_512_2304x24.sh │ ├── distribute_pretrain_with_container.sh │ ├── lambada_cloze_accuracy.sh │ ├── pretrain.sh │ ├── pretrain_117M.sh │ ├── pretrain_1n8d_2x4x1_16_1536x16.sh │ ├── pretrain_345M.sh │ ├── pretrain_with_container.sh │ └── pretrain_with_profile.sh │ ├── oneflow_gpt │ ├── __init__.py │ ├── config.py │ ├── data.py │ ├── distribute.py │ ├── model.py │ ├── optimizer.py │ ├── snapshot.py │ ├── third_party │ │ ├── __init__.py │ │ └── data │ │ │ ├── __init__.py │ │ │ ├── gpt_dataset.py │ │ │ └── indexed_dataset.py │ ├── training.py │ └── util.py │ ├── requirements.txt │ ├── setup.py │ ├── tasks │ ├── __init__.py │ ├── main.py │ └── zeroshot_gpt │ │ ├── __init__.py │ │ ├── datasets.py │ │ └── evaluate.py │ ├── tokenizer │ ├── __init__.py │ ├── gpt2_tokenization.py │ └── tokenizer.py │ └── tools │ ├── README.md │ ├── ansible_inventory │ ├── compare_loss.py │ ├── convert_py_model_to_of.py │ ├── launch_container.py │ ├── meta.proto │ ├── meta_pb2.py │ └── prepare_distribute.sh ├── README.md └── reports ├── bert_fp32_report.md └── resnet50_v15_fp32_report.md /.github/workflows/cnn_e2e.yml: -------------------------------------------------------------------------------- 1 | name: 'resnet e2e test' 2 | on: 3 | pull_request: 4 | types: [review_requested] 5 | branches: 6 | - "*" 7 | workflow_dispatch: 8 | inputs: 9 | placeholder: 10 | description: "placeholder, no effect" 11 | required: false 12 | 13 | jobs: 14 | build: 15 | name: 'Build and test this repo' 16 | runs-on: ubuntu-latest 17 | steps: 18 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # IDE 7 | .idea 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | output/ 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /Classification/cnns/alexnet_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import oneflow.compatible.single_client as flow 18 | 19 | 20 | def _get_kernel_initializer(data_format="NCHW"): 21 | return flow.variance_scaling_initializer( 22 | distribution="random_normal", data_format=data_format 23 | ) 24 | 25 | 26 | def _get_regularizer(): 27 | return flow.regularizers.l2(0.00005) 28 | 29 | 30 | def _get_bias_initializer(): 31 | return flow.zeros_initializer() 32 | 33 | 34 | def conv2d_layer( 35 | name, 36 | input, 37 | filters, 38 | kernel_size=3, 39 | strides=1, 40 | padding="SAME", 41 | data_format="NCHW", 42 | dilation_rate=1, 43 | activation="Relu", 44 | use_bias=True, 45 | bias_initializer=_get_bias_initializer(), 46 | weight_regularizer=_get_regularizer(), 47 | bias_regularizer=_get_regularizer(), 48 | ): 49 | if isinstance(kernel_size, int): 50 | kernel_size_1 = kernel_size 51 | kernel_size_2 = kernel_size 52 | if isinstance(kernel_size, list): 53 | kernel_size_1 = kernel_size[0] 54 | kernel_size_2 = kernel_size[1] 55 | 56 | weight_initializer = _get_kernel_initializer(data_format) 57 | weight_shape = ( 58 | (filters, input.shape[1], kernel_size_1, kernel_size_2) 59 | if data_format == "NCHW" 60 | else (filters, kernel_size_1, kernel_size_2, input.shape[3]) 61 | ) 62 | weight = flow.get_variable( 63 | name + "-weight", 64 | shape=weight_shape, 65 | dtype=input.dtype, 66 | initializer=weight_initializer, 67 | regularizer=weight_regularizer, 68 | ) 69 | output = flow.nn.conv2d( 70 | input, weight, strides, padding, None, data_format, dilation_rate, name=name 71 | ) 72 | if use_bias: 73 | bias = flow.get_variable( 74 | name + "-bias", 75 | shape=(filters,), 76 | dtype=input.dtype, 77 | initializer=bias_initializer, 78 | regularizer=bias_regularizer, 79 | ) 80 | output = flow.nn.bias_add(output, bias, data_format) 81 | 82 | if activation is not None: 83 | if activation == "Relu": 84 | output = flow.nn.relu(output) 85 | else: 86 | raise NotImplementedError 87 | 88 | return output 89 | 90 | 91 | def alexnet(images, args, trainable=True): 92 | data_format = "NHWC" if args.channel_last else "NCHW" 93 | 94 | conv1 = conv2d_layer( 95 | "conv1", 96 | images, 97 | filters=64, 98 | kernel_size=11, 99 | strides=4, 100 | padding="VALID", 101 | data_format=data_format, 102 | ) 103 | 104 | pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", data_format, name="pool1") 105 | 106 | conv2 = conv2d_layer( 107 | "conv2", pool1, filters=192, kernel_size=5, data_format=data_format 108 | ) 109 | 110 | pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", data_format, name="pool2") 111 | 112 | conv3 = conv2d_layer("conv3", pool2, filters=384, data_format=data_format) 113 | 114 | conv4 = conv2d_layer("conv4", conv3, filters=384, data_format=data_format) 115 | 116 | conv5 = conv2d_layer("conv5", conv4, filters=256, data_format=data_format) 117 | 118 | pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", data_format, name="pool5") 119 | 120 | if len(pool5.shape) > 2: 121 | pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1)) 122 | 123 | fc1 = flow.layers.dense( 124 | inputs=pool5, 125 | units=4096, 126 | activation=flow.nn.relu, 127 | use_bias=True, 128 | # kernel_initializer=flow.random_uniform_initializer(), 129 | kernel_initializer=_get_kernel_initializer(), 130 | bias_initializer=_get_bias_initializer(), 131 | kernel_regularizer=_get_regularizer(), 132 | bias_regularizer=_get_regularizer(), 133 | trainable=trainable, 134 | name="fc1", 135 | ) 136 | 137 | dropout1 = flow.nn.dropout(fc1, rate=0.5) 138 | 139 | fc2 = flow.layers.dense( 140 | inputs=dropout1, 141 | units=4096, 142 | activation=flow.nn.relu, 143 | use_bias=True, 144 | kernel_initializer=_get_kernel_initializer(), 145 | bias_initializer=_get_bias_initializer(), 146 | kernel_regularizer=_get_regularizer(), 147 | bias_regularizer=_get_regularizer(), 148 | trainable=trainable, 149 | name="fc2", 150 | ) 151 | 152 | dropout2 = flow.nn.dropout(fc2, rate=0.5) 153 | 154 | fc3 = flow.layers.dense( 155 | inputs=dropout2, 156 | units=1000, 157 | activation=None, 158 | use_bias=False, 159 | kernel_initializer=_get_kernel_initializer(), 160 | kernel_regularizer=_get_regularizer(), 161 | bias_initializer=False, 162 | trainable=trainable, 163 | name="fc3", 164 | ) 165 | 166 | return fc3 167 | -------------------------------------------------------------------------------- /Classification/cnns/benchmark.sh: -------------------------------------------------------------------------------- 1 | BENCH_ROOT=$1 2 | NUM_NODES=$2 3 | GPU_NUM_PER_NODE=$3 4 | BSZ_PER_DEVICE=$4 5 | 6 | if [ -n "$5" ]; then 7 | DATA_ROOT=$5 8 | else 9 | DATA_ROOT=/datasets/ImageNet/OneFlow 10 | fi 11 | 12 | DATA_PART_NUM=44 13 | 14 | rm -rf ./log 15 | mkdir ./log 16 | 17 | NUM_ITERS=300 18 | NUM_EXAMPLES=$(($NUM_NODES * $GPU_NUM_PER_NODE * $BSZ_PER_DEVICE * $NUM_ITERS)) 19 | 20 | export PYTHONUNBUFFERED=1 21 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED 22 | export NCCL_LAUNCH_MODE=PARALLEL 23 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE 24 | 25 | python3 $BENCH_ROOT/of_cnn_train_val.py \ 26 | --num_examples=$NUM_EXAMPLES \ 27 | --train_data_dir=$DATA_ROOT/train \ 28 | --train_data_part_num=$DATA_PART_NUM \ 29 | --num_nodes=$NUM_NODES \ 30 | --gpu_num_per_node=$GPU_NUM_PER_NODE \ 31 | --optimizer="sgd" \ 32 | --momentum=0.875 \ 33 | --label_smoothing=0.1 \ 34 | --learning_rate=0.001 \ 35 | --loss_print_every_n_iter=100 \ 36 | --batch_size_per_device=$BSZ_PER_DEVICE \ 37 | --val_batch_size_per_device=125 \ 38 | --num_epoch=1 \ 39 | --log_dir=./log \ 40 | --use_fp16 \ 41 | --channel_last=True \ 42 | --pad_output \ 43 | --fuse_bn_relu=True \ 44 | --fuse_bn_add_relu=True \ 45 | --nccl_fusion_threshold_mb=16 \ 46 | --nccl_fusion_max_ops=24 \ 47 | --gpu_image_decoder=True \ 48 | --model="resnet50" 49 | -------------------------------------------------------------------------------- /Classification/cnns/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import argparse 18 | from datetime import datetime 19 | 20 | 21 | from optimizer_util import add_optimizer_args 22 | from ofrecord_util import add_ofrecord_args 23 | 24 | 25 | def get_parser(parser=None): 26 | def str_list(x): 27 | return [i.strip() for i in x.split(",")] 28 | 29 | def int_list(x): 30 | return list(map(int, x.split(","))) 31 | 32 | def float_list(x): 33 | return list(map(float, x.split(","))) 34 | 35 | def str2bool(v): 36 | if v.lower() in ("yes", "true", "t", "y", "1"): 37 | return True 38 | elif v.lower() in ("no", "false", "f", "n", "0"): 39 | return False 40 | else: 41 | raise argparse.ArgumentTypeError("Unsupported value encountered.") 42 | 43 | if parser is None: 44 | parser = argparse.ArgumentParser("flags for cnn benchmark") 45 | 46 | parser.add_argument("--dtype", type=str, default="float32", help="float16 float32") 47 | 48 | # resouce 49 | parser.add_argument("--gpu_num_per_node", type=int, default=1) 50 | parser.add_argument( 51 | "--num_nodes", type=int, default=1, help="node/machine number for training" 52 | ) 53 | parser.add_argument( 54 | "--node_ips", 55 | type=str_list, 56 | default=["192.168.1.13", "192.168.1.14"], 57 | help='nodes ip list for training, devided by ",", length >= num_nodes', 58 | ) 59 | parser.add_argument( 60 | "--ctrl_port", type=int, default=50051, help="ctrl_port for multinode job" 61 | ) 62 | 63 | parser.add_argument("--model", type=str, default="resnet50", help="resnet50") 64 | parser.add_argument( 65 | "--use_fp16", 66 | type=str2bool, 67 | nargs="?", 68 | const=True, 69 | help="Whether to use use fp16", 70 | ) 71 | parser.add_argument( 72 | "--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla" 73 | ) 74 | 75 | parser.add_argument( 76 | "--channel_last", 77 | type=str2bool, 78 | nargs="?", 79 | const=False, 80 | help="Whether to use use channel last mode(nhwc)", 81 | ) 82 | parser.add_argument( 83 | "--pad_output", 84 | type=str2bool, 85 | nargs="?", 86 | const=True, 87 | help="Whether to pad the output to number of image channels to 4.", 88 | ) 89 | 90 | # train and validaion 91 | parser.add_argument("--num_epochs", type=int, default=90, help="number of epochs") 92 | parser.add_argument( 93 | "--model_load_dir", type=str, default=None, help="model load directory if need" 94 | ) 95 | parser.add_argument( 96 | "--save_epoch_interval", 97 | type=int, 98 | default=10, 99 | help="Number of iterations between checkpoint saves.", 100 | ) 101 | parser.add_argument( 102 | "--save_last", 103 | action="store_true", 104 | default=False, 105 | help="save model snapshot for last iteration", 106 | ) 107 | parser.add_argument( 108 | "--save_init", 109 | action="store_true", 110 | default=False, 111 | help="save model snapshot for inited", 112 | ) 113 | parser.add_argument("--batch_size_per_device", type=int, default=64) 114 | parser.add_argument("--val_batch_size_per_device", type=int, default=8) 115 | 116 | parser.add_argument( 117 | "--nccl_fusion_threshold_mb", 118 | type=int, 119 | default=0, 120 | help="NCCL fusion threshold megabytes, set to 0 to compatible with previous version of OneFlow.", 121 | ) 122 | parser.add_argument( 123 | "--nccl_fusion_max_ops", 124 | type=int, 125 | default=0, 126 | help="Maximum number of ops of NCCL fusion, set to 0 to compatible with previous version of OneFlow.", 127 | ) 128 | 129 | # fuse bn relu or bn add relu 130 | parser.add_argument( 131 | "--fuse_bn_relu", 132 | type=str2bool, 133 | default=False, 134 | help="Whether to use use fuse batch normalization relu. Currently supported in origin/master of OneFlow only.", 135 | ) 136 | parser.add_argument( 137 | "--fuse_bn_add_relu", 138 | type=str2bool, 139 | default=False, 140 | help="Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.", 141 | ) 142 | parser.add_argument( 143 | "--gpu_image_decoder", 144 | type=str2bool, 145 | default=False, 146 | help="Whether to use use ImageDecoderRandomCropResize.", 147 | ) 148 | # inference 149 | parser.add_argument( 150 | "--image_path", type=str, default="test_img/tiger.jpg", help="image path" 151 | ) 152 | 153 | # for data process 154 | parser.add_argument( 155 | "--num_classes", type=int, default=1000, help="num of pic classes" 156 | ) 157 | parser.add_argument( 158 | "--num_examples", type=int, default=1281167, help="train pic number" 159 | ) 160 | parser.add_argument( 161 | "--num_val_examples", type=int, default=50000, help="validation pic number" 162 | ) 163 | parser.add_argument( 164 | "--rgb-mean", 165 | type=float_list, 166 | default=[123.68, 116.779, 103.939], 167 | help="a tuple of size 3 for the mean rgb", 168 | ) 169 | parser.add_argument( 170 | "--rgb-std", 171 | type=float_list, 172 | default=[58.393, 57.12, 57.375], 173 | help="a tuple of size 3 for the std rgb", 174 | ) 175 | parser.add_argument( 176 | "--image-shape", 177 | type=int_list, 178 | default=[3, 224, 224], 179 | help="the image shape feed into the network", 180 | ) 181 | parser.add_argument( 182 | "--label_smoothing", type=float, default=0.1, help="label smoothing factor" 183 | ) 184 | 185 | # snapshot 186 | parser.add_argument( 187 | "--model_save_dir", 188 | type=str, 189 | default="./output/snapshots/model_save-{}".format( 190 | str(datetime.now().strftime("%Y%m%d%H%M%S")) 191 | ), 192 | help="model save directory", 193 | ) 194 | 195 | # log and loss print 196 | parser.add_argument( 197 | "--log_dir", type=str, default="./output", help="log info save directory" 198 | ) 199 | parser.add_argument( 200 | "--loss_print_every_n_iter", 201 | type=int, 202 | default=1, 203 | help="print loss every n iteration", 204 | ) 205 | add_ofrecord_args(parser) 206 | add_optimizer_args(parser) 207 | return parser 208 | 209 | 210 | def print_args(args): 211 | print("=".ljust(66, "=")) 212 | print( 213 | "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( 214 | args.model, args.gpu_num_per_node, args.num_nodes 215 | ) 216 | ) 217 | print("=".ljust(66, "=")) 218 | for arg in vars(args): 219 | print("{} = {}".format(arg, getattr(args, arg))) 220 | print("-".ljust(66, "-")) 221 | print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) 222 | 223 | 224 | if __name__ == "__main__": 225 | parser = get_parser() 226 | args = parser.parse_args() 227 | print_args(args) 228 | -------------------------------------------------------------------------------- /Classification/cnns/data/ILSVRC2012_val_00020287.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/ILSVRC2012_val_00020287.JPEG -------------------------------------------------------------------------------- /Classification/cnns/data/fish.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/fish.jpg -------------------------------------------------------------------------------- /Classification/cnns/data/tiger.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/tiger.jpg -------------------------------------------------------------------------------- /Classification/cnns/docs/resnet50_lr_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/docs/resnet50_lr_schedule.png -------------------------------------------------------------------------------- /Classification/cnns/docs/resnet50_validation_acuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/docs/resnet50_validation_acuracy.png -------------------------------------------------------------------------------- /Classification/cnns/evaluate.sh: -------------------------------------------------------------------------------- 1 | rm -rf core.* 2 | 3 | # Set up dataset root dir 4 | DATA_ROOT=/dataset/ImageNet/ofrecord 5 | 6 | # Set up model path, e.g. : vgg16_of_best_model_val_top1_721 alexnet_of_best_model_val_top1_54762 7 | MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" 8 | 9 | python3 of_cnn_evaluate.py \ 10 | --num_epochs=3 \ 11 | --num_val_examples=50000 \ 12 | --model_load_dir=$MODEL_LOAD_DIR \ 13 | --val_data_dir=$DATA_ROOT/validation \ 14 | --val_data_part_num=256 \ 15 | --num_nodes=1 \ 16 | --node_ips='127.0.0.1' \ 17 | --gpu_num_per_node=4 \ 18 | --val_batch_size_per_device=64 \ 19 | --model="resnet50" 20 | -------------------------------------------------------------------------------- /Classification/cnns/inference.sh: -------------------------------------------------------------------------------- 1 | rm -rf core.* 2 | 3 | # Set up model path, e.g. : vgg16_of_best_model_val_top1_721 alexnet_of_best_model_val_top1_54762 4 | MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318" 5 | 6 | python3 of_cnn_inference.py \ 7 | --model="resnet50" \ 8 | --image_path="data/fish.jpg" \ 9 | --model_load_dir=$MODEL_LOAD_DIR -------------------------------------------------------------------------------- /Classification/cnns/job_function_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import oneflow.compatible.single_client as flow 18 | 19 | 20 | def _default_config(args): 21 | config = flow.function_config() 22 | config.default_logical_view(flow.scope.consistent_view()) 23 | config.default_data_type(flow.float) 24 | if args.use_fp16: 25 | config.enable_auto_mixed_precision(True) 26 | if args.use_xla: 27 | config.use_xla_jit(True) 28 | config.enable_fuse_add_to_output(True) 29 | return config 30 | 31 | 32 | def get_train_config(args): 33 | train_config = _default_config(args) 34 | train_config.cudnn_conv_heuristic_search_algo(False) 35 | 36 | train_config.prune_parallel_cast_ops(True) 37 | train_config.enable_inplace(True) 38 | if args.num_nodes > 1: 39 | train_config.cudnn_conv_heuristic_search_algo(True) 40 | else: 41 | train_config.cudnn_conv_heuristic_search_algo(False) 42 | train_config.enable_fuse_model_update_ops(True) 43 | return train_config 44 | 45 | 46 | def get_val_config(args): 47 | return _default_config(args) 48 | -------------------------------------------------------------------------------- /Classification/cnns/of_cnn_evaluate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import time 19 | import math 20 | import numpy as np 21 | 22 | import config as configs 23 | 24 | parser = configs.get_parser() 25 | args = parser.parse_args() 26 | configs.print_args(args) 27 | 28 | from util import Snapshot, InitNodes, Metric 29 | import ofrecord_util 30 | from job_function_util import get_train_config, get_val_config 31 | import oneflow.compatible.single_client as flow 32 | import vgg_model 33 | import resnet_model 34 | import resnext_model 35 | import alexnet_model 36 | import mobilenet_v2_model 37 | 38 | 39 | total_device_num = args.num_nodes * args.gpu_num_per_node 40 | val_batch_size = total_device_num * args.val_batch_size_per_device 41 | (C, H, W) = args.image_shape 42 | num_val_steps = int(args.num_val_examples / val_batch_size) 43 | 44 | 45 | model_dict = { 46 | "resnet50": resnet_model.resnet50, 47 | "vgg": vgg_model.vgg16bn, 48 | "alexnet": alexnet_model.alexnet, 49 | "mobilenetv2": mobilenet_v2_model.Mobilenet, 50 | "resnext50": resnext_model.resnext50, 51 | } 52 | 53 | 54 | flow.config.gpu_device_num(args.gpu_num_per_node) 55 | # flow.config.enable_debug_mode(True) 56 | @flow.global_function("predict", get_val_config(args)) 57 | def InferenceNet(): 58 | assert os.path.exists(args.val_data_dir) 59 | print("Loading data from {}".format(args.val_data_dir)) 60 | (labels, images) = ofrecord_util.load_imagenet_for_validation(args) 61 | 62 | logits = model_dict[args.model](images, args) 63 | predictions = flow.nn.softmax(logits) 64 | outputs = {"predictions": predictions, "labels": labels} 65 | return outputs 66 | 67 | 68 | def main(): 69 | InitNodes(args) 70 | assert args.model_load_dir, "Must have model load dir!" 71 | 72 | flow.env.log_dir(args.log_dir) 73 | # snapshot = Snapshot(args.model_save_dir, args.model_load_dir) 74 | print("Restoring model from {}.".format(args.model_load_dir)) 75 | flow.load_variables(flow.checkpoint.get(args.model_load_dir)) 76 | metric = Metric( 77 | desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size 78 | ) 79 | 80 | for i in range(args.num_epochs): 81 | for j in range(num_val_steps): 82 | InferenceNet().async_get(metric.metric_cb(0, j)) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /Classification/cnns/of_cnn_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import numpy as np 19 | from PIL import Image 20 | 21 | import config as configs 22 | 23 | parser = configs.get_parser() 24 | args = parser.parse_args() 25 | configs.print_args(args) 26 | 27 | import oneflow.compatible.single_client as flow 28 | import oneflow.compatible.single_client.typing as tp 29 | from imagenet1000_clsidx_to_labels import clsidx_2_labels 30 | 31 | import resnet_model 32 | import resnext_model 33 | import vgg_model 34 | import alexnet_model 35 | import mobilenet_v2_model 36 | 37 | model_dict = { 38 | "resnet50": resnet_model.resnet50, 39 | "vgg": vgg_model.vgg16bn, 40 | "alexnet": alexnet_model.alexnet, 41 | "mobilenetv2": mobilenet_v2_model.Mobilenet, 42 | "resnext50": resnext_model.resnext50, 43 | } 44 | 45 | 46 | def load_image(image_path="test_img/ILSVRC2012_val_00020287.JPEG"): 47 | print(image_path) 48 | im = Image.open(image_path) 49 | im = im.resize((224, 224)) 50 | im = im.convert("RGB") # 有的图像是单通道的,不加转换会报错 51 | im = np.array(im).astype("float32") 52 | im = (im - args.rgb_mean) / args.rgb_std 53 | im = np.transpose(im, (2, 0, 1)) 54 | im = np.expand_dims(im, axis=0) 55 | return np.ascontiguousarray(im, "float32") 56 | 57 | 58 | @flow.global_function("predict", flow.function_config()) 59 | def InferenceNet( 60 | images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float) 61 | ) -> tp.Numpy: 62 | logits = model_dict[args.model](images, args) 63 | predictions = flow.nn.softmax(logits) 64 | return predictions 65 | 66 | 67 | def main(): 68 | flow.env.log_dir(args.log_dir) 69 | assert os.path.isdir(args.model_load_dir) 70 | flow.load_variables(flow.checkpoint.get(args.model_load_dir)) 71 | image = load_image(args.image_path) 72 | predictions = InferenceNet(image) 73 | clsidx = predictions.argmax() 74 | print(predictions.max(), clsidx_2_labels[clsidx]) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /Classification/cnns/of_cnn_train_val.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import os 17 | import math 18 | import oneflow.compatible.single_client as flow 19 | import ofrecord_util 20 | import optimizer_util 21 | import config as configs 22 | from util import Snapshot, InitNodes, Metric 23 | from job_function_util import get_train_config, get_val_config 24 | import resnet_model 25 | import resnext_model 26 | import vgg_model 27 | import alexnet_model 28 | import inception_model 29 | import mobilenet_v2_model 30 | 31 | parser = configs.get_parser() 32 | args = parser.parse_args() 33 | configs.print_args(args) 34 | 35 | total_device_num = args.num_nodes * args.gpu_num_per_node 36 | train_batch_size = total_device_num * args.batch_size_per_device 37 | val_batch_size = total_device_num * args.val_batch_size_per_device 38 | (C, H, W) = args.image_shape 39 | epoch_size = math.ceil(args.num_examples / train_batch_size) 40 | num_val_steps = int(args.num_val_examples / val_batch_size) 41 | 42 | 43 | model_dict = { 44 | "resnet50": resnet_model.resnet50, 45 | "vgg": vgg_model.vgg16bn, 46 | "alexnet": alexnet_model.alexnet, 47 | "inceptionv3": inception_model.inceptionv3, 48 | "mobilenetv2": mobilenet_v2_model.Mobilenet, 49 | "resnext50": resnext_model.resnext50, 50 | } 51 | 52 | 53 | flow.config.gpu_device_num(args.gpu_num_per_node) 54 | # flow.config.enable_debug_mode(True) 55 | 56 | if args.use_fp16 and args.num_nodes * args.gpu_num_per_node > 1: 57 | flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) 58 | 59 | if args.nccl_fusion_threshold_mb: 60 | flow.config.collective_boxing.nccl_fusion_threshold_mb( 61 | args.nccl_fusion_threshold_mb 62 | ) 63 | 64 | if args.nccl_fusion_max_ops: 65 | flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops) 66 | 67 | 68 | def label_smoothing(labels, classes, eta, dtype): 69 | assert classes > 0 70 | assert eta >= 0.0 and eta < 1.0 71 | return flow.one_hot( 72 | labels, 73 | depth=classes, 74 | dtype=dtype, 75 | on_value=1 - eta + eta / classes, 76 | off_value=eta / classes, 77 | ) 78 | 79 | 80 | @flow.global_function("train", get_train_config(args)) 81 | def TrainNet(): 82 | if args.train_data_dir: 83 | assert os.path.exists(args.train_data_dir) 84 | print("Loading data from {}".format(args.train_data_dir)) 85 | (labels, images) = ofrecord_util.load_imagenet_for_training(args) 86 | 87 | else: 88 | print("Loading synthetic data.") 89 | (labels, images) = ofrecord_util.load_synthetic(args) 90 | logits = model_dict[args.model](images, args) 91 | if args.label_smoothing > 0: 92 | one_hot_labels = label_smoothing( 93 | labels, args.num_classes, args.label_smoothing, logits.dtype 94 | ) 95 | loss = flow.nn.softmax_cross_entropy_with_logits( 96 | one_hot_labels, logits, name="softmax_loss" 97 | ) 98 | else: 99 | loss = flow.nn.sparse_softmax_cross_entropy_with_logits( 100 | labels, logits, name="softmax_loss" 101 | ) 102 | 103 | loss = flow.math.reduce_mean(loss) 104 | predictions = flow.nn.softmax(logits) 105 | outputs = {"loss": loss, "predictions": predictions, "labels": labels} 106 | 107 | # set up warmup,learning rate and optimizer 108 | optimizer_util.set_up_optimizer(loss, args) 109 | return outputs 110 | 111 | 112 | @flow.global_function("predict", get_val_config(args)) 113 | def InferenceNet(): 114 | if args.val_data_dir: 115 | assert os.path.exists(args.val_data_dir) 116 | print("Loading data from {}".format(args.val_data_dir)) 117 | (labels, images) = ofrecord_util.load_imagenet_for_validation(args) 118 | 119 | else: 120 | print("Loading synthetic data.") 121 | (labels, images) = ofrecord_util.load_synthetic(args) 122 | 123 | logits = model_dict[args.model](images, args) 124 | predictions = flow.nn.softmax(logits) 125 | outputs = {"predictions": predictions, "labels": labels} 126 | return outputs 127 | 128 | 129 | def main(): 130 | InitNodes(args) 131 | flow.env.log_dir(args.log_dir) 132 | 133 | snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init) 134 | 135 | print(" {} iter per epoch...".format(epoch_size)) 136 | 137 | for epoch in range(1, args.num_epochs + 1): 138 | metric = Metric( 139 | desc="train", 140 | calculate_batches=args.loss_print_every_n_iter, 141 | batch_size=train_batch_size, 142 | loss_key="loss", 143 | ) 144 | for i in range(epoch_size): 145 | TrainNet().async_get(metric.metric_cb(epoch, i)) 146 | 147 | if args.val_data_dir: 148 | metric = Metric( 149 | desc="validation", 150 | calculate_batches=num_val_steps, 151 | batch_size=val_batch_size, 152 | ) 153 | for i in range(num_val_steps): 154 | InferenceNet().async_get(metric.metric_cb(epoch, i)) 155 | if epoch % args.save_epoch_interval == 0: 156 | snapshot.save("epoch_{}".format(epoch)) 157 | 158 | if args.save_last: 159 | snapshot.save("epoch_{}".format("last")) 160 | 161 | 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /Classification/cnns/ofrecord_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import oneflow.compatible.single_client as flow 18 | 19 | 20 | def add_ofrecord_args(parser): 21 | parser.add_argument( 22 | "--image_size", type=int, default=224, required=False, help="image size" 23 | ) 24 | parser.add_argument( 25 | "--resize_shorter", 26 | type=int, 27 | default=256, 28 | required=False, 29 | help="resize shorter for validation", 30 | ) 31 | parser.add_argument( 32 | "--train_data_dir", type=str, default=None, help="train dataset directory" 33 | ) 34 | parser.add_argument( 35 | "--train_data_part_num", type=int, default=256, help="train data part num" 36 | ) 37 | parser.add_argument( 38 | "--val_data_dir", type=str, default=None, help="val dataset directory" 39 | ) 40 | parser.add_argument( 41 | "--val_data_part_num", type=int, default=256, help="val data part num" 42 | ) 43 | return parser 44 | 45 | 46 | def load_synthetic(args): 47 | total_device_num = args.num_nodes * args.gpu_num_per_node 48 | batch_size = total_device_num * args.batch_size_per_device 49 | label = flow.data.decode_random( 50 | shape=(), 51 | dtype=flow.int32, 52 | batch_size=batch_size, 53 | initializer=flow.zeros_initializer(flow.int32), 54 | ) 55 | 56 | shape = ( 57 | (args.image_size, args.image_size, 3) 58 | if args.channel_last 59 | else (3, args.image_size, args.image_size) 60 | ) 61 | image = flow.data.decode_random( 62 | shape=shape, dtype=flow.float, batch_size=batch_size 63 | ) 64 | 65 | return label, image 66 | 67 | 68 | def load_imagenet_for_training(args): 69 | total_device_num = args.num_nodes * args.gpu_num_per_node 70 | train_batch_size = total_device_num * args.batch_size_per_device 71 | output_layout = "NHWC" if args.channel_last else "NCHW" 72 | 73 | color_space = "RGB" 74 | ofrecord = flow.data.ofrecord_reader( 75 | args.train_data_dir, 76 | batch_size=train_batch_size, 77 | data_part_num=args.train_data_part_num, 78 | part_name_suffix_length=5, 79 | random_shuffle=True, 80 | shuffle_after_epoch=True, 81 | ) 82 | label = flow.data.OFRecordRawDecoder( 83 | ofrecord, "class/label", shape=(), dtype=flow.int32 84 | ) 85 | if args.gpu_image_decoder: 86 | encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded") 87 | image = flow.data.ImageDecoderRandomCropResize( 88 | encoded, target_width=224, target_height=224, num_workers=3, warmup_size=2048 89 | ) 90 | else: 91 | image = flow.data.OFRecordImageDecoderRandomCrop( 92 | ofrecord, "encoded", color_space=color_space # seed=seed, 93 | ) 94 | rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size]) 95 | image = rsz[0] 96 | 97 | rng = flow.random.CoinFlip(batch_size=train_batch_size) # , seed=seed) 98 | normal = flow.image.CropMirrorNormalize( 99 | image, 100 | mirror_blob=rng, 101 | color_space=color_space, 102 | output_layout=output_layout, 103 | mean=args.rgb_mean, 104 | std=args.rgb_std, 105 | output_dtype=flow.float, 106 | ) 107 | return label, normal 108 | 109 | 110 | def load_imagenet_for_validation(args): 111 | total_device_num = args.num_nodes * args.gpu_num_per_node 112 | val_batch_size = total_device_num * args.val_batch_size_per_device 113 | output_layout = "NHWC" if args.channel_last else "NCHW" 114 | 115 | color_space = "RGB" 116 | ofrecord = flow.data.ofrecord_reader( 117 | args.val_data_dir, 118 | batch_size=val_batch_size, 119 | data_part_num=args.val_data_part_num, 120 | part_name_suffix_length=5, 121 | shuffle_after_epoch=False, 122 | ) 123 | image = flow.data.OFRecordImageDecoder(ofrecord, "encoded", color_space=color_space) 124 | label = flow.data.OFRecordRawDecoder( 125 | ofrecord, "class/label", shape=(), dtype=flow.int32 126 | ) 127 | 128 | rsz = flow.image.Resize( 129 | image, 130 | resize_side="shorter", 131 | keep_aspect_ratio=True, 132 | target_size=args.resize_shorter, 133 | ) 134 | 135 | normal = flow.image.CropMirrorNormalize( 136 | rsz[0], 137 | color_space=color_space, 138 | output_layout=output_layout, 139 | crop_h=args.image_size, 140 | crop_w=args.image_size, 141 | crop_pos_y=0.5, 142 | crop_pos_x=0.5, 143 | mean=args.rgb_mean, 144 | std=args.rgb_std, 145 | output_dtype=flow.float, 146 | ) 147 | return label, normal 148 | 149 | 150 | if __name__ == "__main__": 151 | import os 152 | import config as configs 153 | from util import InitNodes, Metric 154 | from job_function_util import get_val_config 155 | 156 | parser = configs.get_parser() 157 | args = parser.parse_args() 158 | configs.print_args(args) 159 | 160 | flow.config.gpu_device_num(args.gpu_num_per_node) 161 | # flow.config.enable_debug_mode(True) 162 | @flow.global_function(get_val_config(args)) 163 | def IOTest(): 164 | if args.train_data_dir: 165 | assert os.path.exists(args.train_data_dir) 166 | print("Loading data from {}".format(args.train_data_dir)) 167 | (labels, images) = load_imagenet_for_training(args) 168 | else: 169 | print("Loading synthetic data.") 170 | (labels, images) = load_synthetic(args) 171 | outputs = {"images": images, "labels": labels} 172 | return outputs 173 | 174 | total_device_num = args.num_nodes * args.gpu_num_per_node 175 | train_batch_size = total_device_num * args.batch_size_per_device 176 | metric = Metric( 177 | desc="io_test", 178 | calculate_batches=args.loss_print_every_n_iter, 179 | batch_size=train_batch_size, 180 | prediction_key=None, 181 | ) 182 | for i in range(1000): 183 | IOTest().async_get(metric.metric_cb(0, i)) 184 | -------------------------------------------------------------------------------- /Classification/cnns/optimizer_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import oneflow.compatible.single_client as flow 17 | import math 18 | import pprint 19 | 20 | 21 | def add_optimizer_args(parser): 22 | group = parser.add_argument_group( 23 | "optimizer parameters", "entire group applies only to optimizer parameters" 24 | ) 25 | group.add_argument( 26 | "--optimizer", type=str, default="sgd", help="sgd, adam, rmsprop" 27 | ) 28 | group.add_argument("--learning_rate", type=float, default=0.256) 29 | group.add_argument("--wd", type=float, default=1.0 / 32768, help="weight decay") 30 | group.add_argument("--momentum", type=float, default=0.875, help="momentum") 31 | group.add_argument( 32 | "--lr_decay", 33 | type=str, 34 | default="cosine", 35 | help="cosine, step, polynomial, exponential, None", 36 | ) 37 | group.add_argument( 38 | "--lr_decay_rate", 39 | type=float, 40 | default="0.94", 41 | help="exponential learning decay rate", 42 | ) 43 | group.add_argument( 44 | "--lr_decay_epochs", 45 | type=int, 46 | default=2, 47 | help="exponential learning rate decay every n epochs", 48 | ) 49 | group.add_argument( 50 | "--warmup_epochs", 51 | type=int, 52 | default=5, 53 | help="the epochs to warmp-up lr to scaled large-batch value", 54 | ) 55 | group.add_argument( 56 | "--decay_rate", type=float, default="0.9", help="decay rate of RMSProp" 57 | ) 58 | group.add_argument("--epsilon", type=float, default="1", help="epsilon") 59 | group.add_argument( 60 | "--gradient_clipping", type=float, default=0.0, help="gradient clipping" 61 | ) 62 | return parser 63 | 64 | 65 | def set_up_optimizer(loss, args): 66 | total_device_num = args.num_nodes * args.gpu_num_per_node 67 | train_batch_size = total_device_num * args.batch_size_per_device 68 | batches_per_epoch = math.ceil(args.num_examples / train_batch_size) 69 | warmup_batches = batches_per_epoch * args.warmup_epochs 70 | num_train_batches = batches_per_epoch * args.num_epochs 71 | decay_batches = num_train_batches 72 | exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs 73 | 74 | # set up warmup strategy 75 | warmup = ( 76 | flow.optimizer.warmup.linear(warmup_batches, 0) if warmup_batches > 0 else None 77 | ) 78 | 79 | # set up grad_clipping 80 | grad_clipping = ( 81 | flow.optimizer.grad_clipping.by_global_norm(args.gradient_clipping) 82 | if args.gradient_clipping > 0.0 83 | else None 84 | ) 85 | 86 | # set up learning rate scheduler 87 | if args.lr_decay == "cosine": 88 | # CosineScheduler 89 | lr_scheduler = flow.optimizer.CosineScheduler( 90 | base_lr=args.learning_rate, steps=decay_batches, warmup=warmup 91 | ) 92 | elif args.lr_decay == "step": 93 | # PiecewiseScalingScheduler 94 | lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( 95 | base_lr=args.learning_rate, 96 | boundaries=[30, 60, 80], 97 | scale=[0.1, 0.01, 0.001], 98 | warmup=warmup, 99 | ) 100 | elif args.lr_decay == "polynomial": 101 | # PolynomialScheduler 102 | lr_scheduler = flow.optimizer.PolynomialScheduler( 103 | base_lr=args.learning_rate, 104 | steps=decay_batches, 105 | end_learning_rate=0.00001, 106 | power=1.0, 107 | cycle=False, 108 | warmup=warmup, 109 | ) 110 | elif args.lr_decay == "exponential": 111 | # ExponentialScheduler 112 | lr_scheduler = flow.optimizer.ExponentialScheduler( 113 | base_lr=args.learning_rate, 114 | steps=exponential_decay_batches, 115 | decay_rate=args.lr_decay_rate, 116 | staircase=False, 117 | warmup=warmup, 118 | ) 119 | else: 120 | lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( 121 | base_lr=args.learning_rate, 122 | boundaries=[args.num_epochs], 123 | scale=[1.0], 124 | warmup=warmup, 125 | ) 126 | 127 | # set up optimizer 128 | loss_scale_policy = None 129 | if args.use_fp16: 130 | loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale( 131 | increment_period=2000 132 | ) 133 | if args.optimizer == "sgd": 134 | print("Optimizer: SGD") 135 | flow.optimizer.SGD( 136 | lr_scheduler, 137 | momentum=args.momentum if args.momentum > 0 else None, 138 | grad_clipping=grad_clipping, 139 | loss_scale_policy=loss_scale_policy, 140 | ).minimize(loss) 141 | elif args.optimizer == "adam": 142 | if args.wd > 0 and args.wd < 1.0: 143 | print("Optimizer: AdamW") 144 | flow.optimizer.AdamW( 145 | lr_scheduler=lr_scheduler, 146 | weight_decay=args.wd, 147 | weight_decay_excludes="_bn-", 148 | grad_clipping=grad_clipping, 149 | epsilon=args.epsilon, 150 | loss_scale_policy=loss_scale_policy, 151 | ).minimize(loss) 152 | else: 153 | print("Optimizer: Adam") 154 | flow.optimizer.Adam( 155 | lr_scheduler=lr_scheduler, 156 | grad_clipping=grad_clipping, 157 | epsilon=args.epsilon, 158 | loss_scale_policy=loss_scale_policy, 159 | ).minimize(loss) 160 | elif args.optimizer == "rmsprop": 161 | print("Optimizer: RMSProp") 162 | flow.optimizer.RMSProp( 163 | lr_scheduler=lr_scheduler, 164 | decay_rate=args.decay_rate, 165 | epsilon=args.epsilon, 166 | loss_scale_policy=loss_scale_policy, 167 | ).minimize(loss) 168 | 169 | 170 | if __name__ == "__main__": 171 | import config as configs 172 | 173 | parser = configs.get_parser() 174 | args = parser.parse_args() 175 | configs.print_args(args) 176 | -------------------------------------------------------------------------------- /Classification/cnns/resnet2onnx.sh: -------------------------------------------------------------------------------- 1 | python3 resnet_to_onnx.py \ 2 | --channel_last=False \ 3 | --fuse_bn_relu=False \ 4 | --fuse_bn_add_relu=False 5 | -------------------------------------------------------------------------------- /Classification/cnns/resnet_to_onnx.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | http://www.apache.org/licenses/LICENSE-2.0 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | """ 13 | 14 | from collections import OrderedDict 15 | import os 16 | from PIL import Image 17 | import time 18 | from typing import Callable, Text 19 | 20 | import numpy as np 21 | import oneflow.compatible.single_client as flow 22 | import oneflow.compatible.single_client.typing as tp 23 | import onnx 24 | import onnxruntime as ort 25 | 26 | from resnet_model import resnet50 27 | import config as configs 28 | from imagenet1000_clsidx_to_labels import clsidx_2_labels 29 | from oneflow_onnx.oneflow2onnx.util import export_onnx_model 30 | 31 | parser = configs.get_parser() 32 | args = parser.parse_args() 33 | 34 | 35 | def load_image(image_path: Text) -> np.ndarray: 36 | rgb_mean = [123.68, 116.779, 103.939] 37 | rgb_std = [58.393, 57.12, 57.375] 38 | print(image_path) 39 | im = Image.open(image_path) 40 | im = im.resize((224, 224)) 41 | im = im.convert("RGB") # 有的图像是单通道的,不加转换会报错 42 | im = np.array(im).astype("float32") 43 | im = (im - rgb_mean) / rgb_std 44 | im = np.transpose(im, (2, 0, 1)) 45 | im = np.expand_dims(im, axis=0) 46 | return np.ascontiguousarray(im, "float32") 47 | 48 | 49 | @flow.global_function("predict") 50 | def InferenceNet( 51 | images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float) 52 | ) -> tp.Numpy: 53 | logits = resnet50(images, args, training=False) 54 | predictions = flow.nn.softmax(logits) 55 | return predictions 56 | 57 | 58 | def onnx_inference(image: np.ndarray, onnx_model: onnx.ModelProto): 59 | """ 60 | test onnx model with onnx runtime 61 | :param image: input image, a numpy array 62 | :param onnx_model: onnx model 63 | :return: 64 | """ 65 | assert os.path.isfile(image_path) 66 | sess = ort.InferenceSession(onnx_model.SerializeToString()) 67 | assert len(sess.get_outputs()) == 1 and len(sess.get_inputs()) <= 1 68 | ipt_dict = OrderedDict() 69 | for ipt in sess.get_inputs(): 70 | ipt_dict[ipt.name] = image 71 | onnx_res = sess.run([], ipt_dict)[0] 72 | return onnx_res 73 | 74 | 75 | def oneflow_to_onnx( 76 | job_func: Callable, 77 | flow_weights_path: Text, 78 | onnx_model_dir: Text, 79 | external_data: bool = False, 80 | ): 81 | """ 82 | convert oneflow model to onnx model 83 | :param job_func: inference function in oneflow 84 | :param flow_weights_path: input oneflow model path 85 | :param onnx_model_dir: output dir path to save model.onnx 86 | :return: onnx model 87 | """ 88 | if not os.path.exists(onnx_model_dir): 89 | os.makedirs(onnx_model_dir) 90 | assert os.path.exists(flow_weights_path) and os.path.isdir(onnx_model_dir) 91 | 92 | onnx_model_path = os.path.join( 93 | onnx_model_dir, "model.onnx" 94 | ) 95 | export_onnx_model( 96 | job_func, 97 | flow_weight_dir=flow_weights_path, 98 | onnx_model_path=onnx_model_dir, 99 | opset=11, 100 | external_data=external_data, 101 | ) 102 | print("Convert to onnx success! >> ", onnx_model_path) 103 | return onnx.load_model(onnx_model_path) 104 | 105 | 106 | def check_equality( 107 | job_func: Callable, onnx_model: onnx.ModelProto, image_path: Text 108 | ) -> (bool, np.ndarray): 109 | image = load_image(image_path) 110 | onnx_res = onnx_inference(image, onnx_model) 111 | oneflow_res = job_func(image) 112 | is_equal = np.allclose(onnx_res, oneflow_res, rtol=1e-4, atol=1e-5) 113 | return is_equal, onnx_res 114 | 115 | 116 | if __name__ == "__main__": 117 | image_path = "data/tiger.jpg" 118 | # set up your model path 119 | flow_weights_path = "resnet_v15_of_best_model_val_top1_77318" 120 | onnx_model_dir = "onnx/model" 121 | 122 | flow.load_variables(flow.checkpoint.get(flow_weights_path)) 123 | 124 | # conver oneflow to onnx 125 | onnx_model = oneflow_to_onnx( 126 | InferenceNet, flow_weights_path, onnx_model_dir, external_data=False 127 | ) 128 | 129 | # check equality 130 | are_equal, onnx_res = check_equality(InferenceNet, onnx_model, image_path) 131 | clsidx_onnx = onnx_res.argmax() 132 | print("Are the results equal? {}".format("Yes" if are_equal else "No")) 133 | print("Class: {}; score: {}".format(clsidx_2_labels[clsidx_onnx], onnx_res.max())) -------------------------------------------------------------------------------- /Classification/cnns/tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools使用说明 2 | ## 简介 3 | tools文件夹中存放的文件和python代码专门用于 **ImageNet(2012)数据集** 制作工具。通过下面的使用说明,你可以将ImageNet(2012)从原始格式转换为通用图像格式的数据集,再转换为可在OneFlow中直接训练的 **OFRecord** 格式。 4 | 5 | #### 原始数据集 6 | 7 | 往往是由成千上万的图片或文本等文件组成,这些文件被散列存储在不同的文件夹中,一个个读取的时候会非常慢,并且占用大量内存空间。 8 | 9 | #### OFRecord 10 | **OFRecord提高IO效率** 11 | 12 | 内部借助“Protocol Buffer”二进制数据编码方案,它只占用一个内存块,只需要一次性加载一个二进制文件的方式即可,简单,快速,尤其对大型训练数据很友好。另外,当我们的训练数据量比较大的时候,可以将数据分成多个OFRecord文件,来提高处理效率。 13 | 14 | 关于OFRecord的详细说明请参考:[OFRecord数据格式](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/ofrecord.md) 15 | 16 | 17 | 18 | ## 数据集制作 19 | 20 | ### 将ImageNet转换成OFRecord 21 | 22 | 在OneFlow中,提供了将原始ImageNet2012数据集文件转换成OFRecord格式的脚本,如果您已经下载过,且准备好了ImageNet2012通用图像格式的数据集,并且训练集/验证集的格式如下: 23 | 24 | ```shell 25 | │ ├── train 26 | │ │ ├── n01440764 27 | │ │ └── n01443537 28 | ... 29 | │ └── validation 30 | │ ├── n01440764 31 | │ └── n01443537 32 | ... 33 | ``` 34 | 35 | 那么,您只需要下载:[imagenet_2012_bounding_boxes.csv](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/imagenet_2012_bounding_boxes.zip) 36 | 37 | 然后执行以下脚本即可完成训练集/验证集 > OFRecord的转换: 38 | 39 | #### 转换训练集 40 | 41 | ```shell 42 | python3 imagenet_ofrecord.py \ 43 | --train_directory ../data/imagenet/train \ 44 | --output_directory ../data/imagenet/ofrecord/train \ 45 | --label_file imagenet_lsvrc_2015_synsets.txt \ 46 | --shards 256 --num_threads 8 --name train \ 47 | --bounding_box_file imagenet_2012_bounding_boxes.csv \ 48 | --height 224 --width 224 49 | ``` 50 | 51 | #### 转换验证集 52 | 53 | ```shell 54 | python3 imagenet_ofrecord.py \ 55 | --validation_directory ../data/imagenet/validation \ 56 | --output_directory ../data/imagenet/ofrecord/validation \ 57 | --label_file imagenet_lsvrc_2015_synsets.txt --name validation \ 58 | --shards 256 --num_threads 8 --name validation \ 59 | --bounding_box_file imagenet_2012_bounding_boxes.csv \ 60 | --height 224 --width 224 61 | ``` 62 | 63 | #### 参数说明 64 | 65 | ```shell 66 | --train_directory 67 | # 指定待转换的训练集文件夹路径 68 | --validation_directory 69 | # 指定待转换的验证集文件夹路径 70 | --name 71 | # 指定转换的是训练集还是验证集 72 | --output_directory 73 | # 指定转换后的ofrecord存储位置 74 | --num_threads 75 | # 任务运行线程数 76 | --shards 77 | # 指定ofrecord分片数量,建议shards = 256 78 | #(shards数量越大,则转换后的每个ofrecord分片数据量就越少) 79 | --bounding_box_file 80 | # 该参数指定的csv文件中标记了所有目标box的坐标,使转换后的ofrecord同时支持分类和目标检测任务 81 | ``` 82 | 83 | 运行以上脚本后,你可以在../data/imagenet/ofrecord/validation、../data/imagenet/ofrecord/train下看到转换好的ofrecord文件: 84 | 85 | ```shell 86 | . 87 | ├── train 88 | │ ├── part-00000 89 | │ └── part-00001 90 | ... 91 | └── validation 92 | ├── part-00000 93 | └── part-00001 94 | ... 95 | ``` 96 | 97 | 98 | 99 | 如果尚未下载/处理过ImageNet,请看下面【ImageNet的下载和预处理】部分的说明。 100 | 101 | ### ImageNet的下载和预处理 102 | 103 | 如果您尚未下载过Imagenet数据集,请准备以下文件: 104 | 105 | - ILSVRC2012_img_train.tar 106 | - ILSVRC2012_img_val.tar 107 | - ILSVRC2012_bbox_train_v2.tar.gz(非必须) 108 | 109 | 其中训练集和验证集的图片请自行下载,bbox标注可以点此下载:[ILSVRC2012_bbox_train_v2.tar.gz](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/ILSVRC2012_bbox_train_v2.tar.gz) 110 | 111 | 我们将用下面三个步骤,帮您完成数据集的预处理。之后,您就可以使用【将ImageNet转换成OFRecord】部分介绍的转换脚本进行OFReciord的转换了。 112 | 113 | 114 | 115 | 下面假设您已经下载好了原始数据集和bbox标注文件,并存放在data/imagenet目录下: 116 | 117 | ```shell 118 | ├── data 119 | │ └── imagenet 120 | │ ├── ILSVRC2012_img_train.tar 121 | │ ├── ILSVRC2012_img_val.tar 122 | │ ├── ILSVRC2012_bbox_train_v2.tar.gz 123 | ├── tools 124 | │ ├── extract_trainval.sh 125 | │ ├── imagenet_2012_validation_synset_labels.txt 126 | │ ├── imagenet_lsvrc_2015_synsets.txt 127 | │ ├── imagenet_metadata.txt 128 | │ ├── imagenet_ofrecord.py 129 | │ └── preprocess_imagenet_validation_data.py 130 | ``` 131 | 132 | #### 步骤一:process_bounding_boxes 133 | 134 | 这一步,主要是将标注好的包含bboxs的xml文件提取到一个.csv文件中,方便后面代码中直接使用。完整的转换过程大约需要5分钟。 135 | 136 | 当然,你也可以直接使用我们转换好的文件:[imagenet_2012_bounding_boxes.csv](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/imagenet_2012_bounding_boxes.zip) 137 | 138 | 1.解压ILSVRC2012_bbox_train_v2.tar.gz 139 | 140 | ```shell 141 | cd data/imagenet && mkdir bounding_boxes && tar -zxvf ILSVRC2012_bbox_train_v2.tar.gz -C bounding_boxes 142 | ``` 143 | 144 | 2.提取bboxs至.csv文件 145 | 146 | ```shell 147 | cd ../.. && python process_bounding_boxes.py data/imagenet/bounding_boxes imagenet_lsvrc_2015_synsets.txt | sort > imagenet_2012_bounding_boxes.csv 148 | ``` 149 | 150 | #### 步骤二:extract imagenet 151 | 152 | 这一步主要是将ILSVRC2012_img_train.tar和ILSVRC2012_img_val.tar解压缩,生成train、validation文件夹。train文件夹下是1000个虚拟lebel分类文件夹(如:n01443537),训练集图片解压后根据分类放入这些label文件夹中;validation文件夹下是解压后的原图。 153 | 154 | ```shell 155 | sh extract_trainval.sh ../data/imagenet # 参数指定存放imagenet元素数据的文件夹路径 156 | ``` 157 | ```shell 158 | 解压后,文件夹结构示意如下: 159 | . 160 | ├── extract_trainval.sh 161 | ├── imagenet 162 | │ ├── ILSVRC2012_img_train.tar 163 | │ ├── ILSVRC2012_img_val.tar 164 | │ ├── ILSVRC2012_bbox_train_v2.tar.gz 165 | │ ├── bounding_boxes 166 | │ ├── train 167 | │ │ ├── n01440764 168 | │ │ │ ├── n01440764_10026.JPEG 169 | │ │ │ ├── n01440764_10027.JPEG 170 | ... 171 | │ │ └── n01443537 172 | │ │ ├── n01443537_10007.JPEG 173 | │ │ ├── n01443537_10014.JPEG 174 | ... 175 | │ └── validation 176 | │ ├── ILSVRC2012_val_00000236.JPEG 177 | │ ├── ILSVRC2012_val_00000262.JPEG 178 | ... 179 | ``` 180 | 181 | #### 步骤三:validation数据处理 182 | 183 | 经过上一步,train数据集已经放入了1000个分类label文件夹中形成了规整的格式,而验证集部分的图片还全部堆放在validation文件夹中,这一步,我们就用preprocess_imagenet_validation_data.py对其进行处理,使其也按类别存放到label文件夹下。 184 | ```shell 185 | python3 preprocess_imagenet_validation_data.py ../data/imagenet/validation 186 | # 参数 ../data/imagenet/validation为ILSVRC2012_img_val.tar解压后验证集图像存放的路径。 187 | ``` 188 | 处理后项目文件夹格式如下: 189 | ```shell 190 | . 191 | ├── extract_trainval.sh 192 | ├── imagenet 193 | │ ├── ILSVRC2012_img_train.tar 194 | │ ├── ILSVRC2012_img_val.tar 195 | │ ├── ILSVRC2012_bbox_train_v2.tar.gz 196 | │ ├── bounding_boxes 197 | │ ├── train 198 | │ │ ├── n01440764 199 | │ │ └── n01443537 200 | ... 201 | │ └── validation 202 | │ ├── n01440764 203 | │ └── n01443537 204 | ... 205 | ``` 206 | 207 | 至此,已经完成了全部的数据预处理,您可以直接跳转至**转换训练集**和**转换验证集**部分,轻松完成ImageNet-2012数据集到OFRecord的转换过程了。 208 | -------------------------------------------------------------------------------- /Classification/cnns/tools/extract_trainval.sh: -------------------------------------------------------------------------------- 1 | # usage: sh extract_trainval.sh your_path_to/imagenet 2 | # 参数指定存放imagenet元素数据的文件夹路径 3 | 4 | set -e 5 | ROOT_DIR=$1 # your path to imagenet dataset root dir 6 | echo "Imagenet dataset in dir:${ROOT_DIR}" 7 | 8 | SYNSETS_FILE="imagenet_lsvrc_2015_synsets.txt" 9 | TRAIN_TARBALL="${ROOT_DIR}/ILSVRC2012_img_train.tar" 10 | TRAIN_OUTPUT_PATH="${ROOT_DIR}/train/" 11 | VALIDATION_TARBALL="${ROOT_DIR}/ILSVRC2012_img_val.tar" 12 | VALIDATION_OUTPUT_PATH="${ROOT_DIR}/validation/" 13 | 14 | mkdir -p "${TRAIN_OUTPUT_PATH}" 15 | mkdir -p "${VALIDATION_OUTPUT_PATH}" 16 | 17 | # extract .tar file of validation 18 | tar xf "${VALIDATION_TARBALL}" -C "${VALIDATION_OUTPUT_PATH}" 19 | 20 | # extract .tar file of train 21 | echo "Uncompressing individual train tar-balls in the training data." 22 | 23 | while read SYNSET; do 24 | # Uncompress into the directory. 25 | tar xf "${TRAIN_TARBALL}" "${SYNSET}.tar" 26 | if [ "$?" = "0" ];then 27 | # Create a directory and delete anything there. 28 | mkdir -p "${TRAIN_OUTPUT_PATH}/${SYNSET}" 29 | rm -rf "${TRAIN_OUTPUT_PATH}/${SYNSET}/*" 30 | echo "Processing: ${SYNSET}" 31 | tar xf "${SYNSET}.tar" -C "${TRAIN_OUTPUT_PATH}/${SYNSET}/" 32 | rm -f "${SYNSET}.tar" 33 | echo "Finished processing: ${SYNSET}" 34 | else 35 | echo "${SYNSET}.tar doesn't exist!" 36 | fi 37 | done < "${SYNSETS_FILE}" -------------------------------------------------------------------------------- /Classification/cnns/tools/preprocess_imagenet_validation_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | #!/usr/bin/python 17 | # Copyright 2016 Google Inc. All Rights Reserved. 18 | # 19 | # Licensed under the Apache License, Version 2.0 (the "License"); 20 | # you may not use this file except in compliance with the License. 21 | # You may obtain a copy of the License at 22 | # 23 | # http://www.apache.org/licenses/LICENSE-2.0 24 | # 25 | # Unless required by applicable law or agreed to in writing, software 26 | # distributed under the License is distributed on an "AS IS" BASIS, 27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | # See the License for the specific language governing permissions and 29 | # limitations under the License. 30 | # ============================================================================== 31 | """Process the ImageNet Challenge bounding boxes for OneFlow model training. 32 | 33 | Associate the ImageNet 2012 Challenge validation data set with labels. 34 | 35 | The raw ImageNet validation data set is expected to reside in JPEG files 36 | located in the following directory structure. 37 | 38 | data_dir/ILSVRC2012_val_00000001.JPEG 39 | data_dir/ILSVRC2012_val_00000002.JPEG 40 | ... 41 | data_dir/ILSVRC2012_val_00050000.JPEG 42 | 43 | This script moves the files into a directory structure like such: 44 | data_dir/n01440764/ILSVRC2012_val_00000293.JPEG 45 | data_dir/n01440764/ILSVRC2012_val_00000543.JPEG 46 | ... 47 | where 'n01440764' is the unique synset label associated with 48 | these images. 49 | 50 | Sample usage: 51 | python3 preprocess_imagenet_validation_data.py ../data/imagenet/validation 52 | """ 53 | 54 | 55 | import os.path 56 | import sys 57 | 58 | from six.moves import xrange 59 | 60 | 61 | if __name__ == "__main__": 62 | if len(sys.argv) < 2: 63 | print( 64 | "Invalid usage\n" 65 | "usage: preprocess_imagenet_validation_data.py " 66 | "" 67 | ) 68 | sys.exit(-1) 69 | data_dir = sys.argv[1] 70 | validation_labels_file = "imagenet_2012_validation_synset_labels.txt" 71 | 72 | # Read in the 50000 synsets associated with the validation data set. 73 | labels = [l.strip() for l in open(validation_labels_file).readlines()] 74 | unique_labels = set(labels) 75 | 76 | # Make all sub-directories in the validation data dir. 77 | for label in unique_labels: 78 | labeled_data_dir = os.path.join(data_dir, label) 79 | if not os.path.exists(labeled_data_dir): 80 | os.makedirs(labeled_data_dir) 81 | 82 | # Move all of the image to the appropriate sub-directory. 83 | for i in xrange(len(labels)): 84 | basename = "ILSVRC2012_val_000%.5d.JPEG" % (i + 1) 85 | original_filename = os.path.join(data_dir, basename) 86 | if not os.path.exists(original_filename): 87 | continue 88 | print("Get image: ", original_filename) 89 | new_filename = os.path.join(data_dir, labels[i], basename) 90 | os.rename(original_filename, new_filename) 91 | 92 | 93 | # Delete all empty dir 94 | for label in unique_labels: 95 | labeled_data_dir = os.path.join(data_dir, label) 96 | if not os.path.exists(labeled_data_dir): 97 | continue 98 | if not os.listdir(labeled_data_dir): 99 | os.rmdir(labeled_data_dir) 100 | -------------------------------------------------------------------------------- /Classification/cnns/train.sh: -------------------------------------------------------------------------------- 1 | rm -rf core.* 2 | rm -rf ./output/snapshots/* 3 | 4 | if [ -n "$1" ]; then 5 | NUM_EPOCH=$1 6 | else 7 | NUM_EPOCH=50 8 | fi 9 | echo NUM_EPOCH=$NUM_EPOCH 10 | 11 | # training with imagenet 12 | if [ -n "$2" ]; then 13 | DATA_ROOT=$2 14 | else 15 | DATA_ROOT=/data/imagenet/ofrecord 16 | fi 17 | echo DATA_ROOT=$DATA_ROOT 18 | 19 | LOG_FOLDER=../logs 20 | mkdir -p $LOG_FOLDER 21 | LOGFILE=$LOG_FOLDER/resnet_training.log 22 | 23 | python3 of_cnn_train_val.py \ 24 | --train_data_dir=$DATA_ROOT/train \ 25 | --train_data_part_num=256 \ 26 | --val_data_dir=$DATA_ROOT/validation \ 27 | --val_data_part_num=256 \ 28 | --num_nodes=1 \ 29 | --gpu_num_per_node=8 \ 30 | --optimizer="sgd" \ 31 | --momentum=0.875 \ 32 | --label_smoothing=0.1 \ 33 | --learning_rate=1.024 \ 34 | --loss_print_every_n_iter=100 \ 35 | --batch_size_per_device=128 \ 36 | --val_batch_size_per_device=50 \ 37 | --num_epoch=$NUM_EPOCH \ 38 | --model="resnet50" 2>&1 | tee ${LOGFILE} 39 | 40 | echo "Writting log to ${LOGFILE}" 41 | -------------------------------------------------------------------------------- /Classification/cnns/train_fp16.sh: -------------------------------------------------------------------------------- 1 | rm -rf core.* 2 | rm -rf ./output/snapshots/* 3 | 4 | if [ -n "$1" ]; then 5 | NUM_EPOCH=$1 6 | else 7 | NUM_EPOCH=50 8 | fi 9 | echo NUM_EPOCH=$NUM_EPOCH 10 | 11 | # training with imagenet 12 | if [ -n "$2" ]; then 13 | DATA_ROOT=$2 14 | else 15 | DATA_ROOT=/data/imagenet/ofrecord 16 | fi 17 | echo DATA_ROOT=$DATA_ROOT 18 | 19 | LOG_FOLDER=../logs 20 | mkdir -p $LOG_FOLDER 21 | LOGFILE=$LOG_FOLDER/resnet_training.log 22 | 23 | export PYTHONUNBUFFERED=1 24 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED 25 | export NCCL_LAUNCH_MODE=PARALLEL 26 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE 27 | export ONEFLOW_COMM_NET_IB_ENABLE=1 28 | 29 | python3 of_cnn_train_val.py \ 30 | --train_data_dir=$DATA_ROOT/train \ 31 | --train_data_part_num=256 \ 32 | --val_data_dir=$DATA_ROOT/validation \ 33 | --val_data_part_num=256 \ 34 | --num_nodes=1 \ 35 | --gpu_num_per_node=8 \ 36 | --optimizer="sgd" \ 37 | --momentum=0.875 \ 38 | --label_smoothing=0.1 \ 39 | --learning_rate=1.536 \ 40 | --loss_print_every_n_iter=100 \ 41 | --batch_size_per_device=192 \ 42 | --val_batch_size_per_device=50 \ 43 | --use_fp16 \ 44 | --channel_last=True \ 45 | --pad_output \ 46 | --fuse_bn_relu=True \ 47 | --fuse_bn_add_relu=True \ 48 | --nccl_fusion_threshold_mb=16 \ 49 | --nccl_fusion_max_ops=24 \ 50 | --gpu_image_decoder=True \ 51 | --num_epoch=$NUM_EPOCH \ 52 | --model="resnet50" 2>&1 | tee ${LOGFILE} 53 | 54 | echo "Writting log to ${LOGFILE}" 55 | -------------------------------------------------------------------------------- /Classification/cnns/train_fp32.sh: -------------------------------------------------------------------------------- 1 | rm -rf core.* 2 | rm -rf ./output/snapshots/* 3 | 4 | if [ -n "$1" ]; then 5 | NUM_EPOCH=$1 6 | else 7 | NUM_EPOCH=50 8 | fi 9 | echo NUM_EPOCH=$NUM_EPOCH 10 | 11 | # training with imagenet 12 | if [ -n "$2" ]; then 13 | DATA_ROOT=$2 14 | else 15 | DATA_ROOT=/data/imagenet/ofrecord 16 | fi 17 | echo DATA_ROOT=$DATA_ROOT 18 | 19 | LOG_FOLDER=../logs 20 | mkdir -p $LOG_FOLDER 21 | LOGFILE=$LOG_FOLDER/resnet_training.log 22 | 23 | export PYTHONUNBUFFERED=1 24 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED 25 | export NCCL_LAUNCH_MODE=PARALLEL 26 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE 27 | 28 | python3 of_cnn_train_val.py \ 29 | --train_data_dir=$DATA_ROOT/train \ 30 | --train_data_part_num=256 \ 31 | --val_data_dir=$DATA_ROOT/validation \ 32 | --val_data_part_num=256 \ 33 | --num_nodes=1 \ 34 | --gpu_num_per_node=8 \ 35 | --optimizer="sgd" \ 36 | --momentum=0.875 \ 37 | --label_smoothing=0.1 \ 38 | --learning_rate=0.768 \ 39 | --loss_print_every_n_iter=100 \ 40 | --batch_size_per_device=96 \ 41 | --val_batch_size_per_device=50 \ 42 | --channel_last=False \ 43 | --fuse_bn_relu=True \ 44 | --fuse_bn_add_relu=True \ 45 | --nccl_fusion_threshold_mb=16 \ 46 | --nccl_fusion_max_ops=24 \ 47 | --gpu_image_decoder=True \ 48 | --num_epoch=$NUM_EPOCH \ 49 | --model="resnet50" 2>&1 | tee ${LOGFILE} 50 | 51 | echo "Writting log to ${LOGFILE}" 52 | -------------------------------------------------------------------------------- /Classification/cnns/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import time 19 | import numpy as np 20 | import oneflow.compatible.single_client as flow 21 | 22 | 23 | def InitNodes(args): 24 | if args.num_nodes > 1: 25 | assert args.num_nodes <= len(args.node_ips) 26 | flow.env.ctrl_port(args.ctrl_port) 27 | nodes = [] 28 | for ip in args.node_ips[: args.num_nodes]: 29 | addr_dict = {} 30 | addr_dict["addr"] = ip 31 | nodes.append(addr_dict) 32 | 33 | flow.env.machine(nodes) 34 | 35 | 36 | class Snapshot(object): 37 | def __init__(self, model_save_dir, model_load_dir, save_init=False): 38 | self._model_save_dir = model_save_dir 39 | if model_load_dir: 40 | assert os.path.isdir(model_load_dir) 41 | print("Restoring model from {}.".format(model_load_dir)) 42 | flow.load_variables(flow.checkpoint.get(model_load_dir)) 43 | elif save_init: 44 | flow.checkpoint.save("initial_model") 45 | print("Init model on demand.") 46 | 47 | def save(self, name): 48 | snapshot_save_path = os.path.join( 49 | self._model_save_dir, "snapshot_{}".format(name) 50 | ) 51 | if not os.path.exists(snapshot_save_path): 52 | os.makedirs(snapshot_save_path) 53 | print("Saving model to {}.".format(snapshot_save_path)) 54 | flow.checkpoint.save(snapshot_save_path) 55 | 56 | 57 | class StopWatch(object): 58 | def __init__(self): 59 | pass 60 | 61 | def start(self): 62 | self.start_time = time.time() 63 | self.last_split = self.start_time 64 | 65 | def split(self): 66 | now = time.time() 67 | duration = now - self.last_split 68 | self.last_split = now 69 | return duration 70 | 71 | def stop(self): 72 | self.stop_time = time.time() 73 | 74 | def duration(self): 75 | return self.stop_time - self.start_time 76 | 77 | 78 | def match_top_k(predictions, labels, top_k=1): 79 | max_k_preds = np.argpartition(predictions.numpy(), -top_k)[:, -top_k:] 80 | match_array = np.logical_or.reduce(max_k_preds == labels.reshape((-1, 1)), axis=1) 81 | num_matched = match_array.sum() 82 | return num_matched, match_array.shape[0] 83 | 84 | 85 | class Metric(object): 86 | def __init__( 87 | self, 88 | desc="train", 89 | calculate_batches=-1, 90 | batch_size=256, 91 | top_k=5, 92 | prediction_key="predictions", 93 | label_key="labels", 94 | loss_key=None, 95 | nvidia_smi_report_step=10, 96 | ): 97 | self.desc = desc 98 | self.calculate_batches = calculate_batches 99 | self.top_k = top_k 100 | self.prediction_key = prediction_key 101 | self.label_key = label_key 102 | self.loss_key = loss_key 103 | self.nvidia_smi_report_step = nvidia_smi_report_step 104 | if loss_key: 105 | self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" 106 | else: 107 | self.fmt = ( 108 | "{}: epoch {}, iter {}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" 109 | ) 110 | 111 | self.timer = StopWatch() 112 | self.timer.start() 113 | self._clear() 114 | 115 | def _clear(self): 116 | self.top_1_num_matched = 0 117 | self.top_k_num_matched = 0 118 | self.num_samples = 0.0 119 | 120 | def metric_cb(self, epoch, step): 121 | def callback(outputs): 122 | if step == 0: 123 | self._clear() 124 | if self.loss_key and epoch == 0 and step == self.nvidia_smi_report_step: 125 | cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv" 126 | os.system(cmd) 127 | 128 | if self.prediction_key: 129 | num_matched, num_samples = match_top_k( 130 | outputs[self.prediction_key], outputs[self.label_key] 131 | ) 132 | self.top_1_num_matched += num_matched 133 | num_matched, _ = match_top_k( 134 | outputs[self.prediction_key], outputs[self.label_key], self.top_k 135 | ) 136 | self.top_k_num_matched += num_matched 137 | else: 138 | num_samples = outputs[self.label_key].shape[0] 139 | 140 | self.num_samples += num_samples 141 | 142 | if (step + 1) % self.calculate_batches == 0: 143 | throughput = self.num_samples / self.timer.split() 144 | if self.prediction_key: 145 | top_1_accuracy = self.top_1_num_matched / self.num_samples 146 | top_k_accuracy = self.top_k_num_matched / self.num_samples 147 | else: 148 | top_1_accuracy = 0.0 149 | top_k_accuracy = 0.0 150 | 151 | if self.loss_key: 152 | loss = outputs[self.loss_key].mean() 153 | print( 154 | self.fmt.format( 155 | self.desc, 156 | epoch, 157 | step + 1, 158 | loss, 159 | top_1_accuracy, 160 | top_k_accuracy, 161 | throughput, 162 | ), 163 | time.time(), 164 | ) 165 | else: 166 | print( 167 | self.fmt.format( 168 | self.desc, 169 | epoch, 170 | step + 1, 171 | top_1_accuracy, 172 | top_k_accuracy, 173 | throughput, 174 | ), 175 | time.time(), 176 | ) 177 | 178 | self._clear() 179 | 180 | return callback 181 | -------------------------------------------------------------------------------- /Classification/cnns/vgg_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import oneflow.compatible.single_client as flow 18 | 19 | 20 | def _batch_norm(inputs, name=None, trainable=True, training=True, data_format="NCHW"): 21 | axis = 1 if data_format == "NCHW" else 3 22 | return flow.layers.batch_normalization( 23 | inputs=inputs, 24 | axis=axis, 25 | momentum=0.997, 26 | epsilon=1.001e-5, 27 | center=True, 28 | scale=True, 29 | trainable=trainable, 30 | training=training, 31 | name=name, 32 | ) 33 | 34 | 35 | def _get_regularizer(): 36 | return flow.regularizers.l2(0.00005) 37 | 38 | 39 | def conv2d_layer( 40 | name, 41 | input, 42 | filters, 43 | weight_initializer, 44 | kernel_size=3, 45 | strides=1, 46 | padding="SAME", 47 | data_format="NCHW", 48 | dilation_rate=1, 49 | activation="Relu", 50 | use_bias=True, 51 | bias_initializer=flow.zeros_initializer(), 52 | weight_regularizer=_get_regularizer(), # weight_decay 53 | bias_regularizer=_get_regularizer(), 54 | trainable=True, 55 | training=True, 56 | bn=True, 57 | ): 58 | weight_shape = ( 59 | (filters, input.shape[1], kernel_size, kernel_size) 60 | if data_format == "NCHW" 61 | else (filters, kernel_size, kernel_size, input.shape[3]) 62 | ) 63 | weight = flow.get_variable( 64 | name + "_weight", 65 | shape=weight_shape, 66 | dtype=input.dtype, 67 | initializer=weight_initializer, 68 | ) 69 | output = flow.nn.conv2d( 70 | input, weight, strides, padding, None, data_format, dilation_rate, name=name 71 | ) 72 | if use_bias: 73 | bias = flow.get_variable( 74 | name + "_bias", 75 | shape=(filters,), 76 | dtype=input.dtype, 77 | initializer=bias_initializer, 78 | ) 79 | output = flow.nn.bias_add(output, bias, data_format) 80 | 81 | if activation is not None: 82 | if activation == "Relu": 83 | if bn: 84 | output = _batch_norm( 85 | output, 86 | name + "_bn", 87 | trainable=trainable, 88 | training=training, 89 | data_format=data_format, 90 | ) 91 | output = flow.nn.relu(output) 92 | else: 93 | output = flow.nn.relu(output) 94 | else: 95 | raise NotImplementedError 96 | 97 | return output 98 | 99 | 100 | def _conv_block( 101 | in_blob, 102 | index, 103 | filters, 104 | conv_times, 105 | data_format="NCHW", 106 | trainable=True, 107 | training=True, 108 | ): 109 | conv_block = [] 110 | conv_block.insert(0, in_blob) 111 | weight_initializer = flow.variance_scaling_initializer( 112 | 2, "fan_out", "random_normal", data_format=data_format 113 | ) 114 | for i in range(conv_times): 115 | conv_i = conv2d_layer( 116 | name="conv{}".format(index), 117 | input=conv_block[i], 118 | filters=filters, 119 | kernel_size=3, 120 | strides=1, 121 | data_format=data_format, 122 | weight_initializer=weight_initializer, 123 | trainable=trainable, 124 | training=training, 125 | bn=True, 126 | ) 127 | 128 | conv_block.append(conv_i) 129 | index += 1 130 | 131 | return conv_block 132 | 133 | 134 | def vgg16bn(images, args, trainable=True, training=True): 135 | data_format = "NHWC" if args.channel_last else "NCHW" 136 | 137 | conv1 = _conv_block( 138 | images, 0, 64, 2, data_format, trainable=trainable, training=training 139 | ) 140 | pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", data_format, name="pool1") 141 | 142 | conv2 = _conv_block( 143 | pool1, 2, 128, 2, data_format, trainable=trainable, training=training 144 | ) 145 | pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", data_format, name="pool2") 146 | 147 | conv3 = _conv_block( 148 | pool2, 4, 256, 3, data_format, trainable=trainable, training=training 149 | ) 150 | pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", data_format, name="pool3") 151 | 152 | conv4 = _conv_block( 153 | pool3, 7, 512, 3, data_format, trainable=trainable, training=training 154 | ) 155 | pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", data_format, name="pool4") 156 | 157 | conv5 = _conv_block( 158 | pool4, 10, 512, 3, data_format, trainable=trainable, training=training 159 | ) 160 | pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", data_format, name="pool5") 161 | 162 | def _get_kernel_initializer(): 163 | return flow.random_normal_initializer(stddev=0.01) 164 | 165 | def _get_bias_initializer(): 166 | return flow.zeros_initializer() 167 | 168 | pool5 = flow.reshape(pool5, [pool5.shape[0], -1]) 169 | fc6 = flow.layers.dense( 170 | inputs=pool5, 171 | units=4096, 172 | activation=flow.nn.relu, 173 | use_bias=True, 174 | kernel_initializer=_get_kernel_initializer(), 175 | bias_initializer=_get_bias_initializer(), 176 | kernel_regularizer=_get_regularizer(), # weght_decay 177 | bias_regularizer=_get_regularizer(), 178 | trainable=trainable, 179 | name="dense0", 180 | ) 181 | 182 | fc6 = flow.nn.dropout(fc6, rate=0.5) 183 | 184 | fc7 = flow.layers.dense( 185 | inputs=fc6, 186 | units=4096, 187 | activation=flow.nn.relu, 188 | use_bias=True, 189 | kernel_initializer=_get_kernel_initializer(), 190 | bias_initializer=_get_bias_initializer(), 191 | trainable=trainable, 192 | name="dense1", 193 | ) 194 | fc7 = flow.nn.dropout(fc7, rate=0.5) 195 | 196 | fc8 = flow.layers.dense( 197 | inputs=fc7, 198 | units=1000, 199 | use_bias=True, 200 | kernel_initializer=_get_kernel_initializer(), 201 | bias_initializer=_get_bias_initializer(), 202 | trainable=trainable, 203 | name="dense2", 204 | ) 205 | 206 | return fc8 207 | -------------------------------------------------------------------------------- /ClickThroughRate/WideDeepLearning/README.md: -------------------------------------------------------------------------------- 1 | The main different between `wdl_train_eval.py` and `wdl_train_eval_test.py` is: 2 | `wdl_train_eval_test.py` is a end to end process of n-epoch training with training dataset, evaluation with full eval dataset after training of every epoch and testing with test dataset at the last stage. The main training loop is `epoch`. 3 | 4 | Otherwise, in `wdl_train_eval.py`, the main training loop is `iteration`. Only evaluate 20 samples a time but not full eval dataset. and no test stage. 5 | 6 | ## Run OneFlow-WDL with train and evaluation 7 | ``` 8 | EMBD_SIZE=1603616 9 | DATA_ROOT=/DATA/disk1/criteo_wdl/ofrecord 10 | python3 wdl_train_eval.py \ 11 | --train_data_dir $DATA_ROOT/train \ 12 | --train_data_part_num 256 \ 13 | --train_part_name_suffix_length=5 \ 14 | --eval_data_dir $DATA_ROOT/val \ 15 | --eval_data_part_num 256 \ 16 | --max_iter=300000 \ 17 | --loss_print_every_n_iter=1000 \ 18 | --eval_interval=1000 \ 19 | --batch_size=512 \ 20 | --wide_vocab_size=$EMBD_SIZE \ 21 | --deep_vocab_size=$EMBD_SIZE \ 22 | --gpu_num 1 23 | ``` 24 | 25 | ## Run OneFlow-WDL with train, evaluation and test 26 | ``` 27 | EMBD_SIZE=1603616 28 | DATA_ROOT=/DATA/disk1/criteo_wdl/ofrecord 29 | python3 wdl_train_eval_test.py \ 30 | --train_data_dir $DATA_ROOT/train \ 31 | --train_data_part_num 256 \ 32 | --train_part_name_suffix_length=5 \ 33 | --eval_data_dir $DATA_ROOT/val \ 34 | --eval_data_part_num 256 \ 35 | --eval_part_name_suffix_length=5 \ 36 | --test_data_dir $DATA_ROOT/test \ 37 | --test_data_part_num 256 \ 38 | --test_part_name_suffix_length=5 \ 39 | --loss_print_every_n_iter=1000 \ 40 | --batch_size=16484 \ 41 | --wide_vocab_size=$EMBD_SIZE \ 42 | --deep_vocab_size=$EMBD_SIZE \ 43 | --gpu_num 1 44 | ``` 45 | 46 | OneFlow-WDL网络实现了模型并行与稀疏更新,在8卡12G TitanV的服务器上实现支持超过4亿的词表大小,而且性能没有损失与小词表性能相当,详细请参考[这篇文档](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/adv_examples/wide_deep.md)评测部分的内容。 -------------------------------------------------------------------------------- /ClickThroughRate/WideDeepLearning/wdl_test_report.md: -------------------------------------------------------------------------------- 1 | [HugeCTR](https://github.com/NVIDIA/HugeCTR) is a recommender specific framework provided by NVIDIA Corporation. It is designed for Click-Through-Rate (CTR) estimation. 2 | 3 | OneFlow build up Wide & Deep Learning (WDL) network based on HugeCTR. 4 | 5 | OneFlow-WDL network supports model parallelism and sparse gradient update. It can support over 400 million vocab size of lookup table in a TitanV 12G * 8 server, at the same time has the same performace with small vocab size table. 6 | 7 | The purpose of this document is to introduce how to use OneFlow-WDL to train network and present the testing results of OneFlow-WDL. 8 | 9 | ## Environment and Preparation 10 | Please make sure to install OneFlow in your computer/server before running OneFlow-WDL, and [scikit-learn](https://scikit-learn.org/stable/install.html) is required to calculate metrics. 11 | 12 | ### Requirements 13 | - python 3.x(recommended) 14 | - OneFlow 15 | - scikit-learn 16 | 17 | ### Data preparation 18 | A small [data set](https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/criteo_wdl_3000w_ofrecord_example.tgz) is prepared for your fast evaluation. Following is the folder structure of this example dataset. 19 | ``` 20 | criteo_wdl_3000w_ofrecord_example 21 | ├── train 22 | │   └── part-00000 23 | └── val 24 | ├── part-00000 25 | └── README.md 26 | ``` 27 | 28 | Making a full-size dataset is exhausting. Hopefully [*Use Spark to create WDL dataset*](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/ClickThroughRate/WideDeepLearning/how_to_make_ofrecord_for_wdl.md) can help you to generate the full-size ofrecord for testing. You can download original dataset from [CriteoLabs](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) and follow the steps in [Spark 2.4.* shell](https://www.apache.org/dyn/closer.lua/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz). 29 | 30 | ### OneFlow-WDL code 31 | The main code we test is the file: `wdl_train_eval.py`. Please download from [here](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/ClickThroughRate/WideDeepLearning/wdl_train_eval.py). 32 | 33 | ## Run OneFlow-WDL code 34 | ``` 35 | VOCAB_SIZE=1603616 36 | DATA_ROOT=/path/to/wdl/criteo_wdl_3000w_ofrecord_example 37 | python3 wdl_train_eval.py \ 38 | --train_data_dir $DATA_ROOT/train \ 39 | --train_data_part_num 1 \ 40 | --train_part_name_suffix_length=5 \ 41 | --eval_data_dir $DATA_ROOT/val \ 42 | --eval_data_part_num 1 \ 43 | --max_iter=300000 \ 44 | --loss_print_every_n_iter=1000 \ 45 | --eval_interval=1000 \ 46 | --batch_size=16384 \ 47 | --wide_vocab_size=$VOCAB_SIZE \ 48 | --deep_vocab_size=$VOCAB_SIZE \ 49 | --gpu_num 1 50 | ``` 51 | 52 | The running shell code is shown above, the only thing we need to config is `DATA_ROOT` of ofrecord dataset for OneFlow-WDL. Then the shell is able to run. If the following output show up which means the code is running correctly. 53 | 54 | Note: the `criteo_wdl_3000w_ofrecord_example` dataset has one part file only, the `train_data_part_num` and `eval_data_part_num` are both set to `1`. 55 | ``` 56 | 1000 time 2020-07-08 00:28:08.066281 loss 0.503295350909233 57 | 1000 eval_loss 0.4846755236387253 eval_auc 0.7616240146992771 58 | 2000 time 2020-07-08 00:28:11.613961 loss 0.48661992555856703 59 | 2000 eval_loss 0.4816856697201729 eval_auc 0.765256583562705 60 | 3000 time 2020-07-08 00:28:15.149135 loss 0.48245503094792364 61 | 3000 eval_loss 0.47835959643125536 eval_auc 0.7715609382514008 62 | 4000 time 2020-07-08 00:28:18.686327 loss 0.47975033831596375 63 | 4000 eval_loss 0.47925308644771575 eval_auc 0.7781267916810946 64 | ``` 65 | ## Testing results and explanation 66 | All tests are performed on a sever with 8 TitanV 12G GPUs installed. As a reference, we perform some Nvidia HugeCTR tests in docker container. 67 | 68 | ### Multi-devices performance 69 | The main purpose of this test is to test the average latency over different GPU device number with fixed total batch size = 16384. 7 hidden layers with 1024 neural units are applied in this test. 70 | 71 | Results: 72 | 73 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/fixed_batch_size_latency.png) 74 | 75 | the maximum memory usage over devices is shown below: 76 | 77 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/fixed_batch_size_memory.png) 78 | 79 | To summarise, from one device to 8 devices, OneFlow-WDL ran faster than HugeCTR with less memory usage. 80 | 81 | ### Batch size per device = 16384 , multi-devices performance 82 | The main purpose of test is to test the average latency over different GPU devices with batch size=16384 per device, the total batch size is scaled with device number. 7 hidden layers with 1024 neural units are applied in this test. 83 | 84 | Results: 85 | 86 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_latency.png) 87 | 88 | the maximum memory usage over devices is shown below: 89 | 90 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_memory.png) 91 | 92 | Summary: 93 | - The latency kept increase alone with number of devices. 94 | - OneFlow-WDL ran faster than HugeCTR with less less memory consumption. 95 | - There is no obvious change in memory usage. 96 | 97 | ### Performance in different batch size with one GPU device 98 | The main purpose of this test is to test the average latency with one GPU device over different batch size. 2 hidden layers with 1024 neural units are applied in this test. 99 | 100 | Results: 101 | 102 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_latency_1gpu.png) 103 | 104 | Summary: OneFlow-WDL ran faster than HugeCTR over batch size from 512 to 16384. 105 | 106 | ### Big vocab size performance 107 | There are two Embedding Tables config in OneFlow-WDL: 108 | - The size of `wide_embedding` is vocab_size x 1 109 | - The size of`deep_embedding` is vocab_size x 16 110 | 111 | In HugeCTR the vocab size is 1,603,616(1.6 million). We kept increasing vocab size from 3.2 million to 409.6 million during test, result is below: 112 | 113 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/big_vocab_table_2x1024.png) 114 | 115 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/big_vocab_table_7x1024.png) 116 | 117 | In above figures,the blue column is average latency and orange curve is for the memory usage over different vocab size. 118 | 119 | Conclusion: with the increaseing of vocab size, memory usage increase, but the average latency kept still. 120 | 121 | Our test GPU has 12G memory only, we can image how big vocab size will OneFlow-WDL support with 16G, 32G or even larger memory devices. **409.6 Million vocab size is not the limitation but a begining**. 122 | 123 | ### Convergence test 1 124 | We choose batch size=512 to run the convergence performance test. 125 | 126 | The follow graph is the results of first 500 iterations. We perform evaluation with 20 example after each iteration. 127 | 128 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/eval_auc_loss_500iters.png) 129 | 130 | Conclusion: AUC grow rapidly over 0.75. 131 | 132 | ### Convergence test 2 133 | Same with the Convergence test 1, but we print the average loss value every 1000 iterations, then select 20 record to evaluate. 300,000 training iterations in total. Result: 134 | 135 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/train_eval_auc_loss.png) 136 | 137 | Conclusion and analysis: 138 | 1. The blue curve of train loss have obvious descend. because, there are 36674623 data in training set. When batch_size=512, 71630 steps will finish a epoch. 300,0000 steps can use the training set over 4 times(epochs). The descend of blue curve proof that. OneFlow can suffle the data during the training process in order to reduce overfitting. 139 | 2. The orange curve is evaluation loss. It maintains descend in first two epochs and began to ascend in third epoch because of overfitting. 140 | 3. The grey curve is the AUC of evaluation set. AUC also meet the peak in second epoch which over 0.8. Then descend in next few epoch. 141 | -------------------------------------------------------------------------------- /Generative/README.md: -------------------------------------------------------------------------------- 1 | # GAN 生成对抗网络演示 2 | 3 | 4 | 5 | ## 简介 Introduction 6 | 7 | 生成对抗网络(GAN)任务的本质是学习一个数据分布。它包含生成网络和判别网络两部分。其中生成网络可将一个随机分布映射为任意分布,判别网络则决定了生成分布的“方向”。二者相互博弈的过程在理论上等效于分布的拟合过程。本文演示了DCGAN的运行,DCGAN是一种广泛应用于图像生成领域中的基于卷积/反卷积运算的生成对抗网络, 8 | 9 | 10 | ## 演示 Demo 11 | 12 | 可通过脚本直接启动DCGAN的训练: 13 | 14 | ```bash 15 | python dcgan.py 16 | ``` 17 | 18 | 脚本参数: 19 | 20 | - `-lr` 学习率,默认为1e-4 21 | - `-e` 设置训练epoch次数,默认为10 22 | - `-b` 设置batchsize 23 | - `-g` 设置gpu数量 24 | 25 | 26 | 27 | 其他需要注意的是: 28 | 29 | - 训练将会默认使用minst数据集,如果第一次使用脚本,将会默认将数据集下载到`.data/`目录 30 | - 训练结束后,将会默认保存模型到`.checkpoint/`目录下 31 | - 模型的结构和参数参考了tensorflow的[官方示例](https://www.tensorflow.org/tutorials/generative/dcgan) 32 | - 模型会定期将生成的图片存储到`.gout/`目录,并在训练结束后生成图片演化的动图,生成动图的过程会依赖python包`imageio` 33 | 34 | ![dcgan demo](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/dev_gan/Generative/pic/1.png) 35 | -------------------------------------------------------------------------------- /Generative/layers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import oneflow.compatible.single_client as flow 17 | 18 | def get_const_initializer(): 19 | return flow.constant_initializer(0.002) 20 | 21 | def deconv2d( 22 | input, 23 | filters, 24 | size, 25 | name, 26 | strides=2, 27 | trainable=True, 28 | reuse=False, 29 | const_init=False, 30 | use_bias=False, 31 | ): 32 | name_ = name if reuse == False else name + "_reuse" 33 | # weight : [in_channels, out_channels, height, width] 34 | weight_shape = (input.shape[1], filters, size, size) 35 | output_shape = ( 36 | input.shape[0], 37 | input.shape[1], 38 | input.shape[2] * strides, 39 | input.shape[3] * strides, 40 | ) 41 | 42 | weight = flow.get_variable( 43 | name + "-weight", 44 | shape=weight_shape, 45 | dtype=input.dtype, 46 | initializer=flow.random_normal_initializer(stddev=0.02) 47 | if not const_init 48 | else get_const_initializer(), 49 | trainable=trainable, 50 | ) 51 | 52 | output = flow.nn.conv2d_transpose( 53 | input, 54 | weight, 55 | strides=[strides, strides], 56 | output_shape=output_shape, 57 | padding="SAME", 58 | data_format="NCHW", 59 | name=name_, 60 | ) 61 | 62 | if use_bias: 63 | bias = flow.get_variable( 64 | name + "-bias", 65 | shape=(filters,), 66 | dtype=input.dtype, 67 | initializer=flow.constant_initializer(0.0), 68 | trainable=trainable, 69 | ) 70 | 71 | output = flow.nn.bias_add(output, bias, "NCHW") 72 | return output 73 | 74 | 75 | def conv2d( 76 | input, 77 | filters, 78 | size, 79 | name, 80 | strides=2, 81 | padding="same", 82 | trainable=True, 83 | reuse=False, 84 | const_init=False, 85 | use_bias=True, 86 | ): 87 | name_ = name if reuse == False else name + "_reuse" 88 | 89 | # (output_dim, k_h, k_w, input.shape[3]) if NHWC 90 | weight_shape = (filters, input.shape[1], size, size) 91 | weight = flow.get_variable( 92 | name + "-weight", 93 | shape=weight_shape, 94 | dtype=input.dtype, 95 | initializer=flow.random_normal_initializer(stddev=0.02) 96 | if not const_init 97 | else get_const_initializer(), 98 | trainable=trainable, 99 | reuse=reuse, 100 | ) 101 | 102 | output = flow.nn.compat_conv2d( 103 | input, 104 | weight, 105 | strides=[strides, strides], 106 | padding=padding, 107 | data_format="NCHW", 108 | name=name_, 109 | ) 110 | 111 | if use_bias: 112 | bias = flow.get_variable( 113 | name + "-bias", 114 | shape=(filters,), 115 | dtype=input.dtype, 116 | initializer=flow.constant_initializer(0.0), 117 | trainable=trainable, 118 | reuse=reuse, 119 | ) 120 | 121 | output = flow.nn.bias_add(output, bias, "NCHW") 122 | return output 123 | 124 | 125 | def batchnorm(input, name, axis=1, reuse=False): 126 | name_ = name if reuse == False else name + "_reuse" 127 | return flow.layers.batch_normalization(input, axis=axis, name=name_) 128 | 129 | def dense( 130 | input, units, name, use_bias=False, trainable=True, reuse=False, const_init=False 131 | ): 132 | name_ = name if reuse == False else name + "_reuse" 133 | 134 | in_shape = input.shape 135 | in_num_axes = len(in_shape) 136 | assert in_num_axes >= 2 137 | 138 | inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input 139 | 140 | weight = flow.get_variable( 141 | name="{}-weight".format(name), 142 | shape=(units, inputs.shape[1]), 143 | dtype=inputs.dtype, 144 | initializer=flow.random_normal_initializer(stddev=0.02) 145 | if not const_init 146 | else get_const_initializer(), 147 | trainable=trainable, 148 | reuse=reuse, 149 | model_name="weight", 150 | ) 151 | 152 | out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul",) 153 | 154 | if use_bias: 155 | bias = flow.get_variable( 156 | name="{}-bias".format(name), 157 | shape=(units,), 158 | dtype=inputs.dtype, 159 | initializer=flow.random_normal_initializer() 160 | if not const_init 161 | else get_const_initializer(), 162 | trainable=trainable, 163 | reuse=reuse, 164 | model_name="bias", 165 | ) 166 | out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add") 167 | 168 | out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out 169 | return out 170 | -------------------------------------------------------------------------------- /Generative/pic/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Generative/pic/1.png -------------------------------------------------------------------------------- /Generative/pic/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Generative/pic/2.png -------------------------------------------------------------------------------- /LanguageModeling/BERT/classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import oneflow.compatible.single_client as flow 17 | import bert as bert_util 18 | 19 | 20 | def GlueBERT( 21 | input_ids_blob, 22 | input_mask_blob, 23 | token_type_ids_blob, 24 | label_blob, 25 | vocab_size, 26 | seq_length=512, 27 | hidden_size=768, 28 | num_hidden_layers=12, 29 | num_attention_heads=12, 30 | intermediate_size=3072, 31 | hidden_act="gelu", 32 | hidden_dropout_prob=0.1, 33 | attention_probs_dropout_prob=0.1, 34 | max_position_embeddings=512, 35 | type_vocab_size=16, 36 | initializer_range=0.02, 37 | label_num=2, 38 | replace_prob=None, 39 | ): 40 | backbone = bert_util.BertBackbone( 41 | input_ids_blob=input_ids_blob, 42 | input_mask_blob=input_mask_blob, 43 | token_type_ids_blob=token_type_ids_blob, 44 | vocab_size=vocab_size, 45 | seq_length=seq_length, 46 | hidden_size=hidden_size, 47 | num_hidden_layers=num_hidden_layers, 48 | num_attention_heads=num_attention_heads, 49 | intermediate_size=intermediate_size, 50 | hidden_act=hidden_act, 51 | hidden_dropout_prob=hidden_dropout_prob, 52 | attention_probs_dropout_prob=attention_probs_dropout_prob, 53 | max_position_embeddings=max_position_embeddings, 54 | type_vocab_size=type_vocab_size, 55 | initializer_range=initializer_range, 56 | ) 57 | pooled_output = PooledOutput( 58 | sequence_output=backbone.sequence_output(), 59 | hidden_size=hidden_size, 60 | initializer_range=initializer_range 61 | ) 62 | loss, _, logit_blob = _AddClassficationLoss( 63 | input_blob=pooled_output, 64 | label_blob=label_blob, 65 | hidden_size=hidden_size, 66 | label_num=label_num, 67 | initializer_range=initializer_range, 68 | scope_name='classification' 69 | ) 70 | 71 | return loss, logit_blob 72 | 73 | 74 | def PooledOutput(sequence_output, hidden_size, initializer_range): 75 | with flow.scope.namespace("bert-pooler"): 76 | first_token_tensor = flow.slice( 77 | sequence_output, [None, 0, 0], [None, 1, -1]) 78 | first_token_tensor = flow.reshape( 79 | first_token_tensor, [-1, hidden_size]) 80 | pooled_output = bert_util._FullyConnected( 81 | first_token_tensor, 82 | input_size=hidden_size, 83 | units=hidden_size, 84 | weight_initializer=bert_util.CreateInitializer(initializer_range), 85 | name="dense", 86 | ) 87 | pooled_output = flow.math.tanh(pooled_output) 88 | return pooled_output 89 | 90 | 91 | def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, 92 | scope_name='classification'): 93 | with flow.scope.namespace(scope_name): 94 | output_weight_blob = flow.get_variable( 95 | name="output_weights", 96 | shape=[label_num, hidden_size], 97 | dtype=input_blob.dtype, 98 | # initializer=bert_util.CreateInitializer(initializer_range), 99 | initializer=flow.random_normal_initializer( 100 | mean=0.0, stddev=initializer_range, seed=None, dtype=None) 101 | ) 102 | output_bias_blob = flow.get_variable( 103 | name="output_bias", 104 | shape=[label_num], 105 | dtype=input_blob.dtype, 106 | initializer=flow.constant_initializer(0.0), 107 | ) 108 | logit_blob = flow.matmul( 109 | input_blob, output_weight_blob, transpose_b=True) 110 | logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) 111 | pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( 112 | logits=logit_blob, labels=label_blob 113 | ) 114 | loss = pre_example_loss 115 | return loss, pre_example_loss, logit_blob 116 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import argparse 18 | from datetime import datetime 19 | 20 | 21 | def str_list(x): 22 | return x.split(",") 23 | 24 | 25 | def int_list(x): 26 | return list(map(int, x.split(","))) 27 | 28 | 29 | def float_list(x): 30 | return list(map(float, x.split(","))) 31 | 32 | 33 | def str2bool(v): 34 | if v.lower() in ("yes", "true", "t", "y", "1"): 35 | return True 36 | elif v.lower() in ("no", "false", "f", "n", "0"): 37 | return False 38 | else: 39 | raise argparse.ArgumentTypeError("Unsupported value encountered.") 40 | 41 | 42 | def get_parser(parser=None): 43 | 44 | parser = argparse.ArgumentParser(description="flags for bert") 45 | 46 | parser.add_argument( 47 | "--do_train", type=str2bool, nargs="?", const=True, help="train or not" 48 | ) 49 | parser.add_argument( 50 | "--do_eval", type=str2bool, nargs="?", const=True, help="eval or not" 51 | ) 52 | # resouce 53 | parser.add_argument("--model", type=str, default="BERT Pretrain") 54 | parser.add_argument("--gpu_num_per_node", type=int, default=1) 55 | parser.add_argument( 56 | "--num_nodes", type=int, default=1, help="node/machine number for training" 57 | ) 58 | parser.add_argument( 59 | "--node_ips", 60 | type=str_list, 61 | default=["192.168.1.13", "192.168.1.14"], 62 | help='nodes ip list for training, devided by ",", length >= num_nodes', 63 | ) 64 | parser.add_argument( 65 | "--ctrl_port", type=int, default=50051, help="ctrl_port for multinode job" 66 | ) 67 | 68 | # train 69 | parser.add_argument( 70 | "--learning_rate", type=float, default=1e-4, help="Learning rate" 71 | ) 72 | parser.add_argument( 73 | "--weight_decay_rate", type=float, default=0.01, help="weight decay rate" 74 | ) 75 | parser.add_argument("--warmup_proportion", type=float, default=0.1) 76 | parser.add_argument( 77 | "--use_fp16", 78 | type=str2bool, 79 | nargs="?", 80 | default="False", 81 | const=True, 82 | help="use use fp16 or not", 83 | ) 84 | parser.add_argument( 85 | "--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla" 86 | ) 87 | parser.add_argument( 88 | "--num_accumulation_steps", 89 | type=int, 90 | default=1, 91 | help="Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size", 92 | ) 93 | parser.add_argument( 94 | "--optimizer_type", 95 | type=str, 96 | default="adam", 97 | help="Optimizer used for training - LAMB or ADAM", 98 | ) 99 | 100 | # log and resore/save 101 | parser.add_argument( 102 | "--loss_print_every_n_iter", 103 | type=int, 104 | default=10, 105 | required=False, 106 | help="print loss every n iteration", 107 | ) 108 | parser.add_argument( 109 | "--model_save_every_n_iter", 110 | type=int, 111 | default=10000, 112 | required=False, 113 | help="save model every n iteration", 114 | ) 115 | parser.add_argument( 116 | "--model_save_dir", 117 | type=str, 118 | default="./output/model_save-{}".format( 119 | str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")) 120 | ), 121 | required=False, 122 | help="model save directory", 123 | ) 124 | parser.add_argument( 125 | "--model_save_init", 126 | action="store_true", 127 | default=False, 128 | help="save model snapshot for inited", 129 | ) 130 | 131 | parser.add_argument( 132 | "--save_last_snapshot", 133 | type=str2bool, 134 | default=False, 135 | required=False, 136 | help="save model snapshot for last iteration", 137 | ) 138 | parser.add_argument( 139 | "--model_load_dir", type=str, default=None, help="model load directory" 140 | ) 141 | parser.add_argument( 142 | "--log_dir", type=str, default="./output", help="log info save directory" 143 | ) 144 | 145 | # bert backbone 146 | parser.add_argument( 147 | "--do_lower_case", type=str2bool, nargs="?", const=True, default="True" 148 | ) 149 | parser.add_argument("--seq_length", type=int, default=512) 150 | parser.add_argument("--max_predictions_per_seq", type=int, default=80) 151 | parser.add_argument("--num_hidden_layers", type=int, default=24) 152 | parser.add_argument("--num_attention_heads", type=int, default=16) 153 | parser.add_argument("--max_position_embeddings", type=int, default=512) 154 | parser.add_argument("--type_vocab_size", type=int, default=2) 155 | parser.add_argument("--vocab_size", type=int, default=30522) 156 | parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1) 157 | parser.add_argument("--hidden_dropout_prob", type=float, default=0.1) 158 | parser.add_argument("--hidden_size_per_head", type=int, default=64) 159 | 160 | return parser 161 | 162 | 163 | def print_args(args): 164 | print("=".ljust(66, "=")) 165 | print( 166 | "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( 167 | args.model, args.gpu_num_per_node, args.num_nodes 168 | ) 169 | ) 170 | print("=".ljust(66, "=")) 171 | for arg in vars(args): 172 | print("{} = {}".format(arg, getattr(args, arg))) 173 | print("-".ljust(66, "-")) 174 | print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) 175 | 176 | 177 | if __name__ == "__main__": 178 | parser = get_parser() 179 | args = parser.parse_args() 180 | print_args(args) 181 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/convert_tf_ckpt_to_of.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | """Convert tensorflow checkpoint to oneflow snapshot""" 17 | 18 | import re 19 | import argparse 20 | import tensorflow as tf 21 | import numpy as np 22 | import os 23 | 24 | parser = argparse.ArgumentParser() 25 | 26 | ## Required parameters 27 | parser.add_argument("--tf_checkpoint_path", 28 | default = None, 29 | type = str, 30 | required = True, 31 | help = "Path the TensorFlow checkpoint path.") 32 | parser.add_argument("--of_dump_path", 33 | default = None, 34 | type = str, 35 | required = True, 36 | help = "Path to the output OneFlow model.") 37 | 38 | #args = parser.parse_args() 39 | args, unknown = parser.parse_known_args() 40 | print(args) 41 | 42 | # parse unknown arguments for extra weights 43 | extra_weights = {} 44 | for u in unknown: 45 | w = u.split("=") 46 | assert len(w) == 2 47 | if len(w) == 2: 48 | extra_weights[w[0]] = float(w[1]) 49 | 50 | 51 | def _write_blob(folder, blob): 52 | os.makedirs(folder, exist_ok=True) 53 | filename = os.path.join(folder, "out") 54 | f = open(filename, 'wb') 55 | f.write(blob.tobytes()) 56 | f.close() 57 | print(filename, blob.shape) 58 | 59 | def _SaveWeightBlob2File(blob, folder): 60 | _write_blob(folder, blob) 61 | 62 | for weight, default_value in extra_weights.items(): 63 | d = np.full_like(blob, default_value) 64 | _write_blob(folder + weight, d) 65 | 66 | def convert(): 67 | path = args.tf_checkpoint_path 68 | init_vars = tf.train.list_variables(path) 69 | for name, shape in init_vars: 70 | array = tf.train.load_variable(path, name) 71 | 72 | sep = name.rfind('/') 73 | blob_name = name[sep + 1:] 74 | op_name = name[:sep].replace('/', '-') 75 | 76 | if blob_name == "kernel": 77 | blob_name = "weight" 78 | elif blob_name in ['adam_m', 'adam_v']: 79 | print("find m, v weights") 80 | 81 | folder_name = op_name+"-"+blob_name 82 | folder = os.path.join(args.of_dump_path, folder_name) 83 | #print("saved to:", folder) 84 | 85 | _SaveWeightBlob2File(array, folder) 86 | 87 | 88 | if __name__ == "__main__": 89 | convert() 90 | 91 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import oneflow.compatible.single_client as flow 17 | import bert as bert_util 18 | import oneflow.core.operator.op_conf_pb2 as op_conf_util 19 | 20 | 21 | def PreTrain( 22 | input_ids_blob, 23 | input_mask_blob, 24 | token_type_ids_blob, 25 | masked_lm_positions_blob, 26 | masked_lm_ids_blob, 27 | masked_lm_weights_blob, 28 | next_sentence_label_blob, 29 | vocab_size, 30 | seq_length=512, 31 | hidden_size=768, 32 | num_hidden_layers=12, 33 | num_attention_heads=12, 34 | intermediate_size=3072, 35 | hidden_act="gelu", 36 | hidden_dropout_prob=0.1, 37 | attention_probs_dropout_prob=0.1, 38 | max_position_embeddings=512, 39 | type_vocab_size=16, 40 | max_predictions_per_seq=20, 41 | initializer_range=0.02, 42 | use_fp16=False, 43 | ): 44 | backbone = bert_util.BertBackbone( 45 | input_ids_blob=input_ids_blob, 46 | input_mask_blob=input_mask_blob, 47 | token_type_ids_blob=token_type_ids_blob, 48 | vocab_size=vocab_size, 49 | seq_length=seq_length, 50 | hidden_size=hidden_size, 51 | num_hidden_layers=num_hidden_layers, 52 | num_attention_heads=num_attention_heads, 53 | intermediate_size=intermediate_size, 54 | hidden_act=hidden_act, 55 | hidden_dropout_prob=hidden_dropout_prob, 56 | attention_probs_dropout_prob=attention_probs_dropout_prob, 57 | max_position_embeddings=max_position_embeddings, 58 | type_vocab_size=type_vocab_size, 59 | initializer_range=initializer_range, 60 | ) 61 | 62 | (lm_loss, _, _) = _AddMaskedLanguageModelLoss( 63 | input_blob=backbone.sequence_output(), 64 | output_weights_blob=backbone.embedding_table(), 65 | positions_blob=masked_lm_positions_blob, 66 | label_id_blob=masked_lm_ids_blob, 67 | label_weight_blob=masked_lm_weights_blob, 68 | seq_length=seq_length, 69 | hidden_size=hidden_size, 70 | vocab_size=vocab_size, 71 | max_predictions_per_seq=max_predictions_per_seq, 72 | hidden_act=bert_util.GetActivation(hidden_act), 73 | initializer_range=initializer_range, 74 | ) 75 | pooled_output = PooledOutput( 76 | backbone.sequence_output(), hidden_size, initializer_range 77 | ) 78 | (ns_loss, _, _) = _AddNextSentenceOutput( 79 | input_blob=pooled_output, 80 | label_blob=next_sentence_label_blob, 81 | hidden_size=hidden_size, 82 | initializer_range=initializer_range, 83 | ) 84 | with flow.scope.namespace("cls-loss"): 85 | lm_loss = flow.math.reduce_mean(lm_loss) 86 | ns_loss = flow.math.reduce_mean(ns_loss) 87 | total_loss = lm_loss + ns_loss 88 | return total_loss, lm_loss, ns_loss 89 | 90 | 91 | def PooledOutput(sequence_output, hidden_size, initializer_range): 92 | with flow.scope.namespace("bert-pooler"): 93 | first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) 94 | first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) 95 | pooled_output = bert_util._FullyConnected( 96 | first_token_tensor, 97 | input_size=hidden_size, 98 | units=hidden_size, 99 | weight_initializer=bert_util.CreateInitializer(initializer_range), 100 | name="dense", 101 | ) 102 | pooled_output = flow.math.tanh(pooled_output) 103 | return pooled_output 104 | 105 | 106 | def _AddMaskedLanguageModelLoss( 107 | input_blob, 108 | output_weights_blob, 109 | positions_blob, 110 | label_id_blob, 111 | label_weight_blob, 112 | seq_length, 113 | hidden_size, 114 | vocab_size, 115 | max_predictions_per_seq, 116 | hidden_act, 117 | initializer_range, 118 | ): 119 | with flow.scope.namespace("other"): 120 | sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1]) 121 | ones = sum_label_weight_blob * 0.0 + 1.0 122 | sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob) 123 | batch_size = flow.math.reduce_sum(ones) 124 | sum_label_weight_blob = sum_label_weight_blob / batch_size 125 | with flow.scope.namespace("cls-predictions"): 126 | input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) 127 | with flow.scope.namespace("transform"): 128 | if callable(hidden_act): 129 | act_fn = op_conf_util.kNone 130 | else: 131 | act_fn = hidden_act 132 | input_blob = bert_util._FullyConnected( 133 | input_blob, 134 | input_size=hidden_size, 135 | units=hidden_size, 136 | activation=act_fn, 137 | weight_initializer=bert_util.CreateInitializer(initializer_range), 138 | name="dense", 139 | ) 140 | if callable(hidden_act): 141 | input_blob = hidden_act(input_blob) 142 | input_blob = bert_util._LayerNorm(input_blob, hidden_size) 143 | output_bias = flow.get_variable( 144 | name="output_bias", 145 | shape=[vocab_size], 146 | dtype=input_blob.dtype, 147 | initializer=flow.constant_initializer(1.0), 148 | ) 149 | logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) 150 | logit_blob = flow.nn.bias_add(logit_blob, output_bias) 151 | label_id_blob = flow.reshape(label_id_blob, [-1]) 152 | pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( 153 | logits=logit_blob, labels=label_id_blob 154 | ) 155 | pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) 156 | numerator = pre_example_loss * label_weight_blob 157 | with flow.scope.namespace("loss"): 158 | numerator = flow.math.reduce_sum(numerator, axis=[-1]) 159 | denominator = sum_label_weight_blob + 1e-5 160 | loss = numerator / denominator 161 | return loss, pre_example_loss, logit_blob 162 | 163 | 164 | def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size): 165 | output = flow.gather( 166 | params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2 167 | ) 168 | output = flow.reshape(output, [-1, hidden_size]) 169 | return output 170 | 171 | 172 | def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range): 173 | with flow.scope.namespace("cls-seq_relationship"): 174 | output_weight_blob = flow.get_variable( 175 | name="output_weights", 176 | shape=[2, hidden_size], 177 | dtype=input_blob.dtype, 178 | initializer=bert_util.CreateInitializer(initializer_range), 179 | ) 180 | output_bias_blob = flow.get_variable( 181 | name="output_bias", 182 | shape=[2], 183 | dtype=input_blob.dtype, 184 | initializer=flow.constant_initializer(0.0), 185 | ) 186 | logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) 187 | logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) 188 | pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( 189 | logits=logit_blob, labels=label_blob 190 | ) 191 | loss = pre_example_loss 192 | return loss, pre_example_loss, logit_blob 193 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/run_pretraining.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import argparse 19 | from datetime import datetime 20 | 21 | import config as configs 22 | import oneflow.compatible.single_client as flow 23 | 24 | from pretrain import PreTrain 25 | from util import Snapshot, InitNodes, Metric, CreateOptimizer, GetFunctionConfig 26 | 27 | parser = configs.get_parser() 28 | parser.add_argument("--data_dir", type=str, default=None) 29 | parser.add_argument( 30 | "--data_part_num", type=int, default=32, help="data part number in dataset" 31 | ) 32 | parser.add_argument( 33 | "--iter_num", type=int, default=1144000, help="total iterations to run" 34 | ) 35 | parser.add_argument("--batch_size_per_device", type=int, default=64) 36 | args = parser.parse_args() 37 | configs.print_args(args) 38 | 39 | batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device 40 | 41 | 42 | def BertDecoder( 43 | data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq 44 | ): 45 | ofrecord = flow.data.ofrecord_reader( 46 | data_dir, 47 | batch_size=batch_size, 48 | data_part_num=data_part_num, 49 | random_shuffle=True, 50 | shuffle_after_epoch=True, 51 | ) 52 | blob_confs = {} 53 | 54 | def _blob_conf(name, shape, dtype=flow.int32): 55 | blob_confs[name] = flow.data.OFRecordRawDecoder( 56 | ofrecord, name, shape=shape, dtype=dtype 57 | ) 58 | 59 | _blob_conf("input_ids", [seq_length]) 60 | _blob_conf("next_sentence_labels", [1]) 61 | _blob_conf("input_mask", [seq_length]) 62 | _blob_conf("segment_ids", [seq_length]) 63 | _blob_conf("masked_lm_ids", [max_predictions_per_seq]) 64 | _blob_conf("masked_lm_positions", [max_predictions_per_seq]) 65 | _blob_conf("masked_lm_weights", [max_predictions_per_seq], flow.float) 66 | return blob_confs 67 | 68 | 69 | @flow.global_function(type="train", function_config=GetFunctionConfig(args)) 70 | def PretrainJob(): 71 | hidden_size = 64 * args.num_attention_heads # , H = 64, size per head 72 | intermediate_size = hidden_size * 4 73 | 74 | if args.data_part_num == 1: 75 | with flow.scope.placement("cpu", "0:0"): 76 | decoders = BertDecoder( 77 | args.data_dir, 78 | batch_size, 79 | args.data_part_num, 80 | args.seq_length, 81 | args.max_predictions_per_seq, 82 | ) 83 | else: 84 | assert args.data_part_num > 1 85 | decoders = BertDecoder( 86 | args.data_dir, 87 | batch_size, 88 | args.data_part_num, 89 | args.seq_length, 90 | args.max_predictions_per_seq, 91 | ) 92 | 93 | total_loss, mlm_loss, nsp_loss = PreTrain( 94 | decoders["input_ids"], 95 | decoders["input_mask"], 96 | decoders["segment_ids"], 97 | decoders["masked_lm_positions"], 98 | decoders["masked_lm_ids"], 99 | decoders["masked_lm_weights"], 100 | decoders["next_sentence_labels"], 101 | args.vocab_size, 102 | seq_length=args.seq_length, 103 | hidden_size=hidden_size, 104 | num_hidden_layers=args.num_hidden_layers, 105 | num_attention_heads=args.num_attention_heads, 106 | intermediate_size=intermediate_size, 107 | hidden_act="gelu", 108 | hidden_dropout_prob=args.hidden_dropout_prob, 109 | attention_probs_dropout_prob=args.attention_probs_dropout_prob, 110 | max_position_embeddings=args.max_position_embeddings, 111 | type_vocab_size=args.type_vocab_size, 112 | max_predictions_per_seq=args.max_predictions_per_seq, 113 | initializer_range=0.02, 114 | use_fp16=args.use_fp16, 115 | ) 116 | opt = CreateOptimizer(args) 117 | opt.minimize(total_loss) 118 | return {"total_loss": total_loss, "mlm_loss": mlm_loss, "nsp_loss": nsp_loss} 119 | 120 | 121 | def main(): 122 | flow.config.gpu_device_num(args.gpu_num_per_node) 123 | flow.env.log_dir(args.log_dir) 124 | 125 | InitNodes(args) 126 | 127 | snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init) 128 | 129 | print("num_accumulation_steps:", args.num_accumulation_steps) 130 | metric = Metric( 131 | desc="train", 132 | print_steps=args.loss_print_every_n_iter, 133 | batch_size=batch_size * args.num_accumulation_steps, 134 | keys=["total_loss", "mlm_loss", "nsp_loss"], 135 | ) 136 | 137 | for step in range(args.iter_num): 138 | PretrainJob().async_get(metric.metric_cb(step)) 139 | # PretrainJob().async_get(metric.metric_cb(step, epoch=3)) 140 | if (step + 1) % args.model_save_every_n_iter == 0: 141 | snapshot.save("snapshot_%d" % (step + 1)) 142 | 143 | if args.save_last_snapshot: 144 | snapshot.save("last_snapshot") 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/run_pretraining_adam.sh: -------------------------------------------------------------------------------- 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT 2 | OUTPUT_DIR=/DATA/disk1/of_output 3 | 4 | DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128 5 | 6 | 7 | BZ=48 8 | ITER_NUM=1000000 9 | max_seq_length=128 10 | max_predictions_per_seq=20 11 | 12 | of_log_dir=$OUTPUT_DIR/bert_master/of 13 | rm -rf ${of_log_dir} 14 | mkdir -p ${of_log_dir} 15 | rm -rf core.* 16 | 17 | export PYTHONUNBUFFERED=1 18 | export ONEFLOW_DEBUG_MODE=True 19 | export GLOG_v=3 20 | export CUDA_VISIBLE_DEVICES=6 21 | python3 $BENCH_ROOT_DIR/run_pretraining.py \ 22 | --gpu_num_per_node=1 \ 23 | --num_nodes=1 \ 24 | --learning_rate=1.25e-5 \ 25 | --warmup_proportion=0.01 \ 26 | --weight_decay_rate=0.01 \ 27 | --batch_size_per_device=${BZ} \ 28 | --iter_num=${ITER_NUM} \ 29 | --loss_print_every_n_iter=1 \ 30 | --seq_length=128 \ 31 | --use_fp16 \ 32 | --max_predictions_per_seq=20 \ 33 | --num_hidden_layers=12 \ 34 | --num_attention_heads=12 \ 35 | --num_accumulation_steps=1 \ 36 | --max_position_embeddings=512 \ 37 | --type_vocab_size=2 \ 38 | --vocab_size=30522 \ 39 | --attention_probs_dropout_prob=0.1 \ 40 | --hidden_dropout_prob=0.1 \ 41 | --hidden_size_per_head=64 \ 42 | --data_part_num=64 \ 43 | --data_dir=$DATA_DIR \ 44 | --log_dir=${of_log_dir} \ 45 | --model_save_every_n_iter=50000 \ 46 | --model_save_dir=${of_log_dir} 47 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/run_pretraining_lamb.sh: -------------------------------------------------------------------------------- 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT 2 | OUTPUT_DIR=/DATA/disk1/of_output 3 | 4 | DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128 5 | 6 | 7 | BZ=16 8 | ITER_NUM=1000000 9 | max_seq_length=128 10 | max_predictions_per_seq=20 11 | 12 | of_log_dir=$OUTPUT_DIR/bert_master/of 13 | rm -rf ${of_log_dir} 14 | mkdir -p ${of_log_dir} 15 | rm -rf core.* 16 | 17 | export PYTHONUNBUFFERED=1 18 | export ONEFLOW_DEBUG_MODE=True 19 | export GLOG_v=3 20 | 21 | python3 $BENCH_ROOT_DIR/run_pretraining.py \ 22 | --gpu_num_per_node=8 \ 23 | --num_nodes=1 \ 24 | --learning_rate=1e-4 \ 25 | --warmup_proportion=0.01 \ 26 | --weight_decay_rate=0.01 \ 27 | --batch_size_per_device=${BZ} \ 28 | --iter_num=${ITER_NUM} \ 29 | --loss_print_every_n_iter=1 \ 30 | --seq_length=128 \ 31 | --use_fp16 \ 32 | --optimizer_type="lamb" \ 33 | --max_predictions_per_seq=20 \ 34 | --num_hidden_layers=12 \ 35 | --num_attention_heads=12 \ 36 | --num_accumulation_steps=512 \ 37 | --max_position_embeddings=512 \ 38 | --type_vocab_size=2 \ 39 | --vocab_size=30522 \ 40 | --attention_probs_dropout_prob=0.1 \ 41 | --hidden_dropout_prob=0.1 \ 42 | --hidden_size_per_head=64 \ 43 | --data_part_num=64 \ 44 | --data_dir=$DATA_DIR \ 45 | --log_dir=${of_log_dir} \ 46 | --model_save_every_n_iter=50000 \ 47 | --model_save_dir=${of_log_dir} 48 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/run_squad.sh: -------------------------------------------------------------------------------- 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT 2 | # pretrained model dir 3 | PRETRAINED_MODEL=/DATA/disk1/of_output/uncased_L-12_H-768_A-12_oneflow 4 | 5 | # squad ofrecord dataset dir 6 | DATA_ROOT=/DATA/disk1/of_output/bert/of_squad 7 | 8 | # `vocab.txt` dir 9 | REF_ROOT_DIR=/DATA/disk1/of_output/uncased_L-12_H-768_A-12 10 | 11 | # `evaluate-v*.py` and `dev-v*.json` dir 12 | SQUAD_TOOL_DIR=/DATA/disk1/of_output/bert/of_squad 13 | db_version=${1:-"v2.0"} 14 | if [ $db_version = "v1.1" ]; then 15 | train_example_num=88614 16 | eval_example_num=10833 17 | version_2_with_negative="False" 18 | elif [ $db_version = "v2.0" ]; then 19 | train_example_num=131944 20 | eval_example_num=12232 21 | version_2_with_negative="True" 22 | else 23 | echo "db_version must be 'v1.1' or 'v2.0'" 24 | exit 25 | fi 26 | 27 | train_data_dir=$DATA_ROOT/train-$db_version 28 | eval_data_dir=$DATA_ROOT/dev-$db_version 29 | LOGFILE=./bert_fp_training.log 30 | export PYTHONUNBUFFERED=1 31 | export ONEFLOW_DEBUG_MODE=True 32 | export CUDA_VISIBLE_DEVICES=7 33 | # finetune and eval SQuAD, 34 | # `predictions.json` will be saved to folder `./squad_output` 35 | python3 $BENCH_ROOT_DIR/run_squad.py \ 36 | --model=SQuAD \ 37 | --do_train=True \ 38 | --do_eval=True \ 39 | --gpu_num_per_node=1 \ 40 | --learning_rate=3e-5 \ 41 | --batch_size_per_device=16 \ 42 | --eval_batch_size_per_device=16 \ 43 | --num_epoch=3 \ 44 | --use_fp16 \ 45 | --version_2_with_negative=$version_2_with_negative \ 46 | --loss_print_every_n_iter=20 \ 47 | --do_lower_case=True \ 48 | --seq_length=384 \ 49 | --num_hidden_layers=12 \ 50 | --num_attention_heads=12 \ 51 | --max_position_embeddings=512 \ 52 | --type_vocab_size=2 \ 53 | --vocab_size=30522 \ 54 | --attention_probs_dropout_prob=0.1 \ 55 | --hidden_dropout_prob=0.1 \ 56 | --hidden_size_per_head=64 \ 57 | --train_data_dir=$train_data_dir \ 58 | --train_example_num=$train_example_num \ 59 | --eval_data_dir=$eval_data_dir \ 60 | --eval_example_num=$eval_example_num \ 61 | --log_dir=./log \ 62 | --model_load_dir=${PRETRAINED_MODEL} \ 63 | --save_last_snapshot=True \ 64 | --model_save_dir=./squad_snapshots \ 65 | --vocab_file=$REF_ROOT_DIR/vocab.txt \ 66 | --predict_file=$SQUAD_TOOL_DIR/dev-${db_version}.json \ 67 | --output_dir=./squad_output 2>&1 | tee ${LOGFILE} 68 | 69 | 70 | # evaluate predictions.json to get metrics 71 | python3 $SQUAD_TOOL_DIR/evaluate-${db_version}.py \ 72 | $SQUAD_TOOL_DIR/dev-${db_version}.json \ 73 | ./squad_output/predictions.json 74 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/squad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | import oneflow.compatible.single_client as flow 17 | import bert as bert_util 18 | 19 | 20 | def SQuAD( 21 | input_ids_blob, 22 | input_mask_blob, 23 | token_type_ids_blob, 24 | vocab_size, 25 | seq_length=512, 26 | hidden_size=768, 27 | num_hidden_layers=12, 28 | num_attention_heads=12, 29 | intermediate_size=3072, 30 | hidden_act="gelu", 31 | hidden_dropout_prob=0.1, 32 | attention_probs_dropout_prob=0.1, 33 | max_position_embeddings=512, 34 | type_vocab_size=16, 35 | initializer_range=0.02, 36 | ): 37 | 38 | backbone = bert_util.BertBackbone( 39 | input_ids_blob=input_ids_blob, 40 | input_mask_blob=input_mask_blob, 41 | token_type_ids_blob=token_type_ids_blob, 42 | vocab_size=vocab_size, 43 | seq_length=seq_length, 44 | hidden_size=hidden_size, 45 | num_hidden_layers=num_hidden_layers, 46 | num_attention_heads=num_attention_heads, 47 | intermediate_size=intermediate_size, 48 | hidden_act=hidden_act, 49 | hidden_dropout_prob=hidden_dropout_prob, 50 | attention_probs_dropout_prob=attention_probs_dropout_prob, 51 | max_position_embeddings=max_position_embeddings, 52 | type_vocab_size=type_vocab_size, 53 | initializer_range=initializer_range, 54 | ) 55 | 56 | with flow.scope.namespace("cls-squad"): 57 | final_hidden = backbone.sequence_output() 58 | final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size]) 59 | logits = bert_util._FullyConnected( 60 | final_hidden_matrix, 61 | hidden_size, 62 | units=2, 63 | weight_initializer=bert_util.CreateInitializer(initializer_range), 64 | name='output') 65 | logits = flow.reshape(logits, [-1, seq_length, 2]) 66 | 67 | start_logits = flow.slice(logits, [None, None, 0], [None, None, 1]) 68 | end_logits = flow.slice(logits, [None, None, 1], [None, None, 1]) 69 | 70 | return start_logits, end_logits 71 | -------------------------------------------------------------------------------- /LanguageModeling/BERT/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2020 The OneFlow Authors. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import time 19 | from collections import OrderedDict 20 | import oneflow.compatible.single_client as flow 21 | 22 | 23 | def InitNodes(args): 24 | if args.num_nodes > 1: 25 | assert args.num_nodes <= len(args.node_ips) 26 | flow.env.ctrl_port(args.ctrl_port) 27 | nodes = [] 28 | for ip in args.node_ips[: args.num_nodes]: 29 | addr_dict = {} 30 | addr_dict["addr"] = ip 31 | nodes.append(addr_dict) 32 | 33 | flow.env.machine(nodes) 34 | 35 | 36 | class Snapshot(object): 37 | def __init__(self, model_save_dir, model_load_dir, model_save_init=False): 38 | self._model_save_dir = model_save_dir 39 | if model_load_dir: 40 | assert os.path.isdir(model_load_dir) 41 | print("Restoring model from {}.".format(model_load_dir)) 42 | flow.load_variables(flow.checkpoint.get(model_load_dir)) 43 | elif model_save_init: 44 | flow.checkpoint.save("initial_model") 45 | print("Init model on demand.") 46 | 47 | def save(self, name): 48 | snapshot_save_path = os.path.join( 49 | self._model_save_dir, "snapshot_{}".format(name) 50 | ) 51 | if not os.path.exists(snapshot_save_path): 52 | os.makedirs(snapshot_save_path) 53 | print("Saving model to {}.".format(snapshot_save_path)) 54 | flow.checkpoint.save(snapshot_save_path) 55 | 56 | 57 | class StopWatch(object): 58 | def __init__(self): 59 | pass 60 | 61 | def start(self): 62 | self.start_time = time.time() 63 | self.last_split = self.start_time 64 | 65 | def split(self): 66 | now = time.time() 67 | duration = now - self.last_split 68 | self.last_split = now 69 | return duration 70 | 71 | def stop(self): 72 | self.stop_time = time.time() 73 | 74 | def duration(self): 75 | return self.stop_time - self.start_time 76 | 77 | 78 | class Metric(object): 79 | def __init__( 80 | self, 81 | desc="train", 82 | print_steps=-1, 83 | batch_size=256, 84 | keys=[], 85 | nvidia_smi_report_step=10, 86 | ): 87 | r"""accumulate and calculate metric 88 | 89 | Args: 90 | desc: `str` general description of the metric to show 91 | print_steps: `Int` print metrics every nth steps 92 | batch_size: `Int` batch size per step 93 | keys: keys in callback outputs 94 | Returns: 95 | """ 96 | self.desc = desc 97 | self.print_steps = print_steps 98 | assert batch_size > 0 99 | self.batch_size = batch_size 100 | self.nvidia_smi_report_step = nvidia_smi_report_step 101 | 102 | assert isinstance(keys, (list, tuple)) 103 | self.keys = keys 104 | self.metric_dict = OrderedDict() 105 | self.metric_dict["step"] = 0 106 | 107 | self.timer = StopWatch() 108 | self.timer.start() 109 | self._clear() 110 | 111 | def _clear(self): 112 | for key in self.keys: 113 | self.metric_dict[key] = 0.0 114 | self.metric_dict["n_" + key] = 0.0 115 | self.metric_dict["throughput"] = 0.0 116 | self.num_samples = 0.0 117 | 118 | def update_and_save(self, key, value, step, **kwargs): 119 | self.metric_dict[key] = value 120 | self.metric_dict.pop("n_" + key, None) 121 | 122 | def metric_cb(self, step=0, **kwargs): 123 | def callback(outputs): 124 | if step == 0: 125 | self._clear() 126 | 127 | if step == self.nvidia_smi_report_step: 128 | cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv" 129 | os.system(cmd) 130 | 131 | for key in self.keys: 132 | self.metric_dict[key] += outputs[key].sum() 133 | self.metric_dict["n_" + key] += outputs[key].size 134 | 135 | self.num_samples += self.batch_size 136 | 137 | if (step + 1) % self.print_steps == 0: 138 | self.metric_dict["step"] = step 139 | for k, v in kwargs.items(): 140 | self.metric_dict[k] = v 141 | throughput = self.num_samples / self.timer.split() 142 | self.update_and_save("throughput", throughput, step) 143 | for key in self.keys: 144 | value = self.metric_dict[key] / self.metric_dict["n_" + key] 145 | self.update_and_save(key, value, step, **kwargs) 146 | print( 147 | ", ".join( 148 | ("{}: {}" if type(v) is int else "{}: {:.3f}").format(k, v) 149 | for k, v in self.metric_dict.items() 150 | ), 151 | time.time(), 152 | ) 153 | self._clear() 154 | 155 | return callback 156 | 157 | 158 | def CreateOptimizer(args): 159 | warmup_batches = int(args.iter_num * args.warmup_proportion) 160 | lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0) 161 | lr_scheduler = flow.optimizer.PolynomialScheduler( 162 | args.learning_rate, args.iter_num, 0.0, warmup=lr_warmup 163 | ) 164 | loss_scale_policy = None 165 | if args.use_fp16: 166 | loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale( 167 | increment_period=2000 168 | ) 169 | 170 | if args.optimizer_type == "lamb": 171 | return flow.optimizer.LAMB( 172 | lr_scheduler, 173 | beta1=0.9, 174 | beta2=0.999, 175 | epsilon=1e-6, 176 | weight_decay=args.weight_decay_rate, 177 | weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], 178 | grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0), 179 | loss_scale_policy=loss_scale_policy, 180 | ) 181 | else: 182 | return flow.optimizer.AdamW( 183 | lr_scheduler, 184 | epsilon=1e-6, 185 | weight_decay=args.weight_decay_rate, 186 | weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], 187 | grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0), 188 | loss_scale_policy=loss_scale_policy, 189 | ) 190 | 191 | 192 | def GetFunctionConfig(args): 193 | config = flow.function_config() 194 | config.enable_auto_mixed_precision(args.use_fp16) 195 | config.train.num_gradient_accumulation_steps(args.num_accumulation_steps) 196 | if args.use_xla: 197 | config.use_xla_jit(True) 198 | config.enable_fuse_add_to_output(True) 199 | config.enable_fuse_model_update_ops(True) 200 | return config 201 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 The OneFlow Authors. All rights reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/distribute_pretrain_2n4d.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=4 5 | export ONEFLOW_GPT_NUM_NODES=2 6 | # Set this env for your training nodes ip 7 | # export ONEFLOW_GPT_NODE_IPS="192.168.1.16,192.168.1.15" 8 | 9 | # If you place training data on somewhere else, set this env 10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 11 | export ONEFLOW_GPT_SEQ_LENGTH=2048 12 | 13 | export ONEFLOW_GPT_HIDDEN_SIZE=1536 14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16 15 | export ONEFLOW_GPT_NUM_LAYERS=16 16 | 17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4 18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1 19 | 20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16 22 | 23 | source $(dirname $0)/pretrain.sh 24 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/distribute_pretrain_4n8d.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8 5 | export ONEFLOW_GPT_NUM_NODES=4 6 | # Set this env for your training nodes ip 7 | # export ONEFLOW_GPT_NODE_IPS="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5" 8 | 9 | # If you place training data on somewhere else, set this env 10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 11 | export ONEFLOW_GPT_SEQ_LENGTH=2048 12 | 13 | export ONEFLOW_GPT_HIDDEN_SIZE=2304 14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24 15 | export ONEFLOW_GPT_NUM_LAYERS=24 16 | 17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=8 18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1 19 | 20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=32 22 | 23 | source $(dirname $0)/pretrain.sh 24 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/distribute_pretrain_4n8d_2x4x4_512_2304x24.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8 5 | export ONEFLOW_GPT_NUM_NODES=4 6 | # Set this env for your training nodes ip 7 | # export ONEFLOW_GPT_NODE_IPS="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5" 8 | 9 | # If you place training data on somewhere else, set this env 10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 11 | export ONEFLOW_GPT_SEQ_LENGTH=2048 12 | 13 | export ONEFLOW_GPT_HIDDEN_SIZE=2304 14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24 15 | export ONEFLOW_GPT_NUM_LAYERS=24 16 | 17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4 18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=4 19 | 20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=512 22 | 23 | source $(dirname $0)/pretrain.sh 24 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/distribute_pretrain_with_container.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8 5 | export ONEFLOW_GPT_NUM_NODES=4 6 | # export ONEFLOW_GPT_NODE_IPS="192.168.1.16,192.168.1.15,192.168.1.14,192.168.1.13" 7 | 8 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 9 | export ONEFLOW_GPT_SEQ_LENGTH=2048 10 | 11 | export ONEFLOW_GPT_HIDDEN_SIZE=2304 12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24 13 | export ONEFLOW_GPT_NUM_LAYERS=24 14 | 15 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4 16 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=4 17 | 18 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 19 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=512 20 | 21 | export ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER=ON 22 | export ONEFLOW_GPT_SRC_DIR=$(realpath $(dirname $(dirname $0))) 23 | export ONEFLOW_DEV_IMAGE=oneflow-manylinux2014-cuda11.2:0.1 24 | export ONEFLOW_GPT_PYTHON_VERSION=3.7 25 | export ONEFLOW_WHEEL=$PWD/packages/oneflow-0.3.5+cu102.git.8b222eed2-cp37-cp37m-manylinux2014_x86_64.whl 26 | # Set this env for mounting data dir for container 27 | # export ONEFLOW_GPT_DATA_DIR=$(dirname $ONEFLOW_GPT_DATASET) 28 | 29 | source $(dirname $0)/pretrain.sh 30 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/lambada_cloze_accuracy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | TASK="LAMBADA" 6 | VALID_DATA=/path/to/lambada_test.json 7 | VOCAB_FILE=/path/to/gpt2-vocab.json 8 | MERGE_FILE=/path/to/gpt2-merges.txt 9 | CHECKPOINT_PATH=/path/to/model 10 | 11 | 12 | gpu_num_per_node=1 13 | micro_batch_size=8 14 | hidden_size=768 15 | num_attn_heads=12 16 | num_layers=12 17 | seq_length=1024 18 | dropout_rate=0.0 19 | 20 | cmd="" 21 | cmd+="python3 tasks/main.py " 22 | cmd+="--task $TASK " 23 | cmd+="--valid-data $VALID_DATA " 24 | cmd+="--tokenizer-type GPT2BPETokenizer " 25 | cmd+="--strict-lambada " 26 | cmd+="--merge-file $MERGE_FILE " 27 | cmd+="--vocab-file $VOCAB_FILE " 28 | cmd+="--load $CHECKPOINT_PATH " 29 | cmd+="--dataset $VALID_DATA " 30 | cmd+="--vocab-size 50257 " 31 | cmd+="--hidden-size $hidden_size " 32 | cmd+="--num-attention-heads $num_attn_heads " 33 | cmd+="--num-layers $num_layers " 34 | cmd+="--seq-length $seq_length " 35 | cmd+="--hidden-dropout $dropout_rate " 36 | cmd+="--attention-dropout $dropout_rate " 37 | cmd+="--fp16 " 38 | cmd+="--checkpoint-activations " 39 | cmd+="--multihead-attention-fusion " 40 | cmd+="--make-vocab-size-divisible-by=128 " 41 | cmd+="--log-interval=10 " 42 | cmd+="--metric-print-format=table " 43 | cmd+="--micro-batch-size=$micro_batch_size " 44 | cmd+="--num-gpus-per-node=$gpu_num_per_node " 45 | cmd+="--num-nodes=1 " 46 | cmd+="--node-ips=10.11.0.2 " 47 | 48 | set -x 49 | 50 | $cmd 51 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | dataset=${ONEFLOW_GPT_DATASET:-"/data/gpt/gpt_sample_dataset_text_document"} 4 | seq_length=${ONEFLOW_GPT_SEQ_LENGTH:-"2048"} 5 | 6 | num_layers=${ONEFLOW_GPT_NUM_LAYERS:-"16"} 7 | hidden_size=${ONEFLOW_GPT_HIDDEN_SIZE:-"1536"} 8 | num_attn_heads=${ONEFLOW_GPT_NUM_ATTENTION_HEADS:-"16"} 9 | 10 | micro_batch_size=${ONEFLOW_GPT_MICRO_BATCH_SIZE:-"8"} 11 | global_batch_size=${ONEFLOW_GPT_GLOBAL_BATCH_SIZE} 12 | tensor_model_parallel_size=${ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE} 13 | pipeline_model_parallel_size=${ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE} 14 | num_accumulation_steps=${ONEFLOW_GPT_NUM_ACCUMULATION_STEPS} 15 | 16 | num_gpus_per_node=${ONEFLOW_GPT_NUM_GPUS_PER_NODE:-"4"} 17 | num_nodes=${ONEFLOW_GPT_NUM_NODES:-"1"} 18 | node_ips=${ONEFLOW_GPT_NODE_IPS:-"10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5"} 19 | 20 | train_iters=${ONEFLOW_GPT_TRAIN_ITERS:-"500000"} 21 | log_interval=${ONEFLOW_GPT_LOG_INTERVAL:-"100"} 22 | 23 | init_loss_scale=${ONEFLOW_GPT_INIT_LOSS_SCALE:-"4294967296"} 24 | 25 | load_path=${ONEFLOW_GPT_LOAD_PATH:-"checkpoint"} 26 | save_path=${ONEFLOW_GPT_SAVE_PATH:-"checkpoint"} 27 | save_interval=${ONEFLOW_GPT_SAVE_INTERVAL:-"10000"} 28 | 29 | cmd="" 30 | 31 | if [[ ! -z "${ONEFLOW_GTP_PROFILE_FILE}" ]]; then 32 | cmd+="nsys profile --stats true --output ${ONEFLOW_GTP_PROFILE_FILE} " 33 | fi 34 | 35 | if [[ ! -z "${ONEFLOW_GTP_GDB}" ]]; then 36 | cmd+="gdb --args " 37 | fi 38 | 39 | cmd+="python3 -m oneflow_gpt.training" 40 | cmd+=" --num-layers ${num_layers}" 41 | cmd+=" --hidden-size ${hidden_size}" 42 | cmd+=" --num-attention-heads ${num_attn_heads}" 43 | cmd+=" --micro-batch-size ${micro_batch_size}" 44 | 45 | if [[ ! -z "${global_batch_size}" ]]; then 46 | cmd+=" --global-batch-size ${global_batch_size}" 47 | fi 48 | 49 | if [[ ! -z "${tensor_model_parallel_size}" ]]; then 50 | cmd+=" --tensor-model-parallel-size ${tensor_model_parallel_size}" 51 | fi 52 | 53 | if [[ ! -z "${pipeline_model_parallel_size}" ]]; then 54 | cmd+=" --pipeline-model-parallel-size ${pipeline_model_parallel_size}" 55 | fi 56 | 57 | if [[ ! -z "${num_accumulation_steps}" ]]; then 58 | cmd+=" --num-accumulation-steps ${num_accumulation_steps}" 59 | fi 60 | 61 | cmd+=" --num-gpus-per-node ${num_gpus_per_node}" 62 | cmd+=" --num-nodes ${num_nodes}" 63 | cmd+=" --node-ips ${node_ips}" 64 | cmd+=" --train-iters ${train_iters}" 65 | cmd+=" --dataset ${dataset}" 66 | cmd+=" --seq-length ${seq_length}" 67 | cmd+=" --vocab-size 50257" 68 | cmd+=" --split 949,50,1" 69 | cmd+=" --learning-rate 0.00015" 70 | cmd+=" --min-lr 1.0e-5" 71 | cmd+=" --lr-decay-style cosine" 72 | cmd+=" --lr-decay-iters 320000" 73 | cmd+=" --lr-warmup-fraction 0.01" 74 | cmd+=" --optimizer adamw" 75 | cmd+=" --initial-loss-scale ${init_loss_scale}" 76 | cmd+=" --weight-decay 1e-2" 77 | cmd+=" --clip-grad 1.0" 78 | cmd+=" --load ${load_path}" 79 | cmd+=" --save ${save_path}" 80 | cmd+=" --save-interval ${save_interval}" 81 | cmd+=" --log-interval ${log_interval}" 82 | cmd+=" --checkpoint-activations" 83 | cmd+=" --multihead-attention-fusion" 84 | cmd+=" --fp16" 85 | 86 | if [[ ${num_nodes} -gt 1 ]]; then 87 | export ONEFLOW_COMM_NET_IB_ENABLE=1 88 | fi 89 | 90 | if [[ ! -z "${ONEFLOW_GTP_PROFILE_FILE}" ]]; then 91 | cmd+=" --profile-transformer-layer" 92 | fi 93 | 94 | if [[ -z "${ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER}" ]]; then 95 | ${cmd} 96 | else 97 | oneflow_gpt_src_dir=${ONEFLOW_GPT_SRC_DIR:-"$(dirname $(dirname $0))"} 98 | oneflow_dev_image=${ONEFLOW_DEV_IMAGE:-"oneflow-manylinux2014-cuda11.2:0.1"} 99 | python_version=${ONEFLOW_GPT_PYTHON_VERSION:-"3.7"} 100 | oneflow_gpt_data_dir=${ONEFLOW_GPT_DATA_DIR:-"/data"} 101 | 102 | if [[ -z "${ONEFLOW_WHEEL}" ]]; then 103 | echo "ONEFLOW_WHEEL env var not set" 104 | exit 1 105 | fi 106 | 107 | python3 ${oneflow_gpt_src_dir}/tools/launch_container.py \ 108 | --src ${oneflow_gpt_src_dir} \ 109 | --py ${python_version} \ 110 | --image ${oneflow_dev_image} \ 111 | --wheel ${ONEFLOW_WHEEL} \ 112 | --extra-mount ${oneflow_gpt_data_dir} \ 113 | --cmd "$cmd" 114 | fi 115 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain_117M.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1 5 | 6 | # If you place training data on somewhere else, set this env 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 8 | export ONEFLOW_GPT_SEQ_LENGTH=1024 9 | 10 | export ONEFLOW_GPT_NUM_LAYERS=12 11 | export ONEFLOW_GPT_HIDDEN_SIZE=768 12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=12 13 | 14 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 15 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=8 16 | 17 | source $(dirname $0)/pretrain.sh 18 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain_1n8d_2x4x1_16_1536x16.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8 5 | 6 | # If you place training data on somewhere else, set this env 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 8 | export ONEFLOW_GPT_SEQ_LENGTH=2048 9 | 10 | export ONEFLOW_GPT_HIDDEN_SIZE=1536 11 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16 12 | export ONEFLOW_GPT_NUM_LAYERS=16 13 | 14 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4 15 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1 16 | 17 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 18 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16 19 | 20 | source $(dirname $0)/pretrain.sh 21 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain_345M.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1 5 | 6 | # If you place training data on somewhere else, set this env 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document 8 | export ONEFLOW_GPT_SEQ_LENGTH=1024 9 | 10 | export ONEFLOW_GPT_NUM_LAYERS=24 11 | export ONEFLOW_GPT_HIDDEN_SIZE=1024 12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16 13 | 14 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 15 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=8 16 | export ONEFLOW_GPT_TRAIN_ITERS=500000 17 | 18 | source $(dirname $0)/pretrain.sh 19 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain_with_container.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8 5 | 6 | export ONEFLOW_GPT_SEQ_LENGTH=2048 7 | export ONEFLOW_GPT_HIDDEN_SIZE=2304 8 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24 9 | export ONEFLOW_GPT_NUM_LAYERS=24 10 | 11 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=16 12 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16 13 | 14 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=8 15 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1 16 | 17 | export ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER=ON 18 | export ONEFLOW_GPT_SRC_DIR=$(realpath $(dirname $(dirname $0))) 19 | export ONEFLOW_DEV_IMAGE=oneflow-manylinux2014-cuda11.2:0.1 20 | export ONEFLOW_GPT_PYTHON_VERSION=3.7 21 | export ONEFLOW_WHEEL=$PWD/packages/oneflow-0.3.5+cu102.git.8b222eed2-cp37-cp37m-manylinux2014_x86_64.whl 22 | 23 | source $(dirname $0)/pretrain.sh 24 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/examples/pretrain_with_profile.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # set -ex 3 | 4 | export ONEFLOW_GPT_SEQ_LENGTH=1024 5 | export ONEFLOW_GPT_NUM_LAYERS=12 6 | export ONEFLOW_GPT_HIDDEN_SIZE=768 7 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=12 8 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1 9 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8 10 | 11 | export PYTHONUNBUFFERED=1 12 | export NCCL_DEBUG=INFO 13 | export ONEFLOW_DEBUG_MODE=1 14 | export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 15 | export ONEFLOW_GTP_PROFILE_FILE="117M_1n1d_bz8" 16 | 17 | source $(dirname $0)/pretrain.sh 18 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/__init__.py: -------------------------------------------------------------------------------- 1 | MAJOR = 0 2 | MINOR = 0.1 3 | VERSION = (MAJOR, MINOR) 4 | 5 | __version__ = ".".join(map(str, VERSION)) 6 | __package_name__ = "oneflow_gpt" 7 | __description__ = "OneFlow GPT" 8 | __license__ = "" 9 | __keywords__ = "deep learning, Megatron, gpu, NLP, nvidia, cuda, oneflow" 10 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/data.py: -------------------------------------------------------------------------------- 1 | import oneflow.compatible.single_client as flow 2 | 3 | from oneflow_gpt import distribute 4 | from oneflow_gpt.config import get_args 5 | 6 | 7 | def get_train_val_test_num_samples(split, num_samples): 8 | assert len(split) == 3 9 | total = sum(split) 10 | return [int((s / total) * num_samples) for s in split] 11 | 12 | 13 | class GPTDataLoader(object): 14 | def __init__(self, name): 15 | self.name = name 16 | args = get_args() 17 | assert args.dataset is not None 18 | self.dataset = args.dataset 19 | self.batch_size = args.global_batch_size // args.num_accumulation_steps 20 | self.seq_length = args.seq_length 21 | self.seed = args.seed 22 | self.split = args.split 23 | self.num_samples = args.train_samples 24 | 25 | def __call__(self): 26 | with distribute.data_placement_scope(): 27 | x = flow.data.megatron_gpt_mmap_data_loader( 28 | data_file_prefix=self.dataset, 29 | seq_length=self.seq_length, 30 | num_samples=self.num_samples, 31 | batch_size=self.batch_size, 32 | dtype=flow.int64, 33 | shuffle=True, 34 | random_seed=self.seed, 35 | split_sizes=self.split, 36 | split_index=0, 37 | nd_sbp=distribute.get_data_parallel_dist(), 38 | name=self.name, 39 | ) 40 | 41 | # embedding is on pipeline first stage 42 | with distribute.layer_placement_scope(0): 43 | data = flow.slice(x, begin=(None, 0), size=(None, self.seq_length)) 44 | 45 | # loss is on pipeline last stage 46 | with distribute.layer_placement_scope(-1): 47 | labels = flow.slice(x, begin=(None, 1), size=(None, self.seq_length)) 48 | 49 | return data, labels 50 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/optimizer.py: -------------------------------------------------------------------------------- 1 | import oneflow.compatible.single_client as flow 2 | 3 | 4 | def get_lr_scheduler(args): 5 | # set up warmup strategy 6 | warmup = None 7 | if args.lr_warmup_iters is not None and args.lr_warmup_iters > 0: 8 | warmup = flow.optimizer.warmup.linear(args.lr_warmup_iters, 0) 9 | 10 | lr_decay_alpha = args.min_lr / args.lr 11 | # set up learning rate scheduler 12 | if args.lr_decay_style == "cosine" and args.lr_decay_iters is not None: 13 | lr_scheduler = flow.optimizer.CosineScheduler( 14 | base_lr=args.lr, 15 | steps=args.lr_decay_iters, 16 | alpha=lr_decay_alpha, 17 | warmup=warmup, 18 | ) 19 | else: 20 | raise NotImplementedError("not supported yet") 21 | 22 | return lr_scheduler 23 | 24 | 25 | def make_optimizer(args): 26 | lr_scheduler = get_lr_scheduler(args) 27 | 28 | loss_scale_policy = None 29 | if args.fp16: 30 | if args.loss_scale is not None: 31 | loss_scale_policy = flow.optimizer.loss_scale.static_loss_scale( 32 | args.loss_scale 33 | ) 34 | else: 35 | loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale( 36 | initial_loss_scale=args.initial_loss_scale, 37 | increment_period=args.loss_scale_window, 38 | ) 39 | 40 | if args.optimizer == "adamw": 41 | optimizer = flow.optimizer.AdamW( 42 | lr_scheduler, 43 | do_bias_correction=True, 44 | loss_scale_policy=loss_scale_policy, 45 | beta1=args.adam_beta1, 46 | beta2=args.adam_beta2, 47 | epsilon=args.adam_eps, 48 | weight_decay_excludes=["bias", "LayerNorm", "layernorm"], 49 | weight_decay=args.weight_decay, 50 | grad_clipping=flow.optimizer.grad_clipping.by_global_norm(args.clip_grad), 51 | ) 52 | else: 53 | raise NotImplementedError("not supported yet") 54 | 55 | return optimizer 56 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/snapshot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import glob 4 | import operator 5 | import oneflow.compatible.single_client as flow 6 | 7 | 8 | class Snapshot(object): 9 | def __init__( 10 | self, 11 | load_dir=None, 12 | save_dir=None, 13 | save_interval=0, 14 | total_iters=0, 15 | save_last=False, 16 | save_init=False, 17 | ): 18 | self.load_dir_ = load_dir 19 | self.save_dir_ = save_dir 20 | self.save_interval_ = save_interval 21 | self.total_iters_ = total_iters 22 | self.save_last_ = save_last 23 | self.save_init_ = save_init 24 | self.checkpoint_ = flow.train.CheckPoint() 25 | 26 | self.iter_, snapshot_dir = self._find_max_iter_snapshot_from_load_dir() 27 | if snapshot_dir is None: 28 | self.checkpoint_.init() 29 | else: 30 | print(f"Loading model from {snapshot_dir}") 31 | self.checkpoint_.load(snapshot_dir) 32 | 33 | self._check_save_dir_snapshot_existence(self.iter_) 34 | 35 | def _extract_iter_from_snapshot_dirname(self, s): 36 | itr_str = re.findall(r"\d+", s) 37 | itr = list(map(int, itr_str)) 38 | assert len(itr) > 0 39 | return itr[0] 40 | 41 | def _collect_snapshot2iter(self, basedir): 42 | snapshot_dirs = glob.glob(f"{basedir}/iter*_snapshot") 43 | snapshot2iter = dict() 44 | for s_dir in snapshot_dirs: 45 | assert os.path.isdir(s_dir) 46 | s = os.path.basename(s_dir) 47 | snapshot2iter[s_dir] = self._extract_iter_from_snapshot_dirname(s) 48 | return snapshot2iter 49 | 50 | def _check_save_dir_snapshot_existence(self, start_iter): 51 | snapshot2iter = self._collect_snapshot2iter(self.save_dir_) 52 | for s, i in snapshot2iter.items(): 53 | if self.save_init_ and i == 0: 54 | raise ValueError(f"{s} already exist") 55 | 56 | if self.save_last_ and i == self.total_iters_: 57 | raise ValueError(f"{s} already exist") 58 | 59 | if ( 60 | i > start_iter 61 | and self.save_interval_ > 0 62 | and (i - start_iter) % self.save_interval_ == 0 63 | and i <= self.total_iters_ 64 | ): 65 | raise ValueError(f"{s} already exist") 66 | 67 | def _find_max_iter_snapshot_from_load_dir(self): 68 | if self.load_dir_ is None: 69 | return 0, None 70 | 71 | snapshot2iter = self._collect_snapshot2iter(self.load_dir_) 72 | if len(snapshot2iter) == 0: 73 | return 0, None 74 | 75 | s, i = max(snapshot2iter.items(), key=operator.itemgetter(1)) 76 | return i, s 77 | 78 | @property 79 | def iter(self): 80 | return self.iter_ 81 | 82 | def save(self, name): 83 | if self.save_dir_ is None: 84 | return 85 | 86 | save_path = os.path.join(self.save_dir_, name) 87 | if os.path.exists(save_path): 88 | return 89 | 90 | os.makedirs(save_path) 91 | print(f"Saving model to {save_path}") 92 | self.checkpoint_.save(save_path) 93 | 94 | def step(self): 95 | if self.iter_ == 0 and self.save_init_: 96 | self.save("iter0_snapshot") 97 | 98 | self.iter_ += 1 99 | 100 | if self.save_interval_ > 0 and self.iter_ % self.save_interval_ == 0: 101 | self.save(f"iter{self.iter_}_snapshot") 102 | 103 | if self.iter_ == self.total_iters_ and self.save_last_: 104 | self.save(f"iter{self.total_iters_}_snapshot") 105 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/oneflow_gpt/third_party/__init__.py -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/third_party/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append( 5 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 6 | ) 7 | 8 | import numpy as np 9 | import oneflow.compatible.single_client as flow 10 | 11 | from oneflow_gpt.config import get_args 12 | from oneflow_gpt import distribute 13 | from oneflow_gpt.data import GPTDataLoader, get_train_val_test_num_samples 14 | from oneflow_gpt.model import GPTModel, ParallelSparseSoftmaxCrossEntropyLoss 15 | from oneflow_gpt.optimizer import make_optimizer 16 | from oneflow_gpt.snapshot import Snapshot 17 | from oneflow_gpt.util import Metric 18 | from oneflow_gpt.third_party.data.gpt_dataset import build_train_valid_test_datasets 19 | 20 | 21 | def _init_env(args): 22 | if args.num_nodes > 1: 23 | if args.num_nodes > len(args.node_ips): 24 | raise ValueError( 25 | f"num_nodes {args.num_nodes} greater than" 26 | " length of node ips {args.node_ips}" 27 | ) 28 | 29 | flow.env.ctrl_port(args.ctrl_port) 30 | nodes = [] 31 | for ip in args.node_ips[: args.num_nodes]: 32 | nodes.append({"addr": ip}) 33 | 34 | flow.env.machine(nodes) 35 | 36 | flow.env.log_dir(args.log) 37 | 38 | 39 | def _init_config(args): 40 | flow.config.gpu_device_num(args.num_gpus_per_node) 41 | if args.tensor_model_parallel_size * args.pipeline_model_parallel_size > 1: 42 | if hasattr(flow.config, "nccl_use_compute_stream"): 43 | flow.config.nccl_use_compute_stream(True) 44 | else: 45 | print( 46 | "WARNING: This version of OneFlow dose not support placing nccl on compute stream" 47 | " please try other version." 48 | ) 49 | 50 | 51 | flow.config.enable_legacy_model_io() 52 | flow.config.enable_model_io_v2(True) 53 | 54 | 55 | def _make_func_config(args): 56 | func_cfg = flow.function_config() 57 | if args.fp16: 58 | func_cfg.enable_auto_mixed_precision(True) 59 | func_cfg.prune_parallel_cast_ops(True) 60 | func_cfg.enable_fuse_add_to_output(True) 61 | func_cfg.enable_fuse_model_update_ops(True) 62 | func_cfg.enable_fuse_cast_scale(True) 63 | # turn on this flag when match ZeRO & DeepSpeed 64 | func_cfg.enable_non_distributed_optimizer(False) 65 | if args.num_accumulation_steps > 1: 66 | if hasattr(func_cfg.train, "num_gradient_accumulation_steps"): 67 | func_cfg.train.num_gradient_accumulation_steps(args.num_accumulation_steps) 68 | else: 69 | args.num_accumulation_steps = 1 70 | print( 71 | "WARNING: This version of OneFlow dose not support gradient accumulation" 72 | " please try newer version." 73 | ) 74 | 75 | return func_cfg 76 | 77 | 78 | def _make_gpt_train_func(args): 79 | model = GPTModel("model") 80 | loss = ParallelSparseSoftmaxCrossEntropyLoss() 81 | optimizer = make_optimizer(args) 82 | 83 | if args.use_external_dataset: 84 | 85 | @flow.global_function("train", _make_func_config(args)) 86 | def train( 87 | x: flow.typing.Numpy.Placeholder( 88 | (args.global_batch_size, args.seq_length + 1), dtype=flow.int64 89 | ) 90 | ): 91 | x = distribute.input_data_parallel_cast(x) 92 | with distribute.layer_placement_scope(0): 93 | data = flow.slice(x, begin=(None, 0), size=(None, args.seq_length)) 94 | with distribute.layer_placement_scope(-1): 95 | labels = flow.slice(x, begin=(None, 1), size=(None, args.seq_length)) 96 | 97 | logits = model(data) 98 | losses = loss(logits, labels) 99 | optimizer.minimize(losses) 100 | 101 | losses = distribute.output_parallel_cast(losses) 102 | return {"loss": losses} 103 | 104 | else: 105 | data_loader = GPTDataLoader("gpt_data_loader") 106 | 107 | @flow.global_function("train", _make_func_config(args)) 108 | def train(): 109 | data, labels = data_loader() 110 | logits = model(data) 111 | losses = loss(logits, labels) 112 | optimizer.minimize(losses) 113 | 114 | losses = distribute.output_parallel_cast(losses) 115 | return {"loss": losses} 116 | 117 | return train 118 | 119 | 120 | def train(): 121 | args = get_args() 122 | _init_env(args) 123 | _init_config(args) 124 | trainer = _make_gpt_train_func(args) 125 | snapshot = Snapshot( 126 | load_dir=args.load, 127 | save_dir=args.save, 128 | save_interval=args.save_interval, 129 | total_iters=args.train_iters, 130 | save_last=args.save_last, 131 | save_init=args.save_init, 132 | ) 133 | 134 | metric = Metric( 135 | print_steps=args.log_interval, 136 | start_step=snapshot.iter, 137 | max_step=args.train_iters, 138 | num_samples_per_batch=args.micro_batch_size * args.data_parallel_size, 139 | keys=["loss"], 140 | print_format=args.metric_print_format, 141 | nvidia_smi_report_step=10, 142 | nvidia_smi_report_file=None, 143 | ) 144 | 145 | if args.use_external_dataset: 146 | train_val_test_num_samples = get_train_val_test_num_samples( 147 | args.split, args.train_samples 148 | ) 149 | train_ds, _, _ = build_train_valid_test_datasets( 150 | data_prefix=[args.dataset], 151 | data_impl="mmap", 152 | splits_string=args.split, 153 | train_valid_test_num_samples=train_val_test_num_samples, 154 | seq_length=args.seq_length, 155 | seed=args.seed, 156 | skip_warmup=0, 157 | ) 158 | 159 | if args.train_iters is None and args.train_samples is None: 160 | raise ValueError("train_iters and train_samples must be set either") 161 | 162 | print("Training...") 163 | try: 164 | batch_size = args.micro_batch_size * args.num_accumulation_steps 165 | iteration = snapshot.iter 166 | while iteration < args.train_iters: 167 | if args.use_external_dataset: 168 | batch = [ 169 | train_ds[iteration * batch_size + i] for i in range(batch_size) 170 | ] 171 | data = np.stack(batch) 172 | trainer(data).async_get(metric.metric_cb()) 173 | else: 174 | trainer().async_get(metric.metric_cb()) 175 | 176 | snapshot.step() 177 | iteration = snapshot.iter 178 | 179 | except KeyboardInterrupt: 180 | print("interrupted") 181 | 182 | 183 | if __name__ == "__main__": 184 | train() 185 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/oneflow_gpt/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | 5 | 6 | class _Timer(object): 7 | def __init__(self): 8 | pass 9 | 10 | def start(self): 11 | now = time.perf_counter() 12 | self.start_ = now 13 | self.step_ = now 14 | 15 | def step(self): 16 | now = time.perf_counter() 17 | duration = now - self.step_ 18 | self.step_ = now 19 | return duration 20 | 21 | def stop(self): 22 | self.stop_ = time.perf_counter() 23 | 24 | def cur_step(self): 25 | return self.step_ 26 | 27 | def duration(self): 28 | return self.stop_ - self.start_ 29 | 30 | 31 | class Metric(object): 32 | def __init__( 33 | self, 34 | print_steps, 35 | start_step, 36 | max_step, 37 | num_samples_per_batch, 38 | keys=None, 39 | print_format="normal", 40 | nvidia_smi_report_step=10, 41 | nvidia_smi_report_file=None, 42 | ): 43 | r"""accumulate and calculate metric 44 | 45 | Args: 46 | print_steps: `Int` print metrics every nth steps 47 | batch_size: `Int` batch size per step 48 | keys: keys in callback outputs 49 | Returns: 50 | """ 51 | self.print_steps_ = print_steps 52 | self.max_step_ = max_step 53 | self.num_samples_per_batch_ = num_samples_per_batch 54 | 55 | self.nvidia_smi_report_step_ = nvidia_smi_report_step 56 | self.nvidia_smi_report_file_ = nvidia_smi_report_file 57 | 58 | self.step_ = start_step 59 | self.micro_batches_ = 0 60 | self.samples_ = 0 61 | self.throughput_ = 0.0 62 | self.latency_ = 0.0 63 | self.timestamp_ = 0.0 64 | 65 | self.kv_store_ = dict() 66 | if keys is None: 67 | self.keys_ = [] 68 | else: 69 | self.keys_ = list(keys) 70 | 71 | for key in self.keys_: 72 | self.kv_store_[key] = 0.0 73 | 74 | # need reset after every print 75 | self.acc_elapsed_time_ = 0.0 76 | self.acc_micro_batches_ = 0 77 | self.acc_samples_ = 0 78 | 79 | self.timer_ = _Timer() 80 | self.timer_.start() 81 | 82 | if print_format == "normal": 83 | self.print_fn_ = self.step_print 84 | elif print_format == "table": 85 | self.print_fn_ = self.step_print_by_table 86 | self.print_title_ = False 87 | else: 88 | raise ValueError("print_format must be ") 89 | 90 | def step_print(self): 91 | record = ( 92 | f"step={self.step_}," 93 | f"micro_batches={self.micro_batches_}," 94 | f"samples={self.samples_}," 95 | f"throughput={self.throughput_:.5f}," 96 | f"latency={self.latency_:.5f}," 97 | ) 98 | for key in self.keys_: 99 | record += f"{key}={self.kv_store_[key]:.5f}," 100 | 101 | print(record) 102 | 103 | def step_print_by_table(self): 104 | title = ( 105 | f"| {'step'.ljust(8)} " 106 | f"| {'micro_batches'.ljust(15)} " 107 | f"| {'samples'.ljust(15)} " 108 | f"| {'throughput'.ljust(10)} " 109 | f"| {'latency'.ljust(10)} " 110 | ) 111 | sep = f"| {'-' * 8} | {'-' * 15} | {'-' * 15} | {'-' * 10} | {'-' * 10} " 112 | 113 | record = ( 114 | f"| {self.step_:<8d} " 115 | f"| {self.micro_batches_:<15d} " 116 | f"| {self.samples_:<15d} " 117 | f"| {self.throughput_:<10.5f} " 118 | f"| {self.latency_:<10.5f} " 119 | ) 120 | 121 | for key in self.keys_: 122 | title += f"| {key.ljust(10)} " 123 | sep += f"| {'-' * 10} " 124 | record += f"| {self.kv_store_[key]:<10.5f} " 125 | 126 | title += "|" 127 | sep += "|" 128 | record += "|" 129 | 130 | if not self.print_title_: 131 | print(title) 132 | print(sep) 133 | self.print_title_ = True 134 | 135 | print(record) 136 | 137 | def metric_cb(self): 138 | def callback(outputs): 139 | elapsed_time = self.timer_.step() 140 | self.timestamp_ = self.timer_.cur_step() 141 | self.acc_elapsed_time_ += elapsed_time 142 | 143 | micro_batches = None 144 | for key in self.keys_: 145 | output = outputs[key].numpy() 146 | assert isinstance(output, np.ndarray) 147 | if micro_batches is None: 148 | micro_batches = output.shape[0] if output.shape else 1 149 | else: 150 | assert micro_batches == output.shape[0] 151 | self.kv_store_[key] += output.sum() 152 | 153 | self.step_ += 1 154 | self.acc_micro_batches_ += micro_batches 155 | self.acc_samples_ += micro_batches * self.num_samples_per_batch_ 156 | 157 | if self.step_ == self.nvidia_smi_report_step_: 158 | cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv" 159 | if self.nvidia_smi_report_file_ is not None: 160 | cmd += f" -f {self.nvidia_smi_report_file_}" 161 | os.system(cmd) 162 | self.print_title_ = False 163 | 164 | if self.step_ % self.print_steps_ == 0 or self.step_ == self.max_step_: 165 | self.throughput_ = self.acc_samples_ / self.acc_elapsed_time_ 166 | self.latency_ = self.acc_elapsed_time_ / self.print_steps_ 167 | 168 | for key in self.keys_: 169 | value = self.kv_store_[key] / self.acc_micro_batches_ 170 | self.kv_store_[key] = value 171 | 172 | self.micro_batches_ += self.acc_micro_batches_ 173 | self.samples_ += self.acc_samples_ 174 | 175 | self.print_fn_() 176 | 177 | for key in self.keys_: 178 | self.kv_store_[key] = 0.0 179 | self.acc_elapsed_time_ = 0.0 180 | self.acc_micro_batches_ = 0 181 | self.acc_samples_ = 0 182 | 183 | return callback 184 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/requirements.txt: -------------------------------------------------------------------------------- 1 | oneflow 2 | numpy 3 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import setuptools 3 | 4 | from oneflow_gpt import ( 5 | __package_name__, 6 | __version__, 7 | __description__, 8 | __license__, 9 | __keywords__, 10 | ) 11 | 12 | 13 | if sys.version_info < (3,): 14 | raise Exception("Python 2 is not supported.") 15 | 16 | 17 | with open("README.md", "r") as fh: 18 | long_description = fh.read() 19 | 20 | 21 | def req_file(filename): 22 | with open(filename) as f: 23 | content = f.readlines() 24 | return [x.strip() for x in content] 25 | 26 | 27 | install_requires = req_file("requirements.txt") 28 | 29 | setuptools.setup( 30 | name=__package_name__, 31 | # Versions should comply with PEP440. For a discussion on single-sourcing 32 | # the version across setup.py and the project code, see 33 | # https://packaging.python.org/en/latest/single_source_version.html 34 | version=__version__, 35 | description=__description__, 36 | long_description=long_description, 37 | long_description_content_type="text/markdown", 38 | # The project's main homepage. 39 | # url=__url__, 40 | # author=__contact_names__, 41 | # maintainer=__contact_names__, 42 | # The licence under which the project is released 43 | license=__license__, 44 | classifiers=[ 45 | "Development Status :: Beta", 46 | "Intended Audience :: Developers", 47 | "Operating System :: POSIX", 48 | "Operating System :: POSIX :: Linux", 49 | "License :: OSI Approved :: Apache License", 50 | "Programming Language :: Python :: 3.6", 51 | "Programming Language :: Python :: 3.7", 52 | "Programming Language :: Python :: 3.8", 53 | ], 54 | python_requires=">=3.6", 55 | packages=setuptools.find_packages(), 56 | install_requires=install_requires, 57 | # Add in any packaged data. 58 | include_package_data=True, 59 | zip_safe=False, 60 | # PyPI package information. 61 | keywords=__keywords__, 62 | ) 63 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tasks/__init__.py -------------------------------------------------------------------------------- /LanguageModeling/GPT/tasks/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append( 5 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 6 | ) 7 | from oneflow_gpt.config import get_args 8 | 9 | 10 | def get_tasks_args(parser): 11 | """Provide extra arguments required for tasks.""" 12 | group = parser.add_argument_group(title="tasks") 13 | 14 | group.add_argument("--task", type=str, required=True, help="Task name.") 15 | group.add_argument( 16 | "--epochs", 17 | type=int, 18 | default=None, 19 | help="Number of finetunning epochs. Zero results in " "evaluation only.", 20 | ) 21 | group.add_argument( 22 | "--pretrained-checkpoint", 23 | type=str, 24 | default=None, 25 | help="Pretrained checkpoint used for finetunning.", 26 | ) 27 | group.add_argument( 28 | "--keep-last", 29 | action="store_true", 30 | help="Keep the last batch (maybe incomplete) in" "the data loader", 31 | ) 32 | group.add_argument( 33 | "--train-data", 34 | nargs="+", 35 | default=None, 36 | help="Whitespace separated paths or corpora names " "for training.", 37 | ) 38 | group.add_argument( 39 | "--valid-data", nargs="*", default=None, help="path(s) to the validation data." 40 | ) 41 | group.add_argument( 42 | "--overlapping-eval", 43 | type=int, 44 | default=32, 45 | help="Sliding window for overlapping evaluation.", 46 | ) 47 | group.add_argument( 48 | "--strict-lambada", 49 | action="store_true", 50 | help="Use more difficult formulation of lambada.", 51 | ) 52 | parser.add_argument( 53 | "--vocab-file", type=str, default=None, help="Path to the vocab file." 54 | ) 55 | parser.add_argument( 56 | "--merge-file", type=str, default=None, help="Path to the BPE merge file." 57 | ) 58 | parser.add_argument( 59 | "--tokenizer-type", 60 | type=str, 61 | default=None, 62 | choices=["BertWordPieceLowerCase", "BertWordPieceCase", "GPT2BPETokenizer"], 63 | help="What type of tokenizer to use.", 64 | ) 65 | parser.add_argument( 66 | "--reset-position-ids", 67 | action="store_true", 68 | help="Reset posistion ids after end-of-document token.", 69 | ) 70 | parser.add_argument( 71 | "--reset-attention-mask", 72 | action="store_true", 73 | help="Reset self attention maske after " "end-of-document token.", 74 | ) 75 | parser.add_argument( 76 | "--eod-mask-loss", 77 | action="store_true", 78 | help="Mask loss for the end of document tokens.", 79 | ) 80 | 81 | return parser 82 | 83 | 84 | if __name__ == "__main__": 85 | 86 | args = get_args(extra_args_provider=get_tasks_args) 87 | 88 | if args.task in ["LAMBADA"]: 89 | from zeroshot_gpt.evaluate import main 90 | else: 91 | raise NotImplementedError("Task {} is not implemented.".format(args.task)) 92 | 93 | main(args) 94 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tasks/zeroshot_gpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tasks/zeroshot_gpt/__init__.py -------------------------------------------------------------------------------- /LanguageModeling/GPT/tasks/zeroshot_gpt/datasets.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import math 18 | import numpy as np 19 | from tokenizer.tokenizer import build_tokenizer 20 | 21 | 22 | def build_dataset(args): 23 | """Helper function to select and build dataset.""" 24 | if args.task == "LAMBADA": 25 | return _build_lambada_dataset(args) 26 | 27 | raise NotImplementedError("dataset for {} task is not " "implemented.".format(task)) 28 | 29 | 30 | class _LambadaDataset: 31 | def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False): 32 | print("> building lambada dataset from {} ...".format(path)) 33 | self.seq_len = seq_len 34 | self.pad_idx = pad_idx 35 | self.tokenizer = tokenizer 36 | self.strict = strict 37 | 38 | self.tokens = [] 39 | self.labels = [] 40 | with open(path, "r") as f: 41 | for line in f.readlines(): 42 | text = json.loads(line)["text"] 43 | tokens, labels = self.get_tokens(text) 44 | self.tokens.append(tokens) 45 | self.labels.append(labels) 46 | 47 | def get_tokens(self, text): 48 | if not self.strict: 49 | tokens = self.tokenizer.tokenize(text) 50 | return tokens[:-1], [tokens[-1]] 51 | last_token = text.split()[-1] 52 | start_idx = text.rfind(last_token) 53 | beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip()) 54 | last_token = self.tokenizer.tokenize(" " + last_token) 55 | return beginning_tokens, last_token 56 | 57 | def __len__(self): 58 | return len(self.tokens) 59 | 60 | def __getitem__(self, idx): 61 | tokens = self.tokens[idx] 62 | num_tokens = len(tokens) 63 | pad_mask = [0] * num_tokens 64 | labels = self.labels[idx] 65 | pad_mask += [1] * len(labels) 66 | tokens = tokens + labels 67 | num_tokens = len(tokens) 68 | if num_tokens < self.seq_len + 1: 69 | num_pad = self.seq_len + 1 - num_tokens 70 | pad_mask += [0] * (num_pad) 71 | tokens += [self.pad_idx] * num_pad 72 | pad_mask = np.array(pad_mask[1:]) 73 | 74 | return {"text": np.array(tokens), "pad_mask": pad_mask} 75 | 76 | 77 | def _build_lambada_dataset(args): 78 | """Build lambada dataset.""" 79 | tokenizer = build_tokenizer(args) 80 | 81 | assert len(args.valid_data) == 1 82 | val_dataset = _LambadaDataset( 83 | args.valid_data[0], 84 | tokenizer.eod, 85 | tokenizer, 86 | args.seq_length, 87 | args.strict_lambada, 88 | ) 89 | print(" > found {} samples.".format(len(val_dataset))) 90 | 91 | return val_dataset 92 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tasks/zeroshot_gpt/evaluate.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import sys 4 | 5 | sys.path.append( 6 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 7 | ) 8 | 9 | from oneflow_gpt.model import GPTModel, ParallelSparseSoftmaxCrossEntropyLoss 10 | from oneflow_gpt import util 11 | from .datasets import build_dataset 12 | import numpy as np 13 | import oneflow.compatible.single_client as flow 14 | 15 | 16 | def _init_env(args): 17 | if args.num_nodes > 1: 18 | if args.num_nodes > len(args.node_ips): 19 | raise ValueError( 20 | f"num_nodes {args.num_nodes} greater than" 21 | " length of node ips {args.node_ips}" 22 | ) 23 | 24 | flow.env.ctrl_port(args.ctrl_port) 25 | nodes = [] 26 | for ip in args.node_ips[: args.num_nodes]: 27 | nodes.append({"addr": ip}) 28 | 29 | flow.env.machine(nodes) 30 | 31 | flow.env.log_dir(args.log) 32 | 33 | 34 | def _init_config(args): 35 | flow.config.gpu_device_num(args.num_gpus_per_node) 36 | flow.config.collective_boxing.nccl_fusion_reduce_scatter(True) 37 | flow.config.collective_boxing.nccl_fusion_all_gather(True) 38 | flow.config.collective_boxing.nccl_enable_mixed_fusion(True) 39 | if args.tensor_model_parallel_size > 1: 40 | if hasattr(flow.config, "nccl_use_compute_stream"): 41 | flow.config.nccl_use_compute_stream(True) 42 | else: 43 | print( 44 | "WARNING: This version of OneFlow dose not support placing nccl on compute stream" 45 | " please try other version." 46 | ) 47 | 48 | flow.config.enable_legacy_model_io() 49 | flow.config.enable_model_io_v2(True) 50 | 51 | 52 | def _make_func_config(args): 53 | func_cfg = flow.function_config() 54 | if args.fp16: 55 | func_cfg.enable_auto_mixed_precision(True) 56 | func_cfg.prune_parallel_cast_ops(True) 57 | func_cfg.enable_fuse_add_to_output(True) 58 | func_cfg.enable_fuse_model_update_ops(True) 59 | func_cfg.enable_fuse_cast_scale(True) 60 | # turn on this flag when match ZeRO & DeepSpeed 61 | func_cfg.enable_non_distributed_optimizer(False) 62 | if args.num_accumulation_steps > 1: 63 | if hasattr(func_cfg.train, "num_gradient_accumulation_steps"): 64 | func_cfg.train.num_gradient_accumulation_steps(args.num_accumulation_steps) 65 | else: 66 | args.num_accumulation_steps = 1 67 | print( 68 | "WARNING: This version of OneFlow dose not support gradient accumulation" 69 | " please try newer version." 70 | ) 71 | 72 | return func_cfg 73 | 74 | 75 | def make_gpt_eval_func(args): 76 | @flow.global_function("predict", _make_func_config(args)) 77 | def gpt_func( 78 | x: flow.typing.Numpy.Placeholder( 79 | (args.global_batch_size, args.seq_length), dtype=flow.int64 80 | ) 81 | ): 82 | gpt = GPTModel("model") 83 | return gpt(x) 84 | 85 | return gpt_func 86 | 87 | 88 | def process_batch(args, batch): 89 | """Process batch and produce inputs for the model.""" 90 | 91 | loss_mask = batch["pad_mask"] 92 | tokens_ = batch["text"] 93 | labels = tokens_[:, 1:] 94 | tokens = tokens_[:, :-1] 95 | 96 | return tokens, labels, None, None, loss_mask 97 | 98 | 99 | def forward_step(args, batch, model, eval_metric): 100 | """Forward step.""" 101 | 102 | # Get the batch. 103 | tokens, labels, attention_mask, position_ids, loss_mask = process_batch(args, batch) 104 | # Tell the model what our actual batch size will be 105 | # args.micro_batch_size = len(labels) 106 | 107 | # Forward model. 108 | 109 | # Forward pass through the model. 110 | logits = model(tokens).get() 111 | 112 | if eval_metric == "accuracy": 113 | bs, e = logits.numpy().shape 114 | outputs = np.argmax( 115 | logits.numpy().reshape( 116 | (args.micro_batch_size, int(bs / args.micro_batch_size), e) 117 | ), 118 | -1, 119 | ) 120 | correct = (outputs == labels).astype(np.float32) 121 | correct[(1 - loss_mask).astype(np.bool_)] = 1 122 | correct = np.prod(correct, -1) 123 | return np.sum(correct) 124 | 125 | raise NotImplementedError( 126 | "forward method for evaluation metric {} " 127 | "is not implemented.".format(eval_metric) 128 | ) 129 | 130 | return None 131 | 132 | 133 | def evaluate(args, data_sets, model, eval_metric): 134 | """Evaluation.""" 135 | total_output = 0.0 136 | 137 | # For all the batches in the dataset. 138 | for iteration in range(int(len(data_sets) / args.micro_batch_size)): 139 | text = [ 140 | data_sets[iteration * args.micro_batch_size + i]["text"] 141 | for i in range(args.micro_batch_size) 142 | ] 143 | text = np.stack(text) 144 | pad_mask = [ 145 | data_sets[iteration * args.micro_batch_size + i]["pad_mask"] 146 | for i in range(args.micro_batch_size) 147 | ] 148 | pad_mask = np.stack(pad_mask) 149 | if iteration % args.log_interval == 0: 150 | print("> working on iteration: {}".format(iteration)) 151 | # Forward evaluation. 152 | output = forward_step( 153 | args, {"text": text, "pad_mask": pad_mask}, model, eval_metric 154 | ) 155 | total_output += output 156 | 157 | return total_output 158 | 159 | 160 | def evaluate_and_print_results(args, data_sets, model, eval_metric): 161 | """Evaluate and print results on screen.""" 162 | # Evaluate and get results. 163 | output = evaluate(args, data_sets, model, eval_metric) 164 | 165 | string = " validation results on {} | ".format(args.task) 166 | if eval_metric == "accuracy": 167 | num_examples = ( 168 | int(len(data_sets) / args.micro_batch_size) * args.micro_batch_size 169 | ) 170 | acc = output / num_examples 171 | string += "number correct: {:.4E} | ".format(output) 172 | string += "total examples: {:.4E} | ".format(num_examples) 173 | string += "avg accuracy: {:.4E}".format(acc) 174 | print(string) 175 | else: 176 | raise NotImplementedError( 177 | "evaluation method for {} metric is not " 178 | "implemented yet.".format(eval_metric) 179 | ) 180 | 181 | 182 | def main(args): 183 | """Main program.""" 184 | 185 | if args.task == "LAMBADA": 186 | eval_metric = "accuracy" 187 | else: 188 | raise NotImplementedError("{} task is not implemented.".format(args.task)) 189 | 190 | # Set up model and load checkpoint. 191 | _init_env(args) 192 | _init_config(args) 193 | gpt_eval = make_gpt_eval_func(args) 194 | check_point = flow.train.CheckPoint() 195 | 196 | assert args.load is not None 197 | check_point.load(args.load) 198 | 199 | dataset = build_dataset(args) 200 | # Run evaluation. 201 | evaluate_and_print_results(args, dataset, gpt_eval, eval_metric) 202 | 203 | print("done :-)") 204 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tokenizer/__init__.py -------------------------------------------------------------------------------- /LanguageModeling/GPT/tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from abc import ABC 17 | from abc import abstractmethod 18 | import math 19 | from .gpt2_tokenization import GPT2Tokenizer 20 | 21 | 22 | def build_tokenizer(args): 23 | """Initialize tokenizer.""" 24 | 25 | # Select and instantiate the tokenizer. 26 | assert args.vocab_file is not None 27 | assert args.merge_file is not None 28 | if args.tokenizer_type == "GPT2BPETokenizer": 29 | 30 | tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) 31 | else: 32 | raise NotImplementedError( 33 | "{} tokenizer is not " "implemented.".format(args.tokenizer_type) 34 | ) 35 | 36 | return tokenizer 37 | 38 | 39 | def initialize_model_parallel(args): 40 | device_num = args.gpu_num_per_node * args.num_nodes 41 | if device_num == 1: 42 | print("warning! there is only 1 device, set model parallel size to 1") 43 | return [ 44 | 1, 45 | ] 46 | 47 | assert device_num % args.model_parallel_size == 0 48 | parallel_hierarchy = [ 49 | device_num // args.model_parallel_size, 50 | args.model_parallel_size, 51 | ] 52 | return parallel_hierarchy 53 | 54 | 55 | def pad_vocab_size(vocab_size, alignment, model_parallel_size): 56 | """Pad vocab size so it is divisible by model parallel size and 57 | still having GPU friendly size.""" 58 | assert isinstance(alignment, int) 59 | if alignment == 0: 60 | return vocab_size 61 | 62 | alignment *= model_parallel_size 63 | 64 | padded_vocab_size = int(math.ceil(vocab_size / alignment)) * alignment 65 | print( 66 | " > padded vocab (size: {}) with {} dummy tokens " 67 | "(new size: {})".format( 68 | vocab_size, padded_vocab_size - vocab_size, padded_vocab_size 69 | ) 70 | ) 71 | return padded_vocab_size 72 | 73 | 74 | class AbstractTokenizer(ABC): 75 | """Abstract class for tokenizer.""" 76 | 77 | def __init__(self, name): 78 | self.name = name 79 | super().__init__() 80 | 81 | @property 82 | @abstractmethod 83 | def vocab_size(self): 84 | pass 85 | 86 | @property 87 | @abstractmethod 88 | def vocab(self): 89 | """Dictionary from vocab text token to id token.""" 90 | pass 91 | 92 | @property 93 | @abstractmethod 94 | def inv_vocab(self): 95 | """Dictionary from vocab id token to text token.""" 96 | pass 97 | 98 | @abstractmethod 99 | def tokenize(self, text): 100 | pass 101 | 102 | def detokenize(self, token_ids): 103 | raise NotImplementedError( 104 | "detokenizer is not implemented for {} " "tokenizer".format(self.name) 105 | ) 106 | 107 | @property 108 | def cls(self): 109 | raise NotImplementedError( 110 | "CLS is not provided for {} " "tokenizer".format(self.name) 111 | ) 112 | 113 | @property 114 | def sep(self): 115 | raise NotImplementedError( 116 | "SEP is not provided for {} " "tokenizer".format(self.name) 117 | ) 118 | 119 | @property 120 | def pad(self): 121 | raise NotImplementedError( 122 | "PAD is not provided for {} " "tokenizer".format(self.name) 123 | ) 124 | 125 | @property 126 | def eod(self): 127 | raise NotImplementedError( 128 | "EOD is not provided for {} " "tokenizer".format(self.name) 129 | ) 130 | 131 | @property 132 | def mask(self): 133 | raise NotImplementedError( 134 | "MASK is not provided for {} " "tokenizer".format(self.name) 135 | ) 136 | 137 | 138 | class _GPT2BPETokenizer(AbstractTokenizer): 139 | """Original GPT2 BPE tokenizer.""" 140 | 141 | def __init__(self, vocab_file, merge_file): 142 | name = "GPT2 BPE" 143 | super().__init__(name) 144 | 145 | self.tokenizer = GPT2Tokenizer( 146 | vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None 147 | ) 148 | self.eod_id = self.tokenizer.encoder["<|endoftext|>"] 149 | 150 | @property 151 | def vocab_size(self): 152 | return len(self.tokenizer.encoder) 153 | 154 | @property 155 | def vocab(self): 156 | return self.tokenizer.encoder 157 | 158 | @property 159 | def inv_vocab(self): 160 | return self.tokenizer.decoder 161 | 162 | def tokenize(self, text): 163 | return self.tokenizer.encode(text) 164 | 165 | def detokenize(self, token_ids): 166 | return self.tokenizer.decode(token_ids) 167 | 168 | @property 169 | def eod(self): 170 | return self.eod_id 171 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/README.md: -------------------------------------------------------------------------------- 1 | # GPT模型转换 2 | 3 | ### PyTorch模型转OneFlow模型 4 | - `meta.proto`,是为生成模型目录下的`meta`文件,需要执行`protoc --python_out=. meta.proto`后生成`meta_pb2.py`,即可`import meta_pb2 as meta_pb` 5 | ``` 6 | syntax = "proto2"; 7 | package gpt; 8 | 9 | message Shape { 10 | repeated int32 dim = 1; 11 | } 12 | 13 | enum DataType { 14 | kInvalidDataType = 0; 15 | kChar = 1; 16 | kFloat = 2; 17 | kDouble = 3; 18 | kInt8 = 4; 19 | kInt32 = 5; 20 | kInt64 = 6; 21 | kUInt8 = 7; 22 | kOFRecord = 8; 23 | kFloat16 = 9; 24 | kTensorBuffer = 10; 25 | } 26 | 27 | message Meta { 28 | required Shape shape = 1; 29 | required DataType data_type = 2 [default = kFloat16]; 30 | } 31 | ``` 32 | - 转换脚本`convert_pt_to_of_gpt.py`,执行`python3 convert_pt_to_of_gpt.py --py_model_dir /path/to/iter_0500000/mp_rank_00/model_optim_rng.pt`即可在当前目录下的`convert_pt_to_of_gpt`生成OneFlow模型 33 | - `--py_model_dir`,pytorch模型地址 34 | - `--of_dump_path`,保存转换后的模型路径 35 | 36 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/ansible_inventory: -------------------------------------------------------------------------------- 1 | [local] 2 | localhost ansible_connection=local 3 | [of] 4 | of11 ansible_host=192.168.1.11 5 | of12 ansible_host=192.168.1.12 6 | of13 ansible_host=192.168.1.13 7 | of14 ansible_host=192.168.1.14 8 | of15 ansible_host=192.168.1.15 9 | of16 ansible_host=192.168.1.16 10 | [ln] 11 | vs002 ansible_host=10.11.0.2 12 | vs003 ansible_host=10.11.0.3 13 | vs004 ansible_host=10.11.0.4 14 | vs005 ansible_host=10.11.0.5 15 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/compare_loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | 5 | 6 | def parse_losses_for_log_file( 7 | log_file, loss_pattern, step_pattern, max_step, verbose=False 8 | ): 9 | if not os.path.isfile(log_file): 10 | raise ValueError(f"log file {log_file} do not exist") 11 | 12 | loss_dict = {} 13 | with open(log_file, "rt") as f: 14 | for line in f: 15 | step = None 16 | loss = None 17 | 18 | m = re.search(loss_pattern, line.strip()) 19 | if m: 20 | loss = float(m.group(1)) 21 | elif verbose: 22 | print(f"not found loss in line: {line.strip()}") 23 | else: 24 | pass 25 | 26 | m = re.search(step_pattern, line.strip()) 27 | if m: 28 | step = int(m.group(1)) 29 | elif verbose: 30 | print(f"not found step in line: {line.strip()}") 31 | else: 32 | pass 33 | 34 | if loss is not None and step is not None: 35 | assert step not in loss_dict 36 | loss_dict[step] = loss 37 | if len(loss_dict) >= max_step: 38 | break 39 | 40 | return loss_dict 41 | 42 | 43 | def plot_losses_comparison(oneflow_log_file, openai_log_file, verbose=False): 44 | import matplotlib.pyplot as plt 45 | 46 | loss_pattern = r"loss=[+-]?((\d+(\.\d+)?)|(\.\d+))" 47 | of_step_pattern = r"step=(\d+)" 48 | of_loss_dict = parse_losses_for_log_file( 49 | oneflow_log_file, loss_pattern, of_step_pattern, 100, verbose 50 | ) 51 | 52 | oa_step_pattern = r"\[(\d+)\s\|\s\d+\.\d+\]" 53 | oa_loss_dict = parse_losses_for_log_file( 54 | openai_log_file, loss_pattern, oa_step_pattern, 100, verbose 55 | ) 56 | 57 | if verbose: 58 | print("of_loss_dict:", of_loss_dict) 59 | print("oa_loss_dict:", oa_loss_dict) 60 | 61 | plt.plot(*zip(*sorted(of_loss_dict.items()))) 62 | plt.plot(*zip(*sorted(oa_loss_dict.items()))) 63 | plt.show() 64 | 65 | 66 | if __name__ == "__main__": 67 | if len(sys.argv) <= 1: 68 | raise ValueError 69 | 70 | loss_pattern = r"loss=[+-]?((\d+(\.\d+)?)|(\.\d+))" 71 | # step_pattern = r"step=(\d+)" 72 | step_pattern = r"\[(\d+)\s\|\s\d+\.\d+\]" 73 | losses = parse_losses_for_log_file(sys.argv[1], loss_pattern, step_pattern) 74 | print(losses) 75 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/convert_py_model_to_of.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import torch 5 | import meta_pb2 as meta_pb 6 | 7 | 8 | def get_args(): 9 | 10 | parser = argparse.ArgumentParser() 11 | 12 | ## Required parameters 13 | parser.add_argument( 14 | "--py_model_dir", 15 | type=str, 16 | default="/path/to/iter_0500000/mp_rank_00/model_optim_rng.pt", 17 | help="Path the PyTorch checkpoint file path.", 18 | ) 19 | parser.add_argument( 20 | "--of_dump_path", 21 | type=str, 22 | default="./convert_pt_to_of_gpt_release", 23 | help="Path to the output OneFlow model.", 24 | ) 25 | 26 | return parser.parse_args() 27 | 28 | 29 | def _SaveWeightBlob2File(blob, op_name, save_path, var="out", meta="meta"): 30 | folder = os.path.join(save_path, op_name) 31 | if not os.path.exists(folder): 32 | os.makedirs(folder) 33 | filename = os.path.join(folder, var) 34 | f = open(filename, "wb") 35 | f.write(blob.tobytes()) 36 | meta_info = meta_pb.Meta() 37 | meta_info.shape.dim[:] = blob.shape 38 | meta_info.data_type = meta_pb.kFloat 39 | filename = os.path.join(folder, meta) 40 | f = open(filename, "w") 41 | f.write(str(meta_info)) 42 | f.close() 43 | np.save(filename, blob) 44 | 45 | 46 | def _SaveWeightBlob2FileExtend(blob, op_name, save_path, var="out", meta="meta"): 47 | _SaveWeightBlob2File(blob.numpy(), op_name, save_path, var=var, meta=meta) 48 | _SaveWeightBlob2File( 49 | np.ones_like(blob), op_name + "-v", save_path, var=var, meta=meta 50 | ) 51 | _SaveWeightBlob2File( 52 | np.zeros_like(blob), op_name + "-m", save_path, var=var, meta=meta 53 | ) 54 | 55 | 56 | def convert(args): 57 | path = args.py_model_dir 58 | state_dict = torch.load(path, map_location="cpu") 59 | for model_key, model_value in state_dict["model"]["language_model"][ 60 | "transformer" 61 | ].items(): 62 | if len(model_value.shape) > 1: 63 | model_value = torch.transpose(model_value, 0, 1) 64 | model_value = model_value.float() 65 | op_name_list = model_key.split(".") 66 | if "layers." in model_key: 67 | op_name = model_key.replace("layers.", "model-") 68 | op_name = op_name.replace( 69 | "-%s." % (op_name_list[1]), "-h%s-" % (op_name_list[1]) 70 | ) 71 | else: 72 | op_name = model_key.replace("final_layernorm.", "model-layernorm_f-") 73 | op_name = op_name.replace("input_layernorm.", "layernorm_1-") 74 | op_name = op_name.replace("post_attention_layernorm.", "layernorm_2-") 75 | op_name = op_name.replace("attention.", "attn-") 76 | op_name = op_name.replace("query_key_value.", "c_attn-") 77 | op_name = op_name.replace("dense.", "c_proj-") 78 | op_name = op_name.replace("mlp.dense_h_to_4h.", "mlp-c_fc-") 79 | op_name = op_name.replace("mlp.dense_4h_to_h.", "mlp-c_proj-") 80 | 81 | if ( 82 | "layernorm_1" in op_name 83 | or "layernorm_2" in op_name 84 | or "layernorm_f" in op_name 85 | ): 86 | op_name = op_name.replace("-weight", "-gamma") 87 | op_name = op_name.replace("-bias", "-beta") 88 | 89 | print(model_key, "-" * 8, op_name) 90 | _SaveWeightBlob2FileExtend(model_value, op_name, args.of_dump_path) 91 | 92 | _SaveWeightBlob2FileExtend( 93 | state_dict["model"]["language_model"]["embedding"]["position_embeddings"][ 94 | "weight" 95 | ].float(), 96 | "model-wpe", 97 | args.of_dump_path, 98 | ) 99 | _SaveWeightBlob2FileExtend( 100 | state_dict["model"]["language_model"]["embedding"]["word_embeddings"][ 101 | "weight" 102 | ].float(), 103 | "model-wte", 104 | args.of_dump_path, 105 | ) 106 | 107 | 108 | if __name__ == "__main__": 109 | args = get_args() 110 | convert(args) 111 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/launch_container.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | import tempfile 5 | 6 | 7 | def pwd(): 8 | return os.getcwd() 9 | 10 | 11 | def homepath(relative_path=None): 12 | if relative_path is None: 13 | return os.path.expanduser("~") 14 | 15 | return os.path.expanduser(f"~/{relative_path}") 16 | 17 | 18 | def py_bin_path(py_ver): 19 | py_ver_list = py_ver.split(".") 20 | major, minor = py_ver_list[:2] 21 | ver = f"{major}{minor}" 22 | return f"/opt/python/cp{ver}-cp{ver}m/bin" 23 | 24 | 25 | def launch_oneflow_gpt_container( 26 | cmd, 27 | src, 28 | image, 29 | wheel, 30 | extra_mount=None, 31 | py_ver="3.7", 32 | proxy=None, 33 | interactive=True, 34 | name="oneflow_gpt", 35 | ): 36 | bash_script = f"""set -ex 37 | export PATH={py_bin_path(py_ver)}:$PATH 38 | python3 -m pip install {wheel} 39 | python3 -m pip install -e {src} 40 | {cmd or 'bash'} 41 | """ 42 | 43 | docker_args = "" 44 | 45 | if proxy is not None: 46 | docker_args += f" -e http_proxy={proxy} -e https_proxy={proxy} -e HTTP_PROXY={proxy} -e HTTPS_PROXY={proxy}" 47 | 48 | if extra_mount is not None: 49 | docker_args += f" -v {extra_mount}:{extra_mount}" 50 | 51 | docker_cmd = "docker run" 52 | 53 | if interactive: 54 | docker_cmd += " -it" 55 | 56 | docker_cmd += " --rm" 57 | docker_cmd += " --runtime nvidia" 58 | docker_cmd += " --privileged" 59 | docker_cmd += " --network host" 60 | docker_cmd += " --shm-size=8g" 61 | 62 | docker_cmd += docker_args 63 | docker_cmd += f" -v {src}:{src}" 64 | docker_cmd += f" -v {homepath('var-cache')}:/var/cache" 65 | docker_cmd += " -v /tmp:/host/tmp" 66 | docker_cmd += f" -v {pwd()}:{pwd()}" 67 | docker_cmd += f" -w {pwd()}" 68 | docker_cmd += f" --name {name}" 69 | docker_cmd += f" {image}" 70 | 71 | with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f: 72 | t_fname = f.name 73 | f.write(bash_script) 74 | f.flush() 75 | print("tempfile name:", t_fname) 76 | docker_cmd += f" bash /host{t_fname}" 77 | print(docker_cmd) 78 | subprocess.check_call(docker_cmd, shell=True) 79 | 80 | 81 | def parse_args(): 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument("--cmd", type=str, default=None, help="") 84 | parser.add_argument("--src", type=str, default=f"{pwd()}/oneflow_gpt", help="") 85 | parser.add_argument( 86 | "--image", type=str, default="oneflow-manylinux2014-cuda11.2:0.1", help="", 87 | ) 88 | parser.add_argument( 89 | "--wheel", 90 | type=str, 91 | default="$PWD/packages/oneflow-0.3.5+cu112.git.4a4f032-cp37-cp37m-linux_x86_64.whl", 92 | help="", 93 | ) 94 | parser.add_argument("--extra-mount", type=str, default="/data", help="") 95 | parser.add_argument("--py", type=str, default="3.7", help="") 96 | parser.add_argument("--proxy", type=str, default=None, help="") 97 | parser.add_argument("--no-interactive", action="store_false", dest="interactive", help="") 98 | return parser.parse_args() 99 | 100 | 101 | if __name__ == "__main__": 102 | args = parse_args() 103 | launch_oneflow_gpt_container( 104 | args.cmd, 105 | args.src, 106 | args.image, 107 | args.wheel, 108 | args.extra_mount, 109 | args.py, 110 | args.proxy, 111 | args.interactive, 112 | ) 113 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/meta.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | message Shape { 4 | repeated int32 dim = 1; 5 | } 6 | 7 | enum DataType { 8 | kInvalidDataType = 0; 9 | kChar = 1; 10 | kFloat = 2; 11 | kDouble = 3; 12 | kInt8 = 4; 13 | kInt32 = 5; 14 | kInt64 = 6; 15 | kUInt8 = 7; 16 | kOFRecord = 8; 17 | kFloat16 = 9; 18 | kTensorBuffer = 10; 19 | } 20 | 21 | message Meta { 22 | required Shape shape = 1; 23 | required DataType data_type = 2 [default = kFloat16]; 24 | } 25 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/meta_pb2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by the protocol buffer compiler. DO NOT EDIT! 3 | # source: meta.proto 4 | """Generated protocol buffer code.""" 5 | from google.protobuf.internal import enum_type_wrapper 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # @@protoc_insertion_point(imports) 11 | 12 | _sym_db = _symbol_database.Default() 13 | 14 | 15 | 16 | 17 | DESCRIPTOR = _descriptor.FileDescriptor( 18 | name='meta.proto', 19 | package='', 20 | syntax='proto2', 21 | serialized_options=None, 22 | create_key=_descriptor._internal_create_key, 23 | serialized_pb=b'\n\nmeta.proto\"\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x05\"E\n\x04Meta\x12\x15\n\x05shape\x18\x01 \x02(\x0b\x32\x06.Shape\x12&\n\tdata_type\x18\x02 \x02(\x0e\x32\t.DataType:\x08kFloat16*\xa3\x01\n\x08\x44\x61taType\x12\x14\n\x10kInvalidDataType\x10\x00\x12\t\n\x05kChar\x10\x01\x12\n\n\x06kFloat\x10\x02\x12\x0b\n\x07kDouble\x10\x03\x12\t\n\x05kInt8\x10\x04\x12\n\n\x06kInt32\x10\x05\x12\n\n\x06kInt64\x10\x06\x12\n\n\x06kUInt8\x10\x07\x12\r\n\tkOFRecord\x10\x08\x12\x0c\n\x08kFloat16\x10\t\x12\x11\n\rkTensorBuffer\x10\n' 24 | ) 25 | 26 | _DATATYPE = _descriptor.EnumDescriptor( 27 | name='DataType', 28 | full_name='DataType', 29 | filename=None, 30 | file=DESCRIPTOR, 31 | create_key=_descriptor._internal_create_key, 32 | values=[ 33 | _descriptor.EnumValueDescriptor( 34 | name='kInvalidDataType', index=0, number=0, 35 | serialized_options=None, 36 | type=None, 37 | create_key=_descriptor._internal_create_key), 38 | _descriptor.EnumValueDescriptor( 39 | name='kChar', index=1, number=1, 40 | serialized_options=None, 41 | type=None, 42 | create_key=_descriptor._internal_create_key), 43 | _descriptor.EnumValueDescriptor( 44 | name='kFloat', index=2, number=2, 45 | serialized_options=None, 46 | type=None, 47 | create_key=_descriptor._internal_create_key), 48 | _descriptor.EnumValueDescriptor( 49 | name='kDouble', index=3, number=3, 50 | serialized_options=None, 51 | type=None, 52 | create_key=_descriptor._internal_create_key), 53 | _descriptor.EnumValueDescriptor( 54 | name='kInt8', index=4, number=4, 55 | serialized_options=None, 56 | type=None, 57 | create_key=_descriptor._internal_create_key), 58 | _descriptor.EnumValueDescriptor( 59 | name='kInt32', index=5, number=5, 60 | serialized_options=None, 61 | type=None, 62 | create_key=_descriptor._internal_create_key), 63 | _descriptor.EnumValueDescriptor( 64 | name='kInt64', index=6, number=6, 65 | serialized_options=None, 66 | type=None, 67 | create_key=_descriptor._internal_create_key), 68 | _descriptor.EnumValueDescriptor( 69 | name='kUInt8', index=7, number=7, 70 | serialized_options=None, 71 | type=None, 72 | create_key=_descriptor._internal_create_key), 73 | _descriptor.EnumValueDescriptor( 74 | name='kOFRecord', index=8, number=8, 75 | serialized_options=None, 76 | type=None, 77 | create_key=_descriptor._internal_create_key), 78 | _descriptor.EnumValueDescriptor( 79 | name='kFloat16', index=9, number=9, 80 | serialized_options=None, 81 | type=None, 82 | create_key=_descriptor._internal_create_key), 83 | _descriptor.EnumValueDescriptor( 84 | name='kTensorBuffer', index=10, number=10, 85 | serialized_options=None, 86 | type=None, 87 | create_key=_descriptor._internal_create_key), 88 | ], 89 | containing_type=None, 90 | serialized_options=None, 91 | serialized_start=108, 92 | serialized_end=271, 93 | ) 94 | _sym_db.RegisterEnumDescriptor(_DATATYPE) 95 | 96 | DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE) 97 | kInvalidDataType = 0 98 | kChar = 1 99 | kFloat = 2 100 | kDouble = 3 101 | kInt8 = 4 102 | kInt32 = 5 103 | kInt64 = 6 104 | kUInt8 = 7 105 | kOFRecord = 8 106 | kFloat16 = 9 107 | kTensorBuffer = 10 108 | 109 | 110 | 111 | _SHAPE = _descriptor.Descriptor( 112 | name='Shape', 113 | full_name='Shape', 114 | filename=None, 115 | file=DESCRIPTOR, 116 | containing_type=None, 117 | create_key=_descriptor._internal_create_key, 118 | fields=[ 119 | _descriptor.FieldDescriptor( 120 | name='dim', full_name='Shape.dim', index=0, 121 | number=1, type=5, cpp_type=1, label=3, 122 | has_default_value=False, default_value=[], 123 | message_type=None, enum_type=None, containing_type=None, 124 | is_extension=False, extension_scope=None, 125 | serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), 126 | ], 127 | extensions=[ 128 | ], 129 | nested_types=[], 130 | enum_types=[ 131 | ], 132 | serialized_options=None, 133 | is_extendable=False, 134 | syntax='proto2', 135 | extension_ranges=[], 136 | oneofs=[ 137 | ], 138 | serialized_start=14, 139 | serialized_end=34, 140 | ) 141 | 142 | 143 | _META = _descriptor.Descriptor( 144 | name='Meta', 145 | full_name='Meta', 146 | filename=None, 147 | file=DESCRIPTOR, 148 | containing_type=None, 149 | create_key=_descriptor._internal_create_key, 150 | fields=[ 151 | _descriptor.FieldDescriptor( 152 | name='shape', full_name='Meta.shape', index=0, 153 | number=1, type=11, cpp_type=10, label=2, 154 | has_default_value=False, default_value=None, 155 | message_type=None, enum_type=None, containing_type=None, 156 | is_extension=False, extension_scope=None, 157 | serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), 158 | _descriptor.FieldDescriptor( 159 | name='data_type', full_name='Meta.data_type', index=1, 160 | number=2, type=14, cpp_type=8, label=2, 161 | has_default_value=True, default_value=9, 162 | message_type=None, enum_type=None, containing_type=None, 163 | is_extension=False, extension_scope=None, 164 | serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), 165 | ], 166 | extensions=[ 167 | ], 168 | nested_types=[], 169 | enum_types=[ 170 | ], 171 | serialized_options=None, 172 | is_extendable=False, 173 | syntax='proto2', 174 | extension_ranges=[], 175 | oneofs=[ 176 | ], 177 | serialized_start=36, 178 | serialized_end=105, 179 | ) 180 | 181 | _META.fields_by_name['shape'].message_type = _SHAPE 182 | _META.fields_by_name['data_type'].enum_type = _DATATYPE 183 | DESCRIPTOR.message_types_by_name['Shape'] = _SHAPE 184 | DESCRIPTOR.message_types_by_name['Meta'] = _META 185 | DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE 186 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 187 | 188 | Shape = _reflection.GeneratedProtocolMessageType('Shape', (_message.Message,), { 189 | 'DESCRIPTOR' : _SHAPE, 190 | '__module__' : 'meta_pb2' 191 | # @@protoc_insertion_point(class_scope:Shape) 192 | }) 193 | _sym_db.RegisterMessage(Shape) 194 | 195 | Meta = _reflection.GeneratedProtocolMessageType('Meta', (_message.Message,), { 196 | 'DESCRIPTOR' : _META, 197 | '__module__' : 'meta_pb2' 198 | # @@protoc_insertion_point(class_scope:Meta) 199 | }) 200 | _sym_db.RegisterMessage(Meta) 201 | 202 | 203 | # @@protoc_insertion_point(module_scope) 204 | -------------------------------------------------------------------------------- /LanguageModeling/GPT/tools/prepare_distribute.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | inventory=$(dirname $0)/ansible_inventory 4 | hosts= 5 | wksp= 6 | oneflow_gpt_src_dir= 7 | wheel= 8 | pip_install=on 9 | 10 | function help() { 11 | echo "Usage: prepare_distribute.sh [ -h | --help ] 12 | [ -i | --inventory inventory_file ] 13 | [ -n | --hosts hosts] 14 | [ -s | --src oneflow_gpt_src_dir ] 15 | [ -w | --wheel wheel_file ] 16 | [ --no-install ] workspace_dir" 17 | exit 2 18 | } 19 | 20 | function parse_args() { 21 | args=$(getopt -o hi:n:s:w: -a -l help,intentory:,hosts:,src:,wheel:,no-install -- "$@") 22 | if [[ $? -ne 0 ]]; then 23 | help 24 | fi 25 | 26 | echo "parsed args is ${args}" 27 | eval set -- "${args}" 28 | 29 | while : 30 | do 31 | case "$1" in 32 | -h|--help) 33 | echo "help" 34 | ;; 35 | -i|--intentory) 36 | inventory="$2" 37 | shift 38 | ;; 39 | -n|--hosts) 40 | hosts="$2" 41 | shift 42 | ;; 43 | -s|--src) 44 | oneflow_gpt_src_dir="$2" 45 | shift 46 | ;; 47 | -w|--wheel) 48 | wheel="$2" 49 | shift 50 | ;; 51 | --no-install) 52 | pip_install= 53 | ;; 54 | --) 55 | shift 56 | break 57 | ;; 58 | *) 59 | echo "Unexpected option: $1" 60 | help 61 | ;; 62 | esac 63 | shift 64 | done 65 | 66 | echo "remaining args are: $@" 67 | echo "remaining args number are: $#" 68 | if [[ $# -ne 0 ]]; then 69 | wksp=$1 70 | else 71 | wksp=$PWD 72 | fi 73 | } 74 | 75 | parse_args $@ 76 | 77 | if [[ -z "${hosts}" ]]; then 78 | echo "hosts is unset" 79 | exit 1 80 | fi 81 | 82 | ansible ${hosts} -i ${inventory} -m file -a "path=${wksp} state=directory" 83 | 84 | if [[ ! -z "${wheel}" ]]; then 85 | wheel=$(realpath "${wheel}") 86 | wheel_dir=$(realpath $(dirname "${wheel}")) 87 | ansible ${hosts} -i ${inventory} -m file -a "path=${wheel_dir} state=directory" 88 | ansible ${hosts} -i ${inventory} -m copy -a "src=${wheel} dest=${wheel}" 89 | if [[ ! -z "${pip_install}" ]]; then 90 | ansible ${hosts} -i ${inventory} -m shell -a "python3 -m pip install ${wheel} --user" 91 | fi 92 | fi 93 | 94 | if [[ ! -z "${oneflow_gpt_src_dir}" ]]; then 95 | ansible ${hosts} -i ${inventory} -m copy -a "src=${oneflow_gpt_src_dir} dest=${wksp}/oneflow_gpt" 96 | if [[ ! -z "${pip_install}" ]]; then 97 | ansible ${hosts} -i ${inventory} -m shell -a "python3 -m pip install -e ${wksp}/oneflow_gpt --user" 98 | fi 99 | fi 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OneFlow Deep Learning Benchmarks 2 | ## Introduction 3 | This repository provides OneFlow deep learning benchmark examples for CV, CTR and NLP, and more models are on the way and will be provided here when ready. 4 | 5 | ## [Convolutional Networks](./Classification/cnns) for Computer Vision Classification 6 | - [ResNet-50](./Classification/cnns) 7 | - [ResNeXt-50-32*4d](./Classification/cnns) 8 | - [VGG-16](./Classification/cnns) 9 | - [Inception-V3](./Classification/cnns) 10 | - [AlexNet](./Classification/cnns) 11 | - [MobileNet-V2](./Classification/cnns) 12 | 13 | ## [Wide Deep Learning](./ClickThroughRate/WideDeepLearning) for Click-Through-Rate (CTR) Recommender Systems 14 | - [OneFlow-WDL](./ClickThroughRate/WideDeepLearning) 15 | 16 | ## [BERT](./LanguageModeling/BERT) for Nature Language Process 17 | - [BERT Pretrain for Language Modeling](./LanguageModeling/BERT) 18 | - [SQuAD for Question Answering](./LanguageModeling/BERT) 19 | - [CoLA and MRPC of GLUE](./LanguageModeling/BERT) 20 | 21 | ## [GPT](./LanguageModeling/GPT) for Generative Pre-trained Transformer 22 | - [Generative Pre-trained Transformer](./LanguageModeling/GPT) 23 | 24 | ## OneFlow Benchmark Test Reports 25 | 26 | | Model | DType | XLA | Throughput | Speedup on 32 devices | 27 | | ----- | ----- | --- | ---------- | ------- | 28 | | [ResNet50-V1.5](./reports/resnet50_v15_fp32_report.md) | Float32 | No | 11.6k imges/sec | 30.4 | 29 | | [BERT base Pretrain](./reports/bert_fp32_report.md) | Float32 | No | 530k tokens/sec | 28.54 | 30 | --------------------------------------------------------------------------------