├── .github
    └── workflows
    │   └── cnn_e2e.yml
├── .gitignore
├── Classification
    └── cnns
    │   ├── README.md
    │   ├── alexnet_model.py
    │   ├── benchmark.sh
    │   ├── config.py
    │   ├── data
    │       ├── ILSVRC2012_val_00020287.JPEG
    │       ├── fish.jpg
    │       └── tiger.jpg
    │   ├── docs
    │       ├── resnet50_lr_schedule.png
    │       └── resnet50_validation_acuracy.png
    │   ├── evaluate.sh
    │   ├── imagenet1000_clsidx_to_labels.py
    │   ├── inception_model.py
    │   ├── inference.sh
    │   ├── job_function_util.py
    │   ├── mobilenet_v2_model.py
    │   ├── of_cnn_evaluate.py
    │   ├── of_cnn_inference.py
    │   ├── of_cnn_train_val.py
    │   ├── ofrecord_util.py
    │   ├── optimizer_util.py
    │   ├── resnet2onnx.sh
    │   ├── resnet_model.py
    │   ├── resnet_to_onnx.py
    │   ├── resnext_model.py
    │   ├── tools
    │       ├── README.md
    │       ├── extract_trainval.sh
    │       ├── imagenet_2012_validation_synset_labels.txt
    │       ├── imagenet_lsvrc_2015_synsets.txt
    │       ├── imagenet_metadata.txt
    │       ├── imagenet_ofrecord.py
    │       ├── preprocess_imagenet_validation_data.py
    │       └── process_bounding_boxes.py
    │   ├── train.sh
    │   ├── train_fp16.sh
    │   ├── train_fp32.sh
    │   ├── util.py
    │   └── vgg_model.py
├── ClickThroughRate
    └── WideDeepLearning
    │   ├── README.md
    │   ├── how_to_make_hf_dataset.md
    │   ├── how_to_make_ofrecord_for_wdl.md
    │   ├── wdl_test_report.md
    │   ├── wdl_train_eval.py
    │   ├── wdl_train_eval_test.py
    │   └── wdl_train_eval_with_hybrid_embd.py
├── Generative
    ├── README.md
    ├── dcgan.py
    ├── layers.py
    └── pic
    │   ├── 1.png
    │   └── 2.png
├── LanguageModeling
    ├── BERT
    │   ├── README.md
    │   ├── bert.py
    │   ├── classifier.py
    │   ├── config.py
    │   ├── convert_tf_ckpt_to_of.py
    │   ├── pretrain.py
    │   ├── run_classifier.py
    │   ├── run_pretraining.py
    │   ├── run_pretraining_adam.sh
    │   ├── run_pretraining_lamb.sh
    │   ├── run_squad.py
    │   ├── run_squad.sh
    │   ├── squad.py
    │   ├── squad_util.py
    │   ├── tokenization.py
    │   └── util.py
    └── GPT
    │   ├── LICENSE
    │   ├── README.md
    │   ├── examples
    │       ├── distribute_pretrain_2n4d.sh
    │       ├── distribute_pretrain_4n8d.sh
    │       ├── distribute_pretrain_4n8d_2x4x4_512_2304x24.sh
    │       ├── distribute_pretrain_with_container.sh
    │       ├── lambada_cloze_accuracy.sh
    │       ├── pretrain.sh
    │       ├── pretrain_117M.sh
    │       ├── pretrain_1n8d_2x4x1_16_1536x16.sh
    │       ├── pretrain_345M.sh
    │       ├── pretrain_with_container.sh
    │       └── pretrain_with_profile.sh
    │   ├── oneflow_gpt
    │       ├── __init__.py
    │       ├── config.py
    │       ├── data.py
    │       ├── distribute.py
    │       ├── model.py
    │       ├── optimizer.py
    │       ├── snapshot.py
    │       ├── third_party
    │       │   ├── __init__.py
    │       │   └── data
    │       │   │   ├── __init__.py
    │       │   │   ├── gpt_dataset.py
    │       │   │   └── indexed_dataset.py
    │       ├── training.py
    │       └── util.py
    │   ├── requirements.txt
    │   ├── setup.py
    │   ├── tasks
    │       ├── __init__.py
    │       ├── main.py
    │       └── zeroshot_gpt
    │       │   ├── __init__.py
    │       │   ├── datasets.py
    │       │   └── evaluate.py
    │   ├── tokenizer
    │       ├── __init__.py
    │       ├── gpt2_tokenization.py
    │       └── tokenizer.py
    │   └── tools
    │       ├── README.md
    │       ├── ansible_inventory
    │       ├── compare_loss.py
    │       ├── convert_py_model_to_of.py
    │       ├── launch_container.py
    │       ├── meta.proto
    │       ├── meta_pb2.py
    │       └── prepare_distribute.sh
├── README.md
└── reports
    ├── bert_fp32_report.md
    └── resnet50_v15_fp32_report.md


/.github/workflows/cnn_e2e.yml:
--------------------------------------------------------------------------------
 1 | name: 'resnet e2e test'
 2 | on: 
 3 |   pull_request:
 4 |     types: [review_requested]
 5 |     branches:
 6 |       - "*"
 7 |   workflow_dispatch:
 8 |     inputs:
 9 |       placeholder:
10 |         description: "placeholder, no effect"
11 |         required: false
12 | 
13 | jobs:
14 |   build: 
15 |     name: 'Build and test this repo'
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # IDE
  7 | .idea
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | output/
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 


--------------------------------------------------------------------------------
/Classification/cnns/alexnet_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import oneflow.compatible.single_client as flow
 18 | 
 19 | 
 20 | def _get_kernel_initializer(data_format="NCHW"):
 21 |     return flow.variance_scaling_initializer(
 22 |         distribution="random_normal", data_format=data_format
 23 |     )
 24 | 
 25 | 
 26 | def _get_regularizer():
 27 |     return flow.regularizers.l2(0.00005)
 28 | 
 29 | 
 30 | def _get_bias_initializer():
 31 |     return flow.zeros_initializer()
 32 | 
 33 | 
 34 | def conv2d_layer(
 35 |     name,
 36 |     input,
 37 |     filters,
 38 |     kernel_size=3,
 39 |     strides=1,
 40 |     padding="SAME",
 41 |     data_format="NCHW",
 42 |     dilation_rate=1,
 43 |     activation="Relu",
 44 |     use_bias=True,
 45 |     bias_initializer=_get_bias_initializer(),
 46 |     weight_regularizer=_get_regularizer(),
 47 |     bias_regularizer=_get_regularizer(),
 48 | ):
 49 |     if isinstance(kernel_size, int):
 50 |         kernel_size_1 = kernel_size
 51 |         kernel_size_2 = kernel_size
 52 |     if isinstance(kernel_size, list):
 53 |         kernel_size_1 = kernel_size[0]
 54 |         kernel_size_2 = kernel_size[1]
 55 | 
 56 |     weight_initializer = _get_kernel_initializer(data_format)
 57 |     weight_shape = (
 58 |         (filters, input.shape[1], kernel_size_1, kernel_size_2)
 59 |         if data_format == "NCHW"
 60 |         else (filters, kernel_size_1, kernel_size_2, input.shape[3])
 61 |     )
 62 |     weight = flow.get_variable(
 63 |         name + "-weight",
 64 |         shape=weight_shape,
 65 |         dtype=input.dtype,
 66 |         initializer=weight_initializer,
 67 |         regularizer=weight_regularizer,
 68 |     )
 69 |     output = flow.nn.conv2d(
 70 |         input, weight, strides, padding, None, data_format, dilation_rate, name=name
 71 |     )
 72 |     if use_bias:
 73 |         bias = flow.get_variable(
 74 |             name + "-bias",
 75 |             shape=(filters,),
 76 |             dtype=input.dtype,
 77 |             initializer=bias_initializer,
 78 |             regularizer=bias_regularizer,
 79 |         )
 80 |         output = flow.nn.bias_add(output, bias, data_format)
 81 | 
 82 |     if activation is not None:
 83 |         if activation == "Relu":
 84 |             output = flow.nn.relu(output)
 85 |         else:
 86 |             raise NotImplementedError
 87 | 
 88 |     return output
 89 | 
 90 | 
 91 | def alexnet(images, args, trainable=True):
 92 |     data_format = "NHWC" if args.channel_last else "NCHW"
 93 | 
 94 |     conv1 = conv2d_layer(
 95 |         "conv1",
 96 |         images,
 97 |         filters=64,
 98 |         kernel_size=11,
 99 |         strides=4,
100 |         padding="VALID",
101 |         data_format=data_format,
102 |     )
103 | 
104 |     pool1 = flow.nn.avg_pool2d(conv1, 3, 2, "VALID", data_format, name="pool1")
105 | 
106 |     conv2 = conv2d_layer(
107 |         "conv2", pool1, filters=192, kernel_size=5, data_format=data_format
108 |     )
109 | 
110 |     pool2 = flow.nn.avg_pool2d(conv2, 3, 2, "VALID", data_format, name="pool2")
111 | 
112 |     conv3 = conv2d_layer("conv3", pool2, filters=384, data_format=data_format)
113 | 
114 |     conv4 = conv2d_layer("conv4", conv3, filters=384, data_format=data_format)
115 | 
116 |     conv5 = conv2d_layer("conv5", conv4, filters=256, data_format=data_format)
117 | 
118 |     pool5 = flow.nn.avg_pool2d(conv5, 3, 2, "VALID", data_format, name="pool5")
119 | 
120 |     if len(pool5.shape) > 2:
121 |         pool5 = flow.reshape(pool5, shape=(pool5.shape[0], -1))
122 | 
123 |     fc1 = flow.layers.dense(
124 |         inputs=pool5,
125 |         units=4096,
126 |         activation=flow.nn.relu,
127 |         use_bias=True,
128 |         # kernel_initializer=flow.random_uniform_initializer(),
129 |         kernel_initializer=_get_kernel_initializer(),
130 |         bias_initializer=_get_bias_initializer(),
131 |         kernel_regularizer=_get_regularizer(),
132 |         bias_regularizer=_get_regularizer(),
133 |         trainable=trainable,
134 |         name="fc1",
135 |     )
136 | 
137 |     dropout1 = flow.nn.dropout(fc1, rate=0.5)
138 | 
139 |     fc2 = flow.layers.dense(
140 |         inputs=dropout1,
141 |         units=4096,
142 |         activation=flow.nn.relu,
143 |         use_bias=True,
144 |         kernel_initializer=_get_kernel_initializer(),
145 |         bias_initializer=_get_bias_initializer(),
146 |         kernel_regularizer=_get_regularizer(),
147 |         bias_regularizer=_get_regularizer(),
148 |         trainable=trainable,
149 |         name="fc2",
150 |     )
151 | 
152 |     dropout2 = flow.nn.dropout(fc2, rate=0.5)
153 | 
154 |     fc3 = flow.layers.dense(
155 |         inputs=dropout2,
156 |         units=1000,
157 |         activation=None,
158 |         use_bias=False,
159 |         kernel_initializer=_get_kernel_initializer(),
160 |         kernel_regularizer=_get_regularizer(),
161 |         bias_initializer=False,
162 |         trainable=trainable,
163 |         name="fc3",
164 |     )
165 | 
166 |     return fc3
167 | 


--------------------------------------------------------------------------------
/Classification/cnns/benchmark.sh:
--------------------------------------------------------------------------------
 1 | BENCH_ROOT=$1
 2 | NUM_NODES=$2
 3 | GPU_NUM_PER_NODE=$3
 4 | BSZ_PER_DEVICE=$4
 5 | 
 6 | if [ -n "$5" ]; then
 7 |     DATA_ROOT=$5
 8 | else
 9 |     DATA_ROOT=/datasets/ImageNet/OneFlow
10 | fi
11 | 
12 | DATA_PART_NUM=44
13 | 
14 | rm -rf ./log
15 | mkdir ./log
16 | 
17 | NUM_ITERS=300
18 | NUM_EXAMPLES=$(($NUM_NODES * $GPU_NUM_PER_NODE * $BSZ_PER_DEVICE * $NUM_ITERS))
19 | 
20 | export PYTHONUNBUFFERED=1
21 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
22 | export NCCL_LAUNCH_MODE=PARALLEL
23 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
24 | 
25 | python3 $BENCH_ROOT/of_cnn_train_val.py \
26 |     --num_examples=$NUM_EXAMPLES \
27 |     --train_data_dir=$DATA_ROOT/train \
28 |     --train_data_part_num=$DATA_PART_NUM \
29 |     --num_nodes=$NUM_NODES \
30 |     --gpu_num_per_node=$GPU_NUM_PER_NODE \
31 |     --optimizer="sgd" \
32 |     --momentum=0.875 \
33 |     --label_smoothing=0.1 \
34 |     --learning_rate=0.001 \
35 |     --loss_print_every_n_iter=100 \
36 |     --batch_size_per_device=$BSZ_PER_DEVICE \
37 |     --val_batch_size_per_device=125 \
38 |     --num_epoch=1 \
39 |     --log_dir=./log \
40 |     --use_fp16 \
41 |     --channel_last=True \
42 |     --pad_output \
43 |     --fuse_bn_relu=True \
44 |     --fuse_bn_add_relu=True \
45 |     --nccl_fusion_threshold_mb=16 \
46 |     --nccl_fusion_max_ops=24 \
47 |     --gpu_image_decoder=True \
48 |     --model="resnet50"
49 | 


--------------------------------------------------------------------------------
/Classification/cnns/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import argparse
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | from optimizer_util import add_optimizer_args
 22 | from ofrecord_util import add_ofrecord_args
 23 | 
 24 | 
 25 | def get_parser(parser=None):
 26 |     def str_list(x):
 27 |         return [i.strip() for i in x.split(",")]
 28 | 
 29 |     def int_list(x):
 30 |         return list(map(int, x.split(",")))
 31 | 
 32 |     def float_list(x):
 33 |         return list(map(float, x.split(",")))
 34 | 
 35 |     def str2bool(v):
 36 |         if v.lower() in ("yes", "true", "t", "y", "1"):
 37 |             return True
 38 |         elif v.lower() in ("no", "false", "f", "n", "0"):
 39 |             return False
 40 |         else:
 41 |             raise argparse.ArgumentTypeError("Unsupported value encountered.")
 42 | 
 43 |     if parser is None:
 44 |         parser = argparse.ArgumentParser("flags for cnn benchmark")
 45 | 
 46 |     parser.add_argument("--dtype", type=str, default="float32", help="float16 float32")
 47 | 
 48 |     # resouce
 49 |     parser.add_argument("--gpu_num_per_node", type=int, default=1)
 50 |     parser.add_argument(
 51 |         "--num_nodes", type=int, default=1, help="node/machine number for training"
 52 |     )
 53 |     parser.add_argument(
 54 |         "--node_ips",
 55 |         type=str_list,
 56 |         default=["192.168.1.13", "192.168.1.14"],
 57 |         help='nodes ip list for training, devided by ",", length >= num_nodes',
 58 |     )
 59 |     parser.add_argument(
 60 |         "--ctrl_port", type=int, default=50051, help="ctrl_port for multinode job"
 61 |     )
 62 | 
 63 |     parser.add_argument("--model", type=str, default="resnet50", help="resnet50")
 64 |     parser.add_argument(
 65 |         "--use_fp16",
 66 |         type=str2bool,
 67 |         nargs="?",
 68 |         const=True,
 69 |         help="Whether to use use fp16",
 70 |     )
 71 |     parser.add_argument(
 72 |         "--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla"
 73 |     )
 74 | 
 75 |     parser.add_argument(
 76 |         "--channel_last",
 77 |         type=str2bool,
 78 |         nargs="?",
 79 |         const=False,
 80 |         help="Whether to use use channel last mode(nhwc)",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--pad_output",
 84 |         type=str2bool,
 85 |         nargs="?",
 86 |         const=True,
 87 |         help="Whether to pad the output to number of image channels to 4.",
 88 |     )
 89 | 
 90 |     # train and validaion
 91 |     parser.add_argument("--num_epochs", type=int, default=90, help="number of epochs")
 92 |     parser.add_argument(
 93 |         "--model_load_dir", type=str, default=None, help="model load directory if need"
 94 |     )
 95 |     parser.add_argument(
 96 |         "--save_epoch_interval",
 97 |         type=int,
 98 |         default=10,
 99 |         help="Number of iterations between checkpoint saves.",
100 |     )
101 |     parser.add_argument(
102 |         "--save_last",
103 |         action="store_true",
104 |         default=False,
105 |         help="save model snapshot for last iteration",
106 |     )
107 |     parser.add_argument(
108 |         "--save_init",
109 |         action="store_true",
110 |         default=False,
111 |         help="save model snapshot for inited",
112 |     )
113 |     parser.add_argument("--batch_size_per_device", type=int, default=64)
114 |     parser.add_argument("--val_batch_size_per_device", type=int, default=8)
115 | 
116 |     parser.add_argument(
117 |         "--nccl_fusion_threshold_mb",
118 |         type=int,
119 |         default=0,
120 |         help="NCCL fusion threshold megabytes, set to 0 to compatible with previous version of OneFlow.",
121 |     )
122 |     parser.add_argument(
123 |         "--nccl_fusion_max_ops",
124 |         type=int,
125 |         default=0,
126 |         help="Maximum number of ops of NCCL fusion, set to 0 to compatible with previous version of OneFlow.",
127 |     )
128 | 
129 |     # fuse bn relu or bn add relu
130 |     parser.add_argument(
131 |         "--fuse_bn_relu",
132 |         type=str2bool,
133 |         default=False,
134 |         help="Whether to use use fuse batch normalization relu. Currently supported in origin/master of OneFlow only.",
135 |     )
136 |     parser.add_argument(
137 |         "--fuse_bn_add_relu",
138 |         type=str2bool,
139 |         default=False,
140 |         help="Whether to use use fuse batch normalization add relu. Currently supported in origin/master of OneFlow only.",
141 |     )
142 |     parser.add_argument(
143 |         "--gpu_image_decoder",
144 |         type=str2bool,
145 |         default=False,
146 |         help="Whether to use use ImageDecoderRandomCropResize.",
147 |     )
148 |     # inference
149 |     parser.add_argument(
150 |         "--image_path", type=str, default="test_img/tiger.jpg", help="image path"
151 |     )
152 | 
153 |     # for data process
154 |     parser.add_argument(
155 |         "--num_classes", type=int, default=1000, help="num of pic classes"
156 |     )
157 |     parser.add_argument(
158 |         "--num_examples", type=int, default=1281167, help="train pic number"
159 |     )
160 |     parser.add_argument(
161 |         "--num_val_examples", type=int, default=50000, help="validation pic number"
162 |     )
163 |     parser.add_argument(
164 |         "--rgb-mean",
165 |         type=float_list,
166 |         default=[123.68, 116.779, 103.939],
167 |         help="a tuple of size 3 for the mean rgb",
168 |     )
169 |     parser.add_argument(
170 |         "--rgb-std",
171 |         type=float_list,
172 |         default=[58.393, 57.12, 57.375],
173 |         help="a tuple of size 3 for the std rgb",
174 |     )
175 |     parser.add_argument(
176 |         "--image-shape",
177 |         type=int_list,
178 |         default=[3, 224, 224],
179 |         help="the image shape feed into the network",
180 |     )
181 |     parser.add_argument(
182 |         "--label_smoothing", type=float, default=0.1, help="label smoothing factor"
183 |     )
184 | 
185 |     # snapshot
186 |     parser.add_argument(
187 |         "--model_save_dir",
188 |         type=str,
189 |         default="./output/snapshots/model_save-{}".format(
190 |             str(datetime.now().strftime("%Y%m%d%H%M%S"))
191 |         ),
192 |         help="model save directory",
193 |     )
194 | 
195 |     # log and loss print
196 |     parser.add_argument(
197 |         "--log_dir", type=str, default="./output", help="log info save directory"
198 |     )
199 |     parser.add_argument(
200 |         "--loss_print_every_n_iter",
201 |         type=int,
202 |         default=1,
203 |         help="print loss every n iteration",
204 |     )
205 |     add_ofrecord_args(parser)
206 |     add_optimizer_args(parser)
207 |     return parser
208 | 
209 | 
210 | def print_args(args):
211 |     print("=".ljust(66, "="))
212 |     print(
213 |         "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
214 |             args.model, args.gpu_num_per_node, args.num_nodes
215 |         )
216 |     )
217 |     print("=".ljust(66, "="))
218 |     for arg in vars(args):
219 |         print("{} = {}".format(arg, getattr(args, arg)))
220 |     print("-".ljust(66, "-"))
221 |     print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     parser = get_parser()
226 |     args = parser.parse_args()
227 |     print_args(args)
228 | 


--------------------------------------------------------------------------------
/Classification/cnns/data/ILSVRC2012_val_00020287.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/ILSVRC2012_val_00020287.JPEG


--------------------------------------------------------------------------------
/Classification/cnns/data/fish.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/fish.jpg


--------------------------------------------------------------------------------
/Classification/cnns/data/tiger.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/data/tiger.jpg


--------------------------------------------------------------------------------
/Classification/cnns/docs/resnet50_lr_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/docs/resnet50_lr_schedule.png


--------------------------------------------------------------------------------
/Classification/cnns/docs/resnet50_validation_acuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Classification/cnns/docs/resnet50_validation_acuracy.png


--------------------------------------------------------------------------------
/Classification/cnns/evaluate.sh:
--------------------------------------------------------------------------------
 1 | rm -rf core.* 
 2 | 
 3 | # Set up  dataset root dir
 4 | DATA_ROOT=/dataset/ImageNet/ofrecord
 5 | 
 6 | # Set up model path, e.g. :  vgg16_of_best_model_val_top1_721  alexnet_of_best_model_val_top1_54762 
 7 | MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318"
 8 | 
 9 |   python3  of_cnn_evaluate.py \
10 |     --num_epochs=3 \
11 |     --num_val_examples=50000 \
12 |     --model_load_dir=$MODEL_LOAD_DIR  \
13 |     --val_data_dir=$DATA_ROOT/validation \
14 |     --val_data_part_num=256 \
15 |     --num_nodes=1 \
16 |     --node_ips='127.0.0.1' \
17 |     --gpu_num_per_node=4 \
18 |     --val_batch_size_per_device=64 \
19 |     --model="resnet50"
20 | 


--------------------------------------------------------------------------------
/Classification/cnns/inference.sh:
--------------------------------------------------------------------------------
1 | rm -rf core.* 
2 | 
3 | # Set up model path, e.g. :  vgg16_of_best_model_val_top1_721  alexnet_of_best_model_val_top1_54762 
4 | MODEL_LOAD_DIR="resnet_v15_of_best_model_val_top1_77318"
5 | 
6 | python3 of_cnn_inference.py \
7 |     --model="resnet50" \
8 |     --image_path="data/fish.jpg" \
9 |     --model_load_dir=$MODEL_LOAD_DIR


--------------------------------------------------------------------------------
/Classification/cnns/job_function_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The OneFlow Authors. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import oneflow.compatible.single_client as flow
18 | 
19 | 
20 | def _default_config(args):
21 |     config = flow.function_config()
22 |     config.default_logical_view(flow.scope.consistent_view())
23 |     config.default_data_type(flow.float)
24 |     if args.use_fp16:
25 |         config.enable_auto_mixed_precision(True)
26 |     if args.use_xla:
27 |         config.use_xla_jit(True)
28 |     config.enable_fuse_add_to_output(True)
29 |     return config
30 | 
31 | 
32 | def get_train_config(args):
33 |     train_config = _default_config(args)
34 |     train_config.cudnn_conv_heuristic_search_algo(False)
35 | 
36 |     train_config.prune_parallel_cast_ops(True)
37 |     train_config.enable_inplace(True)
38 |     if args.num_nodes > 1:
39 |         train_config.cudnn_conv_heuristic_search_algo(True)
40 |     else:
41 |         train_config.cudnn_conv_heuristic_search_algo(False)
42 |     train_config.enable_fuse_model_update_ops(True)
43 |     return train_config
44 | 
45 | 
46 | def get_val_config(args):
47 |     return _default_config(args)
48 | 


--------------------------------------------------------------------------------
/Classification/cnns/of_cnn_evaluate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The OneFlow Authors. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import os
18 | import time
19 | import math
20 | import numpy as np
21 | 
22 | import config as configs
23 | 
24 | parser = configs.get_parser()
25 | args = parser.parse_args()
26 | configs.print_args(args)
27 | 
28 | from util import Snapshot, InitNodes, Metric
29 | import ofrecord_util
30 | from job_function_util import get_train_config, get_val_config
31 | import oneflow.compatible.single_client as flow
32 | import vgg_model
33 | import resnet_model
34 | import resnext_model
35 | import alexnet_model
36 | import mobilenet_v2_model
37 | 
38 | 
39 | total_device_num = args.num_nodes * args.gpu_num_per_node
40 | val_batch_size = total_device_num * args.val_batch_size_per_device
41 | (C, H, W) = args.image_shape
42 | num_val_steps = int(args.num_val_examples / val_batch_size)
43 | 
44 | 
45 | model_dict = {
46 |     "resnet50": resnet_model.resnet50,
47 |     "vgg": vgg_model.vgg16bn,
48 |     "alexnet": alexnet_model.alexnet,
49 |     "mobilenetv2": mobilenet_v2_model.Mobilenet,
50 |     "resnext50": resnext_model.resnext50,
51 | }
52 | 
53 | 
54 | flow.config.gpu_device_num(args.gpu_num_per_node)
55 | # flow.config.enable_debug_mode(True)
56 | @flow.global_function("predict", get_val_config(args))
57 | def InferenceNet():
58 |     assert os.path.exists(args.val_data_dir)
59 |     print("Loading data from {}".format(args.val_data_dir))
60 |     (labels, images) = ofrecord_util.load_imagenet_for_validation(args)
61 | 
62 |     logits = model_dict[args.model](images, args)
63 |     predictions = flow.nn.softmax(logits)
64 |     outputs = {"predictions": predictions, "labels": labels}
65 |     return outputs
66 | 
67 | 
68 | def main():
69 |     InitNodes(args)
70 |     assert args.model_load_dir, "Must have model load dir!"
71 | 
72 |     flow.env.log_dir(args.log_dir)
73 |     # snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
74 |     print("Restoring model from {}.".format(args.model_load_dir))
75 |     flow.load_variables(flow.checkpoint.get(args.model_load_dir))
76 |     metric = Metric(
77 |         desc="validation", calculate_batches=num_val_steps, batch_size=val_batch_size
78 |     )
79 | 
80 |     for i in range(args.num_epochs):
81 |         for j in range(num_val_steps):
82 |             InferenceNet().async_get(metric.metric_cb(0, j))
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/Classification/cnns/of_cnn_inference.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The OneFlow Authors. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import os
18 | import numpy as np
19 | from PIL import Image
20 | 
21 | import config as configs
22 | 
23 | parser = configs.get_parser()
24 | args = parser.parse_args()
25 | configs.print_args(args)
26 | 
27 | import oneflow.compatible.single_client as flow
28 | import oneflow.compatible.single_client.typing as tp
29 | from imagenet1000_clsidx_to_labels import clsidx_2_labels
30 | 
31 | import resnet_model
32 | import resnext_model
33 | import vgg_model
34 | import alexnet_model
35 | import mobilenet_v2_model
36 | 
37 | model_dict = {
38 |     "resnet50": resnet_model.resnet50,
39 |     "vgg": vgg_model.vgg16bn,
40 |     "alexnet": alexnet_model.alexnet,
41 |     "mobilenetv2": mobilenet_v2_model.Mobilenet,
42 |     "resnext50": resnext_model.resnext50,
43 | }
44 | 
45 | 
46 | def load_image(image_path="test_img/ILSVRC2012_val_00020287.JPEG"):
47 |     print(image_path)
48 |     im = Image.open(image_path)
49 |     im = im.resize((224, 224))
50 |     im = im.convert("RGB")  # 有的图像是单通道的，不加转换会报错
51 |     im = np.array(im).astype("float32")
52 |     im = (im - args.rgb_mean) / args.rgb_std
53 |     im = np.transpose(im, (2, 0, 1))
54 |     im = np.expand_dims(im, axis=0)
55 |     return np.ascontiguousarray(im, "float32")
56 | 
57 | 
58 | @flow.global_function("predict", flow.function_config())
59 | def InferenceNet(
60 |     images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float)
61 | ) -> tp.Numpy:
62 |     logits = model_dict[args.model](images, args)
63 |     predictions = flow.nn.softmax(logits)
64 |     return predictions
65 | 
66 | 
67 | def main():
68 |     flow.env.log_dir(args.log_dir)
69 |     assert os.path.isdir(args.model_load_dir)
70 |     flow.load_variables(flow.checkpoint.get(args.model_load_dir))
71 |     image = load_image(args.image_path)
72 |     predictions = InferenceNet(image)
73 |     clsidx = predictions.argmax()
74 |     print(predictions.max(), clsidx_2_labels[clsidx])
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/Classification/cnns/of_cnn_train_val.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | import os
 17 | import math
 18 | import oneflow.compatible.single_client as flow
 19 | import ofrecord_util
 20 | import optimizer_util
 21 | import config as configs
 22 | from util import Snapshot, InitNodes, Metric
 23 | from job_function_util import get_train_config, get_val_config
 24 | import resnet_model
 25 | import resnext_model
 26 | import vgg_model
 27 | import alexnet_model
 28 | import inception_model
 29 | import mobilenet_v2_model
 30 | 
 31 | parser = configs.get_parser()
 32 | args = parser.parse_args()
 33 | configs.print_args(args)
 34 | 
 35 | total_device_num = args.num_nodes * args.gpu_num_per_node
 36 | train_batch_size = total_device_num * args.batch_size_per_device
 37 | val_batch_size = total_device_num * args.val_batch_size_per_device
 38 | (C, H, W) = args.image_shape
 39 | epoch_size = math.ceil(args.num_examples / train_batch_size)
 40 | num_val_steps = int(args.num_val_examples / val_batch_size)
 41 | 
 42 | 
 43 | model_dict = {
 44 |     "resnet50": resnet_model.resnet50,
 45 |     "vgg": vgg_model.vgg16bn,
 46 |     "alexnet": alexnet_model.alexnet,
 47 |     "inceptionv3": inception_model.inceptionv3,
 48 |     "mobilenetv2": mobilenet_v2_model.Mobilenet,
 49 |     "resnext50": resnext_model.resnext50,
 50 | }
 51 | 
 52 | 
 53 | flow.config.gpu_device_num(args.gpu_num_per_node)
 54 | # flow.config.enable_debug_mode(True)
 55 | 
 56 | if args.use_fp16 and args.num_nodes * args.gpu_num_per_node > 1:
 57 |     flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False)
 58 | 
 59 | if args.nccl_fusion_threshold_mb:
 60 |     flow.config.collective_boxing.nccl_fusion_threshold_mb(
 61 |         args.nccl_fusion_threshold_mb
 62 |     )
 63 | 
 64 | if args.nccl_fusion_max_ops:
 65 |     flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops)
 66 | 
 67 | 
 68 | def label_smoothing(labels, classes, eta, dtype):
 69 |     assert classes > 0
 70 |     assert eta >= 0.0 and eta < 1.0
 71 |     return flow.one_hot(
 72 |         labels,
 73 |         depth=classes,
 74 |         dtype=dtype,
 75 |         on_value=1 - eta + eta / classes,
 76 |         off_value=eta / classes,
 77 |     )
 78 | 
 79 | 
 80 | @flow.global_function("train", get_train_config(args))
 81 | def TrainNet():
 82 |     if args.train_data_dir:
 83 |         assert os.path.exists(args.train_data_dir)
 84 |         print("Loading data from {}".format(args.train_data_dir))
 85 |         (labels, images) = ofrecord_util.load_imagenet_for_training(args)
 86 | 
 87 |     else:
 88 |         print("Loading synthetic data.")
 89 |         (labels, images) = ofrecord_util.load_synthetic(args)
 90 |     logits = model_dict[args.model](images, args)
 91 |     if args.label_smoothing > 0:
 92 |         one_hot_labels = label_smoothing(
 93 |             labels, args.num_classes, args.label_smoothing, logits.dtype
 94 |         )
 95 |         loss = flow.nn.softmax_cross_entropy_with_logits(
 96 |             one_hot_labels, logits, name="softmax_loss"
 97 |         )
 98 |     else:
 99 |         loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
100 |             labels, logits, name="softmax_loss"
101 |         )
102 | 
103 |     loss = flow.math.reduce_mean(loss)
104 |     predictions = flow.nn.softmax(logits)
105 |     outputs = {"loss": loss, "predictions": predictions, "labels": labels}
106 | 
107 |     # set up warmup,learning rate and optimizer
108 |     optimizer_util.set_up_optimizer(loss, args)
109 |     return outputs
110 | 
111 | 
112 | @flow.global_function("predict", get_val_config(args))
113 | def InferenceNet():
114 |     if args.val_data_dir:
115 |         assert os.path.exists(args.val_data_dir)
116 |         print("Loading data from {}".format(args.val_data_dir))
117 |         (labels, images) = ofrecord_util.load_imagenet_for_validation(args)
118 | 
119 |     else:
120 |         print("Loading synthetic data.")
121 |         (labels, images) = ofrecord_util.load_synthetic(args)
122 | 
123 |     logits = model_dict[args.model](images, args)
124 |     predictions = flow.nn.softmax(logits)
125 |     outputs = {"predictions": predictions, "labels": labels}
126 |     return outputs
127 | 
128 | 
129 | def main():
130 |     InitNodes(args)
131 |     flow.env.log_dir(args.log_dir)
132 | 
133 |     snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.save_init)
134 | 
135 |     print(" {} iter per epoch...".format(epoch_size))
136 | 
137 |     for epoch in range(1, args.num_epochs + 1):
138 |         metric = Metric(
139 |             desc="train",
140 |             calculate_batches=args.loss_print_every_n_iter,
141 |             batch_size=train_batch_size,
142 |             loss_key="loss",
143 |         )
144 |         for i in range(epoch_size):
145 |             TrainNet().async_get(metric.metric_cb(epoch, i))
146 | 
147 |         if args.val_data_dir:
148 |             metric = Metric(
149 |                 desc="validation",
150 |                 calculate_batches=num_val_steps,
151 |                 batch_size=val_batch_size,
152 |             )
153 |             for i in range(num_val_steps):
154 |                 InferenceNet().async_get(metric.metric_cb(epoch, i))
155 |         if epoch % args.save_epoch_interval == 0:
156 |             snapshot.save("epoch_{}".format(epoch))
157 | 
158 |     if args.save_last:
159 |         snapshot.save("epoch_{}".format("last"))
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/Classification/cnns/ofrecord_util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import oneflow.compatible.single_client as flow
 18 | 
 19 | 
 20 | def add_ofrecord_args(parser):
 21 |     parser.add_argument(
 22 |         "--image_size", type=int, default=224, required=False, help="image size"
 23 |     )
 24 |     parser.add_argument(
 25 |         "--resize_shorter",
 26 |         type=int,
 27 |         default=256,
 28 |         required=False,
 29 |         help="resize shorter for validation",
 30 |     )
 31 |     parser.add_argument(
 32 |         "--train_data_dir", type=str, default=None, help="train dataset directory"
 33 |     )
 34 |     parser.add_argument(
 35 |         "--train_data_part_num", type=int, default=256, help="train data part num"
 36 |     )
 37 |     parser.add_argument(
 38 |         "--val_data_dir", type=str, default=None, help="val dataset directory"
 39 |     )
 40 |     parser.add_argument(
 41 |         "--val_data_part_num", type=int, default=256, help="val data part num"
 42 |     )
 43 |     return parser
 44 | 
 45 | 
 46 | def load_synthetic(args):
 47 |     total_device_num = args.num_nodes * args.gpu_num_per_node
 48 |     batch_size = total_device_num * args.batch_size_per_device
 49 |     label = flow.data.decode_random(
 50 |         shape=(),
 51 |         dtype=flow.int32,
 52 |         batch_size=batch_size,
 53 |         initializer=flow.zeros_initializer(flow.int32),
 54 |     )
 55 | 
 56 |     shape = (
 57 |         (args.image_size, args.image_size, 3)
 58 |         if args.channel_last
 59 |         else (3, args.image_size, args.image_size)
 60 |     )
 61 |     image = flow.data.decode_random(
 62 |         shape=shape, dtype=flow.float, batch_size=batch_size
 63 |     )
 64 | 
 65 |     return label, image
 66 | 
 67 | 
 68 | def load_imagenet_for_training(args):
 69 |     total_device_num = args.num_nodes * args.gpu_num_per_node
 70 |     train_batch_size = total_device_num * args.batch_size_per_device
 71 |     output_layout = "NHWC" if args.channel_last else "NCHW"
 72 | 
 73 |     color_space = "RGB"
 74 |     ofrecord = flow.data.ofrecord_reader(
 75 |         args.train_data_dir,
 76 |         batch_size=train_batch_size,
 77 |         data_part_num=args.train_data_part_num,
 78 |         part_name_suffix_length=5,
 79 |         random_shuffle=True,
 80 |         shuffle_after_epoch=True,
 81 |     )
 82 |     label = flow.data.OFRecordRawDecoder(
 83 |         ofrecord, "class/label", shape=(), dtype=flow.int32
 84 |     )
 85 |     if args.gpu_image_decoder:
 86 |         encoded = flow.data.OFRecordBytesDecoder(ofrecord, "encoded")
 87 |         image = flow.data.ImageDecoderRandomCropResize(
 88 |             encoded, target_width=224, target_height=224, num_workers=3, warmup_size=2048
 89 |         )
 90 |     else:
 91 |         image = flow.data.OFRecordImageDecoderRandomCrop(
 92 |             ofrecord, "encoded", color_space=color_space  # seed=seed,
 93 |         )
 94 |         rsz = flow.image.Resize(image, target_size=[args.image_size, args.image_size])
 95 |         image = rsz[0]
 96 | 
 97 |     rng = flow.random.CoinFlip(batch_size=train_batch_size)  # , seed=seed)
 98 |     normal = flow.image.CropMirrorNormalize(
 99 |         image,
100 |         mirror_blob=rng,
101 |         color_space=color_space,
102 |         output_layout=output_layout,
103 |         mean=args.rgb_mean,
104 |         std=args.rgb_std,
105 |         output_dtype=flow.float,
106 |     )
107 |     return label, normal
108 | 
109 | 
110 | def load_imagenet_for_validation(args):
111 |     total_device_num = args.num_nodes * args.gpu_num_per_node
112 |     val_batch_size = total_device_num * args.val_batch_size_per_device
113 |     output_layout = "NHWC" if args.channel_last else "NCHW"
114 | 
115 |     color_space = "RGB"
116 |     ofrecord = flow.data.ofrecord_reader(
117 |         args.val_data_dir,
118 |         batch_size=val_batch_size,
119 |         data_part_num=args.val_data_part_num,
120 |         part_name_suffix_length=5,
121 |         shuffle_after_epoch=False,
122 |     )
123 |     image = flow.data.OFRecordImageDecoder(ofrecord, "encoded", color_space=color_space)
124 |     label = flow.data.OFRecordRawDecoder(
125 |         ofrecord, "class/label", shape=(), dtype=flow.int32
126 |     )
127 | 
128 |     rsz = flow.image.Resize(
129 |         image,
130 |         resize_side="shorter",
131 |         keep_aspect_ratio=True,
132 |         target_size=args.resize_shorter,
133 |     )
134 | 
135 |     normal = flow.image.CropMirrorNormalize(
136 |         rsz[0],
137 |         color_space=color_space,
138 |         output_layout=output_layout,
139 |         crop_h=args.image_size,
140 |         crop_w=args.image_size,
141 |         crop_pos_y=0.5,
142 |         crop_pos_x=0.5,
143 |         mean=args.rgb_mean,
144 |         std=args.rgb_std,
145 |         output_dtype=flow.float,
146 |     )
147 |     return label, normal
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     import os
152 |     import config as configs
153 |     from util import InitNodes, Metric
154 |     from job_function_util import get_val_config
155 | 
156 |     parser = configs.get_parser()
157 |     args = parser.parse_args()
158 |     configs.print_args(args)
159 | 
160 |     flow.config.gpu_device_num(args.gpu_num_per_node)
161 |     # flow.config.enable_debug_mode(True)
162 |     @flow.global_function(get_val_config(args))
163 |     def IOTest():
164 |         if args.train_data_dir:
165 |             assert os.path.exists(args.train_data_dir)
166 |             print("Loading data from {}".format(args.train_data_dir))
167 |             (labels, images) = load_imagenet_for_training(args)
168 |         else:
169 |             print("Loading synthetic data.")
170 |             (labels, images) = load_synthetic(args)
171 |         outputs = {"images": images, "labels": labels}
172 |         return outputs
173 | 
174 |     total_device_num = args.num_nodes * args.gpu_num_per_node
175 |     train_batch_size = total_device_num * args.batch_size_per_device
176 |     metric = Metric(
177 |         desc="io_test",
178 |         calculate_batches=args.loss_print_every_n_iter,
179 |         batch_size=train_batch_size,
180 |         prediction_key=None,
181 |     )
182 |     for i in range(1000):
183 |         IOTest().async_get(metric.metric_cb(0, i))
184 | 


--------------------------------------------------------------------------------
/Classification/cnns/optimizer_util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | import oneflow.compatible.single_client as flow
 17 | import math
 18 | import pprint
 19 | 
 20 | 
 21 | def add_optimizer_args(parser):
 22 |     group = parser.add_argument_group(
 23 |         "optimizer parameters", "entire group applies only to optimizer parameters"
 24 |     )
 25 |     group.add_argument(
 26 |         "--optimizer", type=str, default="sgd", help="sgd, adam, rmsprop"
 27 |     )
 28 |     group.add_argument("--learning_rate", type=float, default=0.256)
 29 |     group.add_argument("--wd", type=float, default=1.0 / 32768, help="weight decay")
 30 |     group.add_argument("--momentum", type=float, default=0.875, help="momentum")
 31 |     group.add_argument(
 32 |         "--lr_decay",
 33 |         type=str,
 34 |         default="cosine",
 35 |         help="cosine, step, polynomial, exponential, None",
 36 |     )
 37 |     group.add_argument(
 38 |         "--lr_decay_rate",
 39 |         type=float,
 40 |         default="0.94",
 41 |         help="exponential learning decay rate",
 42 |     )
 43 |     group.add_argument(
 44 |         "--lr_decay_epochs",
 45 |         type=int,
 46 |         default=2,
 47 |         help="exponential learning rate decay every n epochs",
 48 |     )
 49 |     group.add_argument(
 50 |         "--warmup_epochs",
 51 |         type=int,
 52 |         default=5,
 53 |         help="the epochs to warmp-up lr to scaled large-batch value",
 54 |     )
 55 |     group.add_argument(
 56 |         "--decay_rate", type=float, default="0.9", help="decay rate of RMSProp"
 57 |     )
 58 |     group.add_argument("--epsilon", type=float, default="1", help="epsilon")
 59 |     group.add_argument(
 60 |         "--gradient_clipping", type=float, default=0.0, help="gradient clipping"
 61 |     )
 62 |     return parser
 63 | 
 64 | 
 65 | def set_up_optimizer(loss, args):
 66 |     total_device_num = args.num_nodes * args.gpu_num_per_node
 67 |     train_batch_size = total_device_num * args.batch_size_per_device
 68 |     batches_per_epoch = math.ceil(args.num_examples / train_batch_size)
 69 |     warmup_batches = batches_per_epoch * args.warmup_epochs
 70 |     num_train_batches = batches_per_epoch * args.num_epochs
 71 |     decay_batches = num_train_batches
 72 |     exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs
 73 | 
 74 |     # set up warmup strategy
 75 |     warmup = (
 76 |         flow.optimizer.warmup.linear(warmup_batches, 0) if warmup_batches > 0 else None
 77 |     )
 78 | 
 79 |     # set up grad_clipping
 80 |     grad_clipping = (
 81 |         flow.optimizer.grad_clipping.by_global_norm(args.gradient_clipping)
 82 |         if args.gradient_clipping > 0.0
 83 |         else None
 84 |     )
 85 | 
 86 |     # set up learning rate scheduler
 87 |     if args.lr_decay == "cosine":
 88 |         # CosineScheduler
 89 |         lr_scheduler = flow.optimizer.CosineScheduler(
 90 |             base_lr=args.learning_rate, steps=decay_batches, warmup=warmup
 91 |         )
 92 |     elif args.lr_decay == "step":
 93 |         # PiecewiseScalingScheduler
 94 |         lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
 95 |             base_lr=args.learning_rate,
 96 |             boundaries=[30, 60, 80],
 97 |             scale=[0.1, 0.01, 0.001],
 98 |             warmup=warmup,
 99 |         )
100 |     elif args.lr_decay == "polynomial":
101 |         # PolynomialScheduler
102 |         lr_scheduler = flow.optimizer.PolynomialScheduler(
103 |             base_lr=args.learning_rate,
104 |             steps=decay_batches,
105 |             end_learning_rate=0.00001,
106 |             power=1.0,
107 |             cycle=False,
108 |             warmup=warmup,
109 |         )
110 |     elif args.lr_decay == "exponential":
111 |         # ExponentialScheduler
112 |         lr_scheduler = flow.optimizer.ExponentialScheduler(
113 |             base_lr=args.learning_rate,
114 |             steps=exponential_decay_batches,
115 |             decay_rate=args.lr_decay_rate,
116 |             staircase=False,
117 |             warmup=warmup,
118 |         )
119 |     else:
120 |         lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
121 |             base_lr=args.learning_rate,
122 |             boundaries=[args.num_epochs],
123 |             scale=[1.0],
124 |             warmup=warmup,
125 |         )
126 | 
127 |     # set up optimizer
128 |     loss_scale_policy = None
129 |     if args.use_fp16:
130 |         loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(
131 |             increment_period=2000
132 |         )
133 |     if args.optimizer == "sgd":
134 |         print("Optimizer:  SGD")
135 |         flow.optimizer.SGD(
136 |             lr_scheduler,
137 |             momentum=args.momentum if args.momentum > 0 else None,
138 |             grad_clipping=grad_clipping,
139 |             loss_scale_policy=loss_scale_policy,
140 |         ).minimize(loss)
141 |     elif args.optimizer == "adam":
142 |         if args.wd > 0 and args.wd < 1.0:
143 |             print("Optimizer:  AdamW")
144 |             flow.optimizer.AdamW(
145 |                 lr_scheduler=lr_scheduler,
146 |                 weight_decay=args.wd,
147 |                 weight_decay_excludes="_bn-",
148 |                 grad_clipping=grad_clipping,
149 |                 epsilon=args.epsilon,
150 |                 loss_scale_policy=loss_scale_policy,
151 |             ).minimize(loss)
152 |         else:
153 |             print("Optimizer:  Adam")
154 |             flow.optimizer.Adam(
155 |                 lr_scheduler=lr_scheduler,
156 |                 grad_clipping=grad_clipping,
157 |                 epsilon=args.epsilon,
158 |                 loss_scale_policy=loss_scale_policy,
159 |             ).minimize(loss)
160 |     elif args.optimizer == "rmsprop":
161 |         print("Optimizer:  RMSProp")
162 |         flow.optimizer.RMSProp(
163 |             lr_scheduler=lr_scheduler,
164 |             decay_rate=args.decay_rate,
165 |             epsilon=args.epsilon,
166 |             loss_scale_policy=loss_scale_policy,
167 |         ).minimize(loss)
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     import config as configs
172 | 
173 |     parser = configs.get_parser()
174 |     args = parser.parse_args()
175 |     configs.print_args(args)
176 | 


--------------------------------------------------------------------------------
/Classification/cnns/resnet2onnx.sh:
--------------------------------------------------------------------------------
1 | python3 resnet_to_onnx.py \
2 |      --channel_last=False \
3 |      --fuse_bn_relu=False \
4 |      --fuse_bn_add_relu=False
5 | 


--------------------------------------------------------------------------------
/Classification/cnns/resnet_to_onnx.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 |     http://www.apache.org/licenses/LICENSE-2.0
  7 | Unless required by applicable law or agreed to in writing, software
  8 | distributed under the License is distributed on an "AS IS" BASIS,
  9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | See the License for the specific language governing permissions and
 11 | limitations under the License.
 12 | """
 13 | 
 14 | from collections import OrderedDict
 15 | import os
 16 | from PIL import Image
 17 | import time
 18 | from typing import Callable, Text
 19 | 
 20 | import numpy as np
 21 | import oneflow.compatible.single_client as flow
 22 | import oneflow.compatible.single_client.typing as tp
 23 | import onnx
 24 | import onnxruntime as ort
 25 | 
 26 | from resnet_model import resnet50
 27 | import config as configs
 28 | from imagenet1000_clsidx_to_labels import clsidx_2_labels
 29 | from oneflow_onnx.oneflow2onnx.util import export_onnx_model
 30 | 
 31 | parser = configs.get_parser()
 32 | args = parser.parse_args()
 33 | 
 34 | 
 35 | def load_image(image_path: Text) -> np.ndarray:
 36 |     rgb_mean = [123.68, 116.779, 103.939]
 37 |     rgb_std = [58.393, 57.12, 57.375]
 38 |     print(image_path)
 39 |     im = Image.open(image_path)
 40 |     im = im.resize((224, 224))
 41 |     im = im.convert("RGB")  # 有的图像是单通道的，不加转换会报错
 42 |     im = np.array(im).astype("float32")
 43 |     im = (im - rgb_mean) / rgb_std
 44 |     im = np.transpose(im, (2, 0, 1))
 45 |     im = np.expand_dims(im, axis=0)
 46 |     return np.ascontiguousarray(im, "float32")
 47 | 
 48 | 
 49 | @flow.global_function("predict")
 50 | def InferenceNet(
 51 |     images: tp.Numpy.Placeholder((1, 3, 224, 224), dtype=flow.float)
 52 | ) -> tp.Numpy:
 53 |     logits = resnet50(images, args, training=False)
 54 |     predictions = flow.nn.softmax(logits)
 55 |     return predictions
 56 | 
 57 | 
 58 | def onnx_inference(image: np.ndarray, onnx_model: onnx.ModelProto):
 59 |     """
 60 |     test onnx model with onnx runtime
 61 |     :param image:           input image, a numpy array
 62 |     :param onnx_model:      onnx model
 63 |     :return:
 64 |     """
 65 |     assert os.path.isfile(image_path)
 66 |     sess = ort.InferenceSession(onnx_model.SerializeToString())
 67 |     assert len(sess.get_outputs()) == 1 and len(sess.get_inputs()) <= 1
 68 |     ipt_dict = OrderedDict()
 69 |     for ipt in sess.get_inputs():
 70 |         ipt_dict[ipt.name] = image
 71 |     onnx_res = sess.run([], ipt_dict)[0]
 72 |     return onnx_res
 73 | 
 74 | 
 75 | def oneflow_to_onnx(
 76 |     job_func: Callable,
 77 |     flow_weights_path: Text,
 78 |     onnx_model_dir: Text,
 79 |     external_data: bool = False,
 80 | ):
 81 |     """
 82 |     convert oneflow model to onnx model
 83 |     :param job_func:            inference function in oneflow
 84 |     :param flow_weights_path:   input oneflow model path
 85 |     :param onnx_model_dir:      output dir path to save model.onnx
 86 |     :return: onnx model
 87 |     """
 88 |     if not os.path.exists(onnx_model_dir):
 89 |         os.makedirs(onnx_model_dir)
 90 |     assert os.path.exists(flow_weights_path) and os.path.isdir(onnx_model_dir)
 91 | 
 92 |     onnx_model_path = os.path.join(
 93 |         onnx_model_dir, "model.onnx"
 94 |     )
 95 |     export_onnx_model(
 96 |         job_func,
 97 |         flow_weight_dir=flow_weights_path,
 98 |         onnx_model_path=onnx_model_dir,
 99 |         opset=11,
100 |         external_data=external_data,
101 |     )
102 |     print("Convert to onnx success! >> ", onnx_model_path)
103 |     return onnx.load_model(onnx_model_path)
104 | 
105 | 
106 | def check_equality(
107 |     job_func: Callable, onnx_model: onnx.ModelProto, image_path: Text
108 | ) -> (bool, np.ndarray):
109 |     image = load_image(image_path)
110 |     onnx_res = onnx_inference(image, onnx_model)
111 |     oneflow_res = job_func(image)
112 |     is_equal = np.allclose(onnx_res, oneflow_res, rtol=1e-4, atol=1e-5)
113 |     return is_equal, onnx_res
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     image_path = "data/tiger.jpg"
118 |     # set up your model path
119 |     flow_weights_path = "resnet_v15_of_best_model_val_top1_77318"
120 |     onnx_model_dir = "onnx/model"
121 | 
122 |     flow.load_variables(flow.checkpoint.get(flow_weights_path))
123 | 
124 |     # conver oneflow to onnx
125 |     onnx_model = oneflow_to_onnx(
126 |         InferenceNet, flow_weights_path, onnx_model_dir, external_data=False
127 |     )
128 | 
129 |     # check equality
130 |     are_equal, onnx_res = check_equality(InferenceNet, onnx_model, image_path)
131 |     clsidx_onnx = onnx_res.argmax()
132 |     print("Are the results equal? {}".format("Yes" if are_equal else "No"))
133 |     print("Class: {}; score: {}".format(clsidx_2_labels[clsidx_onnx], onnx_res.max()))


--------------------------------------------------------------------------------
/Classification/cnns/tools/README.md:
--------------------------------------------------------------------------------
  1 | # Tools使用说明
  2 | ##  简介
  3 | tools文件夹中存放的文件和python代码专门用于 **ImageNet(2012)数据集** 制作工具。通过下面的使用说明，你可以将ImageNet(2012)从原始格式转换为通用图像格式的数据集，再转换为可在OneFlow中直接训练的 **OFRecord** 格式。
  4 | 
  5 | #### 原始数据集
  6 | 
  7 | 往往是由成千上万的图片或文本等文件组成，这些文件被散列存储在不同的文件夹中，一个个读取的时候会非常慢，并且占用大量内存空间。
  8 | 
  9 | #### OFRecord
 10 |  **OFRecord提高IO效率** 
 11 | 
 12 | 内部借助“Protocol Buffer”二进制数据编码方案，它只占用一个内存块，只需要一次性加载一个二进制文件的方式即可，简单，快速，尤其对大型训练数据很友好。另外，当我们的训练数据量比较大的时候，可以将数据分成多个OFRecord文件，来提高处理效率。
 13 | 
 14 | 关于OFRecord的详细说明请参考：[OFRecord数据格式](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/ofrecord.md)
 15 | 
 16 | 
 17 | 
 18 | ##  数据集制作
 19 | 
 20 | ### 将ImageNet转换成OFRecord
 21 | 
 22 | 在OneFlow中，提供了将原始ImageNet2012数据集文件转换成OFRecord格式的脚本，如果您已经下载过，且准备好了ImageNet2012通用图像格式的数据集，并且训练集/验证集的格式如下：
 23 | 
 24 | ```shell
 25 | │   ├── train
 26 | │   │   ├── n01440764
 27 | │   │   └── n01443537
 28 |                                  ...
 29 | │   └── validation
 30 | │       ├── n01440764
 31 | │       └── n01443537
 32 |                                  ...
 33 | ```
 34 | 
 35 | 那么，您只需要下载：[imagenet_2012_bounding_boxes.csv](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/imagenet_2012_bounding_boxes.zip) 
 36 | 
 37 | 然后执行以下脚本即可完成训练集/验证集 > OFRecord的转换：
 38 | 
 39 | #### 转换训练集
 40 | 
 41 | ```shell
 42 | python3 imagenet_ofrecord.py  \
 43 | --train_directory ../data/imagenet/train  \
 44 | --output_directory ../data/imagenet/ofrecord/train   \
 45 | --label_file imagenet_lsvrc_2015_synsets.txt   \
 46 | --shards 256  --num_threads 8 --name train  \
 47 | --bounding_box_file imagenet_2012_bounding_boxes.csv   \
 48 | --height 224 --width 224
 49 | ```
 50 | 
 51 | #### 转换验证集
 52 | 
 53 | ```shell
 54 | python3 imagenet_ofrecord.py  \
 55 | --validation_directory ../data/imagenet/validation  \
 56 | --output_directory ../data/imagenet/ofrecord/validation  \
 57 | --label_file imagenet_lsvrc_2015_synsets.txt --name validation  \
 58 | --shards 256 --num_threads 8 --name validation \
 59 | --bounding_box_file imagenet_2012_bounding_boxes.csv  \
 60 | --height 224 --width 224
 61 | ```
 62 | 
 63 | #### 参数说明
 64 | 
 65 | ```shell
 66 | --train_directory
 67 | # 指定待转换的训练集文件夹路径
 68 | --validation_directory
 69 | # 指定待转换的验证集文件夹路径
 70 | --name
 71 | # 指定转换的是训练集还是验证集
 72 | --output_directory
 73 | # 指定转换后的ofrecord存储位置
 74 |  --num_threads
 75 | # 任务运行线程数
 76 | --shards
 77 | # 指定ofrecord分片数量，建议shards = 256
 78 | #（shards数量越大，则转换后的每个ofrecord分片数据量就越少）
 79 | --bounding_box_file
 80 | # 该参数指定的csv文件中标记了所有目标box的坐标，使转换后的ofrecord同时支持分类和目标检测任务
 81 | ```
 82 | 
 83 | 运行以上脚本后，你可以在../data/imagenet/ofrecord/validation、../data/imagenet/ofrecord/train下看到转换好的ofrecord文件：
 84 | 
 85 | ```shell
 86 | .
 87 | ├── train
 88 | │   ├── part-00000
 89 | │   └── part-00001
 90 |                              ...
 91 | └── validation
 92 |     ├── part-00000
 93 |     └── part-00001
 94 |                              ...
 95 | ```
 96 | 
 97 | 
 98 | 
 99 | 如果尚未下载/处理过ImageNet，请看下面【ImageNet的下载和预处理】部分的说明。
100 | 
101 | ### ImageNet的下载和预处理
102 | 
103 | 如果您尚未下载过Imagenet数据集，请准备以下文件：
104 | 
105 | - ILSVRC2012_img_train.tar
106 | - ILSVRC2012_img_val.tar
107 | - ILSVRC2012_bbox_train_v2.tar.gz（非必须）
108 | 
109 | 其中训练集和验证集的图片请自行下载，bbox标注可以点此下载：[ILSVRC2012_bbox_train_v2.tar.gz](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/ILSVRC2012_bbox_train_v2.tar.gz)
110 | 
111 | 我们将用下面三个步骤，帮您完成数据集的预处理。之后，您就可以使用【将ImageNet转换成OFRecord】部分介绍的转换脚本进行OFReciord的转换了。
112 | 
113 | 
114 | 
115 | 下面假设您已经下载好了原始数据集和bbox标注文件，并存放在data/imagenet目录下：
116 | 
117 | ```shell
118 | ├── data
119 | │   └── imagenet
120 | │       ├── ILSVRC2012_img_train.tar
121 | │       ├── ILSVRC2012_img_val.tar
122 | │       ├── ILSVRC2012_bbox_train_v2.tar.gz
123 | ├── tools
124 | │   ├── extract_trainval.sh
125 | │   ├── imagenet_2012_validation_synset_labels.txt
126 | │   ├── imagenet_lsvrc_2015_synsets.txt
127 | │   ├── imagenet_metadata.txt
128 | │   ├── imagenet_ofrecord.py
129 | │   └── preprocess_imagenet_validation_data.py
130 | ```
131 | 
132 | #### 步骤一：process_bounding_boxes
133 | 
134 | 这一步，主要是将标注好的包含bboxs的xml文件提取到一个.csv文件中，方便后面代码中直接使用。完整的转换过程大约需要5分钟。
135 | 
136 | 当然，你也可以直接使用我们转换好的文件：[imagenet_2012_bounding_boxes.csv](https://oneflow-public.oss-cn-beijing.aliyuncs.com/online_document/dataset/imagenet/imagenet_2012_bounding_boxes.zip)
137 | 
138 | 1.解压ILSVRC2012_bbox_train_v2.tar.gz
139 | 
140 | ```shell
141 | cd data/imagenet && mkdir bounding_boxes && tar -zxvf ILSVRC2012_bbox_train_v2.tar.gz -C bounding_boxes
142 | ```
143 | 
144 | 2.提取bboxs至.csv文件
145 | 
146 | ```shell
147 | cd ../.. && python process_bounding_boxes.py  data/imagenet/bounding_boxes   imagenet_lsvrc_2015_synsets.txt  | sort > imagenet_2012_bounding_boxes.csv
148 | ```
149 | 
150 | #### 步骤二：extract imagenet
151 | 
152 | 这一步主要是将ILSVRC2012_img_train.tar和ILSVRC2012_img_val.tar解压缩，生成train、validation文件夹。train文件夹下是1000个虚拟lebel分类文件夹(如：n01443537)，训练集图片解压后根据分类放入这些label文件夹中；validation文件夹下是解压后的原图。
153 | 
154 | ```shell
155 | sh extract_trainval.sh ../data/imagenet # 参数指定存放imagenet元素数据的文件夹路径
156 | ```
157 | ```shell
158 | 解压后，文件夹结构示意如下：
159 | .
160 | ├── extract_trainval.sh
161 | ├── imagenet
162 | │   ├── ILSVRC2012_img_train.tar
163 | │   ├── ILSVRC2012_img_val.tar
164 | │   ├── ILSVRC2012_bbox_train_v2.tar.gz
165 | │   ├── bounding_boxes
166 | │   ├── train
167 | │   │   ├── n01440764
168 | │   │   │   ├── n01440764_10026.JPEG
169 | │   │   │   ├── n01440764_10027.JPEG 
170 |                                                ...
171 | │   │   └── n01443537
172 | │   │       ├── n01443537_10007.JPEG
173 | │   │       ├── n01443537_10014.JPEG
174 | 											 ...
175 | │   └── validation
176 | │       ├── ILSVRC2012_val_00000236.JPEG
177 | │       ├── ILSVRC2012_val_00000262.JPEG        
178 | 											...
179 | ```
180 | 
181 | #### 步骤三：validation数据处理
182 | 
183 | 经过上一步，train数据集已经放入了1000个分类label文件夹中形成了规整的格式，而验证集部分的图片还全部堆放在validation文件夹中，这一步，我们就用preprocess_imagenet_validation_data.py对其进行处理，使其也按类别存放到label文件夹下。
184 | ```shell
185 | python3 preprocess_imagenet_validation_data.py  ../data/imagenet/validation
186 | # 参数 ../data/imagenet/validation为ILSVRC2012_img_val.tar解压后验证集图像存放的路径。
187 | ```
188 | 处理后项目文件夹格式如下：
189 | ```shell
190 | .
191 | ├── extract_trainval.sh
192 | ├── imagenet
193 | │   ├── ILSVRC2012_img_train.tar
194 | │   ├── ILSVRC2012_img_val.tar
195 | │   ├── ILSVRC2012_bbox_train_v2.tar.gz
196 | │   ├── bounding_boxes
197 | │   ├── train
198 | │   │   ├── n01440764
199 | │   │   └── n01443537
200 |                                 ...
201 | │   └── validation
202 | │       ├── n01440764
203 | │       └── n01443537
204 |                                ...
205 | ```
206 | 
207 | 至此，已经完成了全部的数据预处理，您可以直接跳转至**转换训练集**和**转换验证集**部分，轻松完成ImageNet-2012数据集到OFRecord的转换过程了。
208 | 


--------------------------------------------------------------------------------
/Classification/cnns/tools/extract_trainval.sh:
--------------------------------------------------------------------------------
 1 | # usage: sh extract_trainval.sh your_path_to/imagenet
 2 | # 参数指定存放imagenet元素数据的文件夹路径
 3 | 
 4 | set -e
 5 | ROOT_DIR=$1  # your path to imagenet dataset root dir
 6 | echo "Imagenet dataset in dir:${ROOT_DIR}"
 7 | 
 8 | SYNSETS_FILE="imagenet_lsvrc_2015_synsets.txt"
 9 | TRAIN_TARBALL="${ROOT_DIR}/ILSVRC2012_img_train.tar"
10 | TRAIN_OUTPUT_PATH="${ROOT_DIR}/train/"
11 | VALIDATION_TARBALL="${ROOT_DIR}/ILSVRC2012_img_val.tar"
12 | VALIDATION_OUTPUT_PATH="${ROOT_DIR}/validation/"
13 | 
14 | mkdir -p "${TRAIN_OUTPUT_PATH}"
15 | mkdir -p "${VALIDATION_OUTPUT_PATH}"
16 | 
17 | # extract .tar file of validation
18 | tar xf "${VALIDATION_TARBALL}" -C "${VALIDATION_OUTPUT_PATH}"
19 | 
20 | # extract .tar file of train
21 | echo "Uncompressing individual train tar-balls in the training data."
22 | 
23 | while read SYNSET; do
24 |   # Uncompress into the directory.
25 |   tar xf "${TRAIN_TARBALL}" "${SYNSET}.tar"
26 |   if [ "$?" = "0" ];then
27 |     # Create a directory and delete anything there.
28 |     mkdir -p "${TRAIN_OUTPUT_PATH}/${SYNSET}"
29 |     rm -rf "${TRAIN_OUTPUT_PATH}/${SYNSET}/*"
30 |     echo "Processing: ${SYNSET}"
31 |     tar xf "${SYNSET}.tar" -C "${TRAIN_OUTPUT_PATH}/${SYNSET}/"
32 |     rm -f "${SYNSET}.tar"
33 |     echo "Finished processing: ${SYNSET}"
34 |   else
35 |     echo "${SYNSET}.tar doesn't exist!"
36 |   fi
37 | done < "${SYNSETS_FILE}"


--------------------------------------------------------------------------------
/Classification/cnns/tools/preprocess_imagenet_validation_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | #!/usr/bin/python
 17 | # Copyright 2016 Google Inc. All Rights Reserved.
 18 | #
 19 | # Licensed under the Apache License, Version 2.0 (the "License");
 20 | # you may not use this file except in compliance with the License.
 21 | # You may obtain a copy of the License at
 22 | #
 23 | #     http://www.apache.org/licenses/LICENSE-2.0
 24 | #
 25 | # Unless required by applicable law or agreed to in writing, software
 26 | # distributed under the License is distributed on an "AS IS" BASIS,
 27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 28 | # See the License for the specific language governing permissions and
 29 | # limitations under the License.
 30 | # ==============================================================================
 31 | """Process the ImageNet Challenge bounding boxes for OneFlow model training.
 32 | 
 33 | Associate the ImageNet 2012 Challenge validation data set with labels.
 34 | 
 35 | The raw ImageNet validation data set is expected to reside in JPEG files
 36 | located in the following directory structure.
 37 | 
 38 |  data_dir/ILSVRC2012_val_00000001.JPEG
 39 |  data_dir/ILSVRC2012_val_00000002.JPEG
 40 |  ...
 41 |  data_dir/ILSVRC2012_val_00050000.JPEG
 42 | 
 43 | This script moves the files into a directory structure like such:
 44 |  data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
 45 |  data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
 46 |  ...
 47 | where 'n01440764' is the unique synset label associated with
 48 | these images.
 49 | 
 50 | Sample usage:
 51 |   python3 preprocess_imagenet_validation_data.py  ../data/imagenet/validation
 52 | """
 53 | 
 54 | 
 55 | import os.path
 56 | import sys
 57 | 
 58 | from six.moves import xrange
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 |     if len(sys.argv) < 2:
 63 |         print(
 64 |             "Invalid usage\n"
 65 |             "usage: preprocess_imagenet_validation_data.py "
 66 |             "<validation data dir>"
 67 |         )
 68 |         sys.exit(-1)
 69 |     data_dir = sys.argv[1]
 70 |     validation_labels_file = "imagenet_2012_validation_synset_labels.txt"
 71 | 
 72 | # Read in the 50000 synsets associated with the validation data set.
 73 | labels = [l.strip() for l in open(validation_labels_file).readlines()]
 74 | unique_labels = set(labels)
 75 | 
 76 | # Make all sub-directories in the validation data dir.
 77 | for label in unique_labels:
 78 |     labeled_data_dir = os.path.join(data_dir, label)
 79 |     if not os.path.exists(labeled_data_dir):
 80 |         os.makedirs(labeled_data_dir)
 81 | 
 82 | # Move all of the image to the appropriate sub-directory.
 83 | for i in xrange(len(labels)):
 84 |     basename = "ILSVRC2012_val_000%.5d.JPEG" % (i + 1)
 85 |     original_filename = os.path.join(data_dir, basename)
 86 |     if not os.path.exists(original_filename):
 87 |         continue
 88 |     print("Get image: ", original_filename)
 89 |     new_filename = os.path.join(data_dir, labels[i], basename)
 90 |     os.rename(original_filename, new_filename)
 91 | 
 92 | 
 93 | # Delete all empty dir
 94 | for label in unique_labels:
 95 |     labeled_data_dir = os.path.join(data_dir, label)
 96 |     if not os.path.exists(labeled_data_dir):
 97 |         continue
 98 |     if not os.listdir(labeled_data_dir):
 99 |         os.rmdir(labeled_data_dir)
100 | 


--------------------------------------------------------------------------------
/Classification/cnns/train.sh:
--------------------------------------------------------------------------------
 1 | rm -rf core.*
 2 | rm -rf ./output/snapshots/*
 3 | 
 4 | if [ -n "$1" ]; then
 5 |     NUM_EPOCH=$1
 6 | else
 7 |     NUM_EPOCH=50
 8 | fi
 9 | echo NUM_EPOCH=$NUM_EPOCH
10 | 
11 | # training with imagenet
12 | if [ -n "$2" ]; then
13 |     DATA_ROOT=$2
14 | else
15 |     DATA_ROOT=/data/imagenet/ofrecord
16 | fi
17 | echo DATA_ROOT=$DATA_ROOT
18 | 
19 | LOG_FOLDER=../logs
20 | mkdir -p $LOG_FOLDER
21 | LOGFILE=$LOG_FOLDER/resnet_training.log
22 | 
23 | python3 of_cnn_train_val.py \
24 |      --train_data_dir=$DATA_ROOT/train \
25 |      --train_data_part_num=256 \
26 |      --val_data_dir=$DATA_ROOT/validation \
27 |      --val_data_part_num=256 \
28 |      --num_nodes=1 \
29 |      --gpu_num_per_node=8 \
30 |      --optimizer="sgd" \
31 |      --momentum=0.875 \
32 |      --label_smoothing=0.1 \
33 |      --learning_rate=1.024 \
34 |      --loss_print_every_n_iter=100 \
35 |      --batch_size_per_device=128 \
36 |      --val_batch_size_per_device=50 \
37 |      --num_epoch=$NUM_EPOCH \
38 |      --model="resnet50" 2>&1 | tee ${LOGFILE}
39 | 
40 | echo "Writting log to ${LOGFILE}"
41 | 


--------------------------------------------------------------------------------
/Classification/cnns/train_fp16.sh:
--------------------------------------------------------------------------------
 1 | rm -rf core.*
 2 | rm -rf ./output/snapshots/*
 3 | 
 4 | if [ -n "$1" ]; then
 5 |     NUM_EPOCH=$1
 6 | else
 7 |     NUM_EPOCH=50
 8 | fi
 9 | echo NUM_EPOCH=$NUM_EPOCH
10 | 
11 | # training with imagenet
12 | if [ -n "$2" ]; then
13 |     DATA_ROOT=$2
14 | else
15 |     DATA_ROOT=/data/imagenet/ofrecord
16 | fi
17 | echo DATA_ROOT=$DATA_ROOT
18 | 
19 | LOG_FOLDER=../logs
20 | mkdir -p $LOG_FOLDER
21 | LOGFILE=$LOG_FOLDER/resnet_training.log
22 | 
23 | export PYTHONUNBUFFERED=1
24 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
25 | export NCCL_LAUNCH_MODE=PARALLEL
26 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
27 | export ONEFLOW_COMM_NET_IB_ENABLE=1
28 | 
29 | python3 of_cnn_train_val.py \
30 |      --train_data_dir=$DATA_ROOT/train \
31 |      --train_data_part_num=256 \
32 |      --val_data_dir=$DATA_ROOT/validation \
33 |      --val_data_part_num=256 \
34 |      --num_nodes=1 \
35 |      --gpu_num_per_node=8 \
36 |      --optimizer="sgd" \
37 |      --momentum=0.875 \
38 |      --label_smoothing=0.1 \
39 |      --learning_rate=1.536 \
40 |      --loss_print_every_n_iter=100 \
41 |      --batch_size_per_device=192 \
42 |      --val_batch_size_per_device=50 \
43 |      --use_fp16 \
44 |      --channel_last=True \
45 |      --pad_output \
46 |      --fuse_bn_relu=True \
47 |      --fuse_bn_add_relu=True \
48 |      --nccl_fusion_threshold_mb=16 \
49 |      --nccl_fusion_max_ops=24 \
50 |      --gpu_image_decoder=True \
51 |      --num_epoch=$NUM_EPOCH \
52 |      --model="resnet50" 2>&1 | tee ${LOGFILE}
53 | 
54 | echo "Writting log to ${LOGFILE}"
55 | 


--------------------------------------------------------------------------------
/Classification/cnns/train_fp32.sh:
--------------------------------------------------------------------------------
 1 | rm -rf core.*
 2 | rm -rf ./output/snapshots/*
 3 | 
 4 | if [ -n "$1" ]; then
 5 |     NUM_EPOCH=$1
 6 | else
 7 |     NUM_EPOCH=50
 8 | fi
 9 | echo NUM_EPOCH=$NUM_EPOCH
10 | 
11 | # training with imagenet
12 | if [ -n "$2" ]; then
13 |     DATA_ROOT=$2
14 | else
15 |     DATA_ROOT=/data/imagenet/ofrecord
16 | fi
17 | echo DATA_ROOT=$DATA_ROOT
18 | 
19 | LOG_FOLDER=../logs
20 | mkdir -p $LOG_FOLDER
21 | LOGFILE=$LOG_FOLDER/resnet_training.log
22 | 
23 | export PYTHONUNBUFFERED=1
24 | echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
25 | export NCCL_LAUNCH_MODE=PARALLEL
26 | echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
27 | 
28 | python3 of_cnn_train_val.py \
29 |      --train_data_dir=$DATA_ROOT/train \
30 |      --train_data_part_num=256 \
31 |      --val_data_dir=$DATA_ROOT/validation \
32 |      --val_data_part_num=256 \
33 |      --num_nodes=1 \
34 |      --gpu_num_per_node=8 \
35 |      --optimizer="sgd" \
36 |      --momentum=0.875 \
37 |      --label_smoothing=0.1 \
38 |      --learning_rate=0.768 \
39 |      --loss_print_every_n_iter=100 \
40 |      --batch_size_per_device=96 \
41 |      --val_batch_size_per_device=50 \
42 |      --channel_last=False \
43 |      --fuse_bn_relu=True \
44 |      --fuse_bn_add_relu=True \
45 |      --nccl_fusion_threshold_mb=16 \
46 |      --nccl_fusion_max_ops=24 \
47 |      --gpu_image_decoder=True \
48 |      --num_epoch=$NUM_EPOCH \
49 |      --model="resnet50" 2>&1 | tee ${LOGFILE}
50 | 
51 | echo "Writting log to ${LOGFILE}"
52 | 


--------------------------------------------------------------------------------
/Classification/cnns/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import os
 18 | import time
 19 | import numpy as np
 20 | import oneflow.compatible.single_client as flow
 21 | 
 22 | 
 23 | def InitNodes(args):
 24 |     if args.num_nodes > 1:
 25 |         assert args.num_nodes <= len(args.node_ips)
 26 |         flow.env.ctrl_port(args.ctrl_port)
 27 |         nodes = []
 28 |         for ip in args.node_ips[: args.num_nodes]:
 29 |             addr_dict = {}
 30 |             addr_dict["addr"] = ip
 31 |             nodes.append(addr_dict)
 32 | 
 33 |         flow.env.machine(nodes)
 34 | 
 35 | 
 36 | class Snapshot(object):
 37 |     def __init__(self, model_save_dir, model_load_dir, save_init=False):
 38 |         self._model_save_dir = model_save_dir
 39 |         if model_load_dir:
 40 |             assert os.path.isdir(model_load_dir)
 41 |             print("Restoring model from {}.".format(model_load_dir))
 42 |             flow.load_variables(flow.checkpoint.get(model_load_dir))
 43 |         elif save_init:
 44 |             flow.checkpoint.save("initial_model")
 45 |             print("Init model on demand.")
 46 | 
 47 |     def save(self, name):
 48 |         snapshot_save_path = os.path.join(
 49 |             self._model_save_dir, "snapshot_{}".format(name)
 50 |         )
 51 |         if not os.path.exists(snapshot_save_path):
 52 |             os.makedirs(snapshot_save_path)
 53 |         print("Saving model to {}.".format(snapshot_save_path))
 54 |         flow.checkpoint.save(snapshot_save_path)
 55 | 
 56 | 
 57 | class StopWatch(object):
 58 |     def __init__(self):
 59 |         pass
 60 | 
 61 |     def start(self):
 62 |         self.start_time = time.time()
 63 |         self.last_split = self.start_time
 64 | 
 65 |     def split(self):
 66 |         now = time.time()
 67 |         duration = now - self.last_split
 68 |         self.last_split = now
 69 |         return duration
 70 | 
 71 |     def stop(self):
 72 |         self.stop_time = time.time()
 73 | 
 74 |     def duration(self):
 75 |         return self.stop_time - self.start_time
 76 | 
 77 | 
 78 | def match_top_k(predictions, labels, top_k=1):
 79 |     max_k_preds = np.argpartition(predictions.numpy(), -top_k)[:, -top_k:]
 80 |     match_array = np.logical_or.reduce(max_k_preds == labels.reshape((-1, 1)), axis=1)
 81 |     num_matched = match_array.sum()
 82 |     return num_matched, match_array.shape[0]
 83 | 
 84 | 
 85 | class Metric(object):
 86 |     def __init__(
 87 |         self,
 88 |         desc="train",
 89 |         calculate_batches=-1,
 90 |         batch_size=256,
 91 |         top_k=5,
 92 |         prediction_key="predictions",
 93 |         label_key="labels",
 94 |         loss_key=None,
 95 |         nvidia_smi_report_step=10,
 96 |     ):
 97 |         self.desc = desc
 98 |         self.calculate_batches = calculate_batches
 99 |         self.top_k = top_k
100 |         self.prediction_key = prediction_key
101 |         self.label_key = label_key
102 |         self.loss_key = loss_key
103 |         self.nvidia_smi_report_step = nvidia_smi_report_step
104 |         if loss_key:
105 |             self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}"
106 |         else:
107 |             self.fmt = (
108 |                 "{}: epoch {}, iter {}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}"
109 |             )
110 | 
111 |         self.timer = StopWatch()
112 |         self.timer.start()
113 |         self._clear()
114 | 
115 |     def _clear(self):
116 |         self.top_1_num_matched = 0
117 |         self.top_k_num_matched = 0
118 |         self.num_samples = 0.0
119 | 
120 |     def metric_cb(self, epoch, step):
121 |         def callback(outputs):
122 |             if step == 0:
123 |                 self._clear()
124 |             if self.loss_key and epoch == 0 and step == self.nvidia_smi_report_step:
125 |                 cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
126 |                 os.system(cmd)
127 | 
128 |             if self.prediction_key:
129 |                 num_matched, num_samples = match_top_k(
130 |                     outputs[self.prediction_key], outputs[self.label_key]
131 |                 )
132 |                 self.top_1_num_matched += num_matched
133 |                 num_matched, _ = match_top_k(
134 |                     outputs[self.prediction_key], outputs[self.label_key], self.top_k
135 |                 )
136 |                 self.top_k_num_matched += num_matched
137 |             else:
138 |                 num_samples = outputs[self.label_key].shape[0]
139 | 
140 |             self.num_samples += num_samples
141 | 
142 |             if (step + 1) % self.calculate_batches == 0:
143 |                 throughput = self.num_samples / self.timer.split()
144 |                 if self.prediction_key:
145 |                     top_1_accuracy = self.top_1_num_matched / self.num_samples
146 |                     top_k_accuracy = self.top_k_num_matched / self.num_samples
147 |                 else:
148 |                     top_1_accuracy = 0.0
149 |                     top_k_accuracy = 0.0
150 | 
151 |                 if self.loss_key:
152 |                     loss = outputs[self.loss_key].mean()
153 |                     print(
154 |                         self.fmt.format(
155 |                             self.desc,
156 |                             epoch,
157 |                             step + 1,
158 |                             loss,
159 |                             top_1_accuracy,
160 |                             top_k_accuracy,
161 |                             throughput,
162 |                         ),
163 |                         time.time(),
164 |                     )
165 |                 else:
166 |                     print(
167 |                         self.fmt.format(
168 |                             self.desc,
169 |                             epoch,
170 |                             step + 1,
171 |                             top_1_accuracy,
172 |                             top_k_accuracy,
173 |                             throughput,
174 |                         ),
175 |                         time.time(),
176 |                     )
177 | 
178 |                 self._clear()
179 | 
180 |         return callback
181 | 


--------------------------------------------------------------------------------
/Classification/cnns/vgg_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import oneflow.compatible.single_client as flow
 18 | 
 19 | 
 20 | def _batch_norm(inputs, name=None, trainable=True, training=True, data_format="NCHW"):
 21 |     axis = 1 if data_format == "NCHW" else 3
 22 |     return flow.layers.batch_normalization(
 23 |         inputs=inputs,
 24 |         axis=axis,
 25 |         momentum=0.997,
 26 |         epsilon=1.001e-5,
 27 |         center=True,
 28 |         scale=True,
 29 |         trainable=trainable,
 30 |         training=training,
 31 |         name=name,
 32 |     )
 33 | 
 34 | 
 35 | def _get_regularizer():
 36 |     return flow.regularizers.l2(0.00005)
 37 | 
 38 | 
 39 | def conv2d_layer(
 40 |     name,
 41 |     input,
 42 |     filters,
 43 |     weight_initializer,
 44 |     kernel_size=3,
 45 |     strides=1,
 46 |     padding="SAME",
 47 |     data_format="NCHW",
 48 |     dilation_rate=1,
 49 |     activation="Relu",
 50 |     use_bias=True,
 51 |     bias_initializer=flow.zeros_initializer(),
 52 |     weight_regularizer=_get_regularizer(),  # weight_decay
 53 |     bias_regularizer=_get_regularizer(),
 54 |     trainable=True,
 55 |     training=True,
 56 |     bn=True,
 57 | ):
 58 |     weight_shape = (
 59 |         (filters, input.shape[1], kernel_size, kernel_size)
 60 |         if data_format == "NCHW"
 61 |         else (filters, kernel_size, kernel_size, input.shape[3])
 62 |     )
 63 |     weight = flow.get_variable(
 64 |         name + "_weight",
 65 |         shape=weight_shape,
 66 |         dtype=input.dtype,
 67 |         initializer=weight_initializer,
 68 |     )
 69 |     output = flow.nn.conv2d(
 70 |         input, weight, strides, padding, None, data_format, dilation_rate, name=name
 71 |     )
 72 |     if use_bias:
 73 |         bias = flow.get_variable(
 74 |             name + "_bias",
 75 |             shape=(filters,),
 76 |             dtype=input.dtype,
 77 |             initializer=bias_initializer,
 78 |         )
 79 |         output = flow.nn.bias_add(output, bias, data_format)
 80 | 
 81 |     if activation is not None:
 82 |         if activation == "Relu":
 83 |             if bn:
 84 |                 output = _batch_norm(
 85 |                     output,
 86 |                     name + "_bn",
 87 |                     trainable=trainable,
 88 |                     training=training,
 89 |                     data_format=data_format,
 90 |                 )
 91 |                 output = flow.nn.relu(output)
 92 |             else:
 93 |                 output = flow.nn.relu(output)
 94 |         else:
 95 |             raise NotImplementedError
 96 | 
 97 |     return output
 98 | 
 99 | 
100 | def _conv_block(
101 |     in_blob,
102 |     index,
103 |     filters,
104 |     conv_times,
105 |     data_format="NCHW",
106 |     trainable=True,
107 |     training=True,
108 | ):
109 |     conv_block = []
110 |     conv_block.insert(0, in_blob)
111 |     weight_initializer = flow.variance_scaling_initializer(
112 |         2, "fan_out", "random_normal", data_format=data_format
113 |     )
114 |     for i in range(conv_times):
115 |         conv_i = conv2d_layer(
116 |             name="conv{}".format(index),
117 |             input=conv_block[i],
118 |             filters=filters,
119 |             kernel_size=3,
120 |             strides=1,
121 |             data_format=data_format,
122 |             weight_initializer=weight_initializer,
123 |             trainable=trainable,
124 |             training=training,
125 |             bn=True,
126 |         )
127 | 
128 |         conv_block.append(conv_i)
129 |         index += 1
130 | 
131 |     return conv_block
132 | 
133 | 
134 | def vgg16bn(images, args, trainable=True, training=True):
135 |     data_format = "NHWC" if args.channel_last else "NCHW"
136 | 
137 |     conv1 = _conv_block(
138 |         images, 0, 64, 2, data_format, trainable=trainable, training=training
139 |     )
140 |     pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", data_format, name="pool1")
141 | 
142 |     conv2 = _conv_block(
143 |         pool1, 2, 128, 2, data_format, trainable=trainable, training=training
144 |     )
145 |     pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", data_format, name="pool2")
146 | 
147 |     conv3 = _conv_block(
148 |         pool2, 4, 256, 3, data_format, trainable=trainable, training=training
149 |     )
150 |     pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", data_format, name="pool3")
151 | 
152 |     conv4 = _conv_block(
153 |         pool3, 7, 512, 3, data_format, trainable=trainable, training=training
154 |     )
155 |     pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", data_format, name="pool4")
156 | 
157 |     conv5 = _conv_block(
158 |         pool4, 10, 512, 3, data_format, trainable=trainable, training=training
159 |     )
160 |     pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", data_format, name="pool5")
161 | 
162 |     def _get_kernel_initializer():
163 |         return flow.random_normal_initializer(stddev=0.01)
164 | 
165 |     def _get_bias_initializer():
166 |         return flow.zeros_initializer()
167 | 
168 |     pool5 = flow.reshape(pool5, [pool5.shape[0], -1])
169 |     fc6 = flow.layers.dense(
170 |         inputs=pool5,
171 |         units=4096,
172 |         activation=flow.nn.relu,
173 |         use_bias=True,
174 |         kernel_initializer=_get_kernel_initializer(),
175 |         bias_initializer=_get_bias_initializer(),
176 |         kernel_regularizer=_get_regularizer(),  # weght_decay
177 |         bias_regularizer=_get_regularizer(),
178 |         trainable=trainable,
179 |         name="dense0",
180 |     )
181 | 
182 |     fc6 = flow.nn.dropout(fc6, rate=0.5)
183 | 
184 |     fc7 = flow.layers.dense(
185 |         inputs=fc6,
186 |         units=4096,
187 |         activation=flow.nn.relu,
188 |         use_bias=True,
189 |         kernel_initializer=_get_kernel_initializer(),
190 |         bias_initializer=_get_bias_initializer(),
191 |         trainable=trainable,
192 |         name="dense1",
193 |     )
194 |     fc7 = flow.nn.dropout(fc7, rate=0.5)
195 | 
196 |     fc8 = flow.layers.dense(
197 |         inputs=fc7,
198 |         units=1000,
199 |         use_bias=True,
200 |         kernel_initializer=_get_kernel_initializer(),
201 |         bias_initializer=_get_bias_initializer(),
202 |         trainable=trainable,
203 |         name="dense2",
204 |     )
205 | 
206 |     return fc8
207 | 


--------------------------------------------------------------------------------
/ClickThroughRate/WideDeepLearning/README.md:
--------------------------------------------------------------------------------
 1 | The main different between `wdl_train_eval.py` and `wdl_train_eval_test.py` is:
 2 | `wdl_train_eval_test.py` is a end to end process of n-epoch training with training dataset, evaluation with full eval dataset after training of every epoch and testing with test dataset at the last stage. The main training loop is `epoch`.
 3 | 
 4 | Otherwise, in `wdl_train_eval.py`, the main training loop is `iteration`. Only evaluate 20 samples a time but not full eval dataset. and no test stage.
 5 | 
 6 | ## Run OneFlow-WDL with train and evaluation
 7 | ```
 8 | EMBD_SIZE=1603616 
 9 | DATA_ROOT=/DATA/disk1/criteo_wdl/ofrecord
10 | python3 wdl_train_eval.py \
11 |   --train_data_dir $DATA_ROOT/train \
12 |   --train_data_part_num 256 \
13 |   --train_part_name_suffix_length=5 \
14 |   --eval_data_dir $DATA_ROOT/val \
15 |   --eval_data_part_num 256 \
16 |   --max_iter=300000 \
17 |   --loss_print_every_n_iter=1000 \
18 |   --eval_interval=1000 \
19 |   --batch_size=512 \
20 |   --wide_vocab_size=$EMBD_SIZE \
21 |   --deep_vocab_size=$EMBD_SIZE \
22 |   --gpu_num 1
23 | ```
24 | 
25 | ## Run OneFlow-WDL with train, evaluation and test 
26 | ```
27 | EMBD_SIZE=1603616 
28 | DATA_ROOT=/DATA/disk1/criteo_wdl/ofrecord
29 | python3 wdl_train_eval_test.py \
30 |   --train_data_dir $DATA_ROOT/train \
31 |   --train_data_part_num 256 \
32 |   --train_part_name_suffix_length=5 \
33 |   --eval_data_dir $DATA_ROOT/val \
34 |   --eval_data_part_num 256 \
35 |   --eval_part_name_suffix_length=5 \
36 |   --test_data_dir $DATA_ROOT/test \
37 |   --test_data_part_num 256 \
38 |   --test_part_name_suffix_length=5 \
39 |   --loss_print_every_n_iter=1000 \
40 |   --batch_size=16484 \
41 |   --wide_vocab_size=$EMBD_SIZE \
42 |   --deep_vocab_size=$EMBD_SIZE \
43 |   --gpu_num 1
44 | ```
45 | 
46 | OneFlow-WDL网络实现了模型并行与稀疏更新，在8卡12G TitanV的服务器上实现支持超过4亿的词表大小，而且性能没有损失与小词表性能相当，详细请参考[这篇文档](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/adv_examples/wide_deep.md)评测部分的内容。


--------------------------------------------------------------------------------
/ClickThroughRate/WideDeepLearning/wdl_test_report.md:
--------------------------------------------------------------------------------
  1 | [HugeCTR](https://github.com/NVIDIA/HugeCTR) is a recommender specific framework provided by NVIDIA Corporation. It is designed for Click-Through-Rate (CTR) estimation.
  2 | 
  3 | OneFlow build up Wide & Deep Learning (WDL) network based on HugeCTR. 
  4 | 
  5 | OneFlow-WDL network supports model parallelism and sparse gradient update. It can support over 400 million vocab size of lookup table in a TitanV 12G * 8 server, at the same time has the same performace with small vocab size table.
  6 | 
  7 | The purpose of this document is to introduce how to use OneFlow-WDL to train network and present the testing results of OneFlow-WDL.  
  8 | 
  9 | ## Environment and Preparation 
 10 | Please make sure to install OneFlow in your computer/server before running OneFlow-WDL, and [scikit-learn](https://scikit-learn.org/stable/install.html) is required to calculate metrics.
 11 | 
 12 | ### Requirements
 13 | - python 3.x（recommended）
 14 | - OneFlow 
 15 | - scikit-learn
 16 | 
 17 | ### Data preparation 
 18 | A small [data set](https://oneflow-public.oss-cn-beijing.aliyuncs.com/datasets/criteo_wdl_3000w_ofrecord_example.tgz) is prepared for your fast evaluation. Following is the folder structure of this example dataset. 
 19 | ```
 20 | criteo_wdl_3000w_ofrecord_example
 21 | ├── train
 22 | │   └── part-00000
 23 | └── val
 24 |     ├── part-00000
 25 |     └── README.md
 26 | ```
 27 | 
 28 | Making a full-size dataset is exhausting. Hopefully [*Use Spark to create WDL dataset*](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/ClickThroughRate/WideDeepLearning/how_to_make_ofrecord_for_wdl.md) can help you to generate the full-size ofrecord for testing. You can download original dataset from [CriteoLabs](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/) and follow the steps in [Spark 2.4.* shell](https://www.apache.org/dyn/closer.lua/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz).
 29 | 
 30 | ### OneFlow-WDL code
 31 | The main code we test is the file: `wdl_train_eval.py`. Please download from [here](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/master/ClickThroughRate/WideDeepLearning/wdl_train_eval.py).
 32 | 
 33 | ## Run OneFlow-WDL code
 34 | ```
 35 | VOCAB_SIZE=1603616
 36 | DATA_ROOT=/path/to/wdl/criteo_wdl_3000w_ofrecord_example
 37 | python3 wdl_train_eval.py \
 38 |   --train_data_dir $DATA_ROOT/train \
 39 |   --train_data_part_num 1 \
 40 |   --train_part_name_suffix_length=5 \
 41 |   --eval_data_dir $DATA_ROOT/val \
 42 |   --eval_data_part_num 1 \
 43 |   --max_iter=300000 \
 44 |   --loss_print_every_n_iter=1000 \
 45 |   --eval_interval=1000 \
 46 |   --batch_size=16384 \
 47 |   --wide_vocab_size=$VOCAB_SIZE \
 48 |   --deep_vocab_size=$VOCAB_SIZE \
 49 |   --gpu_num 1
 50 | ```
 51 | 
 52 | The running shell code is shown above, the only thing we need to config is `DATA_ROOT` of ofrecord dataset for OneFlow-WDL. Then the shell is able to run. If the following output show up which means the code is running correctly.
 53 | 
 54 | Note: the `criteo_wdl_3000w_ofrecord_example` dataset has one part file only, the `train_data_part_num` and `eval_data_part_num` are both set to `1`.
 55 | ```
 56 | 1000 time 2020-07-08 00:28:08.066281 loss 0.503295350909233
 57 | 1000 eval_loss 0.4846755236387253 eval_auc 0.7616240146992771
 58 | 2000 time 2020-07-08 00:28:11.613961 loss 0.48661992555856703
 59 | 2000 eval_loss 0.4816856697201729 eval_auc 0.765256583562705
 60 | 3000 time 2020-07-08 00:28:15.149135 loss 0.48245503094792364
 61 | 3000 eval_loss 0.47835959643125536 eval_auc 0.7715609382514008
 62 | 4000 time 2020-07-08 00:28:18.686327 loss 0.47975033831596375
 63 | 4000 eval_loss 0.47925308644771575 eval_auc 0.7781267916810946
 64 | ```
 65 | ## Testing results and explanation
 66 | All tests are performed on a sever with 8 TitanV 12G GPUs installed. As a reference, we perform some Nvidia HugeCTR tests in docker container.
 67 | 
 68 | ### Multi-devices performance
 69 | The main purpose of this test is to test the average latency over different GPU device number with fixed total batch size = 16384. 7 hidden layers with 1024 neural units are applied in this test.
 70 | 
 71 | Results：
 72 | 
 73 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/fixed_batch_size_latency.png)
 74 | 
 75 | the maximum memory usage over devices is shown below:
 76 | 
 77 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/fixed_batch_size_memory.png)
 78 | 
 79 | To summarise, from one device to 8 devices, OneFlow-WDL ran faster than HugeCTR with less memory usage.
 80 | 
 81 | ### Batch size per device = 16384 , multi-devices performance
 82 | The main purpose of test is to test the average latency over different GPU devices with batch size=16384 per device, the total batch size is scaled with device number. 7 hidden layers with 1024 neural units are applied in this test.
 83 | 
 84 | Results：
 85 | 
 86 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_latency.png)
 87 | 
 88 | the maximum memory usage over devices is shown below:
 89 | 
 90 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_memory.png)
 91 | 
 92 | Summary:
 93 | - The latency kept increase alone with number of devices.
 94 | - OneFlow-WDL ran faster than HugeCTR with less less memory consumption.
 95 | - There is no obvious change in memory usage.
 96 | 
 97 | ### Performance in different batch size with one GPU device
 98 | The main purpose of this test is to test the average latency with one GPU device over different batch size. 2 hidden layers with 1024 neural units are applied in this test. 
 99 | 
100 | Results：
101 | 
102 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/scaled_batch_size_latency_1gpu.png)
103 | 
104 | Summary: OneFlow-WDL ran faster than HugeCTR over batch size from 512 to 16384.
105 | 
106 | ### Big vocab size performance  
107 | There are two Embedding Tables config in OneFlow-WDL：
108 | - The size of `wide_embedding` is vocab_size x 1
109 | - The size of`deep_embedding` is vocab_size x 16
110 | 
111 | In HugeCTR the vocab size is 1,603,616(1.6 million). We kept increasing vocab size from 3.2 million to 409.6 million during test, result is below：
112 | 
113 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/big_vocab_table_2x1024.png) 
114 | 
115 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/big_vocab_table_7x1024.png)
116 | 
117 | In above figures，the blue column is average latency and orange curve is for the memory usage over different vocab size.
118 | 
119 | Conclusion: with the increaseing of vocab size, memory usage increase, but the average latency kept still.
120 | 
121 | Our test GPU has 12G memory only, we can image how big vocab size will OneFlow-WDL support with 16G, 32G or even larger memory devices. **409.6 Million vocab size is not the limitation but a begining**. 
122 | 
123 | ### Convergence test 1
124 | We choose batch size=512 to run the convergence performance test. 
125 | 
126 | The follow graph is the results of first 500 iterations. We perform evaluation with 20 example after each iteration.
127 | 
128 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/eval_auc_loss_500iters.png)
129 | 
130 | Conclusion: AUC grow rapidly over 0.75.
131 | 
132 | ### Convergence test 2
133 | Same with the Convergence test 1, but we print the average loss value every 1000 iterations, then select 20 record to evaluate. 300,000 training iterations in total. Result:
134 | 
135 | ![image](https://github.com/Oneflow-Inc/oneflow-documentation/raw/master/cn/docs/adv_examples/imgs/train_eval_auc_loss.png)
136 | 
137 | Conclusion and analysis:
138 | 1. The blue curve of train loss have obvious descend. because, there are 36674623 data in training set. When batch_size=512, 71630 steps will finish a epoch. 300,0000 steps can use the training set over 4 times(epochs). The descend of blue curve proof that. OneFlow can suffle the data during the training process in order to reduce overfitting. 
139 | 2. The orange curve is evaluation loss. It maintains descend in first two epochs and began to ascend in third epoch because of overfitting. 
140 | 3. The grey curve is the AUC of evaluation set. AUC also meet the peak in second epoch which over 0.8. Then descend in next few epoch.
141 | 


--------------------------------------------------------------------------------
/Generative/README.md:
--------------------------------------------------------------------------------
 1 | # GAN 生成对抗网络演示
 2 | 
 3 | 
 4 | 
 5 | ## 简介 Introduction
 6 | 
 7 | 生成对抗网络（GAN）任务的本质是学习一个数据分布。它包含生成网络和判别网络两部分。其中生成网络可将一个随机分布映射为任意分布，判别网络则决定了生成分布的“方向”。二者相互博弈的过程在理论上等效于分布的拟合过程。本文演示了DCGAN的运行，DCGAN是一种广泛应用于图像生成领域中的基于卷积/反卷积运算的生成对抗网络，
 8 | 
 9 | 
10 | ## 演示 Demo
11 | 
12 | 可通过脚本直接启动DCGAN的训练：
13 | 
14 | ```bash
15 | python dcgan.py
16 | ```
17 | 
18 | 脚本参数：
19 | 
20 | - `-lr` 学习率，默认为1e-4
21 | -  `-e` 设置训练epoch次数，默认为10
22 | - `-b` 设置batchsize
23 | - `-g` 设置gpu数量
24 | 
25 | 
26 | 
27 | 其他需要注意的是：
28 | 
29 | - 训练将会默认使用minst数据集，如果第一次使用脚本，将会默认将数据集下载到`.data/`目录
30 | - 训练结束后，将会默认保存模型到`.checkpoint/`目录下
31 | - 模型的结构和参数参考了tensorflow的[官方示例](https://www.tensorflow.org/tutorials/generative/dcgan)
32 | - 模型会定期将生成的图片存储到`.gout/`目录，并在训练结束后生成图片演化的动图,生成动图的过程会依赖python包`imageio`
33 | 
34 | ![dcgan demo](https://github.com/Oneflow-Inc/OneFlow-Benchmark/blob/dev_gan/Generative/pic/1.png)
35 | 


--------------------------------------------------------------------------------
/Generative/layers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | import oneflow.compatible.single_client as flow
 17 | 
 18 | def get_const_initializer():
 19 |     return flow.constant_initializer(0.002)
 20 | 
 21 | def deconv2d(
 22 |     input,
 23 |     filters,
 24 |     size,
 25 |     name,
 26 |     strides=2,
 27 |     trainable=True,
 28 |     reuse=False,
 29 |     const_init=False,
 30 |     use_bias=False,
 31 | ):
 32 |     name_ = name if reuse == False else name + "_reuse"
 33 |     # weight : [in_channels, out_channels, height, width]
 34 |     weight_shape = (input.shape[1], filters, size, size)
 35 |     output_shape = (
 36 |         input.shape[0],
 37 |         input.shape[1],
 38 |         input.shape[2] * strides,
 39 |         input.shape[3] * strides,
 40 |     )
 41 | 
 42 |     weight = flow.get_variable(
 43 |         name + "-weight",
 44 |         shape=weight_shape,
 45 |         dtype=input.dtype,
 46 |         initializer=flow.random_normal_initializer(stddev=0.02)
 47 |         if not const_init
 48 |         else get_const_initializer(),
 49 |         trainable=trainable,
 50 |     )
 51 | 
 52 |     output = flow.nn.conv2d_transpose(
 53 |         input,
 54 |         weight,
 55 |         strides=[strides, strides],
 56 |         output_shape=output_shape,
 57 |         padding="SAME",
 58 |         data_format="NCHW",
 59 |         name=name_,
 60 |     )
 61 | 
 62 |     if use_bias:
 63 |         bias = flow.get_variable(
 64 |             name + "-bias",
 65 |             shape=(filters,),
 66 |             dtype=input.dtype,
 67 |             initializer=flow.constant_initializer(0.0),
 68 |             trainable=trainable,
 69 |         )
 70 | 
 71 |         output = flow.nn.bias_add(output, bias, "NCHW")
 72 |     return output
 73 | 
 74 | 
 75 | def conv2d(
 76 |     input,
 77 |     filters,
 78 |     size,
 79 |     name,
 80 |     strides=2,
 81 |     padding="same",
 82 |     trainable=True,
 83 |     reuse=False,
 84 |     const_init=False,
 85 |     use_bias=True,
 86 | ):
 87 |     name_ = name if reuse == False else name + "_reuse"
 88 | 
 89 |     # (output_dim, k_h, k_w, input.shape[3]) if NHWC
 90 |     weight_shape = (filters, input.shape[1], size, size)
 91 |     weight = flow.get_variable(
 92 |         name + "-weight",
 93 |         shape=weight_shape,
 94 |         dtype=input.dtype,
 95 |         initializer=flow.random_normal_initializer(stddev=0.02)
 96 |         if not const_init
 97 |         else get_const_initializer(),
 98 |         trainable=trainable,
 99 |         reuse=reuse,
100 |     )
101 | 
102 |     output = flow.nn.compat_conv2d(
103 |         input,
104 |         weight,
105 |         strides=[strides, strides],
106 |         padding=padding,
107 |         data_format="NCHW",
108 |         name=name_,
109 |     )
110 | 
111 |     if use_bias:
112 |         bias = flow.get_variable(
113 |             name + "-bias",
114 |             shape=(filters,),
115 |             dtype=input.dtype,
116 |             initializer=flow.constant_initializer(0.0),
117 |             trainable=trainable,
118 |             reuse=reuse,
119 |         )
120 | 
121 |         output = flow.nn.bias_add(output, bias, "NCHW")
122 |     return output
123 | 
124 | 
125 | def batchnorm(input, name, axis=1, reuse=False):
126 |     name_ = name if reuse == False else name + "_reuse"
127 |     return flow.layers.batch_normalization(input, axis=axis, name=name_)
128 | 
129 | def dense(
130 |     input, units, name, use_bias=False, trainable=True, reuse=False, const_init=False
131 | ):
132 |     name_ = name if reuse == False else name + "_reuse"
133 | 
134 |     in_shape = input.shape
135 |     in_num_axes = len(in_shape)
136 |     assert in_num_axes >= 2
137 | 
138 |     inputs = flow.reshape(input, (-1, in_shape[-1])) if in_num_axes > 2 else input
139 | 
140 |     weight = flow.get_variable(
141 |         name="{}-weight".format(name),
142 |         shape=(units, inputs.shape[1]),
143 |         dtype=inputs.dtype,
144 |         initializer=flow.random_normal_initializer(stddev=0.02)
145 |         if not const_init
146 |         else get_const_initializer(),
147 |         trainable=trainable,
148 |         reuse=reuse,
149 |         model_name="weight",
150 |     )
151 | 
152 |     out = flow.matmul(a=inputs, b=weight, transpose_b=True, name=name_ + "matmul",)
153 | 
154 |     if use_bias:
155 |         bias = flow.get_variable(
156 |             name="{}-bias".format(name),
157 |             shape=(units,),
158 |             dtype=inputs.dtype,
159 |             initializer=flow.random_normal_initializer()
160 |             if not const_init
161 |             else get_const_initializer(),
162 |             trainable=trainable,
163 |             reuse=reuse,
164 |             model_name="bias",
165 |         )
166 |         out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")
167 | 
168 |     out = flow.reshape(out, in_shape[:-1] + (units,)) if in_num_axes > 2 else out
169 |     return out
170 | 


--------------------------------------------------------------------------------
/Generative/pic/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Generative/pic/1.png


--------------------------------------------------------------------------------
/Generative/pic/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/Generative/pic/2.png


--------------------------------------------------------------------------------
/LanguageModeling/BERT/classifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | import oneflow.compatible.single_client as flow
 17 | import bert as bert_util
 18 | 
 19 | 
 20 | def GlueBERT(
 21 |     input_ids_blob,
 22 |     input_mask_blob,
 23 |     token_type_ids_blob,
 24 |     label_blob,
 25 |     vocab_size,
 26 |     seq_length=512,
 27 |     hidden_size=768,
 28 |     num_hidden_layers=12,
 29 |     num_attention_heads=12,
 30 |     intermediate_size=3072,
 31 |     hidden_act="gelu",
 32 |     hidden_dropout_prob=0.1,
 33 |     attention_probs_dropout_prob=0.1,
 34 |     max_position_embeddings=512,
 35 |     type_vocab_size=16,
 36 |     initializer_range=0.02,
 37 |     label_num=2,
 38 |     replace_prob=None,
 39 | ):
 40 |     backbone = bert_util.BertBackbone(
 41 |         input_ids_blob=input_ids_blob,
 42 |         input_mask_blob=input_mask_blob,
 43 |         token_type_ids_blob=token_type_ids_blob,
 44 |         vocab_size=vocab_size,
 45 |         seq_length=seq_length,
 46 |         hidden_size=hidden_size,
 47 |         num_hidden_layers=num_hidden_layers,
 48 |         num_attention_heads=num_attention_heads,
 49 |         intermediate_size=intermediate_size,
 50 |         hidden_act=hidden_act,
 51 |         hidden_dropout_prob=hidden_dropout_prob,
 52 |         attention_probs_dropout_prob=attention_probs_dropout_prob,
 53 |         max_position_embeddings=max_position_embeddings,
 54 |         type_vocab_size=type_vocab_size,
 55 |         initializer_range=initializer_range,
 56 |     )
 57 |     pooled_output = PooledOutput(
 58 |         sequence_output=backbone.sequence_output(),
 59 |         hidden_size=hidden_size,
 60 |         initializer_range=initializer_range
 61 |     )
 62 |     loss, _, logit_blob = _AddClassficationLoss(
 63 |         input_blob=pooled_output,
 64 |         label_blob=label_blob,
 65 |         hidden_size=hidden_size,
 66 |         label_num=label_num,
 67 |         initializer_range=initializer_range,
 68 |         scope_name='classification'
 69 |     )
 70 | 
 71 |     return loss, logit_blob
 72 | 
 73 | 
 74 | def PooledOutput(sequence_output, hidden_size, initializer_range):
 75 |     with flow.scope.namespace("bert-pooler"):
 76 |         first_token_tensor = flow.slice(
 77 |             sequence_output, [None, 0, 0], [None, 1, -1])
 78 |         first_token_tensor = flow.reshape(
 79 |             first_token_tensor, [-1, hidden_size])
 80 |         pooled_output = bert_util._FullyConnected(
 81 |             first_token_tensor,
 82 |             input_size=hidden_size,
 83 |             units=hidden_size,
 84 |             weight_initializer=bert_util.CreateInitializer(initializer_range),
 85 |             name="dense",
 86 |         )
 87 |         pooled_output = flow.math.tanh(pooled_output)
 88 |     return pooled_output
 89 | 
 90 | 
 91 | def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range,
 92 |                           scope_name='classification'):
 93 |     with flow.scope.namespace(scope_name):
 94 |         output_weight_blob = flow.get_variable(
 95 |             name="output_weights",
 96 |             shape=[label_num, hidden_size],
 97 |             dtype=input_blob.dtype,
 98 |             # initializer=bert_util.CreateInitializer(initializer_range),
 99 |             initializer=flow.random_normal_initializer(
100 |                 mean=0.0, stddev=initializer_range, seed=None, dtype=None)
101 |         )
102 |         output_bias_blob = flow.get_variable(
103 |             name="output_bias",
104 |             shape=[label_num],
105 |             dtype=input_blob.dtype,
106 |             initializer=flow.constant_initializer(0.0),
107 |         )
108 |         logit_blob = flow.matmul(
109 |             input_blob, output_weight_blob, transpose_b=True)
110 |         logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
111 |         pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
112 |             logits=logit_blob, labels=label_blob
113 |         )
114 |         loss = pre_example_loss
115 |         return loss, pre_example_loss, logit_blob
116 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import argparse
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | def str_list(x):
 22 |     return x.split(",")
 23 | 
 24 | 
 25 | def int_list(x):
 26 |     return list(map(int, x.split(",")))
 27 | 
 28 | 
 29 | def float_list(x):
 30 |     return list(map(float, x.split(",")))
 31 | 
 32 | 
 33 | def str2bool(v):
 34 |     if v.lower() in ("yes", "true", "t", "y", "1"):
 35 |         return True
 36 |     elif v.lower() in ("no", "false", "f", "n", "0"):
 37 |         return False
 38 |     else:
 39 |         raise argparse.ArgumentTypeError("Unsupported value encountered.")
 40 | 
 41 | 
 42 | def get_parser(parser=None):
 43 | 
 44 |     parser = argparse.ArgumentParser(description="flags for bert")
 45 | 
 46 |     parser.add_argument(
 47 |         "--do_train", type=str2bool, nargs="?", const=True, help="train or not"
 48 |     )
 49 |     parser.add_argument(
 50 |         "--do_eval", type=str2bool, nargs="?", const=True, help="eval or not"
 51 |     )
 52 |     # resouce
 53 |     parser.add_argument("--model", type=str, default="BERT Pretrain")
 54 |     parser.add_argument("--gpu_num_per_node", type=int, default=1)
 55 |     parser.add_argument(
 56 |         "--num_nodes", type=int, default=1, help="node/machine number for training"
 57 |     )
 58 |     parser.add_argument(
 59 |         "--node_ips",
 60 |         type=str_list,
 61 |         default=["192.168.1.13", "192.168.1.14"],
 62 |         help='nodes ip list for training, devided by ",", length >= num_nodes',
 63 |     )
 64 |     parser.add_argument(
 65 |         "--ctrl_port", type=int, default=50051, help="ctrl_port for multinode job"
 66 |     )
 67 | 
 68 |     # train
 69 |     parser.add_argument(
 70 |         "--learning_rate", type=float, default=1e-4, help="Learning rate"
 71 |     )
 72 |     parser.add_argument(
 73 |         "--weight_decay_rate", type=float, default=0.01, help="weight decay rate"
 74 |     )
 75 |     parser.add_argument("--warmup_proportion", type=float, default=0.1)
 76 |     parser.add_argument(
 77 |         "--use_fp16",
 78 |         type=str2bool,
 79 |         nargs="?",
 80 |         default="False",
 81 |         const=True,
 82 |         help="use use fp16 or not",
 83 |     )
 84 |     parser.add_argument(
 85 |         "--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla"
 86 |     )
 87 |     parser.add_argument(
 88 |         "--num_accumulation_steps",
 89 |         type=int,
 90 |         default=1,
 91 |         help="Number of accumulation steps before gradient update, Global batch size = num_accumulation_steps * train_batch_size",
 92 |     )
 93 |     parser.add_argument(
 94 |         "--optimizer_type",
 95 |         type=str,
 96 |         default="adam",
 97 |         help="Optimizer used for training - LAMB or ADAM",
 98 |     )
 99 | 
100 |     # log and resore/save
101 |     parser.add_argument(
102 |         "--loss_print_every_n_iter",
103 |         type=int,
104 |         default=10,
105 |         required=False,
106 |         help="print loss every n iteration",
107 |     )
108 |     parser.add_argument(
109 |         "--model_save_every_n_iter",
110 |         type=int,
111 |         default=10000,
112 |         required=False,
113 |         help="save model every n iteration",
114 |     )
115 |     parser.add_argument(
116 |         "--model_save_dir",
117 |         type=str,
118 |         default="./output/model_save-{}".format(
119 |             str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
120 |         ),
121 |         required=False,
122 |         help="model save directory",
123 |     )
124 |     parser.add_argument(
125 |         "--model_save_init",
126 |         action="store_true",
127 |         default=False,
128 |         help="save model snapshot for inited",
129 |     )
130 |     
131 |     parser.add_argument(
132 |         "--save_last_snapshot",
133 |         type=str2bool,
134 |         default=False,
135 |         required=False,
136 |         help="save model snapshot for last iteration",
137 |     )
138 |     parser.add_argument(
139 |         "--model_load_dir", type=str, default=None, help="model load directory"
140 |     )
141 |     parser.add_argument(
142 |         "--log_dir", type=str, default="./output", help="log info save directory"
143 |     )
144 | 
145 |     # bert backbone
146 |     parser.add_argument(
147 |         "--do_lower_case", type=str2bool, nargs="?", const=True, default="True"
148 |     )
149 |     parser.add_argument("--seq_length", type=int, default=512)
150 |     parser.add_argument("--max_predictions_per_seq", type=int, default=80)
151 |     parser.add_argument("--num_hidden_layers", type=int, default=24)
152 |     parser.add_argument("--num_attention_heads", type=int, default=16)
153 |     parser.add_argument("--max_position_embeddings", type=int, default=512)
154 |     parser.add_argument("--type_vocab_size", type=int, default=2)
155 |     parser.add_argument("--vocab_size", type=int, default=30522)
156 |     parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
157 |     parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
158 |     parser.add_argument("--hidden_size_per_head", type=int, default=64)
159 | 
160 |     return parser
161 | 
162 | 
163 | def print_args(args):
164 |     print("=".ljust(66, "="))
165 |     print(
166 |         "Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
167 |             args.model, args.gpu_num_per_node, args.num_nodes
168 |         )
169 |     )
170 |     print("=".ljust(66, "="))
171 |     for arg in vars(args):
172 |         print("{} = {}".format(arg, getattr(args, arg)))
173 |     print("-".ljust(66, "-"))
174 |     print("Time stamp: {}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     parser = get_parser()
179 |     args = parser.parse_args()
180 |     print_args(args)
181 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/convert_tf_ckpt_to_of.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The OneFlow Authors. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | """Convert tensorflow checkpoint to oneflow snapshot"""
17 | 
18 | import re
19 | import argparse
20 | import tensorflow as tf
21 | import numpy as np
22 | import os
23 | 
24 | parser = argparse.ArgumentParser()
25 | 
26 | ## Required parameters
27 | parser.add_argument("--tf_checkpoint_path",
28 |                     default = None,
29 |                     type = str,
30 |                     required = True,
31 |                     help = "Path the TensorFlow checkpoint path.")
32 | parser.add_argument("--of_dump_path",
33 |                     default = None,
34 |                     type = str,
35 |                     required = True,
36 |                     help = "Path to the output OneFlow model.")
37 | 
38 | #args = parser.parse_args()
39 | args, unknown = parser.parse_known_args()
40 | print(args)
41 | 
42 | # parse unknown arguments for extra weights
43 | extra_weights = {}
44 | for u in unknown:
45 |     w = u.split("=")
46 |     assert len(w) == 2
47 |     if len(w) == 2:
48 |         extra_weights[w[0]] = float(w[1])
49 | 
50 | 
51 | def _write_blob(folder, blob):
52 |     os.makedirs(folder, exist_ok=True)
53 |     filename = os.path.join(folder, "out")
54 |     f = open(filename, 'wb')
55 |     f.write(blob.tobytes())
56 |     f.close()
57 |     print(filename, blob.shape)
58 | 
59 | def _SaveWeightBlob2File(blob, folder):
60 |     _write_blob(folder, blob)
61 | 
62 |     for weight, default_value in extra_weights.items():
63 |         d = np.full_like(blob, default_value)
64 |         _write_blob(folder + weight, d)
65 | 
66 | def convert():
67 |     path = args.tf_checkpoint_path
68 |     init_vars = tf.train.list_variables(path)
69 |     for name, shape in init_vars:
70 |         array = tf.train.load_variable(path, name)
71 | 
72 |         sep = name.rfind('/')
73 |         blob_name = name[sep + 1:]
74 |         op_name = name[:sep].replace('/', '-')
75 | 
76 |         if blob_name == "kernel":
77 |             blob_name = "weight"
78 |         elif blob_name in ['adam_m', 'adam_v']:
79 |             print("find m, v weights")
80 | 
81 |         folder_name = op_name+"-"+blob_name
82 |         folder = os.path.join(args.of_dump_path, folder_name)
83 |         #print("saved to:", folder)
84 | 
85 |         _SaveWeightBlob2File(array, folder)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     convert()
90 | 
91 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/pretrain.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | import oneflow.compatible.single_client as flow
 17 | import bert as bert_util
 18 | import oneflow.core.operator.op_conf_pb2 as op_conf_util
 19 | 
 20 | 
 21 | def PreTrain(
 22 |     input_ids_blob,
 23 |     input_mask_blob,
 24 |     token_type_ids_blob,
 25 |     masked_lm_positions_blob,
 26 |     masked_lm_ids_blob,
 27 |     masked_lm_weights_blob,
 28 |     next_sentence_label_blob,
 29 |     vocab_size,
 30 |     seq_length=512,
 31 |     hidden_size=768,
 32 |     num_hidden_layers=12,
 33 |     num_attention_heads=12,
 34 |     intermediate_size=3072,
 35 |     hidden_act="gelu",
 36 |     hidden_dropout_prob=0.1,
 37 |     attention_probs_dropout_prob=0.1,
 38 |     max_position_embeddings=512,
 39 |     type_vocab_size=16,
 40 |     max_predictions_per_seq=20,
 41 |     initializer_range=0.02,
 42 |     use_fp16=False,
 43 | ):
 44 |     backbone = bert_util.BertBackbone(
 45 |         input_ids_blob=input_ids_blob,
 46 |         input_mask_blob=input_mask_blob,
 47 |         token_type_ids_blob=token_type_ids_blob,
 48 |         vocab_size=vocab_size,
 49 |         seq_length=seq_length,
 50 |         hidden_size=hidden_size,
 51 |         num_hidden_layers=num_hidden_layers,
 52 |         num_attention_heads=num_attention_heads,
 53 |         intermediate_size=intermediate_size,
 54 |         hidden_act=hidden_act,
 55 |         hidden_dropout_prob=hidden_dropout_prob,
 56 |         attention_probs_dropout_prob=attention_probs_dropout_prob,
 57 |         max_position_embeddings=max_position_embeddings,
 58 |         type_vocab_size=type_vocab_size,
 59 |         initializer_range=initializer_range,
 60 |     )
 61 | 
 62 |     (lm_loss, _, _) = _AddMaskedLanguageModelLoss(
 63 |         input_blob=backbone.sequence_output(),
 64 |         output_weights_blob=backbone.embedding_table(),
 65 |         positions_blob=masked_lm_positions_blob,
 66 |         label_id_blob=masked_lm_ids_blob,
 67 |         label_weight_blob=masked_lm_weights_blob,
 68 |         seq_length=seq_length,
 69 |         hidden_size=hidden_size,
 70 |         vocab_size=vocab_size,
 71 |         max_predictions_per_seq=max_predictions_per_seq,
 72 |         hidden_act=bert_util.GetActivation(hidden_act),
 73 |         initializer_range=initializer_range,
 74 |     )
 75 |     pooled_output = PooledOutput(
 76 |         backbone.sequence_output(), hidden_size, initializer_range
 77 |     )
 78 |     (ns_loss, _, _) = _AddNextSentenceOutput(
 79 |         input_blob=pooled_output,
 80 |         label_blob=next_sentence_label_blob,
 81 |         hidden_size=hidden_size,
 82 |         initializer_range=initializer_range,
 83 |     )
 84 |     with flow.scope.namespace("cls-loss"):
 85 |         lm_loss = flow.math.reduce_mean(lm_loss)
 86 |         ns_loss = flow.math.reduce_mean(ns_loss)
 87 |         total_loss = lm_loss + ns_loss
 88 |     return total_loss, lm_loss, ns_loss
 89 | 
 90 | 
 91 | def PooledOutput(sequence_output, hidden_size, initializer_range):
 92 |     with flow.scope.namespace("bert-pooler"):
 93 |         first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
 94 |         first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
 95 |         pooled_output = bert_util._FullyConnected(
 96 |             first_token_tensor,
 97 |             input_size=hidden_size,
 98 |             units=hidden_size,
 99 |             weight_initializer=bert_util.CreateInitializer(initializer_range),
100 |             name="dense",
101 |         )
102 |         pooled_output = flow.math.tanh(pooled_output)
103 |     return pooled_output
104 | 
105 | 
106 | def _AddMaskedLanguageModelLoss(
107 |     input_blob,
108 |     output_weights_blob,
109 |     positions_blob,
110 |     label_id_blob,
111 |     label_weight_blob,
112 |     seq_length,
113 |     hidden_size,
114 |     vocab_size,
115 |     max_predictions_per_seq,
116 |     hidden_act,
117 |     initializer_range,
118 | ):
119 |     with flow.scope.namespace("other"):
120 |         sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
121 |         ones = sum_label_weight_blob * 0.0 + 1.0
122 |         sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
123 |         batch_size = flow.math.reduce_sum(ones)
124 |         sum_label_weight_blob = sum_label_weight_blob / batch_size
125 |     with flow.scope.namespace("cls-predictions"):
126 |         input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
127 |         with flow.scope.namespace("transform"):
128 |             if callable(hidden_act):
129 |                 act_fn = op_conf_util.kNone
130 |             else:
131 |                 act_fn = hidden_act
132 |             input_blob = bert_util._FullyConnected(
133 |                 input_blob,
134 |                 input_size=hidden_size,
135 |                 units=hidden_size,
136 |                 activation=act_fn,
137 |                 weight_initializer=bert_util.CreateInitializer(initializer_range),
138 |                 name="dense",
139 |             )
140 |             if callable(hidden_act):
141 |                 input_blob = hidden_act(input_blob)
142 |                 input_blob = bert_util._LayerNorm(input_blob, hidden_size)
143 |         output_bias = flow.get_variable(
144 |             name="output_bias",
145 |             shape=[vocab_size],
146 |             dtype=input_blob.dtype,
147 |             initializer=flow.constant_initializer(1.0),
148 |         )
149 |         logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
150 |         logit_blob = flow.nn.bias_add(logit_blob, output_bias)
151 |         label_id_blob = flow.reshape(label_id_blob, [-1])
152 |         pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
153 |             logits=logit_blob, labels=label_id_blob
154 |         )
155 |         pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
156 |         numerator = pre_example_loss * label_weight_blob
157 |         with flow.scope.namespace("loss"):
158 |             numerator = flow.math.reduce_sum(numerator, axis=[-1])
159 |             denominator = sum_label_weight_blob + 1e-5
160 |             loss = numerator / denominator
161 |         return loss, pre_example_loss, logit_blob
162 | 
163 | 
164 | def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
165 |     output = flow.gather(
166 |         params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
167 |     )
168 |     output = flow.reshape(output, [-1, hidden_size])
169 |     return output
170 | 
171 | 
172 | def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
173 |     with flow.scope.namespace("cls-seq_relationship"):
174 |         output_weight_blob = flow.get_variable(
175 |             name="output_weights",
176 |             shape=[2, hidden_size],
177 |             dtype=input_blob.dtype,
178 |             initializer=bert_util.CreateInitializer(initializer_range),
179 |         )
180 |         output_bias_blob = flow.get_variable(
181 |             name="output_bias",
182 |             shape=[2],
183 |             dtype=input_blob.dtype,
184 |             initializer=flow.constant_initializer(0.0),
185 |         )
186 |         logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
187 |         logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
188 |         pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
189 |             logits=logit_blob, labels=label_blob
190 |         )
191 |         loss = pre_example_loss
192 |         return loss, pre_example_loss, logit_blob
193 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/run_pretraining.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import os
 18 | import argparse
 19 | from datetime import datetime
 20 | 
 21 | import config as configs
 22 | import oneflow.compatible.single_client as flow
 23 | 
 24 | from pretrain import PreTrain
 25 | from util import Snapshot, InitNodes, Metric, CreateOptimizer, GetFunctionConfig
 26 | 
 27 | parser = configs.get_parser()
 28 | parser.add_argument("--data_dir", type=str, default=None)
 29 | parser.add_argument(
 30 |     "--data_part_num", type=int, default=32, help="data part number in dataset"
 31 | )
 32 | parser.add_argument(
 33 |     "--iter_num", type=int, default=1144000, help="total iterations to run"
 34 | )
 35 | parser.add_argument("--batch_size_per_device", type=int, default=64)
 36 | args = parser.parse_args()
 37 | configs.print_args(args)
 38 | 
 39 | batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
 40 | 
 41 | 
 42 | def BertDecoder(
 43 |     data_dir, batch_size, data_part_num, seq_length, max_predictions_per_seq
 44 | ):
 45 |     ofrecord = flow.data.ofrecord_reader(
 46 |         data_dir,
 47 |         batch_size=batch_size,
 48 |         data_part_num=data_part_num,
 49 |         random_shuffle=True,
 50 |         shuffle_after_epoch=True,
 51 |     )
 52 |     blob_confs = {}
 53 | 
 54 |     def _blob_conf(name, shape, dtype=flow.int32):
 55 |         blob_confs[name] = flow.data.OFRecordRawDecoder(
 56 |             ofrecord, name, shape=shape, dtype=dtype
 57 |         )
 58 | 
 59 |     _blob_conf("input_ids", [seq_length])
 60 |     _blob_conf("next_sentence_labels", [1])
 61 |     _blob_conf("input_mask", [seq_length])
 62 |     _blob_conf("segment_ids", [seq_length])
 63 |     _blob_conf("masked_lm_ids", [max_predictions_per_seq])
 64 |     _blob_conf("masked_lm_positions", [max_predictions_per_seq])
 65 |     _blob_conf("masked_lm_weights", [max_predictions_per_seq], flow.float)
 66 |     return blob_confs
 67 | 
 68 | 
 69 | @flow.global_function(type="train", function_config=GetFunctionConfig(args))
 70 | def PretrainJob():
 71 |     hidden_size = 64 * args.num_attention_heads  # , H = 64, size per head
 72 |     intermediate_size = hidden_size * 4
 73 | 
 74 |     if args.data_part_num == 1:
 75 |         with flow.scope.placement("cpu", "0:0"):
 76 |             decoders = BertDecoder(
 77 |                 args.data_dir,
 78 |                 batch_size,
 79 |                 args.data_part_num,
 80 |                 args.seq_length,
 81 |                 args.max_predictions_per_seq,
 82 |             )
 83 |     else:
 84 |         assert args.data_part_num > 1
 85 |         decoders = BertDecoder(
 86 |             args.data_dir,
 87 |             batch_size,
 88 |             args.data_part_num,
 89 |             args.seq_length,
 90 |             args.max_predictions_per_seq,
 91 |         )
 92 | 
 93 |     total_loss, mlm_loss, nsp_loss = PreTrain(
 94 |         decoders["input_ids"],
 95 |         decoders["input_mask"],
 96 |         decoders["segment_ids"],
 97 |         decoders["masked_lm_positions"],
 98 |         decoders["masked_lm_ids"],
 99 |         decoders["masked_lm_weights"],
100 |         decoders["next_sentence_labels"],
101 |         args.vocab_size,
102 |         seq_length=args.seq_length,
103 |         hidden_size=hidden_size,
104 |         num_hidden_layers=args.num_hidden_layers,
105 |         num_attention_heads=args.num_attention_heads,
106 |         intermediate_size=intermediate_size,
107 |         hidden_act="gelu",
108 |         hidden_dropout_prob=args.hidden_dropout_prob,
109 |         attention_probs_dropout_prob=args.attention_probs_dropout_prob,
110 |         max_position_embeddings=args.max_position_embeddings,
111 |         type_vocab_size=args.type_vocab_size,
112 |         max_predictions_per_seq=args.max_predictions_per_seq,
113 |         initializer_range=0.02,
114 |         use_fp16=args.use_fp16,
115 |     )
116 |     opt = CreateOptimizer(args)
117 |     opt.minimize(total_loss)
118 |     return {"total_loss": total_loss, "mlm_loss": mlm_loss, "nsp_loss": nsp_loss}
119 | 
120 | 
121 | def main():
122 |     flow.config.gpu_device_num(args.gpu_num_per_node)
123 |     flow.env.log_dir(args.log_dir)
124 | 
125 |     InitNodes(args)
126 | 
127 |     snapshot = Snapshot(args.model_save_dir, args.model_load_dir, args.model_save_init)
128 | 
129 |     print("num_accumulation_steps:", args.num_accumulation_steps)
130 |     metric = Metric(
131 |         desc="train",
132 |         print_steps=args.loss_print_every_n_iter,
133 |         batch_size=batch_size * args.num_accumulation_steps,
134 |         keys=["total_loss", "mlm_loss", "nsp_loss"],
135 |     )
136 | 
137 |     for step in range(args.iter_num):
138 |         PretrainJob().async_get(metric.metric_cb(step))
139 |         # PretrainJob().async_get(metric.metric_cb(step, epoch=3))
140 |         if (step + 1) % args.model_save_every_n_iter == 0:
141 |             snapshot.save("snapshot_%d" % (step + 1))
142 | 
143 |     if args.save_last_snapshot:
144 |         snapshot.save("last_snapshot")
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/run_pretraining_adam.sh:
--------------------------------------------------------------------------------
 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT
 2 | OUTPUT_DIR=/DATA/disk1/of_output
 3 | 
 4 | DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128
 5 | 
 6 | 
 7 | BZ=48
 8 | ITER_NUM=1000000
 9 | max_seq_length=128
10 | max_predictions_per_seq=20
11 | 
12 | of_log_dir=$OUTPUT_DIR/bert_master/of
13 | rm -rf ${of_log_dir}
14 | mkdir -p ${of_log_dir}
15 | rm -rf core.*
16 | 
17 | export PYTHONUNBUFFERED=1
18 | export ONEFLOW_DEBUG_MODE=True
19 | export GLOG_v=3
20 | export CUDA_VISIBLE_DEVICES=6
21 | python3 $BENCH_ROOT_DIR/run_pretraining.py \
22 |   --gpu_num_per_node=1 \
23 |   --num_nodes=1 \
24 |   --learning_rate=1.25e-5 \
25 |   --warmup_proportion=0.01 \
26 |   --weight_decay_rate=0.01 \
27 |   --batch_size_per_device=${BZ} \
28 |   --iter_num=${ITER_NUM} \
29 |   --loss_print_every_n_iter=1 \
30 |   --seq_length=128 \
31 |   --use_fp16 \
32 |   --max_predictions_per_seq=20 \
33 |   --num_hidden_layers=12 \
34 |   --num_attention_heads=12 \
35 |   --num_accumulation_steps=1 \
36 |   --max_position_embeddings=512 \
37 |   --type_vocab_size=2 \
38 |   --vocab_size=30522 \
39 |   --attention_probs_dropout_prob=0.1 \
40 |   --hidden_dropout_prob=0.1 \
41 |   --hidden_size_per_head=64 \
42 |   --data_part_num=64 \
43 |   --data_dir=$DATA_DIR \
44 |   --log_dir=${of_log_dir} \
45 |   --model_save_every_n_iter=50000 \
46 |   --model_save_dir=${of_log_dir}
47 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/run_pretraining_lamb.sh:
--------------------------------------------------------------------------------
 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT
 2 | OUTPUT_DIR=/DATA/disk1/of_output
 3 | 
 4 | DATA_DIR=/DATA/disk1/bert/wiki_seq_len_128
 5 | 
 6 | 
 7 | BZ=16
 8 | ITER_NUM=1000000
 9 | max_seq_length=128
10 | max_predictions_per_seq=20
11 | 
12 | of_log_dir=$OUTPUT_DIR/bert_master/of
13 | rm -rf ${of_log_dir}
14 | mkdir -p ${of_log_dir}
15 | rm -rf core.*
16 | 
17 | export PYTHONUNBUFFERED=1
18 | export ONEFLOW_DEBUG_MODE=True
19 | export GLOG_v=3
20 | 
21 | python3 $BENCH_ROOT_DIR/run_pretraining.py \
22 |   --gpu_num_per_node=8 \
23 |   --num_nodes=1 \
24 |   --learning_rate=1e-4 \
25 |   --warmup_proportion=0.01 \
26 |   --weight_decay_rate=0.01 \
27 |   --batch_size_per_device=${BZ} \
28 |   --iter_num=${ITER_NUM} \
29 |   --loss_print_every_n_iter=1 \
30 |   --seq_length=128 \
31 |   --use_fp16 \
32 |   --optimizer_type="lamb" \
33 |   --max_predictions_per_seq=20 \
34 |   --num_hidden_layers=12 \
35 |   --num_attention_heads=12 \
36 |   --num_accumulation_steps=512 \
37 |   --max_position_embeddings=512 \
38 |   --type_vocab_size=2 \
39 |   --vocab_size=30522 \
40 |   --attention_probs_dropout_prob=0.1 \
41 |   --hidden_dropout_prob=0.1 \
42 |   --hidden_size_per_head=64 \
43 |   --data_part_num=64 \
44 |   --data_dir=$DATA_DIR \
45 |   --log_dir=${of_log_dir} \
46 |   --model_save_every_n_iter=50000 \
47 |   --model_save_dir=${of_log_dir}
48 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/run_squad.sh:
--------------------------------------------------------------------------------
 1 | BENCH_ROOT_DIR=/path/to/OneFlow-Benchmark/LanguageModeling/BERT
 2 | # pretrained model dir
 3 | PRETRAINED_MODEL=/DATA/disk1/of_output/uncased_L-12_H-768_A-12_oneflow
 4 | 
 5 | # squad ofrecord dataset dir
 6 | DATA_ROOT=/DATA/disk1/of_output/bert/of_squad
 7 | 
 8 | # `vocab.txt` dir
 9 | REF_ROOT_DIR=/DATA/disk1/of_output/uncased_L-12_H-768_A-12
10 | 
11 | # `evaluate-v*.py` and `dev-v*.json` dir
12 | SQUAD_TOOL_DIR=/DATA/disk1/of_output/bert/of_squad
13 | db_version=${1:-"v2.0"}
14 | if [ $db_version = "v1.1" ]; then
15 |   train_example_num=88614
16 |   eval_example_num=10833
17 |   version_2_with_negative="False"
18 | elif [ $db_version = "v2.0" ]; then
19 |   train_example_num=131944
20 |   eval_example_num=12232
21 |   version_2_with_negative="True"
22 | else
23 |   echo "db_version must be 'v1.1' or 'v2.0'"
24 |   exit
25 | fi
26 | 
27 | train_data_dir=$DATA_ROOT/train-$db_version
28 | eval_data_dir=$DATA_ROOT/dev-$db_version
29 | LOGFILE=./bert_fp_training.log
30 | export PYTHONUNBUFFERED=1
31 | export ONEFLOW_DEBUG_MODE=True
32 | export CUDA_VISIBLE_DEVICES=7
33 | # finetune and eval SQuAD,
34 | # `predictions.json` will be saved to folder `./squad_output`
35 | python3 $BENCH_ROOT_DIR/run_squad.py \
36 |   --model=SQuAD \
37 |   --do_train=True \
38 |   --do_eval=True \
39 |   --gpu_num_per_node=1 \
40 |   --learning_rate=3e-5 \
41 |   --batch_size_per_device=16 \
42 |   --eval_batch_size_per_device=16 \
43 |   --num_epoch=3 \
44 |   --use_fp16 \
45 |   --version_2_with_negative=$version_2_with_negative \
46 |   --loss_print_every_n_iter=20 \
47 |   --do_lower_case=True \
48 |   --seq_length=384 \
49 |   --num_hidden_layers=12 \
50 |   --num_attention_heads=12 \
51 |   --max_position_embeddings=512 \
52 |   --type_vocab_size=2 \
53 |   --vocab_size=30522 \
54 |   --attention_probs_dropout_prob=0.1 \
55 |   --hidden_dropout_prob=0.1 \
56 |   --hidden_size_per_head=64 \
57 |   --train_data_dir=$train_data_dir \
58 |   --train_example_num=$train_example_num \
59 |   --eval_data_dir=$eval_data_dir \
60 |   --eval_example_num=$eval_example_num \
61 |   --log_dir=./log \
62 |   --model_load_dir=${PRETRAINED_MODEL} \
63 |   --save_last_snapshot=True \
64 |   --model_save_dir=./squad_snapshots \
65 |   --vocab_file=$REF_ROOT_DIR/vocab.txt \
66 |   --predict_file=$SQUAD_TOOL_DIR/dev-${db_version}.json \
67 |   --output_dir=./squad_output 2>&1 | tee ${LOGFILE}
68 | 
69 | 
70 | # evaluate predictions.json to get metrics
71 | python3 $SQUAD_TOOL_DIR/evaluate-${db_version}.py \
72 |   $SQUAD_TOOL_DIR/dev-${db_version}.json \
73 |   ./squad_output/predictions.json
74 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/squad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2020 The OneFlow Authors. All rights reserved.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | import oneflow.compatible.single_client as flow
17 | import bert as bert_util
18 | 
19 | 
20 | def SQuAD(
21 |     input_ids_blob,
22 |     input_mask_blob,
23 |     token_type_ids_blob,
24 |     vocab_size,
25 |     seq_length=512,
26 |     hidden_size=768,
27 |     num_hidden_layers=12,
28 |     num_attention_heads=12,
29 |     intermediate_size=3072,
30 |     hidden_act="gelu",
31 |     hidden_dropout_prob=0.1,
32 |     attention_probs_dropout_prob=0.1,
33 |     max_position_embeddings=512,
34 |     type_vocab_size=16,
35 |     initializer_range=0.02,
36 |     ):
37 | 
38 |     backbone = bert_util.BertBackbone(
39 |         input_ids_blob=input_ids_blob,
40 |         input_mask_blob=input_mask_blob,
41 |         token_type_ids_blob=token_type_ids_blob,
42 |         vocab_size=vocab_size,
43 |         seq_length=seq_length,
44 |         hidden_size=hidden_size,
45 |         num_hidden_layers=num_hidden_layers,
46 |         num_attention_heads=num_attention_heads,
47 |         intermediate_size=intermediate_size,
48 |         hidden_act=hidden_act,
49 |         hidden_dropout_prob=hidden_dropout_prob,
50 |         attention_probs_dropout_prob=attention_probs_dropout_prob,
51 |         max_position_embeddings=max_position_embeddings,
52 |         type_vocab_size=type_vocab_size,
53 |         initializer_range=initializer_range,
54 |     )
55 | 
56 |     with flow.scope.namespace("cls-squad"):
57 |         final_hidden = backbone.sequence_output()
58 |         final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size])
59 |         logits = bert_util._FullyConnected(
60 |                     final_hidden_matrix,
61 |                     hidden_size,
62 |                     units=2,
63 |                     weight_initializer=bert_util.CreateInitializer(initializer_range),
64 |                     name='output')
65 |         logits = flow.reshape(logits, [-1, seq_length, 2])
66 | 
67 |         start_logits = flow.slice(logits, [None, None, 0], [None, None, 1])
68 |         end_logits = flow.slice(logits, [None, None, 1], [None, None, 1])
69 | 
70 |     return start_logits, end_logits
71 | 


--------------------------------------------------------------------------------
/LanguageModeling/BERT/util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2020 The OneFlow Authors. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import os
 18 | import time
 19 | from collections import OrderedDict
 20 | import oneflow.compatible.single_client as flow
 21 | 
 22 | 
 23 | def InitNodes(args):
 24 |     if args.num_nodes > 1:
 25 |         assert args.num_nodes <= len(args.node_ips)
 26 |         flow.env.ctrl_port(args.ctrl_port)
 27 |         nodes = []
 28 |         for ip in args.node_ips[: args.num_nodes]:
 29 |             addr_dict = {}
 30 |             addr_dict["addr"] = ip
 31 |             nodes.append(addr_dict)
 32 | 
 33 |         flow.env.machine(nodes)
 34 | 
 35 | 
 36 | class Snapshot(object):
 37 |     def __init__(self, model_save_dir, model_load_dir, model_save_init=False):
 38 |         self._model_save_dir = model_save_dir
 39 |         if model_load_dir:
 40 |             assert os.path.isdir(model_load_dir)
 41 |             print("Restoring model from {}.".format(model_load_dir))
 42 |             flow.load_variables(flow.checkpoint.get(model_load_dir))
 43 |         elif model_save_init:
 44 |             flow.checkpoint.save("initial_model")
 45 |             print("Init model on demand.")
 46 | 
 47 |     def save(self, name):
 48 |         snapshot_save_path = os.path.join(
 49 |             self._model_save_dir, "snapshot_{}".format(name)
 50 |         )
 51 |         if not os.path.exists(snapshot_save_path):
 52 |             os.makedirs(snapshot_save_path)
 53 |         print("Saving model to {}.".format(snapshot_save_path))
 54 |         flow.checkpoint.save(snapshot_save_path)
 55 | 
 56 | 
 57 | class StopWatch(object):
 58 |     def __init__(self):
 59 |         pass
 60 | 
 61 |     def start(self):
 62 |         self.start_time = time.time()
 63 |         self.last_split = self.start_time
 64 | 
 65 |     def split(self):
 66 |         now = time.time()
 67 |         duration = now - self.last_split
 68 |         self.last_split = now
 69 |         return duration
 70 | 
 71 |     def stop(self):
 72 |         self.stop_time = time.time()
 73 | 
 74 |     def duration(self):
 75 |         return self.stop_time - self.start_time
 76 | 
 77 | 
 78 | class Metric(object):
 79 |     def __init__(
 80 |         self,
 81 |         desc="train",
 82 |         print_steps=-1,
 83 |         batch_size=256,
 84 |         keys=[],
 85 |         nvidia_smi_report_step=10,
 86 |     ):
 87 |         r"""accumulate and calculate metric
 88 | 
 89 |         Args:
 90 |             desc: `str` general description of the metric to show
 91 |             print_steps: `Int` print metrics every nth steps
 92 |             batch_size: `Int` batch size per step
 93 |             keys: keys in callback outputs
 94 |         Returns:
 95 |         """
 96 |         self.desc = desc
 97 |         self.print_steps = print_steps
 98 |         assert batch_size > 0
 99 |         self.batch_size = batch_size
100 |         self.nvidia_smi_report_step = nvidia_smi_report_step
101 | 
102 |         assert isinstance(keys, (list, tuple))
103 |         self.keys = keys
104 |         self.metric_dict = OrderedDict()
105 |         self.metric_dict["step"] = 0
106 | 
107 |         self.timer = StopWatch()
108 |         self.timer.start()
109 |         self._clear()
110 | 
111 |     def _clear(self):
112 |         for key in self.keys:
113 |             self.metric_dict[key] = 0.0
114 |             self.metric_dict["n_" + key] = 0.0
115 |         self.metric_dict["throughput"] = 0.0
116 |         self.num_samples = 0.0
117 | 
118 |     def update_and_save(self, key, value, step, **kwargs):
119 |         self.metric_dict[key] = value
120 |         self.metric_dict.pop("n_" + key, None)
121 | 
122 |     def metric_cb(self, step=0, **kwargs):
123 |         def callback(outputs):
124 |             if step == 0:
125 |                 self._clear()
126 | 
127 |             if step == self.nvidia_smi_report_step:
128 |                 cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
129 |                 os.system(cmd)
130 | 
131 |             for key in self.keys:
132 |                 self.metric_dict[key] += outputs[key].sum()
133 |                 self.metric_dict["n_" + key] += outputs[key].size
134 | 
135 |             self.num_samples += self.batch_size
136 | 
137 |             if (step + 1) % self.print_steps == 0:
138 |                 self.metric_dict["step"] = step
139 |                 for k, v in kwargs.items():
140 |                     self.metric_dict[k] = v
141 |                 throughput = self.num_samples / self.timer.split()
142 |                 self.update_and_save("throughput", throughput, step)
143 |                 for key in self.keys:
144 |                     value = self.metric_dict[key] / self.metric_dict["n_" + key]
145 |                     self.update_and_save(key, value, step, **kwargs)
146 |                 print(
147 |                     ", ".join(
148 |                         ("{}: {}" if type(v) is int else "{}: {:.3f}").format(k, v)
149 |                         for k, v in self.metric_dict.items()
150 |                     ),
151 |                     time.time(),
152 |                 )
153 |                 self._clear()
154 | 
155 |         return callback
156 | 
157 | 
158 | def CreateOptimizer(args):
159 |     warmup_batches = int(args.iter_num * args.warmup_proportion)
160 |     lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0)
161 |     lr_scheduler = flow.optimizer.PolynomialScheduler(
162 |         args.learning_rate, args.iter_num, 0.0, warmup=lr_warmup
163 |     )
164 |     loss_scale_policy = None
165 |     if args.use_fp16:
166 |         loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(
167 |             increment_period=2000
168 |         )
169 | 
170 |     if args.optimizer_type == "lamb":
171 |         return flow.optimizer.LAMB(
172 |             lr_scheduler,
173 |             beta1=0.9,
174 |             beta2=0.999,
175 |             epsilon=1e-6,
176 |             weight_decay=args.weight_decay_rate,
177 |             weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
178 |             grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),
179 |             loss_scale_policy=loss_scale_policy,
180 |         )
181 |     else:
182 |         return flow.optimizer.AdamW(
183 |             lr_scheduler,
184 |             epsilon=1e-6,
185 |             weight_decay=args.weight_decay_rate,
186 |             weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
187 |             grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),
188 |             loss_scale_policy=loss_scale_policy,
189 |         )
190 | 
191 | 
192 | def GetFunctionConfig(args):
193 |     config = flow.function_config()
194 |     config.enable_auto_mixed_precision(args.use_fp16)
195 |     config.train.num_gradient_accumulation_steps(args.num_accumulation_steps)
196 |     if args.use_xla:
197 |         config.use_xla_jit(True)
198 |     config.enable_fuse_add_to_output(True)
199 |     config.enable_fuse_model_update_ops(True)
200 |     return config
201 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 The OneFlow Authors. All rights reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/distribute_pretrain_2n4d.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=4
 5 | export ONEFLOW_GPT_NUM_NODES=2
 6 | # Set this env for your training nodes ip
 7 | # export ONEFLOW_GPT_NODE_IPS="192.168.1.16,192.168.1.15"
 8 | 
 9 | # If you place training data on somewhere else, set this env
10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
11 | export ONEFLOW_GPT_SEQ_LENGTH=2048
12 | 
13 | export ONEFLOW_GPT_HIDDEN_SIZE=1536
14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16
15 | export ONEFLOW_GPT_NUM_LAYERS=16
16 | 
17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4
18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1
19 | 
20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16
22 | 
23 | source $(dirname $0)/pretrain.sh
24 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/distribute_pretrain_4n8d.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8
 5 | export ONEFLOW_GPT_NUM_NODES=4
 6 | # Set this env for your training nodes ip
 7 | # export ONEFLOW_GPT_NODE_IPS="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5"
 8 | 
 9 | # If you place training data on somewhere else, set this env
10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
11 | export ONEFLOW_GPT_SEQ_LENGTH=2048
12 | 
13 | export ONEFLOW_GPT_HIDDEN_SIZE=2304
14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24
15 | export ONEFLOW_GPT_NUM_LAYERS=24
16 | 
17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=8
18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1
19 | 
20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=32
22 | 
23 | source $(dirname $0)/pretrain.sh
24 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/distribute_pretrain_4n8d_2x4x4_512_2304x24.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8
 5 | export ONEFLOW_GPT_NUM_NODES=4
 6 | # Set this env for your training nodes ip
 7 | # export ONEFLOW_GPT_NODE_IPS="10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5"
 8 | 
 9 | # If you place training data on somewhere else, set this env
10 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
11 | export ONEFLOW_GPT_SEQ_LENGTH=2048
12 | 
13 | export ONEFLOW_GPT_HIDDEN_SIZE=2304
14 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24
15 | export ONEFLOW_GPT_NUM_LAYERS=24
16 | 
17 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4
18 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=4
19 | 
20 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
21 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=512
22 | 
23 | source $(dirname $0)/pretrain.sh
24 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/distribute_pretrain_with_container.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8
 5 | export ONEFLOW_GPT_NUM_NODES=4
 6 | # export ONEFLOW_GPT_NODE_IPS="192.168.1.16,192.168.1.15,192.168.1.14,192.168.1.13"
 7 | 
 8 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
 9 | export ONEFLOW_GPT_SEQ_LENGTH=2048
10 | 
11 | export ONEFLOW_GPT_HIDDEN_SIZE=2304
12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24
13 | export ONEFLOW_GPT_NUM_LAYERS=24
14 | 
15 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4
16 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=4
17 | 
18 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
19 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=512
20 | 
21 | export ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER=ON
22 | export ONEFLOW_GPT_SRC_DIR=$(realpath $(dirname $(dirname $0)))
23 | export ONEFLOW_DEV_IMAGE=oneflow-manylinux2014-cuda11.2:0.1
24 | export ONEFLOW_GPT_PYTHON_VERSION=3.7
25 | export ONEFLOW_WHEEL=$PWD/packages/oneflow-0.3.5+cu102.git.8b222eed2-cp37-cp37m-manylinux2014_x86_64.whl
26 | # Set this env for mounting data dir for container
27 | # export ONEFLOW_GPT_DATA_DIR=$(dirname $ONEFLOW_GPT_DATASET)
28 | 
29 | source $(dirname $0)/pretrain.sh
30 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/lambada_cloze_accuracy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | TASK="LAMBADA"
 6 | VALID_DATA=/path/to/lambada_test.json
 7 | VOCAB_FILE=/path/to/gpt2-vocab.json
 8 | MERGE_FILE=/path/to/gpt2-merges.txt
 9 | CHECKPOINT_PATH=/path/to/model
10 | 
11 | 
12 | gpu_num_per_node=1
13 | micro_batch_size=8
14 | hidden_size=768
15 | num_attn_heads=12
16 | num_layers=12
17 | seq_length=1024
18 | dropout_rate=0.0
19 | 
20 | cmd=""
21 | cmd+="python3 tasks/main.py "
22 | cmd+="--task $TASK "
23 | cmd+="--valid-data $VALID_DATA "
24 | cmd+="--tokenizer-type GPT2BPETokenizer "
25 | cmd+="--strict-lambada "
26 | cmd+="--merge-file $MERGE_FILE "
27 | cmd+="--vocab-file $VOCAB_FILE "
28 | cmd+="--load $CHECKPOINT_PATH "
29 | cmd+="--dataset $VALID_DATA "
30 | cmd+="--vocab-size 50257 "
31 | cmd+="--hidden-size $hidden_size "
32 | cmd+="--num-attention-heads $num_attn_heads "
33 | cmd+="--num-layers $num_layers "
34 | cmd+="--seq-length $seq_length "
35 | cmd+="--hidden-dropout $dropout_rate "
36 | cmd+="--attention-dropout $dropout_rate "
37 | cmd+="--fp16 "
38 | cmd+="--checkpoint-activations "
39 | cmd+="--multihead-attention-fusion "
40 | cmd+="--make-vocab-size-divisible-by=128 "
41 | cmd+="--log-interval=10 "
42 | cmd+="--metric-print-format=table "
43 | cmd+="--micro-batch-size=$micro_batch_size "
44 | cmd+="--num-gpus-per-node=$gpu_num_per_node "
45 | cmd+="--num-nodes=1 "
46 | cmd+="--node-ips=10.11.0.2 "
47 | 
48 | set -x
49 | 
50 | $cmd 
51 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain.sh:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | dataset=${ONEFLOW_GPT_DATASET:-"/data/gpt/gpt_sample_dataset_text_document"}
  4 | seq_length=${ONEFLOW_GPT_SEQ_LENGTH:-"2048"}
  5 | 
  6 | num_layers=${ONEFLOW_GPT_NUM_LAYERS:-"16"}
  7 | hidden_size=${ONEFLOW_GPT_HIDDEN_SIZE:-"1536"}
  8 | num_attn_heads=${ONEFLOW_GPT_NUM_ATTENTION_HEADS:-"16"}
  9 | 
 10 | micro_batch_size=${ONEFLOW_GPT_MICRO_BATCH_SIZE:-"8"}
 11 | global_batch_size=${ONEFLOW_GPT_GLOBAL_BATCH_SIZE}
 12 | tensor_model_parallel_size=${ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE}
 13 | pipeline_model_parallel_size=${ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE}
 14 | num_accumulation_steps=${ONEFLOW_GPT_NUM_ACCUMULATION_STEPS}
 15 | 
 16 | num_gpus_per_node=${ONEFLOW_GPT_NUM_GPUS_PER_NODE:-"4"}
 17 | num_nodes=${ONEFLOW_GPT_NUM_NODES:-"1"}
 18 | node_ips=${ONEFLOW_GPT_NODE_IPS:-"10.11.0.2,10.11.0.3,10.11.0.4,10.11.0.5"}
 19 | 
 20 | train_iters=${ONEFLOW_GPT_TRAIN_ITERS:-"500000"}
 21 | log_interval=${ONEFLOW_GPT_LOG_INTERVAL:-"100"}
 22 | 
 23 | init_loss_scale=${ONEFLOW_GPT_INIT_LOSS_SCALE:-"4294967296"}
 24 | 
 25 | load_path=${ONEFLOW_GPT_LOAD_PATH:-"checkpoint"}
 26 | save_path=${ONEFLOW_GPT_SAVE_PATH:-"checkpoint"}
 27 | save_interval=${ONEFLOW_GPT_SAVE_INTERVAL:-"10000"}
 28 | 
 29 | cmd=""
 30 | 
 31 | if [[ ! -z "${ONEFLOW_GTP_PROFILE_FILE}" ]]; then
 32 |     cmd+="nsys profile --stats true --output ${ONEFLOW_GTP_PROFILE_FILE} "
 33 | fi
 34 | 
 35 | if [[ ! -z "${ONEFLOW_GTP_GDB}" ]]; then
 36 |     cmd+="gdb --args "
 37 | fi
 38 | 
 39 | cmd+="python3 -m oneflow_gpt.training"
 40 | cmd+=" --num-layers ${num_layers}"
 41 | cmd+=" --hidden-size ${hidden_size}"
 42 | cmd+=" --num-attention-heads ${num_attn_heads}"
 43 | cmd+=" --micro-batch-size ${micro_batch_size}"
 44 | 
 45 | if [[ ! -z "${global_batch_size}" ]]; then
 46 |     cmd+=" --global-batch-size ${global_batch_size}"
 47 | fi
 48 | 
 49 | if [[ ! -z "${tensor_model_parallel_size}" ]]; then
 50 |     cmd+=" --tensor-model-parallel-size ${tensor_model_parallel_size}"
 51 | fi
 52 | 
 53 | if [[ ! -z "${pipeline_model_parallel_size}" ]]; then
 54 |     cmd+=" --pipeline-model-parallel-size ${pipeline_model_parallel_size}"
 55 | fi
 56 | 
 57 | if [[ ! -z "${num_accumulation_steps}" ]]; then
 58 |     cmd+=" --num-accumulation-steps ${num_accumulation_steps}"
 59 | fi
 60 | 
 61 | cmd+=" --num-gpus-per-node ${num_gpus_per_node}"
 62 | cmd+=" --num-nodes ${num_nodes}"
 63 | cmd+=" --node-ips ${node_ips}"
 64 | cmd+=" --train-iters ${train_iters}"
 65 | cmd+=" --dataset ${dataset}"
 66 | cmd+=" --seq-length ${seq_length}"
 67 | cmd+=" --vocab-size 50257"
 68 | cmd+=" --split 949,50,1"
 69 | cmd+=" --learning-rate 0.00015"
 70 | cmd+=" --min-lr 1.0e-5"
 71 | cmd+=" --lr-decay-style cosine"
 72 | cmd+=" --lr-decay-iters 320000"
 73 | cmd+=" --lr-warmup-fraction 0.01"
 74 | cmd+=" --optimizer adamw"
 75 | cmd+=" --initial-loss-scale ${init_loss_scale}"
 76 | cmd+=" --weight-decay 1e-2"
 77 | cmd+=" --clip-grad 1.0"
 78 | cmd+=" --load ${load_path}"
 79 | cmd+=" --save ${save_path}"
 80 | cmd+=" --save-interval ${save_interval}"
 81 | cmd+=" --log-interval ${log_interval}"
 82 | cmd+=" --checkpoint-activations"
 83 | cmd+=" --multihead-attention-fusion"
 84 | cmd+=" --fp16"
 85 | 
 86 | if [[ ${num_nodes} -gt 1 ]]; then
 87 |     export ONEFLOW_COMM_NET_IB_ENABLE=1
 88 | fi
 89 | 
 90 | if [[ ! -z "${ONEFLOW_GTP_PROFILE_FILE}" ]]; then
 91 |     cmd+=" --profile-transformer-layer"
 92 | fi
 93 | 
 94 | if [[ -z "${ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER}" ]]; then
 95 |     ${cmd}
 96 | else
 97 |     oneflow_gpt_src_dir=${ONEFLOW_GPT_SRC_DIR:-"$(dirname $(dirname $0))"}
 98 |     oneflow_dev_image=${ONEFLOW_DEV_IMAGE:-"oneflow-manylinux2014-cuda11.2:0.1"}
 99 |     python_version=${ONEFLOW_GPT_PYTHON_VERSION:-"3.7"}
100 |     oneflow_gpt_data_dir=${ONEFLOW_GPT_DATA_DIR:-"/data"}
101 | 
102 |     if [[ -z "${ONEFLOW_WHEEL}" ]]; then
103 |         echo "ONEFLOW_WHEEL env var not set"
104 |         exit 1
105 |     fi
106 | 
107 |     python3 ${oneflow_gpt_src_dir}/tools/launch_container.py \
108 |         --src ${oneflow_gpt_src_dir} \
109 |         --py ${python_version} \
110 |         --image ${oneflow_dev_image} \
111 |         --wheel ${ONEFLOW_WHEEL} \
112 |         --extra-mount ${oneflow_gpt_data_dir} \
113 |         --cmd "$cmd"
114 | fi
115 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain_117M.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1
 5 | 
 6 | # If you place training data on somewhere else, set this env
 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
 8 | export ONEFLOW_GPT_SEQ_LENGTH=1024
 9 | 
10 | export ONEFLOW_GPT_NUM_LAYERS=12
11 | export ONEFLOW_GPT_HIDDEN_SIZE=768
12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=12
13 | 
14 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
15 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=8
16 | 
17 | source $(dirname $0)/pretrain.sh
18 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain_1n8d_2x4x1_16_1536x16.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8
 5 | 
 6 | # If you place training data on somewhere else, set this env
 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
 8 | export ONEFLOW_GPT_SEQ_LENGTH=2048
 9 | 
10 | export ONEFLOW_GPT_HIDDEN_SIZE=1536
11 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16
12 | export ONEFLOW_GPT_NUM_LAYERS=16
13 | 
14 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=4
15 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1
16 | 
17 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
18 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16
19 | 
20 | source $(dirname $0)/pretrain.sh
21 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain_345M.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1
 5 | 
 6 | # If you place training data on somewhere else, set this env
 7 | # export ONEFLOW_GPT_DATASET=/data/gpt/gpt_sample_dataset_text_document
 8 | export ONEFLOW_GPT_SEQ_LENGTH=1024
 9 | 
10 | export ONEFLOW_GPT_NUM_LAYERS=24
11 | export ONEFLOW_GPT_HIDDEN_SIZE=1024
12 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=16
13 | 
14 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
15 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=8
16 | export ONEFLOW_GPT_TRAIN_ITERS=500000
17 | 
18 | source $(dirname $0)/pretrain.sh
19 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain_with_container.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=8
 5 | 
 6 | export ONEFLOW_GPT_SEQ_LENGTH=2048
 7 | export ONEFLOW_GPT_HIDDEN_SIZE=2304
 8 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=24
 9 | export ONEFLOW_GPT_NUM_LAYERS=24
10 | 
11 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=16
12 | export ONEFLOW_GPT_GLOBAL_BATCH_SIZE=16
13 | 
14 | export ONEFLOW_GPT_TENSOR_MODEL_PARALLEL_SIZE=8
15 | export ONEFLOW_GPT_PIPELINE_MODEL_PARALLEL_SIZE=1
16 | 
17 | export ONEFLOW_GTP_PRETRAIN_WITH_CONTAINER=ON
18 | export ONEFLOW_GPT_SRC_DIR=$(realpath $(dirname $(dirname $0)))
19 | export ONEFLOW_DEV_IMAGE=oneflow-manylinux2014-cuda11.2:0.1
20 | export ONEFLOW_GPT_PYTHON_VERSION=3.7
21 | export ONEFLOW_WHEEL=$PWD/packages/oneflow-0.3.5+cu102.git.8b222eed2-cp37-cp37m-manylinux2014_x86_64.whl
22 | 
23 | source $(dirname $0)/pretrain.sh
24 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/examples/pretrain_with_profile.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # set -ex
 3 | 
 4 | export ONEFLOW_GPT_SEQ_LENGTH=1024
 5 | export ONEFLOW_GPT_NUM_LAYERS=12
 6 | export ONEFLOW_GPT_HIDDEN_SIZE=768
 7 | export ONEFLOW_GPT_NUM_ATTENTION_HEADS=12
 8 | export ONEFLOW_GPT_NUM_GPUS_PER_NODE=1
 9 | export ONEFLOW_GPT_MICRO_BATCH_SIZE=8
10 | 
11 | export PYTHONUNBUFFERED=1
12 | export NCCL_DEBUG=INFO
13 | export ONEFLOW_DEBUG_MODE=1
14 | export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1
15 | export ONEFLOW_GTP_PROFILE_FILE="117M_1n1d_bz8"
16 | 
17 | source $(dirname $0)/pretrain.sh
18 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/__init__.py:
--------------------------------------------------------------------------------
 1 | MAJOR = 0
 2 | MINOR = 0.1
 3 | VERSION = (MAJOR, MINOR)
 4 | 
 5 | __version__ = ".".join(map(str, VERSION))
 6 | __package_name__ = "oneflow_gpt"
 7 | __description__ = "OneFlow GPT"
 8 | __license__ = ""
 9 | __keywords__ = "deep learning, Megatron, gpu, NLP, nvidia, cuda, oneflow"
10 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/data.py:
--------------------------------------------------------------------------------
 1 | import oneflow.compatible.single_client as flow
 2 | 
 3 | from oneflow_gpt import distribute
 4 | from oneflow_gpt.config import get_args
 5 | 
 6 | 
 7 | def get_train_val_test_num_samples(split, num_samples):
 8 |     assert len(split) == 3
 9 |     total = sum(split)
10 |     return [int((s / total) * num_samples) for s in split]
11 | 
12 | 
13 | class GPTDataLoader(object):
14 |     def __init__(self, name):
15 |         self.name = name
16 |         args = get_args()
17 |         assert args.dataset is not None
18 |         self.dataset = args.dataset
19 |         self.batch_size = args.global_batch_size // args.num_accumulation_steps
20 |         self.seq_length = args.seq_length
21 |         self.seed = args.seed
22 |         self.split = args.split
23 |         self.num_samples = args.train_samples
24 | 
25 |     def __call__(self):
26 |         with distribute.data_placement_scope():
27 |             x = flow.data.megatron_gpt_mmap_data_loader(
28 |                 data_file_prefix=self.dataset,
29 |                 seq_length=self.seq_length,
30 |                 num_samples=self.num_samples,
31 |                 batch_size=self.batch_size,
32 |                 dtype=flow.int64,
33 |                 shuffle=True,
34 |                 random_seed=self.seed,
35 |                 split_sizes=self.split,
36 |                 split_index=0,
37 |                 nd_sbp=distribute.get_data_parallel_dist(),
38 |                 name=self.name,
39 |             )
40 | 
41 |         # embedding is on pipeline first stage
42 |         with distribute.layer_placement_scope(0):
43 |             data = flow.slice(x, begin=(None, 0), size=(None, self.seq_length))
44 | 
45 |         # loss is on pipeline last stage
46 |         with distribute.layer_placement_scope(-1):
47 |             labels = flow.slice(x, begin=(None, 1), size=(None, self.seq_length))
48 | 
49 |         return data, labels
50 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/optimizer.py:
--------------------------------------------------------------------------------
 1 | import oneflow.compatible.single_client as flow
 2 | 
 3 | 
 4 | def get_lr_scheduler(args):
 5 |     # set up warmup strategy
 6 |     warmup = None
 7 |     if args.lr_warmup_iters is not None and args.lr_warmup_iters > 0:
 8 |         warmup = flow.optimizer.warmup.linear(args.lr_warmup_iters, 0)
 9 | 
10 |     lr_decay_alpha = args.min_lr / args.lr
11 |     # set up learning rate scheduler
12 |     if args.lr_decay_style == "cosine" and args.lr_decay_iters is not None:
13 |         lr_scheduler = flow.optimizer.CosineScheduler(
14 |             base_lr=args.lr,
15 |             steps=args.lr_decay_iters,
16 |             alpha=lr_decay_alpha,
17 |             warmup=warmup,
18 |         )
19 |     else:
20 |         raise NotImplementedError("not supported yet")
21 | 
22 |     return lr_scheduler
23 | 
24 | 
25 | def make_optimizer(args):
26 |     lr_scheduler = get_lr_scheduler(args)
27 | 
28 |     loss_scale_policy = None
29 |     if args.fp16:
30 |         if args.loss_scale is not None:
31 |             loss_scale_policy = flow.optimizer.loss_scale.static_loss_scale(
32 |                 args.loss_scale
33 |             )
34 |         else:
35 |             loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(
36 |                 initial_loss_scale=args.initial_loss_scale,
37 |                 increment_period=args.loss_scale_window,
38 |             )
39 | 
40 |     if args.optimizer == "adamw":
41 |         optimizer = flow.optimizer.AdamW(
42 |             lr_scheduler,
43 |             do_bias_correction=True,
44 |             loss_scale_policy=loss_scale_policy,
45 |             beta1=args.adam_beta1,
46 |             beta2=args.adam_beta2,
47 |             epsilon=args.adam_eps,
48 |             weight_decay_excludes=["bias", "LayerNorm", "layernorm"],
49 |             weight_decay=args.weight_decay,
50 |             grad_clipping=flow.optimizer.grad_clipping.by_global_norm(args.clip_grad),
51 |         )
52 |     else:
53 |         raise NotImplementedError("not supported yet")
54 | 
55 |     return optimizer
56 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/snapshot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import glob
  4 | import operator
  5 | import oneflow.compatible.single_client as flow
  6 | 
  7 | 
  8 | class Snapshot(object):
  9 |     def __init__(
 10 |         self,
 11 |         load_dir=None,
 12 |         save_dir=None,
 13 |         save_interval=0,
 14 |         total_iters=0,
 15 |         save_last=False,
 16 |         save_init=False,
 17 |     ):
 18 |         self.load_dir_ = load_dir
 19 |         self.save_dir_ = save_dir
 20 |         self.save_interval_ = save_interval
 21 |         self.total_iters_ = total_iters
 22 |         self.save_last_ = save_last
 23 |         self.save_init_ = save_init
 24 |         self.checkpoint_ = flow.train.CheckPoint()
 25 | 
 26 |         self.iter_, snapshot_dir = self._find_max_iter_snapshot_from_load_dir()
 27 |         if snapshot_dir is None:
 28 |             self.checkpoint_.init()
 29 |         else:
 30 |             print(f"Loading model from {snapshot_dir}")
 31 |             self.checkpoint_.load(snapshot_dir)
 32 | 
 33 |         self._check_save_dir_snapshot_existence(self.iter_)
 34 | 
 35 |     def _extract_iter_from_snapshot_dirname(self, s):
 36 |         itr_str = re.findall(r"\d+", s)
 37 |         itr = list(map(int, itr_str))
 38 |         assert len(itr) > 0
 39 |         return itr[0]
 40 | 
 41 |     def _collect_snapshot2iter(self, basedir):
 42 |         snapshot_dirs = glob.glob(f"{basedir}/iter*_snapshot")
 43 |         snapshot2iter = dict()
 44 |         for s_dir in snapshot_dirs:
 45 |             assert os.path.isdir(s_dir)
 46 |             s = os.path.basename(s_dir)
 47 |             snapshot2iter[s_dir] = self._extract_iter_from_snapshot_dirname(s)
 48 |         return snapshot2iter
 49 | 
 50 |     def _check_save_dir_snapshot_existence(self, start_iter):
 51 |         snapshot2iter = self._collect_snapshot2iter(self.save_dir_)
 52 |         for s, i in snapshot2iter.items():
 53 |             if self.save_init_ and i == 0:
 54 |                 raise ValueError(f"{s} already exist")
 55 | 
 56 |             if self.save_last_ and i == self.total_iters_:
 57 |                 raise ValueError(f"{s} already exist")
 58 | 
 59 |             if (
 60 |                 i > start_iter
 61 |                 and self.save_interval_ > 0
 62 |                 and (i - start_iter) % self.save_interval_ == 0
 63 |                 and i <= self.total_iters_
 64 |             ):
 65 |                 raise ValueError(f"{s} already exist")
 66 | 
 67 |     def _find_max_iter_snapshot_from_load_dir(self):
 68 |         if self.load_dir_ is None:
 69 |             return 0, None
 70 | 
 71 |         snapshot2iter = self._collect_snapshot2iter(self.load_dir_)
 72 |         if len(snapshot2iter) == 0:
 73 |             return 0, None
 74 | 
 75 |         s, i = max(snapshot2iter.items(), key=operator.itemgetter(1))
 76 |         return i, s
 77 | 
 78 |     @property
 79 |     def iter(self):
 80 |         return self.iter_
 81 | 
 82 |     def save(self, name):
 83 |         if self.save_dir_ is None:
 84 |             return
 85 | 
 86 |         save_path = os.path.join(self.save_dir_, name)
 87 |         if os.path.exists(save_path):
 88 |             return
 89 | 
 90 |         os.makedirs(save_path)
 91 |         print(f"Saving model to {save_path}")
 92 |         self.checkpoint_.save(save_path)
 93 | 
 94 |     def step(self):
 95 |         if self.iter_ == 0 and self.save_init_:
 96 |             self.save("iter0_snapshot")
 97 | 
 98 |         self.iter_ += 1
 99 | 
100 |         if self.save_interval_ > 0 and self.iter_ % self.save_interval_ == 0:
101 |             self.save(f"iter{self.iter_}_snapshot")
102 | 
103 |         if self.iter_ == self.total_iters_ and self.save_last_:
104 |             self.save(f"iter{self.total_iters_}_snapshot")
105 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/oneflow_gpt/third_party/__init__.py


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/third_party/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/training.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | sys.path.append(
  5 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
  6 | )
  7 | 
  8 | import numpy as np
  9 | import oneflow.compatible.single_client as flow
 10 | 
 11 | from oneflow_gpt.config import get_args
 12 | from oneflow_gpt import distribute
 13 | from oneflow_gpt.data import GPTDataLoader, get_train_val_test_num_samples
 14 | from oneflow_gpt.model import GPTModel, ParallelSparseSoftmaxCrossEntropyLoss
 15 | from oneflow_gpt.optimizer import make_optimizer
 16 | from oneflow_gpt.snapshot import Snapshot
 17 | from oneflow_gpt.util import Metric
 18 | from oneflow_gpt.third_party.data.gpt_dataset import build_train_valid_test_datasets
 19 | 
 20 | 
 21 | def _init_env(args):
 22 |     if args.num_nodes > 1:
 23 |         if args.num_nodes > len(args.node_ips):
 24 |             raise ValueError(
 25 |                 f"num_nodes {args.num_nodes} greater than"
 26 |                 " length of node ips {args.node_ips}"
 27 |             )
 28 | 
 29 |         flow.env.ctrl_port(args.ctrl_port)
 30 |         nodes = []
 31 |         for ip in args.node_ips[: args.num_nodes]:
 32 |             nodes.append({"addr": ip})
 33 | 
 34 |         flow.env.machine(nodes)
 35 | 
 36 |     flow.env.log_dir(args.log)
 37 | 
 38 | 
 39 | def _init_config(args):
 40 |     flow.config.gpu_device_num(args.num_gpus_per_node)
 41 |     if args.tensor_model_parallel_size * args.pipeline_model_parallel_size > 1:
 42 |         if hasattr(flow.config, "nccl_use_compute_stream"):
 43 |             flow.config.nccl_use_compute_stream(True)
 44 |         else:
 45 |             print(
 46 |                 "WARNING: This version of OneFlow dose not support placing nccl on compute stream"
 47 |                 " please try other version."
 48 |             )
 49 | 
 50 | 
 51 |     flow.config.enable_legacy_model_io()
 52 |     flow.config.enable_model_io_v2(True)
 53 | 
 54 | 
 55 | def _make_func_config(args):
 56 |     func_cfg = flow.function_config()
 57 |     if args.fp16:
 58 |         func_cfg.enable_auto_mixed_precision(True)
 59 |     func_cfg.prune_parallel_cast_ops(True)
 60 |     func_cfg.enable_fuse_add_to_output(True)
 61 |     func_cfg.enable_fuse_model_update_ops(True)
 62 |     func_cfg.enable_fuse_cast_scale(True)
 63 |     # turn on this flag when match ZeRO & DeepSpeed
 64 |     func_cfg.enable_non_distributed_optimizer(False)
 65 |     if args.num_accumulation_steps > 1:
 66 |         if hasattr(func_cfg.train, "num_gradient_accumulation_steps"):
 67 |             func_cfg.train.num_gradient_accumulation_steps(args.num_accumulation_steps)
 68 |         else:
 69 |             args.num_accumulation_steps = 1
 70 |             print(
 71 |                 "WARNING: This version of OneFlow dose not support gradient accumulation"
 72 |                 " please try newer version."
 73 |             )
 74 | 
 75 |     return func_cfg
 76 | 
 77 | 
 78 | def _make_gpt_train_func(args):
 79 |     model = GPTModel("model")
 80 |     loss = ParallelSparseSoftmaxCrossEntropyLoss()
 81 |     optimizer = make_optimizer(args)
 82 | 
 83 |     if args.use_external_dataset:
 84 | 
 85 |         @flow.global_function("train", _make_func_config(args))
 86 |         def train(
 87 |             x: flow.typing.Numpy.Placeholder(
 88 |                 (args.global_batch_size, args.seq_length + 1), dtype=flow.int64
 89 |             )
 90 |         ):
 91 |             x = distribute.input_data_parallel_cast(x)
 92 |             with distribute.layer_placement_scope(0):
 93 |                 data = flow.slice(x, begin=(None, 0), size=(None, args.seq_length))
 94 |             with distribute.layer_placement_scope(-1):
 95 |                 labels = flow.slice(x, begin=(None, 1), size=(None, args.seq_length))
 96 | 
 97 |             logits = model(data)
 98 |             losses = loss(logits, labels)
 99 |             optimizer.minimize(losses)
100 | 
101 |             losses = distribute.output_parallel_cast(losses)
102 |             return {"loss": losses}
103 | 
104 |     else:
105 |         data_loader = GPTDataLoader("gpt_data_loader")
106 | 
107 |         @flow.global_function("train", _make_func_config(args))
108 |         def train():
109 |             data, labels = data_loader()
110 |             logits = model(data)
111 |             losses = loss(logits, labels)
112 |             optimizer.minimize(losses)
113 | 
114 |             losses = distribute.output_parallel_cast(losses)
115 |             return {"loss": losses}
116 | 
117 |     return train
118 | 
119 | 
120 | def train():
121 |     args = get_args()
122 |     _init_env(args)
123 |     _init_config(args)
124 |     trainer = _make_gpt_train_func(args)
125 |     snapshot = Snapshot(
126 |         load_dir=args.load,
127 |         save_dir=args.save,
128 |         save_interval=args.save_interval,
129 |         total_iters=args.train_iters,
130 |         save_last=args.save_last,
131 |         save_init=args.save_init,
132 |     )
133 | 
134 |     metric = Metric(
135 |         print_steps=args.log_interval,
136 |         start_step=snapshot.iter,
137 |         max_step=args.train_iters,
138 |         num_samples_per_batch=args.micro_batch_size * args.data_parallel_size,
139 |         keys=["loss"],
140 |         print_format=args.metric_print_format,
141 |         nvidia_smi_report_step=10,
142 |         nvidia_smi_report_file=None,
143 |     )
144 | 
145 |     if args.use_external_dataset:
146 |         train_val_test_num_samples = get_train_val_test_num_samples(
147 |             args.split, args.train_samples
148 |         )
149 |         train_ds, _, _ = build_train_valid_test_datasets(
150 |             data_prefix=[args.dataset],
151 |             data_impl="mmap",
152 |             splits_string=args.split,
153 |             train_valid_test_num_samples=train_val_test_num_samples,
154 |             seq_length=args.seq_length,
155 |             seed=args.seed,
156 |             skip_warmup=0,
157 |         )
158 | 
159 |     if args.train_iters is None and args.train_samples is None:
160 |         raise ValueError("train_iters and train_samples must be set either")
161 | 
162 |     print("Training...")
163 |     try:
164 |         batch_size = args.micro_batch_size * args.num_accumulation_steps
165 |         iteration = snapshot.iter
166 |         while iteration < args.train_iters:
167 |             if args.use_external_dataset:
168 |                 batch = [
169 |                     train_ds[iteration * batch_size + i] for i in range(batch_size)
170 |                 ]
171 |                 data = np.stack(batch)
172 |                 trainer(data).async_get(metric.metric_cb())
173 |             else:
174 |                 trainer().async_get(metric.metric_cb())
175 | 
176 |             snapshot.step()
177 |             iteration = snapshot.iter
178 | 
179 |     except KeyboardInterrupt:
180 |         print("interrupted")
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     train()
185 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/oneflow_gpt/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import numpy as np
  4 | 
  5 | 
  6 | class _Timer(object):
  7 |     def __init__(self):
  8 |         pass
  9 | 
 10 |     def start(self):
 11 |         now = time.perf_counter()
 12 |         self.start_ = now
 13 |         self.step_ = now
 14 | 
 15 |     def step(self):
 16 |         now = time.perf_counter()
 17 |         duration = now - self.step_
 18 |         self.step_ = now
 19 |         return duration
 20 | 
 21 |     def stop(self):
 22 |         self.stop_ = time.perf_counter()
 23 | 
 24 |     def cur_step(self):
 25 |         return self.step_
 26 | 
 27 |     def duration(self):
 28 |         return self.stop_ - self.start_
 29 | 
 30 | 
 31 | class Metric(object):
 32 |     def __init__(
 33 |         self,
 34 |         print_steps,
 35 |         start_step,
 36 |         max_step,
 37 |         num_samples_per_batch,
 38 |         keys=None,
 39 |         print_format="normal",
 40 |         nvidia_smi_report_step=10,
 41 |         nvidia_smi_report_file=None,
 42 |     ):
 43 |         r"""accumulate and calculate metric
 44 | 
 45 |         Args:
 46 |             print_steps: `Int` print metrics every nth steps
 47 |             batch_size: `Int` batch size per step
 48 |             keys: keys in callback outputs
 49 |         Returns:
 50 |         """
 51 |         self.print_steps_ = print_steps
 52 |         self.max_step_ = max_step
 53 |         self.num_samples_per_batch_ = num_samples_per_batch
 54 | 
 55 |         self.nvidia_smi_report_step_ = nvidia_smi_report_step
 56 |         self.nvidia_smi_report_file_ = nvidia_smi_report_file
 57 | 
 58 |         self.step_ = start_step
 59 |         self.micro_batches_ = 0
 60 |         self.samples_ = 0
 61 |         self.throughput_ = 0.0
 62 |         self.latency_ = 0.0
 63 |         self.timestamp_ = 0.0
 64 | 
 65 |         self.kv_store_ = dict()
 66 |         if keys is None:
 67 |             self.keys_ = []
 68 |         else:
 69 |             self.keys_ = list(keys)
 70 | 
 71 |         for key in self.keys_:
 72 |             self.kv_store_[key] = 0.0
 73 | 
 74 |         # need reset after every print
 75 |         self.acc_elapsed_time_ = 0.0
 76 |         self.acc_micro_batches_ = 0
 77 |         self.acc_samples_ = 0
 78 | 
 79 |         self.timer_ = _Timer()
 80 |         self.timer_.start()
 81 | 
 82 |         if print_format == "normal":
 83 |             self.print_fn_ = self.step_print
 84 |         elif print_format == "table":
 85 |             self.print_fn_ = self.step_print_by_table
 86 |             self.print_title_ = False
 87 |         else:
 88 |             raise ValueError("print_format must be <normal|table>")
 89 | 
 90 |     def step_print(self):
 91 |         record = (
 92 |             f"step={self.step_},"
 93 |             f"micro_batches={self.micro_batches_},"
 94 |             f"samples={self.samples_},"
 95 |             f"throughput={self.throughput_:.5f},"
 96 |             f"latency={self.latency_:.5f},"
 97 |         )
 98 |         for key in self.keys_:
 99 |             record += f"{key}={self.kv_store_[key]:.5f},"
100 | 
101 |         print(record)
102 | 
103 |     def step_print_by_table(self):
104 |         title = (
105 |             f"| {'step'.ljust(8)} "
106 |             f"| {'micro_batches'.ljust(15)} "
107 |             f"| {'samples'.ljust(15)} "
108 |             f"| {'throughput'.ljust(10)} "
109 |             f"| {'latency'.ljust(10)} "
110 |         )
111 |         sep = f"| {'-' * 8} | {'-' * 15} | {'-' * 15} | {'-' * 10} | {'-' * 10} "
112 | 
113 |         record = (
114 |             f"| {self.step_:<8d} "
115 |             f"| {self.micro_batches_:<15d} "
116 |             f"| {self.samples_:<15d} "
117 |             f"| {self.throughput_:<10.5f} "
118 |             f"| {self.latency_:<10.5f} "
119 |         )
120 | 
121 |         for key in self.keys_:
122 |             title += f"| {key.ljust(10)} "
123 |             sep += f"| {'-' * 10} "
124 |             record += f"| {self.kv_store_[key]:<10.5f} "
125 | 
126 |         title += "|"
127 |         sep += "|"
128 |         record += "|"
129 | 
130 |         if not self.print_title_:
131 |             print(title)
132 |             print(sep)
133 |             self.print_title_ = True
134 | 
135 |         print(record)
136 | 
137 |     def metric_cb(self):
138 |         def callback(outputs):
139 |             elapsed_time = self.timer_.step()
140 |             self.timestamp_ = self.timer_.cur_step()
141 |             self.acc_elapsed_time_ += elapsed_time
142 | 
143 |             micro_batches = None
144 |             for key in self.keys_:
145 |                 output = outputs[key].numpy()
146 |                 assert isinstance(output, np.ndarray)
147 |                 if micro_batches is None:
148 |                     micro_batches = output.shape[0] if output.shape else 1
149 |                 else:
150 |                     assert micro_batches == output.shape[0]
151 |                 self.kv_store_[key] += output.sum()
152 | 
153 |             self.step_ += 1
154 |             self.acc_micro_batches_ += micro_batches
155 |             self.acc_samples_ += micro_batches * self.num_samples_per_batch_
156 | 
157 |             if self.step_ == self.nvidia_smi_report_step_:
158 |                 cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
159 |                 if self.nvidia_smi_report_file_ is not None:
160 |                     cmd += f" -f {self.nvidia_smi_report_file_}"
161 |                 os.system(cmd)
162 |                 self.print_title_ = False
163 | 
164 |             if self.step_ % self.print_steps_ == 0 or self.step_ == self.max_step_:
165 |                 self.throughput_ = self.acc_samples_ / self.acc_elapsed_time_
166 |                 self.latency_ = self.acc_elapsed_time_ / self.print_steps_
167 | 
168 |                 for key in self.keys_:
169 |                     value = self.kv_store_[key] / self.acc_micro_batches_
170 |                     self.kv_store_[key] = value
171 | 
172 |                 self.micro_batches_ += self.acc_micro_batches_
173 |                 self.samples_ += self.acc_samples_
174 | 
175 |                 self.print_fn_()
176 | 
177 |                 for key in self.keys_:
178 |                     self.kv_store_[key] = 0.0
179 |                 self.acc_elapsed_time_ = 0.0
180 |                 self.acc_micro_batches_ = 0
181 |                 self.acc_samples_ = 0
182 | 
183 |         return callback
184 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/requirements.txt:
--------------------------------------------------------------------------------
1 | oneflow
2 | numpy
3 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import setuptools
 3 | 
 4 | from oneflow_gpt import (
 5 |     __package_name__,
 6 |     __version__,
 7 |     __description__,
 8 |     __license__,
 9 |     __keywords__,
10 | )
11 | 
12 | 
13 | if sys.version_info < (3,):
14 |     raise Exception("Python 2 is not supported.")
15 | 
16 | 
17 | with open("README.md", "r") as fh:
18 |     long_description = fh.read()
19 | 
20 | 
21 | def req_file(filename):
22 |     with open(filename) as f:
23 |         content = f.readlines()
24 |     return [x.strip() for x in content]
25 | 
26 | 
27 | install_requires = req_file("requirements.txt")
28 | 
29 | setuptools.setup(
30 |     name=__package_name__,
31 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
32 |     # the version across setup.py and the project code, see
33 |     # https://packaging.python.org/en/latest/single_source_version.html
34 |     version=__version__,
35 |     description=__description__,
36 |     long_description=long_description,
37 |     long_description_content_type="text/markdown",
38 |     # The project's main homepage.
39 |     # url=__url__,
40 |     # author=__contact_names__,
41 |     # maintainer=__contact_names__,
42 |     # The licence under which the project is released
43 |     license=__license__,
44 |     classifiers=[
45 |         "Development Status :: Beta",
46 |         "Intended Audience :: Developers",
47 |         "Operating System :: POSIX",
48 |         "Operating System :: POSIX :: Linux",
49 |         "License :: OSI Approved :: Apache License",
50 |         "Programming Language :: Python :: 3.6",
51 |         "Programming Language :: Python :: 3.7",
52 |         "Programming Language :: Python :: 3.8",
53 |     ],
54 |     python_requires=">=3.6",
55 |     packages=setuptools.find_packages(),
56 |     install_requires=install_requires,
57 |     # Add in any packaged data.
58 |     include_package_data=True,
59 |     zip_safe=False,
60 |     # PyPI package information.
61 |     keywords=__keywords__,
62 | )
63 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tasks/__init__.py


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tasks/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.append(
 5 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
 6 | )
 7 | from oneflow_gpt.config import get_args
 8 | 
 9 | 
10 | def get_tasks_args(parser):
11 |     """Provide extra arguments required for tasks."""
12 |     group = parser.add_argument_group(title="tasks")
13 | 
14 |     group.add_argument("--task", type=str, required=True, help="Task name.")
15 |     group.add_argument(
16 |         "--epochs",
17 |         type=int,
18 |         default=None,
19 |         help="Number of finetunning epochs. Zero results in " "evaluation only.",
20 |     )
21 |     group.add_argument(
22 |         "--pretrained-checkpoint",
23 |         type=str,
24 |         default=None,
25 |         help="Pretrained checkpoint used for finetunning.",
26 |     )
27 |     group.add_argument(
28 |         "--keep-last",
29 |         action="store_true",
30 |         help="Keep the last batch (maybe incomplete) in" "the data loader",
31 |     )
32 |     group.add_argument(
33 |         "--train-data",
34 |         nargs="+",
35 |         default=None,
36 |         help="Whitespace separated paths or corpora names " "for training.",
37 |     )
38 |     group.add_argument(
39 |         "--valid-data", nargs="*", default=None, help="path(s) to the validation data."
40 |     )
41 |     group.add_argument(
42 |         "--overlapping-eval",
43 |         type=int,
44 |         default=32,
45 |         help="Sliding window for overlapping evaluation.",
46 |     )
47 |     group.add_argument(
48 |         "--strict-lambada",
49 |         action="store_true",
50 |         help="Use more difficult formulation of lambada.",
51 |     )
52 |     parser.add_argument(
53 |         "--vocab-file", type=str, default=None, help="Path to the vocab file."
54 |     )
55 |     parser.add_argument(
56 |         "--merge-file", type=str, default=None, help="Path to the BPE merge file."
57 |     )
58 |     parser.add_argument(
59 |         "--tokenizer-type",
60 |         type=str,
61 |         default=None,
62 |         choices=["BertWordPieceLowerCase", "BertWordPieceCase", "GPT2BPETokenizer"],
63 |         help="What type of tokenizer to use.",
64 |     )
65 |     parser.add_argument(
66 |         "--reset-position-ids",
67 |         action="store_true",
68 |         help="Reset posistion ids after end-of-document token.",
69 |     )
70 |     parser.add_argument(
71 |         "--reset-attention-mask",
72 |         action="store_true",
73 |         help="Reset self attention maske after " "end-of-document token.",
74 |     )
75 |     parser.add_argument(
76 |         "--eod-mask-loss",
77 |         action="store_true",
78 |         help="Mask loss for the end of document tokens.",
79 |     )
80 | 
81 |     return parser
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 
86 |     args = get_args(extra_args_provider=get_tasks_args)
87 | 
88 |     if args.task in ["LAMBADA"]:
89 |         from zeroshot_gpt.evaluate import main
90 |     else:
91 |         raise NotImplementedError("Task {} is not implemented.".format(args.task))
92 | 
93 |     main(args)
94 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tasks/zeroshot_gpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tasks/zeroshot_gpt/__init__.py


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tasks/zeroshot_gpt/datasets.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import math
18 | import numpy as np
19 | from tokenizer.tokenizer import build_tokenizer
20 | 
21 | 
22 | def build_dataset(args):
23 |     """Helper function to select and build dataset."""
24 |     if args.task == "LAMBADA":
25 |         return _build_lambada_dataset(args)
26 | 
27 |     raise NotImplementedError("dataset for {} task is not " "implemented.".format(task))
28 | 
29 | 
30 | class _LambadaDataset:
31 |     def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
32 |         print("> building lambada dataset from {} ...".format(path))
33 |         self.seq_len = seq_len
34 |         self.pad_idx = pad_idx
35 |         self.tokenizer = tokenizer
36 |         self.strict = strict
37 | 
38 |         self.tokens = []
39 |         self.labels = []
40 |         with open(path, "r") as f:
41 |             for line in f.readlines():
42 |                 text = json.loads(line)["text"]
43 |                 tokens, labels = self.get_tokens(text)
44 |                 self.tokens.append(tokens)
45 |                 self.labels.append(labels)
46 | 
47 |     def get_tokens(self, text):
48 |         if not self.strict:
49 |             tokens = self.tokenizer.tokenize(text)
50 |             return tokens[:-1], [tokens[-1]]
51 |         last_token = text.split()[-1]
52 |         start_idx = text.rfind(last_token)
53 |         beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
54 |         last_token = self.tokenizer.tokenize(" " + last_token)
55 |         return beginning_tokens, last_token
56 | 
57 |     def __len__(self):
58 |         return len(self.tokens)
59 | 
60 |     def __getitem__(self, idx):
61 |         tokens = self.tokens[idx]
62 |         num_tokens = len(tokens)
63 |         pad_mask = [0] * num_tokens
64 |         labels = self.labels[idx]
65 |         pad_mask += [1] * len(labels)
66 |         tokens = tokens + labels
67 |         num_tokens = len(tokens)
68 |         if num_tokens < self.seq_len + 1:
69 |             num_pad = self.seq_len + 1 - num_tokens
70 |             pad_mask += [0] * (num_pad)
71 |             tokens += [self.pad_idx] * num_pad
72 |         pad_mask = np.array(pad_mask[1:])
73 | 
74 |         return {"text": np.array(tokens), "pad_mask": pad_mask}
75 | 
76 | 
77 | def _build_lambada_dataset(args):
78 |     """Build lambada dataset."""
79 |     tokenizer = build_tokenizer(args)
80 | 
81 |     assert len(args.valid_data) == 1
82 |     val_dataset = _LambadaDataset(
83 |         args.valid_data[0],
84 |         tokenizer.eod,
85 |         tokenizer,
86 |         args.seq_length,
87 |         args.strict_lambada,
88 |     )
89 |     print(" > found {} samples.".format(len(val_dataset)))
90 | 
91 |     return val_dataset
92 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tasks/zeroshot_gpt/evaluate.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import sys
  4 | 
  5 | sys.path.append(
  6 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
  7 | )
  8 | 
  9 | from oneflow_gpt.model import GPTModel, ParallelSparseSoftmaxCrossEntropyLoss
 10 | from oneflow_gpt import util
 11 | from .datasets import build_dataset
 12 | import numpy as np
 13 | import oneflow.compatible.single_client as flow
 14 | 
 15 | 
 16 | def _init_env(args):
 17 |     if args.num_nodes > 1:
 18 |         if args.num_nodes > len(args.node_ips):
 19 |             raise ValueError(
 20 |                 f"num_nodes {args.num_nodes} greater than"
 21 |                 " length of node ips {args.node_ips}"
 22 |             )
 23 | 
 24 |         flow.env.ctrl_port(args.ctrl_port)
 25 |         nodes = []
 26 |         for ip in args.node_ips[: args.num_nodes]:
 27 |             nodes.append({"addr": ip})
 28 | 
 29 |         flow.env.machine(nodes)
 30 | 
 31 |     flow.env.log_dir(args.log)
 32 | 
 33 | 
 34 | def _init_config(args):
 35 |     flow.config.gpu_device_num(args.num_gpus_per_node)
 36 |     flow.config.collective_boxing.nccl_fusion_reduce_scatter(True)
 37 |     flow.config.collective_boxing.nccl_fusion_all_gather(True)
 38 |     flow.config.collective_boxing.nccl_enable_mixed_fusion(True)
 39 |     if args.tensor_model_parallel_size > 1:
 40 |         if hasattr(flow.config, "nccl_use_compute_stream"):
 41 |             flow.config.nccl_use_compute_stream(True)
 42 |         else:
 43 |             print(
 44 |                 "WARNING: This version of OneFlow dose not support placing nccl on compute stream"
 45 |                 " please try other version."
 46 |             )
 47 | 
 48 |     flow.config.enable_legacy_model_io()
 49 |     flow.config.enable_model_io_v2(True)
 50 | 
 51 | 
 52 | def _make_func_config(args):
 53 |     func_cfg = flow.function_config()
 54 |     if args.fp16:
 55 |         func_cfg.enable_auto_mixed_precision(True)
 56 |     func_cfg.prune_parallel_cast_ops(True)
 57 |     func_cfg.enable_fuse_add_to_output(True)
 58 |     func_cfg.enable_fuse_model_update_ops(True)
 59 |     func_cfg.enable_fuse_cast_scale(True)
 60 |     # turn on this flag when match ZeRO & DeepSpeed
 61 |     func_cfg.enable_non_distributed_optimizer(False)
 62 |     if args.num_accumulation_steps > 1:
 63 |         if hasattr(func_cfg.train, "num_gradient_accumulation_steps"):
 64 |             func_cfg.train.num_gradient_accumulation_steps(args.num_accumulation_steps)
 65 |         else:
 66 |             args.num_accumulation_steps = 1
 67 |             print(
 68 |                 "WARNING: This version of OneFlow dose not support gradient accumulation"
 69 |                 " please try newer version."
 70 |             )
 71 | 
 72 |     return func_cfg
 73 | 
 74 | 
 75 | def make_gpt_eval_func(args):
 76 |     @flow.global_function("predict", _make_func_config(args))
 77 |     def gpt_func(
 78 |         x: flow.typing.Numpy.Placeholder(
 79 |             (args.global_batch_size, args.seq_length), dtype=flow.int64
 80 |         )
 81 |     ):
 82 |         gpt = GPTModel("model")
 83 |         return gpt(x)
 84 | 
 85 |     return gpt_func
 86 | 
 87 | 
 88 | def process_batch(args, batch):
 89 |     """Process batch and produce inputs for the model."""
 90 | 
 91 |     loss_mask = batch["pad_mask"]
 92 |     tokens_ = batch["text"]
 93 |     labels = tokens_[:, 1:]
 94 |     tokens = tokens_[:, :-1]
 95 | 
 96 |     return tokens, labels, None, None, loss_mask
 97 | 
 98 | 
 99 | def forward_step(args, batch, model, eval_metric):
100 |     """Forward step."""
101 | 
102 |     # Get the batch.
103 |     tokens, labels, attention_mask, position_ids, loss_mask = process_batch(args, batch)
104 |     # Tell the model what our actual batch size will be
105 |     # args.micro_batch_size = len(labels)
106 | 
107 |     # Forward model.
108 | 
109 |     # Forward pass through the model.
110 |     logits = model(tokens).get()
111 | 
112 |     if eval_metric == "accuracy":
113 |         bs, e = logits.numpy().shape
114 |         outputs = np.argmax(
115 |             logits.numpy().reshape(
116 |                 (args.micro_batch_size, int(bs / args.micro_batch_size), e)
117 |             ),
118 |             -1,
119 |         )
120 |         correct = (outputs == labels).astype(np.float32)
121 |         correct[(1 - loss_mask).astype(np.bool_)] = 1
122 |         correct = np.prod(correct, -1)
123 |         return np.sum(correct)
124 | 
125 |     raise NotImplementedError(
126 |         "forward method for evaluation metric {} "
127 |         "is not implemented.".format(eval_metric)
128 |     )
129 | 
130 |     return None
131 | 
132 | 
133 | def evaluate(args, data_sets, model, eval_metric):
134 |     """Evaluation."""
135 |     total_output = 0.0
136 | 
137 |     # For all the batches in the dataset.
138 |     for iteration in range(int(len(data_sets) / args.micro_batch_size)):
139 |         text = [
140 |             data_sets[iteration * args.micro_batch_size + i]["text"]
141 |             for i in range(args.micro_batch_size)
142 |         ]
143 |         text = np.stack(text)
144 |         pad_mask = [
145 |             data_sets[iteration * args.micro_batch_size + i]["pad_mask"]
146 |             for i in range(args.micro_batch_size)
147 |         ]
148 |         pad_mask = np.stack(pad_mask)
149 |         if iteration % args.log_interval == 0:
150 |             print("> working on iteration: {}".format(iteration))
151 |         # Forward evaluation.
152 |         output = forward_step(
153 |             args, {"text": text, "pad_mask": pad_mask}, model, eval_metric
154 |         )
155 |         total_output += output
156 | 
157 |     return total_output
158 | 
159 | 
160 | def evaluate_and_print_results(args, data_sets, model, eval_metric):
161 |     """Evaluate and print results on screen."""
162 |     # Evaluate and get results.
163 |     output = evaluate(args, data_sets, model, eval_metric)
164 | 
165 |     string = " validation results on {} | ".format(args.task)
166 |     if eval_metric == "accuracy":
167 |         num_examples = (
168 |             int(len(data_sets) / args.micro_batch_size) * args.micro_batch_size
169 |         )
170 |         acc = output / num_examples
171 |         string += "number correct: {:.4E} | ".format(output)
172 |         string += "total examples: {:.4E} | ".format(num_examples)
173 |         string += "avg accuracy: {:.4E}".format(acc)
174 |         print(string)
175 |     else:
176 |         raise NotImplementedError(
177 |             "evaluation method for {} metric is not "
178 |             "implemented yet.".format(eval_metric)
179 |         )
180 | 
181 | 
182 | def main(args):
183 |     """Main program."""
184 | 
185 |     if args.task == "LAMBADA":
186 |         eval_metric = "accuracy"
187 |     else:
188 |         raise NotImplementedError("{} task is not implemented.".format(args.task))
189 | 
190 |     # Set up model and load checkpoint.
191 |     _init_env(args)
192 |     _init_config(args)
193 |     gpt_eval = make_gpt_eval_func(args)
194 |     check_point = flow.train.CheckPoint()
195 | 
196 |     assert args.load is not None
197 |     check_point.load(args.load)
198 | 
199 |     dataset = build_dataset(args)
200 |     # Run evaluation.
201 |     evaluate_and_print_results(args, dataset, gpt_eval, eval_metric)
202 | 
203 |     print("done :-)")
204 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Oneflow-Inc/OneFlow-Benchmark/de4698a7acce3cca2eaf1564eaf25d59c9954b86/LanguageModeling/GPT/tokenizer/__init__.py


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from abc import ABC
 17 | from abc import abstractmethod
 18 | import math
 19 | from .gpt2_tokenization import GPT2Tokenizer
 20 | 
 21 | 
 22 | def build_tokenizer(args):
 23 |     """Initialize tokenizer."""
 24 | 
 25 |     # Select and instantiate the tokenizer.
 26 |     assert args.vocab_file is not None
 27 |     assert args.merge_file is not None
 28 |     if args.tokenizer_type == "GPT2BPETokenizer":
 29 | 
 30 |         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
 31 |     else:
 32 |         raise NotImplementedError(
 33 |             "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
 34 |         )
 35 | 
 36 |     return tokenizer
 37 | 
 38 | 
 39 | def initialize_model_parallel(args):
 40 |     device_num = args.gpu_num_per_node * args.num_nodes
 41 |     if device_num == 1:
 42 |         print("warning! there is only 1 device, set model parallel size to 1")
 43 |         return [
 44 |             1,
 45 |         ]
 46 | 
 47 |     assert device_num % args.model_parallel_size == 0
 48 |     parallel_hierarchy = [
 49 |         device_num // args.model_parallel_size,
 50 |         args.model_parallel_size,
 51 |     ]
 52 |     return parallel_hierarchy
 53 | 
 54 | 
 55 | def pad_vocab_size(vocab_size, alignment, model_parallel_size):
 56 |     """Pad vocab size so it is divisible by model parallel size and
 57 |     still having GPU friendly size."""
 58 |     assert isinstance(alignment, int)
 59 |     if alignment == 0:
 60 |         return vocab_size
 61 | 
 62 |     alignment *= model_parallel_size
 63 | 
 64 |     padded_vocab_size = int(math.ceil(vocab_size / alignment)) * alignment
 65 |     print(
 66 |         " > padded vocab (size: {}) with {} dummy tokens "
 67 |         "(new size: {})".format(
 68 |             vocab_size, padded_vocab_size - vocab_size, padded_vocab_size
 69 |         )
 70 |     )
 71 |     return padded_vocab_size
 72 | 
 73 | 
 74 | class AbstractTokenizer(ABC):
 75 |     """Abstract class for tokenizer."""
 76 | 
 77 |     def __init__(self, name):
 78 |         self.name = name
 79 |         super().__init__()
 80 | 
 81 |     @property
 82 |     @abstractmethod
 83 |     def vocab_size(self):
 84 |         pass
 85 | 
 86 |     @property
 87 |     @abstractmethod
 88 |     def vocab(self):
 89 |         """Dictionary from vocab text token to id token."""
 90 |         pass
 91 | 
 92 |     @property
 93 |     @abstractmethod
 94 |     def inv_vocab(self):
 95 |         """Dictionary from vocab id token to text token."""
 96 |         pass
 97 | 
 98 |     @abstractmethod
 99 |     def tokenize(self, text):
100 |         pass
101 | 
102 |     def detokenize(self, token_ids):
103 |         raise NotImplementedError(
104 |             "detokenizer is not implemented for {} " "tokenizer".format(self.name)
105 |         )
106 | 
107 |     @property
108 |     def cls(self):
109 |         raise NotImplementedError(
110 |             "CLS is not provided for {} " "tokenizer".format(self.name)
111 |         )
112 | 
113 |     @property
114 |     def sep(self):
115 |         raise NotImplementedError(
116 |             "SEP is not provided for {} " "tokenizer".format(self.name)
117 |         )
118 | 
119 |     @property
120 |     def pad(self):
121 |         raise NotImplementedError(
122 |             "PAD is not provided for {} " "tokenizer".format(self.name)
123 |         )
124 | 
125 |     @property
126 |     def eod(self):
127 |         raise NotImplementedError(
128 |             "EOD is not provided for {} " "tokenizer".format(self.name)
129 |         )
130 | 
131 |     @property
132 |     def mask(self):
133 |         raise NotImplementedError(
134 |             "MASK is not provided for {} " "tokenizer".format(self.name)
135 |         )
136 | 
137 | 
138 | class _GPT2BPETokenizer(AbstractTokenizer):
139 |     """Original GPT2 BPE tokenizer."""
140 | 
141 |     def __init__(self, vocab_file, merge_file):
142 |         name = "GPT2 BPE"
143 |         super().__init__(name)
144 | 
145 |         self.tokenizer = GPT2Tokenizer(
146 |             vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
147 |         )
148 |         self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
149 | 
150 |     @property
151 |     def vocab_size(self):
152 |         return len(self.tokenizer.encoder)
153 | 
154 |     @property
155 |     def vocab(self):
156 |         return self.tokenizer.encoder
157 | 
158 |     @property
159 |     def inv_vocab(self):
160 |         return self.tokenizer.decoder
161 | 
162 |     def tokenize(self, text):
163 |         return self.tokenizer.encode(text)
164 | 
165 |     def detokenize(self, token_ids):
166 |         return self.tokenizer.decode(token_ids)
167 | 
168 |     @property
169 |     def eod(self):
170 |         return self.eod_id
171 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/README.md:
--------------------------------------------------------------------------------
 1 | # GPT模型转换
 2 | 
 3 | ### PyTorch模型转OneFlow模型
 4 |   - `meta.proto`，是为生成模型目录下的`meta`文件，需要执行`protoc --python_out=. meta.proto`后生成`meta_pb2.py`，即可`import meta_pb2 as meta_pb`
 5 |   ```
 6 |     syntax = "proto2";
 7 |     package gpt;
 8 | 
 9 |     message Shape {
10 |         repeated int32 dim = 1;
11 |     }
12 | 
13 |     enum DataType {
14 |         kInvalidDataType = 0;
15 |         kChar = 1;
16 |         kFloat = 2;
17 |         kDouble = 3;
18 |         kInt8 = 4;
19 |         kInt32 = 5;
20 |         kInt64 = 6;
21 |         kUInt8 = 7;
22 |         kOFRecord = 8;
23 |         kFloat16 = 9;
24 |         kTensorBuffer = 10;
25 |     }
26 | 
27 |     message Meta {
28 |         required Shape shape = 1;
29 |         required DataType data_type = 2 [default = kFloat16];
30 |     }
31 |   ```
32 |   - 转换脚本`convert_pt_to_of_gpt.py`，执行`python3 convert_pt_to_of_gpt.py --py_model_dir /path/to/iter_0500000/mp_rank_00/model_optim_rng.pt`即可在当前目录下的`convert_pt_to_of_gpt`生成OneFlow模型
33 |     - `--py_model_dir`,pytorch模型地址
34 |     - `--of_dump_path`,保存转换后的模型路径
35 |   
36 |   


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/ansible_inventory:
--------------------------------------------------------------------------------
 1 | [local]
 2 | localhost ansible_connection=local
 3 | [of]
 4 | of11 ansible_host=192.168.1.11
 5 | of12 ansible_host=192.168.1.12
 6 | of13 ansible_host=192.168.1.13
 7 | of14 ansible_host=192.168.1.14
 8 | of15 ansible_host=192.168.1.15
 9 | of16 ansible_host=192.168.1.16
10 | [ln]
11 | vs002 ansible_host=10.11.0.2
12 | vs003 ansible_host=10.11.0.3
13 | vs004 ansible_host=10.11.0.4
14 | vs005 ansible_host=10.11.0.5
15 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/compare_loss.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import sys
 4 | 
 5 | 
 6 | def parse_losses_for_log_file(
 7 |     log_file, loss_pattern, step_pattern, max_step, verbose=False
 8 | ):
 9 |     if not os.path.isfile(log_file):
10 |         raise ValueError(f"log file {log_file} do not exist")
11 | 
12 |     loss_dict = {}
13 |     with open(log_file, "rt") as f:
14 |         for line in f:
15 |             step = None
16 |             loss = None
17 | 
18 |             m = re.search(loss_pattern, line.strip())
19 |             if m:
20 |                 loss = float(m.group(1))
21 |             elif verbose:
22 |                 print(f"not found loss in line: {line.strip()}")
23 |             else:
24 |                 pass
25 | 
26 |             m = re.search(step_pattern, line.strip())
27 |             if m:
28 |                 step = int(m.group(1))
29 |             elif verbose:
30 |                 print(f"not found step in line: {line.strip()}")
31 |             else:
32 |                 pass
33 | 
34 |             if loss is not None and step is not None:
35 |                 assert step not in loss_dict
36 |                 loss_dict[step] = loss
37 |                 if len(loss_dict) >= max_step:
38 |                     break
39 | 
40 |     return loss_dict
41 | 
42 | 
43 | def plot_losses_comparison(oneflow_log_file, openai_log_file, verbose=False):
44 |     import matplotlib.pyplot as plt
45 | 
46 |     loss_pattern = r"loss=[+-]?((\d+(\.\d+)?)|(\.\d+))"
47 |     of_step_pattern = r"step=(\d+)"
48 |     of_loss_dict = parse_losses_for_log_file(
49 |         oneflow_log_file, loss_pattern, of_step_pattern, 100, verbose
50 |     )
51 | 
52 |     oa_step_pattern = r"\[(\d+)\s\|\s\d+\.\d+\]"
53 |     oa_loss_dict = parse_losses_for_log_file(
54 |         openai_log_file, loss_pattern, oa_step_pattern, 100, verbose
55 |     )
56 | 
57 |     if verbose:
58 |         print("of_loss_dict:", of_loss_dict)
59 |         print("oa_loss_dict:", oa_loss_dict)
60 | 
61 |     plt.plot(*zip(*sorted(of_loss_dict.items())))
62 |     plt.plot(*zip(*sorted(oa_loss_dict.items())))
63 |     plt.show()
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     if len(sys.argv) <= 1:
68 |         raise ValueError
69 | 
70 |     loss_pattern = r"loss=[+-]?((\d+(\.\d+)?)|(\.\d+))"
71 |     # step_pattern = r"step=(\d+)"
72 |     step_pattern = r"\[(\d+)\s\|\s\d+\.\d+\]"
73 |     losses = parse_losses_for_log_file(sys.argv[1], loss_pattern, step_pattern)
74 |     print(losses)
75 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/convert_py_model_to_of.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import torch
  5 | import meta_pb2 as meta_pb
  6 | 
  7 | 
  8 | def get_args():
  9 | 
 10 |     parser = argparse.ArgumentParser()
 11 | 
 12 |     ## Required parameters
 13 |     parser.add_argument(
 14 |         "--py_model_dir",
 15 |         type=str,
 16 |         default="/path/to/iter_0500000/mp_rank_00/model_optim_rng.pt",
 17 |         help="Path the PyTorch checkpoint file path.",
 18 |     )
 19 |     parser.add_argument(
 20 |         "--of_dump_path",
 21 |         type=str,
 22 |         default="./convert_pt_to_of_gpt_release",
 23 |         help="Path to the output OneFlow model.",
 24 |     )
 25 | 
 26 |     return parser.parse_args()
 27 | 
 28 | 
 29 | def _SaveWeightBlob2File(blob, op_name, save_path, var="out", meta="meta"):
 30 |     folder = os.path.join(save_path, op_name)
 31 |     if not os.path.exists(folder):
 32 |         os.makedirs(folder)
 33 |     filename = os.path.join(folder, var)
 34 |     f = open(filename, "wb")
 35 |     f.write(blob.tobytes())
 36 |     meta_info = meta_pb.Meta()
 37 |     meta_info.shape.dim[:] = blob.shape
 38 |     meta_info.data_type = meta_pb.kFloat
 39 |     filename = os.path.join(folder, meta)
 40 |     f = open(filename, "w")
 41 |     f.write(str(meta_info))
 42 |     f.close()
 43 |     np.save(filename, blob)
 44 | 
 45 | 
 46 | def _SaveWeightBlob2FileExtend(blob, op_name, save_path, var="out", meta="meta"):
 47 |     _SaveWeightBlob2File(blob.numpy(), op_name, save_path, var=var, meta=meta)
 48 |     _SaveWeightBlob2File(
 49 |         np.ones_like(blob), op_name + "-v", save_path, var=var, meta=meta
 50 |     )
 51 |     _SaveWeightBlob2File(
 52 |         np.zeros_like(blob), op_name + "-m", save_path, var=var, meta=meta
 53 |     )
 54 | 
 55 | 
 56 | def convert(args):
 57 |     path = args.py_model_dir
 58 |     state_dict = torch.load(path, map_location="cpu")
 59 |     for model_key, model_value in state_dict["model"]["language_model"][
 60 |         "transformer"
 61 |     ].items():
 62 |         if len(model_value.shape) > 1:
 63 |             model_value = torch.transpose(model_value, 0, 1)
 64 |         model_value = model_value.float()
 65 |         op_name_list = model_key.split(".")
 66 |         if "layers." in model_key:
 67 |             op_name = model_key.replace("layers.", "model-")
 68 |             op_name = op_name.replace(
 69 |                 "-%s." % (op_name_list[1]), "-h%s-" % (op_name_list[1])
 70 |             )
 71 |         else:
 72 |             op_name = model_key.replace("final_layernorm.", "model-layernorm_f-")
 73 |         op_name = op_name.replace("input_layernorm.", "layernorm_1-")
 74 |         op_name = op_name.replace("post_attention_layernorm.", "layernorm_2-")
 75 |         op_name = op_name.replace("attention.", "attn-")
 76 |         op_name = op_name.replace("query_key_value.", "c_attn-")
 77 |         op_name = op_name.replace("dense.", "c_proj-")
 78 |         op_name = op_name.replace("mlp.dense_h_to_4h.", "mlp-c_fc-")
 79 |         op_name = op_name.replace("mlp.dense_4h_to_h.", "mlp-c_proj-")
 80 | 
 81 |         if (
 82 |             "layernorm_1" in op_name
 83 |             or "layernorm_2" in op_name
 84 |             or "layernorm_f" in op_name
 85 |         ):
 86 |             op_name = op_name.replace("-weight", "-gamma")
 87 |             op_name = op_name.replace("-bias", "-beta")
 88 | 
 89 |         print(model_key, "-" * 8, op_name)
 90 |         _SaveWeightBlob2FileExtend(model_value, op_name, args.of_dump_path)
 91 | 
 92 |     _SaveWeightBlob2FileExtend(
 93 |         state_dict["model"]["language_model"]["embedding"]["position_embeddings"][
 94 |             "weight"
 95 |         ].float(),
 96 |         "model-wpe",
 97 |         args.of_dump_path,
 98 |     )
 99 |     _SaveWeightBlob2FileExtend(
100 |         state_dict["model"]["language_model"]["embedding"]["word_embeddings"][
101 |             "weight"
102 |         ].float(),
103 |         "model-wte",
104 |         args.of_dump_path,
105 |     )
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     args = get_args()
110 |     convert(args)
111 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/launch_container.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | import tempfile
  5 | 
  6 | 
  7 | def pwd():
  8 |     return os.getcwd()
  9 | 
 10 | 
 11 | def homepath(relative_path=None):
 12 |     if relative_path is None:
 13 |         return os.path.expanduser("~")
 14 | 
 15 |     return os.path.expanduser(f"~/{relative_path}")
 16 | 
 17 | 
 18 | def py_bin_path(py_ver):
 19 |     py_ver_list = py_ver.split(".")
 20 |     major, minor = py_ver_list[:2]
 21 |     ver = f"{major}{minor}"
 22 |     return f"/opt/python/cp{ver}-cp{ver}m/bin"
 23 | 
 24 | 
 25 | def launch_oneflow_gpt_container(
 26 |     cmd,
 27 |     src,
 28 |     image,
 29 |     wheel,
 30 |     extra_mount=None,
 31 |     py_ver="3.7",
 32 |     proxy=None,
 33 |     interactive=True,
 34 |     name="oneflow_gpt",
 35 | ):
 36 |     bash_script = f"""set -ex
 37 | export PATH={py_bin_path(py_ver)}:$PATH
 38 | python3 -m pip install {wheel}
 39 | python3 -m pip install -e {src}
 40 | {cmd or 'bash'}
 41 | """
 42 | 
 43 |     docker_args = ""
 44 | 
 45 |     if proxy is not None:
 46 |         docker_args += f" -e http_proxy={proxy} -e https_proxy={proxy} -e HTTP_PROXY={proxy} -e HTTPS_PROXY={proxy}"
 47 | 
 48 |     if extra_mount is not None:
 49 |         docker_args += f" -v {extra_mount}:{extra_mount}"
 50 | 
 51 |     docker_cmd = "docker run"
 52 | 
 53 |     if interactive:
 54 |         docker_cmd += " -it"
 55 | 
 56 |     docker_cmd += " --rm"
 57 |     docker_cmd += " --runtime nvidia"
 58 |     docker_cmd += " --privileged"
 59 |     docker_cmd += " --network host"
 60 |     docker_cmd += " --shm-size=8g"
 61 | 
 62 |     docker_cmd += docker_args
 63 |     docker_cmd += f" -v {src}:{src}"
 64 |     docker_cmd += f" -v {homepath('var-cache')}:/var/cache"
 65 |     docker_cmd += " -v /tmp:/host/tmp"
 66 |     docker_cmd += f" -v {pwd()}:{pwd()}"
 67 |     docker_cmd += f" -w {pwd()}"
 68 |     docker_cmd += f" --name {name}"
 69 |     docker_cmd += f" {image}"
 70 | 
 71 |     with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
 72 |         t_fname = f.name
 73 |         f.write(bash_script)
 74 |         f.flush()
 75 |         print("tempfile name:", t_fname)
 76 |         docker_cmd += f" bash /host{t_fname}"
 77 |         print(docker_cmd)
 78 |         subprocess.check_call(docker_cmd, shell=True)
 79 | 
 80 | 
 81 | def parse_args():
 82 |     parser = argparse.ArgumentParser()
 83 |     parser.add_argument("--cmd", type=str, default=None, help="")
 84 |     parser.add_argument("--src", type=str, default=f"{pwd()}/oneflow_gpt", help="")
 85 |     parser.add_argument(
 86 |         "--image", type=str, default="oneflow-manylinux2014-cuda11.2:0.1", help="",
 87 |     )
 88 |     parser.add_argument(
 89 |         "--wheel",
 90 |         type=str,
 91 |         default="$PWD/packages/oneflow-0.3.5+cu112.git.4a4f032-cp37-cp37m-linux_x86_64.whl",
 92 |         help="",
 93 |     )
 94 |     parser.add_argument("--extra-mount", type=str, default="/data", help="")
 95 |     parser.add_argument("--py", type=str, default="3.7", help="")
 96 |     parser.add_argument("--proxy", type=str, default=None, help="")
 97 |     parser.add_argument("--no-interactive", action="store_false", dest="interactive", help="")
 98 |     return parser.parse_args()
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     args = parse_args()
103 |     launch_oneflow_gpt_container(
104 |         args.cmd,
105 |         args.src,
106 |         args.image,
107 |         args.wheel,
108 |         args.extra_mount,
109 |         args.py,
110 |         args.proxy,
111 |         args.interactive,
112 |     )
113 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/meta.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | message Shape {
 4 |     repeated int32 dim = 1;
 5 | }
 6 | 
 7 | enum DataType {
 8 |     kInvalidDataType = 0;
 9 |     kChar = 1;
10 |     kFloat = 2;
11 |     kDouble = 3;
12 |     kInt8 = 4;
13 |     kInt32 = 5;
14 |     kInt64 = 6;
15 |     kUInt8 = 7;
16 |     kOFRecord = 8;
17 |     kFloat16 = 9;
18 |     kTensorBuffer = 10;
19 | }
20 | 
21 | message Meta {
22 |     required Shape shape = 1;
23 |     required DataType data_type = 2 [default = kFloat16];
24 | }
25 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/meta_pb2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  3 | # source: meta.proto
  4 | """Generated protocol buffer code."""
  5 | from google.protobuf.internal import enum_type_wrapper
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | # @@protoc_insertion_point(imports)
 11 | 
 12 | _sym_db = _symbol_database.Default()
 13 | 
 14 | 
 15 | 
 16 | 
 17 | DESCRIPTOR = _descriptor.FileDescriptor(
 18 |   name='meta.proto',
 19 |   package='',
 20 |   syntax='proto2',
 21 |   serialized_options=None,
 22 |   create_key=_descriptor._internal_create_key,
 23 |   serialized_pb=b'\n\nmeta.proto\"\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x05\"E\n\x04Meta\x12\x15\n\x05shape\x18\x01 \x02(\x0b\x32\x06.Shape\x12&\n\tdata_type\x18\x02 \x02(\x0e\x32\t.DataType:\x08kFloat16*\xa3\x01\n\x08\x44\x61taType\x12\x14\n\x10kInvalidDataType\x10\x00\x12\t\n\x05kChar\x10\x01\x12\n\n\x06kFloat\x10\x02\x12\x0b\n\x07kDouble\x10\x03\x12\t\n\x05kInt8\x10\x04\x12\n\n\x06kInt32\x10\x05\x12\n\n\x06kInt64\x10\x06\x12\n\n\x06kUInt8\x10\x07\x12\r\n\tkOFRecord\x10\x08\x12\x0c\n\x08kFloat16\x10\t\x12\x11\n\rkTensorBuffer\x10\n'
 24 | )
 25 | 
 26 | _DATATYPE = _descriptor.EnumDescriptor(
 27 |   name='DataType',
 28 |   full_name='DataType',
 29 |   filename=None,
 30 |   file=DESCRIPTOR,
 31 |   create_key=_descriptor._internal_create_key,
 32 |   values=[
 33 |     _descriptor.EnumValueDescriptor(
 34 |       name='kInvalidDataType', index=0, number=0,
 35 |       serialized_options=None,
 36 |       type=None,
 37 |       create_key=_descriptor._internal_create_key),
 38 |     _descriptor.EnumValueDescriptor(
 39 |       name='kChar', index=1, number=1,
 40 |       serialized_options=None,
 41 |       type=None,
 42 |       create_key=_descriptor._internal_create_key),
 43 |     _descriptor.EnumValueDescriptor(
 44 |       name='kFloat', index=2, number=2,
 45 |       serialized_options=None,
 46 |       type=None,
 47 |       create_key=_descriptor._internal_create_key),
 48 |     _descriptor.EnumValueDescriptor(
 49 |       name='kDouble', index=3, number=3,
 50 |       serialized_options=None,
 51 |       type=None,
 52 |       create_key=_descriptor._internal_create_key),
 53 |     _descriptor.EnumValueDescriptor(
 54 |       name='kInt8', index=4, number=4,
 55 |       serialized_options=None,
 56 |       type=None,
 57 |       create_key=_descriptor._internal_create_key),
 58 |     _descriptor.EnumValueDescriptor(
 59 |       name='kInt32', index=5, number=5,
 60 |       serialized_options=None,
 61 |       type=None,
 62 |       create_key=_descriptor._internal_create_key),
 63 |     _descriptor.EnumValueDescriptor(
 64 |       name='kInt64', index=6, number=6,
 65 |       serialized_options=None,
 66 |       type=None,
 67 |       create_key=_descriptor._internal_create_key),
 68 |     _descriptor.EnumValueDescriptor(
 69 |       name='kUInt8', index=7, number=7,
 70 |       serialized_options=None,
 71 |       type=None,
 72 |       create_key=_descriptor._internal_create_key),
 73 |     _descriptor.EnumValueDescriptor(
 74 |       name='kOFRecord', index=8, number=8,
 75 |       serialized_options=None,
 76 |       type=None,
 77 |       create_key=_descriptor._internal_create_key),
 78 |     _descriptor.EnumValueDescriptor(
 79 |       name='kFloat16', index=9, number=9,
 80 |       serialized_options=None,
 81 |       type=None,
 82 |       create_key=_descriptor._internal_create_key),
 83 |     _descriptor.EnumValueDescriptor(
 84 |       name='kTensorBuffer', index=10, number=10,
 85 |       serialized_options=None,
 86 |       type=None,
 87 |       create_key=_descriptor._internal_create_key),
 88 |   ],
 89 |   containing_type=None,
 90 |   serialized_options=None,
 91 |   serialized_start=108,
 92 |   serialized_end=271,
 93 | )
 94 | _sym_db.RegisterEnumDescriptor(_DATATYPE)
 95 | 
 96 | DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE)
 97 | kInvalidDataType = 0
 98 | kChar = 1
 99 | kFloat = 2
100 | kDouble = 3
101 | kInt8 = 4
102 | kInt32 = 5
103 | kInt64 = 6
104 | kUInt8 = 7
105 | kOFRecord = 8
106 | kFloat16 = 9
107 | kTensorBuffer = 10
108 | 
109 | 
110 | 
111 | _SHAPE = _descriptor.Descriptor(
112 |   name='Shape',
113 |   full_name='Shape',
114 |   filename=None,
115 |   file=DESCRIPTOR,
116 |   containing_type=None,
117 |   create_key=_descriptor._internal_create_key,
118 |   fields=[
119 |     _descriptor.FieldDescriptor(
120 |       name='dim', full_name='Shape.dim', index=0,
121 |       number=1, type=5, cpp_type=1, label=3,
122 |       has_default_value=False, default_value=[],
123 |       message_type=None, enum_type=None, containing_type=None,
124 |       is_extension=False, extension_scope=None,
125 |       serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
126 |   ],
127 |   extensions=[
128 |   ],
129 |   nested_types=[],
130 |   enum_types=[
131 |   ],
132 |   serialized_options=None,
133 |   is_extendable=False,
134 |   syntax='proto2',
135 |   extension_ranges=[],
136 |   oneofs=[
137 |   ],
138 |   serialized_start=14,
139 |   serialized_end=34,
140 | )
141 | 
142 | 
143 | _META = _descriptor.Descriptor(
144 |   name='Meta',
145 |   full_name='Meta',
146 |   filename=None,
147 |   file=DESCRIPTOR,
148 |   containing_type=None,
149 |   create_key=_descriptor._internal_create_key,
150 |   fields=[
151 |     _descriptor.FieldDescriptor(
152 |       name='shape', full_name='Meta.shape', index=0,
153 |       number=1, type=11, cpp_type=10, label=2,
154 |       has_default_value=False, default_value=None,
155 |       message_type=None, enum_type=None, containing_type=None,
156 |       is_extension=False, extension_scope=None,
157 |       serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
158 |     _descriptor.FieldDescriptor(
159 |       name='data_type', full_name='Meta.data_type', index=1,
160 |       number=2, type=14, cpp_type=8, label=2,
161 |       has_default_value=True, default_value=9,
162 |       message_type=None, enum_type=None, containing_type=None,
163 |       is_extension=False, extension_scope=None,
164 |       serialized_options=None, file=DESCRIPTOR,  create_key=_descriptor._internal_create_key),
165 |   ],
166 |   extensions=[
167 |   ],
168 |   nested_types=[],
169 |   enum_types=[
170 |   ],
171 |   serialized_options=None,
172 |   is_extendable=False,
173 |   syntax='proto2',
174 |   extension_ranges=[],
175 |   oneofs=[
176 |   ],
177 |   serialized_start=36,
178 |   serialized_end=105,
179 | )
180 | 
181 | _META.fields_by_name['shape'].message_type = _SHAPE
182 | _META.fields_by_name['data_type'].enum_type = _DATATYPE
183 | DESCRIPTOR.message_types_by_name['Shape'] = _SHAPE
184 | DESCRIPTOR.message_types_by_name['Meta'] = _META
185 | DESCRIPTOR.enum_types_by_name['DataType'] = _DATATYPE
186 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
187 | 
188 | Shape = _reflection.GeneratedProtocolMessageType('Shape', (_message.Message,), {
189 |   'DESCRIPTOR' : _SHAPE,
190 |   '__module__' : 'meta_pb2'
191 |   # @@protoc_insertion_point(class_scope:Shape)
192 |   })
193 | _sym_db.RegisterMessage(Shape)
194 | 
195 | Meta = _reflection.GeneratedProtocolMessageType('Meta', (_message.Message,), {
196 |   'DESCRIPTOR' : _META,
197 |   '__module__' : 'meta_pb2'
198 |   # @@protoc_insertion_point(class_scope:Meta)
199 |   })
200 | _sym_db.RegisterMessage(Meta)
201 | 
202 | 
203 | # @@protoc_insertion_point(module_scope)
204 | 


--------------------------------------------------------------------------------
/LanguageModeling/GPT/tools/prepare_distribute.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | inventory=$(dirname $0)/ansible_inventory
  4 | hosts=
  5 | wksp=
  6 | oneflow_gpt_src_dir=
  7 | wheel=
  8 | pip_install=on
  9 | 
 10 | function help() {
 11 |     echo "Usage: prepare_distribute.sh [ -h | --help ]
 12 |                                        [ -i | --inventory inventory_file ]
 13 |                                        [ -n | --hosts hosts]
 14 |                                        [ -s | --src oneflow_gpt_src_dir ]
 15 |                                        [ -w | --wheel wheel_file ]
 16 |                                        [ --no-install ] workspace_dir"
 17 |     exit 2
 18 | }
 19 | 
 20 | function parse_args() {
 21 |     args=$(getopt -o hi:n:s:w: -a -l help,intentory:,hosts:,src:,wheel:,no-install -- "$@")
 22 |     if [[ $? -ne 0 ]]; then
 23 |         help
 24 |     fi
 25 | 
 26 |     echo "parsed args is ${args}"
 27 |     eval set -- "${args}"
 28 | 
 29 |     while :
 30 |     do
 31 |         case "$1" in
 32 |             -h|--help)
 33 |                 echo "help"
 34 |                 ;;
 35 |             -i|--intentory)
 36 |                 inventory="$2"
 37 |                 shift
 38 |                 ;;
 39 |             -n|--hosts)
 40 |                 hosts="$2"
 41 |                 shift
 42 |                 ;;
 43 |             -s|--src)
 44 |                 oneflow_gpt_src_dir="$2"
 45 |                 shift
 46 |                 ;;
 47 |             -w|--wheel)
 48 |                 wheel="$2"
 49 |                 shift
 50 |                 ;;
 51 |             --no-install)
 52 |                 pip_install=
 53 |                 ;;
 54 |             --)
 55 |                 shift
 56 |                 break
 57 |                 ;;
 58 |             *)
 59 |                 echo "Unexpected option: $1"
 60 |                 help
 61 |                 ;;
 62 |         esac
 63 |         shift
 64 |     done
 65 | 
 66 |     echo "remaining args are: $@"
 67 |     echo "remaining args number are: $#"
 68 |     if [[ $# -ne 0 ]]; then
 69 |         wksp=$1
 70 |     else
 71 |         wksp=$PWD
 72 |     fi
 73 | }
 74 | 
 75 | parse_args $@
 76 | 
 77 | if [[ -z "${hosts}" ]]; then
 78 |     echo "hosts is unset"
 79 |     exit 1
 80 | fi
 81 | 
 82 | ansible ${hosts} -i ${inventory} -m file -a "path=${wksp} state=directory"
 83 | 
 84 | if [[ ! -z "${wheel}" ]]; then
 85 |     wheel=$(realpath "${wheel}")
 86 |     wheel_dir=$(realpath $(dirname "${wheel}"))
 87 |     ansible ${hosts} -i ${inventory} -m file -a "path=${wheel_dir} state=directory"
 88 |     ansible ${hosts} -i ${inventory} -m copy -a "src=${wheel} dest=${wheel}"
 89 |     if [[ ! -z "${pip_install}" ]]; then
 90 |         ansible ${hosts} -i ${inventory} -m shell -a "python3 -m pip install ${wheel} --user"
 91 |     fi
 92 | fi
 93 | 
 94 | if [[ ! -z "${oneflow_gpt_src_dir}" ]]; then
 95 |     ansible ${hosts} -i ${inventory} -m copy -a "src=${oneflow_gpt_src_dir} dest=${wksp}/oneflow_gpt"
 96 |     if [[ ! -z "${pip_install}" ]]; then
 97 |         ansible ${hosts} -i ${inventory} -m shell -a "python3 -m pip install -e ${wksp}/oneflow_gpt --user"
 98 |     fi
 99 | fi
100 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OneFlow Deep Learning Benchmarks
 2 | ## Introduction
 3 | This repository provides OneFlow deep learning benchmark examples for CV, CTR and NLP, and more models are on the way and will be provided here when ready.
 4 | 
 5 | ## [Convolutional Networks](./Classification/cnns) for Computer Vision Classification
 6 | - [ResNet-50](./Classification/cnns)
 7 | - [ResNeXt-50-32*4d](./Classification/cnns)
 8 | - [VGG-16](./Classification/cnns)
 9 | - [Inception-V3](./Classification/cnns)
10 | - [AlexNet](./Classification/cnns)
11 | - [MobileNet-V2](./Classification/cnns)
12 | 
13 | ## [Wide Deep Learning](./ClickThroughRate/WideDeepLearning) for Click-Through-Rate (CTR) Recommender Systems
14 | - [OneFlow-WDL](./ClickThroughRate/WideDeepLearning)
15 | 
16 | ## [BERT](./LanguageModeling/BERT) for Nature Language Process
17 | - [BERT Pretrain for Language Modeling](./LanguageModeling/BERT)
18 | - [SQuAD for Question Answering](./LanguageModeling/BERT)
19 | - [CoLA and MRPC of GLUE](./LanguageModeling/BERT)
20 | 
21 | ## [GPT](./LanguageModeling/GPT) for Generative Pre-trained Transformer
22 | - [Generative Pre-trained Transformer](./LanguageModeling/GPT)
23 | 
24 | ## OneFlow Benchmark Test Reports
25 | 
26 | | Model | DType | XLA | Throughput | Speedup on 32 devices |
27 | | ----- | ----- | --- | ---------- | ------- |
28 | | [ResNet50-V1.5](./reports/resnet50_v15_fp32_report.md) | Float32 | No | 11.6k imges/sec | 30.4 |
29 | | [BERT base Pretrain](./reports/bert_fp32_report.md) | Float32 | No | 530k tokens/sec | 28.54 |
30 | 


--------------------------------------------------------------------------------