├── docker ├── __init__.py ├── build_artifacts │ ├── __init__.py │ ├── dockerd-entrypoint.py │ └── deep_learning_container.py ├── 1.15.0 │ ├── py3 │ │ ├── dockerd-entrypoint.py │ │ └── Dockerfile.cpu │ └── py2 │ │ ├── Dockerfile.cpu │ │ └── Dockerfile.gpu ├── 2.0.0 │ ├── py2 │ │ ├── dockerd-entrypoint.py │ │ ├── Dockerfile.cpu │ │ └── Dockerfile.gpu │ └── py3 │ │ ├── dockerd-entrypoint.py │ │ └── Dockerfile.cpu ├── 1.10.0 │ ├── Dockerfile.cpu │ └── Dockerfile.gpu ├── 1.11.0 │ ├── Dockerfile.cpu │ └── Dockerfile.gpu ├── 1.12.0 │ ├── Dockerfile.cpu │ └── Dockerfile.gpu ├── 2.1.0 │ ├── py2 │ │ ├── Dockerfile.cpu │ │ └── Dockerfile.gpu │ └── py3 │ │ └── Dockerfile.cpu ├── 1.13.1 │ ├── Dockerfile.cpu │ └── Dockerfile.gpu ├── 2.0.1 │ ├── py3 │ │ └── Dockerfile.cpu │ └── py2 │ │ ├── Dockerfile.cpu │ │ └── Dockerfile.gpu └── 1.14.0 │ ├── py2 │ └── Dockerfile.cpu │ └── py3 │ └── Dockerfile.cpu ├── VERSION ├── CODEOWNERS ├── test ├── resources │ ├── test_dir_wrong_model │ │ └── fake_model.h5 │ ├── test_dir_correct_model │ │ └── 12345 │ │ │ └── saved_model.pb │ ├── test_dir_wrong_parent_dir │ │ └── not-digit │ │ │ └── saved_model.pb │ ├── mnist │ │ ├── data │ │ │ ├── test │ │ │ │ ├── x_test.npy │ │ │ │ └── y_test.npy │ │ │ └── train │ │ │ │ ├── x_train.npy │ │ │ │ └── y_train.npy │ │ ├── data-distributed │ │ │ ├── eval_data.npy │ │ │ ├── eval_labels.npy │ │ │ ├── train_data.npy │ │ │ └── train_labels.npy │ │ ├── __init__.py │ │ ├── mnist.py │ │ ├── smdataparallel_mnist.py │ │ ├── horovod_mnist.py │ │ └── mnist_custom.py │ ├── hvdbasic │ │ ├── train_hvd_basic.py │ │ └── train_hvd_env_vars.py │ ├── __init__.py │ ├── multi_worker_mirrored │ │ ├── __init__.py │ │ └── train_dummy.py │ └── tuning_model_dir │ │ └── entry.py ├── container │ └── 2.7.1 │ │ ├── Dockerfile.dlc.cpu │ │ ├── Dockerfile.dlc.gpu │ │ ├── Dockerfile.tf.cpu │ │ └── Dockerfile.tf.gpu ├── unit │ ├── __init__.py │ └── test_s3_utils.py ├── utils │ ├── __init__.py │ └── image_utils.py └── integration │ ├── __init__.py │ ├── sagemaker │ ├── timeout.py │ ├── test_multi_worker_mirrored.py │ ├── test_tuning_model_dir.py │ ├── test_smdataparallel.py │ ├── test_horovod_sagemaker.py │ ├── test_mnist.py │ └── recordio_utils.py │ └── local │ ├── test_horovod_local.py │ └── test_training.py ├── NOTICE ├── .flake8 ├── .gitignore ├── .github └── PULL_REQUEST_TEMPLATE.md ├── MANIFEST.in ├── CODE_OF_CONDUCT.md ├── .coveragerc_py38 ├── .coveragerc_py39 ├── .coveragerc_py37 ├── benchmarks ├── tf_benchmarks │ ├── bench.sh │ ├── README.md │ └── execute_tensorflow_training.py └── horovod-resnet │ ├── bench.sh │ └── train.sh ├── src └── sagemaker_tensorflow_container │ ├── __init__.py │ ├── s3_utils.py │ └── deep_learning_container.py ├── scripts ├── publish_all.py └── build_all.py ├── tox.ini ├── setup.py └── README.rst /docker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 20.5.1.dev0 2 | -------------------------------------------------------------------------------- /docker/build_artifacts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @aws/sagemaker-jobs-platform 2 | -------------------------------------------------------------------------------- /test/resources/test_dir_wrong_model/fake_model.h5: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/resources/test_dir_correct_model/12345/saved_model.pb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/resources/test_dir_wrong_parent_dir/not-digit/saved_model.pb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | application_import_names = image_utils, integration, sagemaker_tensorflow_container, test, timeout, utils 3 | import-order-style = google 4 | -------------------------------------------------------------------------------- /test/resources/mnist/data/test/x_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/test/x_test.npy -------------------------------------------------------------------------------- /test/resources/mnist/data/test/y_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/test/y_test.npy -------------------------------------------------------------------------------- /test/resources/mnist/data/train/x_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/train/x_train.npy -------------------------------------------------------------------------------- /test/resources/mnist/data/train/y_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/train/y_train.npy -------------------------------------------------------------------------------- /test/resources/mnist/data-distributed/eval_data.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/eval_data.npy -------------------------------------------------------------------------------- /test/resources/mnist/data-distributed/eval_labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/eval_labels.npy -------------------------------------------------------------------------------- /test/resources/mnist/data-distributed/train_data.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/train_data.npy -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | dist 3 | **/*.egg-info 4 | .DS_Store 5 | .idea/ 6 | .cache/ 7 | *.iml 8 | **/.ipynb_checkpoints 9 | **/.python-version 10 | .tox 11 | *~ 12 | .coverage 13 | -------------------------------------------------------------------------------- /test/resources/mnist/data-distributed/train_labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/train_labels.npy -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src/sagemaker_tensorflow_container * 2 | 3 | include VERSION 4 | include LICENSE 5 | include README.rst 6 | 7 | prune test 8 | 9 | recursive-exclude * __pycache__ 10 | recursive-exclude * *.py[co] 11 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /.coveragerc_py38: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | timid = True 4 | 5 | [report] 6 | exclude_lines = 7 | pragma: no cover 8 | pragma: py3 no cover 9 | if six.PY2 10 | elif six.PY2 11 | 12 | partial_branches = 13 | pragma: no cover 14 | pragma: py3 no cover 15 | if six.PY3 16 | elif six.PY3 17 | 18 | show_missing = True 19 | 20 | fail_under = 70 -------------------------------------------------------------------------------- /.coveragerc_py39: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | timid = True 4 | 5 | [report] 6 | exclude_lines = 7 | pragma: no cover 8 | pragma: py3 no cover 9 | if six.PY2 10 | elif six.PY2 11 | 12 | partial_branches = 13 | pragma: no cover 14 | pragma: py3 no cover 15 | if six.PY3 16 | elif six.PY3 17 | 18 | show_missing = True 19 | 20 | fail_under = 70 -------------------------------------------------------------------------------- /.coveragerc_py37: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | timid = True 4 | 5 | [report] 6 | exclude_lines = 7 | pragma: no cover 8 | pragma: py3 no cover 9 | if six.PY2 10 | elif six.PY2 11 | 12 | partial_branches = 13 | pragma: no cover 14 | pragma: py3 no cover 15 | if six.PY3 16 | elif six.PY3 17 | 18 | show_missing = True 19 | 20 | fail_under = 70 21 | -------------------------------------------------------------------------------- /test/container/2.7.1/Dockerfile.dlc.cpu: -------------------------------------------------------------------------------- 1 | ARG region 2 | FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-cpu-py38-ubuntu20.04-sagemaker 3 | 4 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz 5 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ 6 | rm /sagemaker_tensorflow_training.tar.gz 7 | -------------------------------------------------------------------------------- /test/container/2.7.1/Dockerfile.dlc.gpu: -------------------------------------------------------------------------------- 1 | ARG region 2 | FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-gpu-py38-cu112-ubuntu20.04-sagemaker 3 | 4 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz 5 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ 6 | rm /sagemaker_tensorflow_training.tar.gz 7 | -------------------------------------------------------------------------------- /test/resources/hvdbasic/train_hvd_basic.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import horovod.tensorflow as hvd 4 | 5 | hvd.init() 6 | 7 | with open( 8 | os.path.join("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank())), "w+" 9 | ) as f: 10 | basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} 11 | 12 | print(basic_info) 13 | json.dump(basic_info, f) 14 | -------------------------------------------------------------------------------- /test/container/2.7.1/Dockerfile.tf.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.7.1 2 | 3 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 4 | 5 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz 6 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ 7 | rm /sagemaker_tensorflow_training.tar.gz 8 | RUN pip install --no-cache-dir tensorflow-io 9 | RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd -------------------------------------------------------------------------------- /benchmarks/tf_benchmarks/bench.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | execute_tensorflow_training.py train \ 4 | --framework-version 1.12 \ 5 | --device gpu \ 6 | \ 7 | --instance-types ml.p3.16xlarge \ 8 | \ 9 | --instance-counts 1 \ 10 | --instance-counts 2 \ 11 | --instance-counts 4 \ 12 | \ 13 | --py-versions py3 \ 14 | \ 15 | --subnets subnet-125fb674 \ 16 | \ 17 | --security-groups sg-ce5dd1b4 \ 18 | \ 19 | --batch-sizes 64 \ 20 | \ 21 | -- --num_batches=1000 --model vgg16 \ 22 | --variable_update horovod --horovod_device gpu --use_fp16 --summary_verbosity 1 --save_summaries_steps 10 -------------------------------------------------------------------------------- /test/resources/hvdbasic/train_hvd_env_vars.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import horovod.tensorflow as hvd 4 | 5 | hvd.init() 6 | 7 | with open("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank()), "w+") as f: 8 | basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()} 9 | 10 | print(basic_info) 11 | json.dump(basic_info, f) 12 | 13 | val = os.environ.get("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI") 14 | host = os.environ.get("SM_CURRENT_HOST") 15 | 16 | assert val is not None 17 | assert host is not None 18 | 19 | print("host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}".format(host, val)) 20 | -------------------------------------------------------------------------------- /src/sagemaker_tensorflow_container/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | -------------------------------------------------------------------------------- /test/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | -------------------------------------------------------------------------------- /test/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | -------------------------------------------------------------------------------- /test/resources/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | -------------------------------------------------------------------------------- /test/resources/mnist/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | -------------------------------------------------------------------------------- /test/resources/multi_worker_mirrored/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | -------------------------------------------------------------------------------- /test/container/2.7.1/Dockerfile.tf.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.7.1-gpu 2 | 3 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 4 | 5 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz 6 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ 7 | rm /sagemaker_tensorflow_training.tar.gz 8 | RUN pip install --no-cache-dir tensorflow-io 9 | RUN apt-key del 7fa2af80 \ 10 | && rm /etc/apt/sources.list.d/nvidia-ml.list \ 11 | && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ 12 | && apt-get update \ 13 | && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd 14 | -------------------------------------------------------------------------------- /test/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import logging 16 | import os 17 | 18 | logging.getLogger("boto3").setLevel(logging.INFO) 19 | logging.getLogger("botocore").setLevel(logging.INFO) 20 | 21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources") 22 | DEFAULT_TIMEOUT = 120 23 | -------------------------------------------------------------------------------- /docker/1.15.0/py3/dockerd-entrypoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os.path 16 | import shlex 17 | import subprocess 18 | import sys 19 | 20 | if not os.path.exists("/opt/ml/input/config"): 21 | subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) 22 | 23 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) 24 | -------------------------------------------------------------------------------- /docker/2.0.0/py2/dockerd-entrypoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from __future__ import absolute_import 15 | 16 | import os.path 17 | import shlex 18 | import subprocess 19 | import sys 20 | 21 | if not os.path.exists("/opt/ml/input/config"): 22 | subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) 23 | 24 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) 25 | -------------------------------------------------------------------------------- /docker/2.0.0/py3/dockerd-entrypoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from __future__ import absolute_import 15 | 16 | import os.path 17 | import shlex 18 | import subprocess 19 | import sys 20 | 21 | if not os.path.exists("/opt/ml/input/config"): 22 | subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) 23 | 24 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) 25 | -------------------------------------------------------------------------------- /docker/build_artifacts/dockerd-entrypoint.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os.path 16 | import shlex 17 | import subprocess 18 | import sys 19 | 20 | if not os.path.exists("/opt/ml/input/config"): 21 | subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&']) 22 | 23 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:]))) 24 | -------------------------------------------------------------------------------- /benchmarks/horovod-resnet/bench.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You 5 | # may not use this file except in compliance with the License. A copy of 6 | # the License is located at 7 | # 8 | # http://aws.amazon.com/apache2.0/ 9 | # 10 | # or in the "license" file accompanying this file. This file is 11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 12 | # ANY KIND, either express or implied. See the License for the specific 13 | # language governing permissions and limitations under the License. 14 | 15 | ./execute_tensorflow_training.py train \ 16 | --framework-version 1.12 \ 17 | --device gpu \ 18 | \ 19 | --instance-types ml.p3.16xlarge \ 20 | \ 21 | --instance-counts 1 \ 22 | --instance-counts 2 \ 23 | --instance-counts 4 \ 24 | --instance-counts 8 \ 25 | --instance-counts 16 \ 26 | \ 27 | --py-versions py3 \ 28 | \ 29 | --subnets # add subnet id here \ 30 | \ 31 | --security-groups # add security-group id here 32 | -------------------------------------------------------------------------------- /test/resources/tuning_model_dir/entry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import argparse 16 | import os 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--model_dir", type=str) 20 | parser.add_argument("--arbitrary_value", type=int, default=0) 21 | args = parser.parse_args() 22 | 23 | assert os.environ["TRAINING_JOB_NAME"] in args.model_dir, ( 24 | "model_dir not unique to training job: %s" % args.model_dir 25 | ) 26 | 27 | # For the "hyperparameter tuning" to work 28 | print("accuracy=1") 29 | -------------------------------------------------------------------------------- /benchmarks/horovod-resnet/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You 5 | # may not use this file except in compliance with the License. A copy of 6 | # the License is located at 7 | # 8 | # http://aws.amazon.com/apache2.0/ 9 | # 10 | # or in the "license" file accompanying this file. This file is 11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 12 | # ANY KIND, either express or implied. See the License for the specific 13 | # language governing permissions and limitations under the License. 14 | 15 | set -ex 16 | 17 | echo "Launching training job using $SM_NUM_GPUS GPUs" 18 | 19 | # p3 instances have larger GPU memory, so a higher batch size can be used 20 | GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'` 21 | if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi 22 | 23 | # Training 24 | python -W ignore train_imagenet_resnet_hvd.py --num_epochs 90 --synthetic -b $BATCH_SIZE \ 25 | --lr_decay_mode poly --warmup_epochs 10 --clear_log 26 | -------------------------------------------------------------------------------- /src/sagemaker_tensorflow_container/s3_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the 'license' file accompanying this file. This file is 10 | # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | import boto3 18 | from six.moves.urllib.parse import urlparse 19 | 20 | 21 | def configure(model_dir, job_region): 22 | 23 | os.environ["S3_REGION"] = _s3_region(job_region, model_dir) 24 | 25 | # setting log level to WARNING 26 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" 27 | os.environ["S3_USE_HTTPS"] = "1" 28 | 29 | 30 | def _s3_region(job_region, model_dir): 31 | if model_dir and model_dir.startswith("s3://"): 32 | s3 = boto3.client("s3", region_name=job_region) 33 | 34 | # We get the AWS region of the checkpoint bucket, which may be different from 35 | # the region this container is currently running in. 36 | parsed_url = urlparse(model_dir) 37 | bucket_name = parsed_url.netloc 38 | 39 | bucket_location = s3.get_bucket_location(Bucket=bucket_name)["LocationConstraint"] 40 | 41 | return bucket_location or job_region 42 | else: 43 | return job_region 44 | -------------------------------------------------------------------------------- /test/integration/sagemaker/timeout.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # A copy of the License is located at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # or in the "license" file accompanying this file. This file is distributed 10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 11 | # express or implied. See the License for the specific language governing 12 | # permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | from contextlib import contextmanager 16 | import logging 17 | import signal 18 | 19 | LOGGER = logging.getLogger("timeout") 20 | 21 | 22 | class TimeoutError(Exception): 23 | pass 24 | 25 | 26 | @contextmanager 27 | def timeout(seconds=0, minutes=0, hours=0): 28 | """Add a signal-based timeout to any block of code. 29 | If multiple time units are specified, they will be added together to determine time limit. 30 | Usage: 31 | with timeout(seconds=5): 32 | my_slow_function(...) 33 | Args: 34 | - seconds: The time limit, in seconds. 35 | - minutes: The time limit, in minutes. 36 | - hours: The time limit, in hours. 37 | """ 38 | 39 | limit = seconds + 60 * minutes + 3600 * hours 40 | 41 | def handler(signum, frame): 42 | raise TimeoutError("timed out after {} seconds".format(limit)) 43 | 44 | try: 45 | signal.signal(signal.SIGALRM, handler) 46 | signal.alarm(limit) 47 | 48 | yield 49 | finally: 50 | signal.alarm(0) 51 | -------------------------------------------------------------------------------- /test/unit/test_s3_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the 'license' file accompanying this file. This file is 10 | # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | from mock import MagicMock, patch 18 | 19 | from sagemaker_tensorflow_container import s3_utils 20 | 21 | 22 | BUCKET_REGION = "us-west-2" 23 | JOB_REGION = "us-west-1" 24 | JOB_BUKCET = "sagemaker-us-west-2-000-00-1" 25 | PREFIX = "sagemaker/something" 26 | MODEL_DIR = "s3://{}/{}".format(JOB_BUKCET, PREFIX) 27 | 28 | 29 | @patch("boto3.client") 30 | def test_configure(client): 31 | s3 = MagicMock() 32 | client.return_value = s3 33 | loc = {"LocationConstraint": BUCKET_REGION} 34 | s3.get_bucket_location.return_value = loc 35 | 36 | s3_utils.configure(MODEL_DIR, JOB_REGION) 37 | 38 | assert os.environ["S3_REGION"] == BUCKET_REGION 39 | assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" 40 | assert os.environ["S3_USE_HTTPS"] == "1" 41 | 42 | 43 | def test_configure_local_dir(): 44 | s3_utils.configure("/opt/ml/model", JOB_REGION) 45 | 46 | assert os.environ["S3_REGION"] == JOB_REGION 47 | assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1" 48 | assert os.environ["S3_USE_HTTPS"] == "1" 49 | -------------------------------------------------------------------------------- /test/integration/sagemaker/test_multi_worker_mirrored.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | from sagemaker.tensorflow import TensorFlow 18 | from sagemaker.utils import unique_name_from_base 19 | 20 | 21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 22 | 23 | 24 | def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys): 25 | estimator = TensorFlow( 26 | entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_dummy.py"), 27 | role="SageMakerRole", 28 | instance_type=instance_type, 29 | instance_count=2, 30 | image_name=image_uri, 31 | framework_version=framework_version, 32 | py_version="py3", 33 | hyperparameters={ 34 | "sagemaker_multi_worker_mirrored_strategy_enabled": True, 35 | }, 36 | sagemaker_session=sagemaker_session, 37 | ) 38 | estimator.fit(job_name=unique_name_from_base("test-tf-mwms")) 39 | captured = capsys.readouterr() 40 | logs = captured.out + captured.err 41 | assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs 42 | assert "TF_CONFIG=" in logs 43 | -------------------------------------------------------------------------------- /test/resources/multi_worker_mirrored/train_dummy.py: -------------------------------------------------------------------------------- 1 | # Please refer to https://github.com/tensorflow/docs/blob/master/site/en/tutorials/distribute/multi_worker_with_keras.ipynb 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import os 6 | import json 7 | 8 | 9 | def mnist_dataset(batch_size): 10 | (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() 11 | # The `x` arrays are in uint8 and have values in the [0, 255] range. 12 | # You need to convert them to float32 with values in the [0, 1] range. 13 | x_train = x_train / np.float32(255) 14 | y_train = y_train.astype(np.int64) 15 | train_dataset = tf.data.Dataset.from_tensor_slices( 16 | (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) 17 | return train_dataset 18 | 19 | def build_and_compile_cnn_model(): 20 | model = tf.keras.Sequential([ 21 | tf.keras.layers.InputLayer(input_shape=(28, 28)), 22 | tf.keras.layers.Reshape(target_shape=(28, 28, 1)), 23 | tf.keras.layers.Conv2D(32, 3, activation='relu'), 24 | tf.keras.layers.Flatten(), 25 | tf.keras.layers.Dense(128, activation='relu'), 26 | tf.keras.layers.Dense(10) 27 | ]) 28 | model.compile( 29 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 30 | optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), 31 | metrics=['accuracy']) 32 | return model 33 | 34 | 35 | per_worker_batch_size = 64 36 | tf_config = json.loads(os.environ['TF_CONFIG']) 37 | num_workers = len(tf_config['cluster']['worker']) 38 | 39 | strategy = tf.distribute.MultiWorkerMirroredStrategy() 40 | 41 | global_batch_size = per_worker_batch_size * num_workers 42 | multi_worker_dataset = mnist_dataset(global_batch_size) 43 | 44 | with strategy.scope(): 45 | # Model building/compiling need to be within `strategy.scope()`. 46 | multi_worker_model = build_and_compile_cnn_model() 47 | 48 | multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) 49 | -------------------------------------------------------------------------------- /test/resources/mnist/mnist.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import sys 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | 10 | def _parse_args(): 11 | 12 | parser = argparse.ArgumentParser() 13 | 14 | # hyperparameters sent by the client are passed as command-line arguments to the script. 15 | parser.add_argument("--epochs", type=int, default=1) 16 | # Data, model, and output directories 17 | parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) 18 | parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"]) 19 | parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) 20 | parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) 21 | 22 | return parser.parse_known_args() 23 | 24 | 25 | def _load_training_data(base_dir): 26 | x_train = np.load(os.path.join(base_dir, "train", "x_train.npy")) 27 | y_train = np.load(os.path.join(base_dir, "train", "y_train.npy")) 28 | return x_train, y_train 29 | 30 | 31 | def _load_testing_data(base_dir): 32 | x_test = np.load(os.path.join(base_dir, "test", "x_test.npy")) 33 | y_test = np.load(os.path.join(base_dir, "test", "y_test.npy")) 34 | return x_test, y_test 35 | 36 | 37 | args, unknown = _parse_args() 38 | 39 | model = tf.keras.models.Sequential( 40 | [ 41 | tf.keras.layers.Flatten(input_shape=(28, 28)), 42 | tf.keras.layers.Dense(512, activation=tf.nn.relu), 43 | tf.keras.layers.Dropout(0.2), 44 | tf.keras.layers.Dense(10, activation=tf.nn.softmax), 45 | ] 46 | ) 47 | 48 | model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) 49 | x_train, y_train = _load_training_data(args.train) 50 | x_test, y_test = _load_testing_data(args.train) 51 | model.fit(x_train, y_train, epochs=args.epochs) 52 | model.evaluate(x_test, y_test) 53 | 54 | if args.current_host == args.hosts[0]: 55 | model.save(os.path.join("/opt/ml/model", "my_model.h5")) 56 | -------------------------------------------------------------------------------- /test/integration/sagemaker/test_tuning_model_dir.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | from sagemaker.tensorflow import TensorFlow 18 | from sagemaker.tuner import HyperparameterTuner, IntegerParameter 19 | from sagemaker.utils import unique_name_from_base 20 | 21 | 22 | def test_model_dir_with_training_job_name( 23 | sagemaker_session, image_uri, instance_type, framework_version 24 | ): 25 | resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources") 26 | script = os.path.join(resource_path, "tuning_model_dir", "entry.py") 27 | 28 | estimator = TensorFlow( 29 | entry_point=script, 30 | role="SageMakerRole", 31 | instance_type=instance_type, 32 | instance_count=1, 33 | image_uri=image_uri, 34 | framework_version=framework_version, 35 | py_version="py3", 36 | sagemaker_session=sagemaker_session, 37 | ) 38 | 39 | tuner = HyperparameterTuner( 40 | estimator=estimator, 41 | objective_metric_name="accuracy", 42 | hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)}, 43 | metric_definitions=[{"Name": "accuracy", "Regex": "accuracy=([01])"}], 44 | max_jobs=1, 45 | max_parallel_jobs=1, 46 | ) 47 | 48 | # User script has logic to check for the correct model_dir 49 | tuner.fit(job_name=unique_name_from_base("test-tf-model-dir", max_length=32)) 50 | tuner.wait() 51 | -------------------------------------------------------------------------------- /benchmarks/tf_benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow benchmarking scripts 2 | 3 | This folder contains the TF training scripts https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks. 4 | 5 | ## Basic usage 6 | **execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job. 7 | 8 | ```bash 9 | ./execute_tensorflow_training.py train --help 10 | Usage: execute_tensorflow_training.py train [OPTIONS] [SCRIPT_ARGS]... 11 | 12 | Options: 13 | --framework-version [1.11.0|1.12.0] 14 | [required] 15 | --device [cpu|gpu] [required] 16 | --py-versions TEXT 17 | --training-input-mode [File|Pipe] 18 | --networking-isolation / --no-networking-isolation 19 | --wait / --no-wait 20 | --security-groups TEXT 21 | --subnets TEXT 22 | --role TEXT 23 | --instance-counts INTEGER 24 | --batch-sizes INTEGER 25 | --instance-types TEXT 26 | --help Show this message and exit. 27 | 28 | ``` 29 | **execute_tensorflow_training.py generate_reports** generate benchmark reports. 30 | 31 | ## Examples: 32 | 33 | ```bash 34 | #!/usr/bin/env bash 35 | 36 | ./execute_tensorflow_training.py train \ 37 | --framework-version 1.11.0 \ 38 | --device gpu \ 39 | \ 40 | --instance-types ml.p3.2xlarge \ 41 | --instance-types ml.p3.8xlarge \ 42 | --instance-types ml.p3.16xlarge \ 43 | --instance-types ml.p2.xlarge \ 44 | --instance-types ml.p2.8xlarge \ 45 | --instance-types ml.p2.16xlarge \ 46 | \ 47 | --instance-counts 1 \ 48 | \ 49 | --py-versions py3 \ 50 | --py-versions py2 \ 51 | \ 52 | --subnets subnet-125fb674 \ 53 | \ 54 | --security-groups sg-ce5dd1b4 \ 55 | \ 56 | --batch-sizes 32 \ 57 | --batch-sizes 64 \ 58 | --batch-sizes 128 \ 59 | --batch-sizes 256 \ 60 | --batch-sizes 512 \ 61 | \ 62 | -- --model resnet32 --num_epochs 10 --data_format NHWC --summary_verbosity 1 --save_summaries_steps 10 --data_name cifar10 63 | ``` 64 | 65 | ## Using other models, datasets and benchmarks configurations 66 | ```python tf_cnn_benchmarks/tf_cnn_benchmarks.py --help``` shows all the options that the script has. 67 | -------------------------------------------------------------------------------- /scripts/publish_all.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import argparse 16 | import subprocess 17 | 18 | DEV_ACCOUNT = "142577830533" 19 | VERSION = "1.13.1" 20 | REGION = "us-west-2" 21 | REPO = "sagemaker-tensorflow-scriptmode" 22 | 23 | 24 | def _parse_args(): 25 | 26 | parser = argparse.ArgumentParser() 27 | 28 | parser.add_argument("--account", type=str, default=DEV_ACCOUNT) 29 | parser.add_argument("--version", type=str, default=VERSION) 30 | parser.add_argument("--repo", type=str, default=REPO) 31 | parser.add_argument("--region", type=str, default=REGION) 32 | 33 | return parser.parse_args() 34 | 35 | 36 | args = _parse_args() 37 | 38 | for arch in ["cpu", "gpu"]: 39 | for py_version in ["2", "3"]: 40 | source = "{}:{}-{}-py{}".format(args.repo, args.version, arch, py_version) 41 | dest = "{}.dkr.ecr.{}.amazonaws.com/{}".format(args.account, args.region, source) 42 | tag_cmd = "docker tag {} {}".format(source, dest) 43 | print("Tagging image: {}".format(tag_cmd)) 44 | subprocess.check_call(tag_cmd.split()) 45 | login_cmd = subprocess.check_output( 46 | "aws ecr get-login --no-include-email --registry-id {} --region {}".format( 47 | args.account, args.region 48 | ).split() 49 | ) 50 | print("Executing docker login command: {}".format(login_cmd)) 51 | subprocess.check_call(login_cmd.split()) 52 | push_cmd = "docker push {}".format(dest) 53 | print("Pushing image: {}".format(push_cmd)) 54 | subprocess.check_call(push_cmd.split()) 55 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py38, flake8 8 | skip_missing_interpreters = False 9 | 10 | [travis] 11 | python = 12 | 3.8: py38, flake8 13 | 14 | [flake8] 15 | max-line-length = 120 16 | exclude = 17 | build/ 18 | .git 19 | __pycache__ 20 | examples/ 21 | *pb2.py 22 | .tox 23 | tests/data/ 24 | test/resources 25 | venv/ 26 | sagemaker-tensorflow-extensions 27 | benchmarks/ 28 | max-complexity = 10 29 | ignore = 30 | C901, 31 | E203, 32 | FI10, 33 | FI12, 34 | FI13, 35 | FI14, 36 | FI15, 37 | FI16, 38 | FI17, 39 | FI18, 40 | FI50, 41 | FI51, 42 | FI52, 43 | FI53, 44 | FI54, 45 | FI55, 46 | FI56, 47 | FI57, 48 | W503 49 | 50 | require-code = True 51 | 52 | [testenv] 53 | # TEAMCITY_VERSION environment variable exists during build on Teamcity. teamcity-messages uses it in order to enable 54 | # reporting to TeamCity. 55 | passenv = 56 | AWS_ACCESS_KEY_ID 57 | AWS_SECRET_ACCESS_KEY 58 | AWS_SESSION_TOKEN 59 | AWS_CONTAINER_CREDENTIALS_RELATIVE_URI 60 | AWS_DEFAULT_REGION 61 | # {posargs} can be passed in by additional arguments specified when invoking tox. 62 | # Can be used to specify which tests to run, e.g.: tox -- -s 63 | commands = 64 | coverage run --rcfile .coveragerc_{envname} --source sagemaker_tensorflow_container -m py.test {posargs} 65 | {env:IGNORE_COVERAGE:} coverage report --rcfile .coveragerc_{envname} 66 | {env:IGNORE_COVERAGE:} coverage html --rcfile .coveragerc_{envname} 67 | 68 | deps = .[test] 69 | 70 | [testenv:flake8] 71 | basepython = python 72 | deps = 73 | flake8 74 | flake8-future-import 75 | flake8-import-order 76 | commands = flake8 --append-config .flake8 77 | 78 | 79 | [testenv:twine] 80 | basepython = python3 81 | # https://github.com/pypa/twine/blob/master/docs/changelog.rst 82 | deps = 83 | twine>=1.12.0 84 | # https://packaging.python.org/guides/making-a-pypi-friendly-readme/#validating-restructuredtext-markup 85 | commands = 86 | python setup.py sdist 87 | twine check dist/*.tar.gz 88 | -------------------------------------------------------------------------------- /test/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | import subprocess 17 | import sys 18 | 19 | CYAN_COLOR = "\033[36m" 20 | END_COLOR = "\033[0m" 21 | DLC_AWS_ID = "763104351884" 22 | 23 | 24 | def build_image(framework_version, dockerfile, image_uri, region, cwd="."): 25 | _check_call("python setup.py sdist") 26 | 27 | if "dlc" in dockerfile: 28 | ecr_login(region, DLC_AWS_ID) 29 | 30 | dockerfile_location = os.path.join("test", "container", framework_version, dockerfile) 31 | 32 | subprocess.check_call( 33 | [ 34 | "docker", 35 | "build", 36 | "-t", 37 | image_uri, 38 | "-f", 39 | dockerfile_location, 40 | "--build-arg", 41 | "region={}".format(region), 42 | cwd, 43 | ], 44 | cwd=cwd, 45 | ) 46 | print("created image {}".format(image_uri)) 47 | return image_uri 48 | 49 | 50 | def push_image(ecr_image, region, aws_id): 51 | ecr_login(region, aws_id) 52 | _check_call("docker push {}".format(ecr_image)) 53 | 54 | 55 | def ecr_login(region, aws_id): 56 | login = _check_call( 57 | "aws ecr get-login --registry-ids {} ".format(aws_id) 58 | + "--no-include-email --region {}".format(region) 59 | ) 60 | _check_call(login.decode("utf-8").rstrip("\n")) 61 | 62 | 63 | def _check_call(cmd, *popenargs, **kwargs): 64 | if isinstance(cmd, str): 65 | cmd = cmd.split(" ") 66 | _print_cmd(cmd) 67 | return subprocess.check_output(cmd, *popenargs, **kwargs) 68 | 69 | 70 | def _print_cmd(cmd): 71 | print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR)) 72 | sys.stdout.flush() 73 | -------------------------------------------------------------------------------- /test/integration/sagemaker/test_smdataparallel.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | import pytest 18 | import sagemaker 19 | from sagemaker.tensorflow import TensorFlow 20 | from sagemaker.utils import unique_name_from_base 21 | 22 | from integration import DEFAULT_TIMEOUT, RESOURCE_PATH 23 | from integration.sagemaker.timeout import timeout 24 | 25 | 26 | @pytest.mark.skip_cpu 27 | @pytest.mark.skip_generic 28 | @pytest.mark.parametrize( 29 | "instances, instance_type", 30 | [(2, "ml.p3.16xlarge")], 31 | ) 32 | def test_smdataparallel_training(instances, instance_type, sagemaker_session, image_uri, framework_version, tmpdir): 33 | default_bucket = sagemaker_session.default_bucket() 34 | output_path = "s3://{}/{}/{}".format(default_bucket, "tensorflow", "smdataparallel") 35 | 36 | estimator = TensorFlow( 37 | entry_point=os.path.join(RESOURCE_PATH, "mnist", "smdataparallel_mnist.py"), 38 | role="SageMakerRole", 39 | instance_type=instance_type, 40 | sagemaker_session=sagemaker_session, 41 | instance_count=instances, 42 | image_uri=image_uri, 43 | output_path=output_path, 44 | framework_version=framework_version, 45 | py_version="py3", 46 | distribution={"smdistributed": {"dataparallel": {"enabled": True}}} 47 | ) 48 | 49 | with timeout(minutes=DEFAULT_TIMEOUT): 50 | estimator.fit(job_name=unique_name_from_base("test-tf-smdataparallel")) 51 | 52 | model_data_source = sagemaker.local.data.get_data_source_instance( 53 | estimator.model_data, sagemaker_session 54 | ) 55 | 56 | for filename in model_data_source.get_file_list(): 57 | assert os.path.basename(filename) == "model.tar.gz" 58 | -------------------------------------------------------------------------------- /docker/1.10.0/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | MAINTAINER Amazon AI 4 | 5 | ARG framework_installable 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 7 | ARG py_version 8 | 9 | # Validate that arguments are specified 10 | RUN test $framework_installable || exit 1 \ 11 | && test $py_version || exit 1 12 | 13 | WORKDIR /root 14 | 15 | COPY $framework_installable . 16 | COPY $framework_support_installable . 17 | 18 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ 19 | && add-apt-repository ppa:deadsnakes/ppa -y 20 | 21 | RUN buildDeps=" \ 22 | build-essential \ 23 | curl \ 24 | git \ 25 | libcurl3-dev \ 26 | libfreetype6-dev \ 27 | libpng12-dev \ 28 | libzmq3-dev \ 29 | pkg-config \ 30 | rsync \ 31 | unzip \ 32 | zip \ 33 | zlib1g-dev \ 34 | openjdk-8-jdk \ 35 | openjdk-8-jre-headless \ 36 | wget \ 37 | vim \ 38 | iputils-ping \ 39 | nginx \ 40 | " \ 41 | && apt-get update && apt-get install -y --no-install-recommends $buildDeps \ 42 | && apt-get clean \ 43 | && rm -rf /var/lib/apt/lists/* 44 | 45 | RUN if [ $py_version -eq 3 ]; \ 46 | then apt-get update && apt-get install -y --no-install-recommends python3.6-dev \ 47 | && ln -s -f /usr/bin/python3.6 /usr/bin/python; \ 48 | else apt-get update && apt-get install -y --no-install-recommends python-dev; fi 49 | 50 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 51 | python get-pip.py && \ 52 | rm get-pip.py 53 | 54 | RUN pip install --upgrade \ 55 | pip \ 56 | setuptools 57 | 58 | # Set environment variables for MKL 59 | # TODO: investigate the right value for OMP_NUM_THREADS 60 | # For more about MKL with TensorFlow see: 61 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 62 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 63 | 64 | RUN framework_installable_local=$(basename $framework_installable) \ 65 | && framework_support_installable_local=$(basename $framework_support_installable) \ 66 | && pip install --no-cache --upgrade \ 67 | $framework_installable_local \ 68 | $framework_support_installable_local \ 69 | "sagemaker-tensorflow>=1.10,<1.11" \ 70 | \ 71 | && rm $framework_installable_local \ 72 | && rm $framework_support_installable_local 73 | 74 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 75 | -------------------------------------------------------------------------------- /docker/1.10.0/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04 2 | 3 | MAINTAINER Amazon AI 4 | 5 | ARG framework_installable 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 7 | ARG py_version 8 | 9 | # Validate that arguments are specified 10 | RUN test $framework_installable || exit 1 \ 11 | && test $py_version || exit 1 12 | 13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ 14 | && add-apt-repository ppa:deadsnakes/ppa -y 15 | 16 | RUN buildDeps=" \ 17 | build-essential \ 18 | cuda-command-line-tools-9-0 \ 19 | cuda-cublas-dev-9-0 \ 20 | cuda-cudart-dev-9-0 \ 21 | cuda-cufft-dev-9-0 \ 22 | cuda-curand-dev-9-0 \ 23 | cuda-cusolver-dev-9-0 \ 24 | cuda-cusparse-dev-9-0 \ 25 | curl \ 26 | git \ 27 | libcudnn7=7.1.4.18-1+cuda9.0 \ 28 | libcudnn7-dev=7.1.4.18-1+cuda9.0 \ 29 | libcurl3-dev \ 30 | libfreetype6-dev \ 31 | libpng12-dev \ 32 | libzmq3-dev \ 33 | pkg-config \ 34 | rsync \ 35 | unzip \ 36 | zip \ 37 | zlib1g-dev \ 38 | wget \ 39 | vim \ 40 | nginx \ 41 | iputils-ping \ 42 | " \ 43 | && apt-get update && apt-get install -y --no-install-recommends $buildDeps \ 44 | && rm -rf /var/lib/apt/lists/* \ 45 | && find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete \ 46 | && rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a 47 | 48 | RUN if [ $py_version -eq 3 ]; \ 49 | then apt-get update && apt-get install -y --no-install-recommends python3.6-dev \ 50 | && ln -s -f /usr/bin/python3.6 /usr/bin/python; \ 51 | else apt-get update && apt-get install -y --no-install-recommends python-dev; fi 52 | 53 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 54 | python get-pip.py && \ 55 | rm get-pip.py 56 | 57 | WORKDIR /root 58 | 59 | COPY $framework_installable . 60 | COPY $framework_support_installable . 61 | 62 | RUN framework_installable_local=$(basename $framework_installable) && \ 63 | framework_support_installable_local=$(basename $framework_support_installable) && \ 64 | \ 65 | pip install --no-cache --upgrade $framework_installable_local \ 66 | $framework_support_installable_local \ 67 | "sagemaker-tensorflow>=1.10,<1.11" &&\ 68 | \ 69 | rm $framework_installable_local && \ 70 | rm $framework_support_installable_local 71 | 72 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 73 | -------------------------------------------------------------------------------- /docker/1.11.0/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | ARG framework_installable 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 7 | ARG py_version 8 | 9 | # Validate that arguments are specified 10 | RUN test $framework_installable || exit 1 \ 11 | && test $py_version || exit 1 12 | 13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ 14 | && add-apt-repository ppa:deadsnakes/ppa -y \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | RUN apt-get update && apt-get install -y --no-install-recommends \ 18 | ca-certificates \ 19 | build-essential \ 20 | curl \ 21 | nginx \ 22 | && if [ $py_version -eq 3 ]; \ 23 | then apt-get install -y --no-install-recommends python3.6-dev \ 24 | && ln -s -f /usr/bin/python3.6 /usr/bin/python; \ 25 | else apt-get install -y --no-install-recommends python-dev; fi \ 26 | && rm -rf /var/lib/apt/lists/* 27 | 28 | # Python won’t try to write .pyc or .pyo files on the import of source modules 29 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 30 | 31 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 32 | python get-pip.py \ 33 | --disable-pip-version-check \ 34 | --no-cache-dir \ 35 | "pip==18.1" \ 36 | ; \ 37 | pip --version; \ 38 | find /usr/local -depth \ 39 | \( \ 40 | \( -type d -a \( -name test -o -name tests \) \) \ 41 | -o \ 42 | \( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ 43 | \) -exec rm -rf '{}' +; \ 44 | rm get-pip.py 45 | 46 | # Set environment variables for MKL 47 | # TODO: investigate the right value for OMP_NUM_THREADS 48 | # For more about MKL with TensorFlow see: 49 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 50 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 51 | 52 | WORKDIR / 53 | 54 | COPY $framework_installable . 55 | COPY $framework_support_installable . 56 | 57 | RUN pip install --no-cache-dir -U \ 58 | keras==2.2.4 \ 59 | $framework_support_installable \ 60 | "sagemaker-tensorflow>=1.11,<1.12" \ 61 | # Let's install TensorFlow separately in the end to avoid 62 | # the library version to be overwritten 63 | && pip install --force-reinstall --no-cache-dir -U $framework_installable \ 64 | \ 65 | && rm -f $framework_installable \ 66 | && rm -f $framework_support_installable \ 67 | && pip uninstall -y --no-cache-dir \ 68 | markdown \ 69 | tensorboard 70 | 71 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 72 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | from glob import glob 16 | import os 17 | from os.path import basename 18 | from os.path import splitext 19 | import sys 20 | 21 | from setuptools import find_packages, setup 22 | 23 | 24 | def read(fname): 25 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 26 | 27 | 28 | def read_version(): 29 | return read("VERSION").strip() 30 | 31 | 32 | test_dependencies = [ 33 | "tox", 34 | "flake8", 35 | "pytest", 36 | "pytest-cov", 37 | "pytest-xdist", 38 | "pytest-rerunfailures", 39 | "mock", 40 | "sagemaker[local]>=2", 41 | "tensorflow<2.4", 42 | "docker-compose", 43 | "boto3==1.16.34", 44 | "python-dateutil>=2.1,<2.8.1", 45 | "botocore==1.19.34", 46 | "requests-mock", 47 | "awscli==1.18.194", 48 | "protobuf>=3.9.2,<3.20" 49 | ] 50 | 51 | if sys.version_info.major > 2: 52 | test_dependencies.append("sagemaker-experiments==0.1.7") 53 | 54 | setup( 55 | name="sagemaker_tensorflow_training", 56 | version=read_version(), 57 | description="Open source library for using " 58 | "TensorFlow to train models on on Amazon SageMaker.", 59 | packages=find_packages(where="src", exclude=("test",)), 60 | package_dir={"": "src"}, 61 | py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], 62 | long_description=read("README.rst"), 63 | author="Amazon Web Services", 64 | url="https://github.com/aws/sagemaker-tensorflow-training-toolkit", 65 | license="Apache License 2.0", 66 | classifiers=[ 67 | "Development Status :: 5 - Production/Stable", 68 | "Intended Audience :: Developers", 69 | "Natural Language :: English", 70 | "License :: OSI Approved :: Apache Software License", 71 | "Programming Language :: Python", 72 | "Programming Language :: Python :: 3.7", 73 | "Programming Language :: Python :: 3.8", 74 | "Programming Language :: Python :: 3.9", 75 | ], 76 | install_requires=[ 77 | "sagemaker-training>=4.3.0,<=4.8.3", 78 | "numpy < 1.24", 79 | "scipy", 80 | "scikit-learn", 81 | "pandas", 82 | "Pillow", 83 | "h5py", 84 | ], 85 | extras_require={"test": test_dependencies, "benchmark": ["click"], }, 86 | ) 87 | -------------------------------------------------------------------------------- /test/integration/sagemaker/test_horovod_sagemaker.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | import pytest 18 | import sagemaker 19 | from sagemaker.tensorflow import TensorFlow 20 | from sagemaker.utils import unique_name_from_base 21 | 22 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 23 | 24 | 25 | @pytest.mark.skip_generic 26 | def test_distributed_training_horovod( 27 | sagemaker_session, instance_type, image_uri, tmpdir, framework_version 28 | ): 29 | 30 | mpi_options = "-verbose -x orte_base_help_aggregate=0" 31 | estimator = TensorFlow( 32 | entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"), 33 | role="SageMakerRole", 34 | instance_type=instance_type, 35 | instance_count=2, 36 | image_uri=image_uri, 37 | framework_version=framework_version, 38 | py_version="py3", 39 | hyperparameters={ 40 | "sagemaker_mpi_enabled": True, 41 | "sagemaker_mpi_custom_mpi_options": mpi_options, 42 | "sagemaker_mpi_num_of_processes_per_host": 1, 43 | }, 44 | sagemaker_session=sagemaker_session, 45 | ) 46 | 47 | estimator.fit(job_name=unique_name_from_base("test-tf-horovod")) 48 | 49 | model_data_source = sagemaker.local.data.get_data_source_instance( 50 | estimator.model_data, sagemaker_session 51 | ) 52 | 53 | for filename in model_data_source.get_file_list(): 54 | assert os.path.basename(filename) == "model.tar.gz" 55 | 56 | 57 | @pytest.mark.skip_generic 58 | def test_distributed_training_horovod_with_env_vars( 59 | sagemaker_session, instance_type, image_uri, tmpdir, framework_version 60 | ): 61 | 62 | mpi_options = "-verbose -x orte_base_help_aggregate=0" 63 | estimator = TensorFlow( 64 | entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"), 65 | role="SageMakerRole", 66 | instance_type=instance_type, 67 | instance_count=2, 68 | image_uri=image_uri, 69 | framework_version=framework_version, 70 | py_version="py3", 71 | hyperparameters={ 72 | "sagemaker_mpi_enabled": True, 73 | "sagemaker_mpi_custom_mpi_options": mpi_options, 74 | "sagemaker_mpi_num_of_processes_per_host": 2, 75 | }, 76 | sagemaker_session=sagemaker_session, 77 | ) 78 | 79 | estimator.fit(job_name=unique_name_from_base("test-tf-horovod-env-vars")) 80 | -------------------------------------------------------------------------------- /test/resources/mnist/smdataparallel_mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | import tensorflow as tf 14 | 15 | import smdistributed.dataparallel.tensorflow as dist 16 | 17 | tf.random.set_seed(42) 18 | 19 | dist.init() 20 | 21 | gpus = tf.config.experimental.list_physical_devices("GPU") 22 | for gpu in gpus: 23 | tf.config.experimental.set_memory_growth(gpu, True) 24 | if gpus: 25 | tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU") 26 | 27 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( 28 | path="mnist-%d.npz" % dist.rank() 29 | ) 30 | 31 | dataset = tf.data.Dataset.from_tensor_slices( 32 | (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) 33 | ) 34 | dataset = dataset.repeat().shuffle(10000).batch(128) 35 | 36 | mnist_model = tf.keras.Sequential( 37 | [ 38 | tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), 39 | tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), 40 | tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), 41 | tf.keras.layers.Dropout(0.25), 42 | tf.keras.layers.Flatten(), 43 | tf.keras.layers.Dense(128, activation="relu"), 44 | tf.keras.layers.Dropout(0.5), 45 | tf.keras.layers.Dense(10, activation="softmax"), 46 | ] 47 | ) 48 | loss = tf.losses.SparseCategoricalCrossentropy() 49 | # LR for 8 node run : 0.000125 50 | # LR for single node run : 0.001 51 | opt = tf.optimizers.Adam(0.000125 * dist.size()) 52 | 53 | checkpoint_dir = "./checkpoints" 54 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) 55 | 56 | 57 | @tf.function 58 | def training_step(images, labels, first_batch): 59 | with tf.GradientTape() as tape: 60 | probs = mnist_model(images, training=True) 61 | loss_value = loss(labels, probs) 62 | 63 | tape = dist.DistributedGradientTape(tape) 64 | 65 | grads = tape.gradient(loss_value, mnist_model.trainable_variables) 66 | opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) 67 | 68 | if first_batch: 69 | dist.broadcast_variables(mnist_model.variables, root_rank=0) 70 | dist.broadcast_variables(opt.variables(), root_rank=0) 71 | 72 | loss_value = dist.oob_allreduce(loss_value) # Average the loss across workers 73 | return loss_value 74 | 75 | 76 | for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())): 77 | loss_value = training_step(images, labels, batch == 0) 78 | 79 | if batch % 50 == 0 and dist.rank() == 0: 80 | print("Step #%d\tLoss: %.6f" % (batch, loss_value)) 81 | 82 | if dist.rank() == 0: 83 | checkpoint.save(checkpoint_dir) 84 | -------------------------------------------------------------------------------- /test/integration/local/test_horovod_local.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import json 16 | import os 17 | import tarfile 18 | 19 | import pytest 20 | from sagemaker.tensorflow import TensorFlow 21 | 22 | 23 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 24 | 25 | 26 | @pytest.mark.skip_cpu 27 | @pytest.mark.skip_generic 28 | def test_distributed_training_horovod_gpu( 29 | sagemaker_local_session, image_uri, tmpdir, framework_version 30 | ): 31 | _test_distributed_training_horovod( 32 | 1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, "local_gpu" 33 | ) 34 | 35 | 36 | @pytest.mark.skip_gpu 37 | @pytest.mark.skip_generic 38 | @pytest.mark.parametrize("instances, processes", [(2, 2)]) 39 | def test_distributed_training_horovod_cpu( 40 | instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version 41 | ): 42 | _test_distributed_training_horovod( 43 | instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, "local" 44 | ) 45 | 46 | 47 | def _test_distributed_training_horovod( 48 | instances, processes, session, image_uri, tmpdir, framework_version, instance_type 49 | ): 50 | output_path = "file://%s" % tmpdir 51 | estimator = TensorFlow( 52 | entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"), 53 | role="SageMakerRole", 54 | instance_type=instance_type, 55 | sagemaker_session=session, 56 | instance_count=instances, 57 | image_uri=image_uri, 58 | output_path=output_path, 59 | hyperparameters={ 60 | "sagemaker_mpi_enabled": True, 61 | "sagemaker_network_interface_name": "eth0", 62 | "sagemaker_mpi_num_of_processes_per_host": processes, 63 | }, 64 | ) 65 | 66 | estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed"))) 67 | 68 | tmp = str(tmpdir) 69 | extract_files(output_path.replace("file://", ""), tmp) 70 | 71 | size = instances * processes 72 | 73 | for rank in range(size): 74 | local_rank = rank % processes 75 | assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == { 76 | "local-rank": local_rank, 77 | "rank": rank, 78 | "size": size, 79 | } 80 | 81 | 82 | def read_json(file, tmp): 83 | with open(os.path.join(tmp, file)) as f: 84 | return json.load(f) 85 | 86 | 87 | def assert_files_exist_in_tar(output_path, files): 88 | if output_path.startswith("file://"): 89 | output_path = output_path[7:] 90 | model_file = os.path.join(output_path, "model.tar.gz") 91 | with tarfile.open(model_file) as tar: 92 | for f in files: 93 | tar.getmember(f) 94 | 95 | 96 | def extract_files(output_path, tmpdir): 97 | with tarfile.open(os.path.join(output_path, "model.tar.gz")) as tar: 98 | tar.extractall(tmpdir) 99 | -------------------------------------------------------------------------------- /docker/1.11.0/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | ARG framework_installable 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 7 | ARG py_version 8 | 9 | # Validate that arguments are specified 10 | RUN test $framework_installable || exit 1 \ 11 | && test $py_version || exit 1 12 | 13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ 14 | && add-apt-repository ppa:deadsnakes/ppa -y \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | ENV NCCL_VERSION=2.3.5-2+cuda9.0 18 | ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0 19 | ENV TF_TENSORRT_VERSION=4.1.2 20 | 21 | RUN apt-get update && apt-get install -y --no-install-recommends \ 22 | build-essential \ 23 | ca-certificates \ 24 | cuda-command-line-tools-9-0 \ 25 | cuda-cublas-dev-9-0 \ 26 | cuda-cudart-dev-9-0 \ 27 | cuda-cufft-dev-9-0 \ 28 | cuda-curand-dev-9-0 \ 29 | cuda-cusolver-dev-9-0 \ 30 | cuda-cusparse-dev-9-0 \ 31 | curl \ 32 | libcudnn7=${CUDNN_VERSION} \ 33 | libnccl2=${NCCL_VERSION} \ 34 | libgomp1 \ 35 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 36 | # adds a new list which contains libnvinfer library, so it needs another 37 | # 'apt-get update' to retrieve that list before it can actually install the 38 | # library. 39 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 40 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 41 | && apt-get update && apt-get install -y --no-install-recommends \ 42 | nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \ 43 | && apt-get update && apt-get install -y --no-install-recommends \ 44 | libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \ 45 | && apt-get clean \ 46 | && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ 47 | && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ 48 | && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ 49 | && if [ $py_version -eq 3 ]; \ 50 | then apt-get install -y --no-install-recommends python3.6-dev \ 51 | && ln -s -f /usr/bin/python3.6 /usr/bin/python; \ 52 | else apt-get install -y --no-install-recommends python-dev; fi \ 53 | && rm -rf /var/lib/apt/lists/* 54 | 55 | # Python won’t try to write .pyc or .pyo files on the import of source modules 56 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 57 | 58 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 59 | python get-pip.py \ 60 | --disable-pip-version-check \ 61 | --no-cache-dir \ 62 | "pip==18.1" \ 63 | ; \ 64 | pip --version; \ 65 | find /usr/local -depth \ 66 | \( \ 67 | \( -type d -a \( -name test -o -name tests \) \) \ 68 | -o \ 69 | \( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ 70 | \) -exec rm -rf '{}' +; \ 71 | rm get-pip.py 72 | 73 | WORKDIR / 74 | 75 | COPY $framework_installable . 76 | COPY $framework_support_installable . 77 | 78 | RUN pip install --no-cache-dir -U \ 79 | keras==2.2.4 \ 80 | $framework_support_installable \ 81 | "sagemaker-tensorflow>=1.11,<1.12" \ 82 | # Let's install TensorFlow separately in the end to avoid 83 | # the library version to be overwritten 84 | && pip install --force-reinstall --no-cache-dir -U $framework_installable \ 85 | \ 86 | && rm -f $framework_installable \ 87 | && rm -f $framework_support_installable \ 88 | && pip uninstall -y --no-cache-dir \ 89 | markdown \ 90 | tensorboard 91 | 92 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 93 | -------------------------------------------------------------------------------- /docker/build_artifacts/deep_learning_container.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import json 16 | import logging 17 | import re 18 | 19 | import requests 20 | 21 | 22 | def _validate_instance_id(instance_id): 23 | """ 24 | Validate instance ID 25 | """ 26 | instance_id_regex = r'^(i-\S{17})' 27 | compiled_regex = re.compile(instance_id_regex) 28 | match = compiled_regex.match(instance_id) 29 | 30 | if not match: 31 | return None 32 | 33 | return match.group(1) 34 | 35 | 36 | def _retrieve_instance_id(): 37 | """ 38 | Retrieve instance ID from instance metadata service 39 | """ 40 | instance_id = None 41 | url = "http://169.254.169.254/latest/meta-data/instance-id" 42 | response = requests_helper(url, timeout=0.1) 43 | 44 | if response is not None: 45 | instance_id = _validate_instance_id(response.text) 46 | 47 | return instance_id 48 | 49 | 50 | def _retrieve_instance_region(): 51 | """ 52 | Retrieve instance region from instance metadata service 53 | """ 54 | region = None 55 | valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2', 56 | 'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1', 57 | 'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 58 | 'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] 59 | 60 | url = "http://169.254.169.254/latest/dynamic/instance-identity/document" 61 | response = requests_helper(url, timeout=0.1) 62 | 63 | if response is not None: 64 | response_json = json.loads(response.text) 65 | 66 | if response_json['region'] in valid_regions: 67 | region = response_json['region'] 68 | 69 | return region 70 | 71 | 72 | def query_bucket(): 73 | """ 74 | GET request on an empty object from an Amazon S3 bucket 75 | """ 76 | response = None 77 | instance_id = _retrieve_instance_id() 78 | region = _retrieve_instance_region() 79 | 80 | if instance_id is not None and region is not None: 81 | url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com" 82 | "/dlc-containers.txt?x-instance-id={1}".format(region, instance_id)) 83 | response = requests_helper(url, timeout=0.2) 84 | 85 | logging.debug("Query bucket finished: {}".format(response)) 86 | 87 | return response 88 | 89 | 90 | def requests_helper(url, timeout): 91 | response = None 92 | try: 93 | response = requests.get(url, timeout=timeout) 94 | except requests.exceptions.RequestException as e: 95 | logging.error("Request exception: {}".format(e)) 96 | 97 | return response 98 | 99 | 100 | def main(): 101 | """ 102 | Invoke bucket query 103 | """ 104 | # Logs are not necessary for normal run. Remove this line while debugging. 105 | logging.getLogger().disabled = True 106 | 107 | logging.basicConfig(level=logging.ERROR) 108 | query_bucket() 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | -------------------------------------------------------------------------------- /docker/1.12.0/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | openssh-client \ 9 | openssh-server \ 10 | ca-certificates \ 11 | curl \ 12 | && add-apt-repository ppa:deadsnakes/ppa -y \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | # Install Open MPI 16 | RUN mkdir /tmp/openmpi && \ 17 | cd /tmp/openmpi && \ 18 | curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ 19 | tar zxf openmpi-3.1.2.tar.gz && \ 20 | cd openmpi-3.1.2 && \ 21 | ./configure --enable-orterun-prefix-by-default && \ 22 | make -j $(nproc) all && \ 23 | make install && \ 24 | ldconfig && \ 25 | rm -rf /tmp/openmpi 26 | 27 | 28 | # Create a wrapper for OpenMPI to allow running as root by default 29 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 30 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 31 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 32 | chmod a+x /usr/local/bin/mpirun 33 | 34 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 35 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 36 | 37 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 38 | 39 | ENV PATH /usr/local/openmpi/bin/:$PATH 40 | 41 | # SSH login fix. Otherwise user is kicked off after login 42 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 43 | 44 | # Create SSH key. 45 | RUN mkdir -p /root/.ssh/ && \ 46 | mkdir -p /var/run/sshd && \ 47 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 48 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 49 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 50 | 51 | # Set environment variables for MKL 52 | # For more about MKL with TensorFlow see: 53 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 54 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 55 | 56 | WORKDIR / 57 | 58 | ARG py_version 59 | 60 | RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \ 61 | apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \ 62 | ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \ 63 | rm -rf /var/lib/apt/lists/* 64 | 65 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 66 | 67 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 68 | python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \ 69 | rm get-pip.py 70 | 71 | ARG framework_installable 72 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 73 | 74 | COPY $framework_installable tensorflow-1.12.0-py2.py3-none-any.whl 75 | COPY $framework_support_installable . 76 | 77 | RUN pip install --no-cache-dir -U \ 78 | keras==2.2.4 \ 79 | mpi4py==3.0.1 \ 80 | "sagemaker-tensorflow>=1.12,<1.13" && \ 81 | # Let's install TensorFlow separately in the end to avoid 82 | # the library version to be overwritten 83 | pip install --force-reinstall --no-cache-dir -U \ 84 | tensorflow-1.12.0-py2.py3-none-any.whl \ 85 | horovod && \ 86 | pip install --no-cache-dir -U $framework_support_installable && \ 87 | rm -f tensorflow-1.12.0-py2.py3-none-any.whl && \ 88 | rm -f $framework_support_installable && \ 89 | pip uninstall -y --no-cache-dir \ 90 | markdown \ 91 | tensorboard 92 | 93 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 94 | -------------------------------------------------------------------------------- /test/resources/mnist/horovod_mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | import os 14 | import tensorflow as tf 15 | import horovod.tensorflow as hvd 16 | 17 | # Horovod: initialize Horovod. 18 | hvd.init() 19 | 20 | # Horovod: pin GPU to be used to process local rank (one GPU per process) 21 | gpus = tf.config.experimental.list_physical_devices("GPU") 22 | for gpu in gpus: 23 | tf.config.experimental.set_memory_growth(gpu, True) 24 | if gpus: 25 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") 26 | 27 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( 28 | path="mnist-%d.npz" % hvd.rank() 29 | ) 30 | 31 | dataset = tf.data.Dataset.from_tensor_slices( 32 | (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) 33 | ) 34 | dataset = dataset.repeat().shuffle(10000).batch(128) 35 | 36 | mnist_model = tf.keras.Sequential( 37 | [ 38 | tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), 39 | tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), 40 | tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), 41 | tf.keras.layers.Dropout(0.25), 42 | tf.keras.layers.Flatten(), 43 | tf.keras.layers.Dense(128, activation="relu"), 44 | tf.keras.layers.Dropout(0.5), 45 | tf.keras.layers.Dense(10, activation="softmax"), 46 | ] 47 | ) 48 | loss = tf.losses.SparseCategoricalCrossentropy() 49 | 50 | # Horovod: adjust learning rate based on number of GPUs. 51 | opt = tf.optimizers.Adam(0.001 * hvd.size()) 52 | 53 | checkpoint_dir = "./checkpoints" 54 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) 55 | 56 | 57 | @tf.function 58 | def training_step(images, labels, first_batch): 59 | with tf.GradientTape() as tape: 60 | probs = mnist_model(images, training=True) 61 | loss_value = loss(labels, probs) 62 | 63 | # Horovod: add Horovod Distributed GradientTape. 64 | tape = hvd.DistributedGradientTape(tape) 65 | 66 | grads = tape.gradient(loss_value, mnist_model.trainable_variables) 67 | opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) 68 | 69 | # Horovod: broadcast initial variable states from rank 0 to all other processes. 70 | # This is necessary to ensure consistent initialization of all workers when 71 | # training is started with random weights or restored from a checkpoint. 72 | # 73 | # Note: broadcast should be done after the first gradient step to ensure optimizer 74 | # initialization. 75 | if first_batch: 76 | hvd.broadcast_variables(mnist_model.variables, root_rank=0) 77 | hvd.broadcast_variables(opt.variables(), root_rank=0) 78 | 79 | return loss_value 80 | 81 | 82 | # Horovod: adjust number of steps based on number of GPUs. 83 | for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())): 84 | loss_value = training_step(images, labels, batch == 0) 85 | 86 | if batch % 10 == 0 and hvd.local_rank() == 0: 87 | print("Step #%d\tLoss: %.6f" % (batch, loss_value)) 88 | 89 | # Horovod: save checkpoints only on worker 0 to prevent other workers from 90 | # corrupting it. 91 | if hvd.rank() == 0: 92 | # Export the keras model as Tensorflow SavedModelBundle 93 | mnist_model.save(os.path.join("/opt/ml/model/mnist/1"), save_format="tf") 94 | -------------------------------------------------------------------------------- /src/sagemaker_tensorflow_container/deep_learning_container.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import json 16 | import logging 17 | import re 18 | 19 | import requests 20 | 21 | 22 | def _validate_instance_id(instance_id): 23 | """ 24 | Validate instance ID 25 | """ 26 | compiled_regex = re.compile(r"^(i-\S{17})") 27 | match = compiled_regex.match(instance_id) 28 | 29 | if not match: 30 | return None 31 | 32 | return match.group(1) 33 | 34 | 35 | def _retrieve_instance_id(): 36 | """ 37 | Retrieve instance ID from instance metadata service 38 | """ 39 | instance_id = None 40 | url = "http://169.254.169.254/latest/meta-data/instance-id" 41 | response = requests_helper(url, timeout=0.1) 42 | 43 | if response is not None: 44 | instance_id = _validate_instance_id(response.text) 45 | 46 | return instance_id 47 | 48 | 49 | def _retrieve_instance_region(): 50 | """ 51 | Retrieve instance region from instance metadata service 52 | """ 53 | region = None 54 | valid_regions = [ 55 | "ap-northeast-1", 56 | "ap-northeast-2", 57 | "ap-southeast-1", 58 | "ap-southeast-2", 59 | "ap-south-1", 60 | "ca-central-1", 61 | "eu-central-1", 62 | "eu-north-1", 63 | "eu-west-1", 64 | "eu-west-2", 65 | "eu-west-3", 66 | "sa-east-1", 67 | "us-east-1", 68 | "us-east-2", 69 | "us-west-1", 70 | "us-west-2", 71 | ] 72 | 73 | url = "http://169.254.169.254/latest/dynamic/instance-identity/document" 74 | response = requests_helper(url, timeout=0.1) 75 | 76 | if response is not None: 77 | response_json = json.loads(response.text) 78 | 79 | if response_json["region"] in valid_regions: 80 | region = response_json["region"] 81 | 82 | return region 83 | 84 | 85 | def query_bucket(): 86 | """ 87 | GET request on an empty object from an Amazon S3 bucket 88 | """ 89 | response = None 90 | instance_id = _retrieve_instance_id() 91 | region = _retrieve_instance_region() 92 | 93 | if instance_id is not None and region is not None: 94 | url = "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com/dlc-containers.txt?x-instance-id={1}".format( # noqa: E501 95 | region, instance_id 96 | ) 97 | response = requests_helper(url, timeout=0.2) 98 | 99 | logging.debug("Query bucket finished: {}".format(response)) 100 | 101 | return response 102 | 103 | 104 | def requests_helper(url, timeout): 105 | response = None 106 | try: 107 | response = requests.get(url, timeout=timeout) 108 | except requests.exceptions.RequestException as e: 109 | logging.error("Request exception: {}".format(e)) 110 | 111 | return response 112 | 113 | 114 | def main(): 115 | """ 116 | Invoke bucket query 117 | """ 118 | # Logs are not necessary for normal run. Remove this line while debugging. 119 | logging.getLogger().disabled = True 120 | 121 | logging.basicConfig(level=logging.ERROR) 122 | query_bucket() 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /benchmarks/tf_benchmarks/execute_tensorflow_training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You 5 | # may not use this file except in compliance with the License. A copy of 6 | # the License is located at 7 | # 8 | # http://aws.amazon.com/apache2.0/ 9 | # 10 | # or in the "license" file accompanying this file. This file is 11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 12 | # ANY KIND, either express or implied. See the License for the specific 13 | # language governing permissions and limitations under the License. 14 | 15 | from __future__ import absolute_import 16 | 17 | import argparse 18 | import itertools 19 | import os 20 | 21 | from sagemaker import Session 22 | from sagemaker.estimator import Framework 23 | from sagemaker.tensorflow import TensorFlow 24 | 25 | default_bucket = Session().default_bucket 26 | dir_path = os.path.dirname(os.path.realpath(__file__)) 27 | 28 | _DEFAULT_HYPERPARAMETERS = { 29 | "batch_size": 32, 30 | "model": "resnet32", 31 | "num_epochs": 10, 32 | "data_format": "NHWC", 33 | "summary_verbosity": 1, 34 | "save_summaries_steps": 10, 35 | "data_name": "cifar10", 36 | } 37 | 38 | 39 | class ScriptModeTensorFlow(Framework): 40 | """This class is temporary until the final version of Script Mode is released. 41 | """ 42 | 43 | __framework_name__ = "tensorflow-scriptmode-beta" 44 | 45 | create_model = TensorFlow.create_model 46 | 47 | def __init__(self, py_version="py3", **kwargs): 48 | super(ScriptModeTensorFlow, self).__init__(**kwargs) 49 | self.py_version = py_version 50 | self.image_name = None 51 | self.framework_version = "1.10.0" 52 | 53 | 54 | def get_args(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument( 57 | "-t", "--instance-types", nargs="+", help=" Set flag", required=True 58 | ) 59 | parser.add_argument("-r", "--role", required=True) 60 | parser.add_argument("-w", "--wait", action="store_true") 61 | parser.add_argument("--region", default="us-west-2") 62 | parser.add_argument("--py-versions", nargs="+", help=" Set flag", default=["py3"]) 63 | parser.add_argument( 64 | "--checkpoint-path", 65 | default=os.path.join(default_bucket(), "benchmarks", "checkpoints"), 66 | help="The S3 location where the model checkpoints and tensorboard events are saved after training", 67 | ) 68 | 69 | return parser.parse_known_args() 70 | 71 | 72 | def main(args, script_args): 73 | for instance_type, py_version in itertools.product(args.instance_types, args.py_versions): 74 | base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:]) 75 | model_dir = os.path.join(args.checkpoint_path, base_name) 76 | 77 | job_hps = create_hyperparameters(model_dir, script_args) 78 | 79 | print("hyperparameters:") 80 | print(job_hps) 81 | 82 | estimator = ScriptModeTensorFlow( 83 | entry_point="tf_cnn_benchmarks.py", 84 | role="SageMakerRole", 85 | source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"), 86 | base_job_name=base_name, 87 | train_instance_count=1, 88 | hyperparameters=job_hps, 89 | train_instance_type=instance_type, 90 | ) 91 | 92 | input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region 93 | estimator.fit({"train": input_dir}, wait=args.wait) 94 | 95 | print("To use TensorBoard, execute the following command:") 96 | cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0 AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s" 97 | print(cmd % (args.region, args.checkpoint_path)) 98 | 99 | 100 | def create_hyperparameters(model_dir, script_args): 101 | job_hps = _DEFAULT_HYPERPARAMETERS.copy() 102 | 103 | job_hps.update({"train_dir": model_dir, "eval_dir": model_dir}) 104 | 105 | script_arg_keys_without_dashes = [ 106 | key[2:] if key.startswith("--") else key[1:] for key in script_args[::2] 107 | ] 108 | script_arg_values = script_args[1::2] 109 | job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values))) 110 | 111 | return job_hps 112 | 113 | 114 | if __name__ == "__main__": 115 | args, script_args = get_args() 116 | main(args, script_args) 117 | -------------------------------------------------------------------------------- /docker/2.1.0/py2/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/cpu/final/tensorflow-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl 24 | 25 | ARG PYTHON=python 26 | ARG PYTHON_PIP=python-pip 27 | ARG PIP=pip 28 | 29 | RUN apt-get update && apt-get install -y --no-install-recommends \ 30 | software-properties-common \ 31 | build-essential \ 32 | openssh-client \ 33 | openssh-server \ 34 | ca-certificates \ 35 | curl \ 36 | git \ 37 | wget \ 38 | vim \ 39 | zlib1g-dev \ 40 | # Install dependent library for OpenCV 41 | libgtk2.0-dev \ 42 | && rm -rf /var/lib/apt/lists/* 43 | 44 | # Install Open MPI 45 | RUN mkdir /tmp/openmpi \ 46 | && cd /tmp/openmpi \ 47 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 48 | && tar zxf openmpi-4.0.1.tar.gz \ 49 | && cd openmpi-4.0.1 \ 50 | && ./configure --enable-orterun-prefix-by-default \ 51 | && make -j $(nproc) all \ 52 | && make install \ 53 | && ldconfig \ 54 | && rm -rf /tmp/openmpi 55 | 56 | # Create a wrapper for OpenMPI to allow running as root by default 57 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 58 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 59 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 60 | && chmod a+x /usr/local/bin/mpirun 61 | 62 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 63 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 64 | 65 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 66 | ENV PATH /usr/local/openmpi/bin/:$PATH 67 | 68 | # SSH login fix. Otherwise user is kicked off after login 69 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 70 | 71 | # Create SSH key. 72 | RUN mkdir -p /root/.ssh/ \ 73 | && mkdir -p /var/run/sshd \ 74 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 75 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 76 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 77 | 78 | WORKDIR / 79 | 80 | RUN apt-get update && apt-get install -y \ 81 | ${PYTHON} \ 82 | ${PYTHON_PIP} 83 | 84 | RUN ${PIP} --no-cache-dir install --upgrade \ 85 | pip \ 86 | setuptools 87 | 88 | # Some TF tools expect a "python" binary 89 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 90 | 91 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 92 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 93 | RUN ${PIP} install --no-cache-dir -U \ 94 | numpy==1.16.6 \ 95 | scipy==1.2.2 \ 96 | scikit-learn==0.20.4 \ 97 | pandas==0.24.2 \ 98 | Pillow==6.2.2 \ 99 | h5py==2.10.0 \ 100 | keras_applications==1.0.8 \ 101 | keras_preprocessing==1.1.0 \ 102 | keras==2.3.1 \ 103 | python-dateutil==2.8.1 \ 104 | pyYAML==5.3.1 \ 105 | requests==2.22.0 \ 106 | awscli \ 107 | mpi4py==3.0.3 \ 108 | opencv-python==4.2.0.32 \ 109 | "cryptography>=2.3" \ 110 | "sagemaker-tensorflow>=2.1,<2.2" \ 111 | "sagemaker-tensorflow-training>2,<4" \ 112 | # Let's install TensorFlow separately in the end to avoid 113 | # the library version to be overwritten 114 | && ${PIP} install --no-cache-dir -U \ 115 | ${TF_URL} \ 116 | && ${PIP} install --no-cache-dir -U \ 117 | horovod==0.18.2 118 | 119 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 120 | 121 | RUN chmod +x /usr/local/bin/deep_learning_container.py 122 | 123 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt 124 | 125 | CMD ["bin/bash"] 126 | -------------------------------------------------------------------------------- /docker/2.1.0/py3/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG PYTHON=python3 24 | ARG PYTHON_PIP=python3-pip 25 | ARG PIP=pip3 26 | 27 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/cpu/final/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl 28 | 29 | RUN apt-get update && apt-get install -y --no-install-recommends \ 30 | python3-dev \ 31 | python3-pip \ 32 | python3-setuptools \ 33 | software-properties-common \ 34 | build-essential \ 35 | openssh-client \ 36 | openssh-server \ 37 | ca-certificates \ 38 | curl \ 39 | git \ 40 | wget \ 41 | vim \ 42 | zlib1g-dev \ 43 | # Install dependent library for OpenCV 44 | libgtk2.0-dev \ 45 | && rm -rf /var/lib/apt/lists/* 46 | 47 | # Install Open MPI 48 | RUN mkdir /tmp/openmpi && \ 49 | cd /tmp/openmpi && \ 50 | curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 51 | && tar zxf openmpi-4.0.1.tar.gz \ 52 | && cd openmpi-4.0.1 \ 53 | && ./configure --enable-orterun-prefix-by-default \ 54 | && make -j $(nproc) all \ 55 | && make install \ 56 | && ldconfig \ 57 | && rm -rf /tmp/openmpi 58 | 59 | # Create a wrapper for OpenMPI to allow running as root by default 60 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 61 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 62 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 63 | && chmod a+x /usr/local/bin/mpirun 64 | 65 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 66 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 67 | 68 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 69 | ENV PATH /usr/local/openmpi/bin/:$PATH 70 | 71 | # SSH login fix. Otherwise user is kicked off after login 72 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 73 | 74 | # Create SSH key. 75 | RUN mkdir -p /root/.ssh/ \ 76 | && mkdir -p /var/run/sshd \ 77 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 78 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 79 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 80 | 81 | WORKDIR / 82 | 83 | RUN ${PIP} --no-cache-dir install --upgrade \ 84 | pip \ 85 | setuptools 86 | 87 | # Some TF tools expect a "python" binary 88 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \ 89 | && ln -s $(which ${PIP}) /usr/bin/pip 90 | 91 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 92 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 93 | RUN ${PIP} install --no-cache-dir -U \ 94 | numpy==1.18.1 \ 95 | scipy==1.2.2 \ 96 | scikit-learn==0.22 \ 97 | pandas==1.0.1 \ 98 | Pillow==7.0.0 \ 99 | h5py==2.10.0 \ 100 | keras_applications==1.0.8 \ 101 | keras_preprocessing==1.1.0 \ 102 | keras==2.3.1 \ 103 | smdebug==0.7.2 \ 104 | python-dateutil==2.8.1 \ 105 | pyYAML==5.3.1 \ 106 | requests==2.22.0 \ 107 | awscli \ 108 | mpi4py==3.0.3 \ 109 | opencv-python==4.2.0.32 \ 110 | sagemaker==1.50.17 \ 111 | sagemaker-experiments==0.1.7 \ 112 | "sagemaker-tensorflow>=2.1,<2.2" \ 113 | "sagemaker-tensorflow-training>2,<4" \ 114 | # Let's install TensorFlow separately in the end to avoid 115 | # the library version to be overwritten 116 | && ${PIP} install --no-cache-dir -U \ 117 | ${TF_URL} \ 118 | horovod==0.18.2 119 | 120 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 121 | 122 | RUN chmod +x /usr/local/bin/deep_learning_container.py 123 | 124 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt 125 | 126 | CMD ["bin/bash"] 127 | -------------------------------------------------------------------------------- /docker/1.13.1/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | openssh-client \ 9 | openssh-server \ 10 | ca-certificates \ 11 | curl \ 12 | git \ 13 | wget \ 14 | vim \ 15 | zlib1g-dev \ 16 | && rm -rf /var/lib/apt/lists/* 17 | 18 | # Install Open MPI 19 | RUN mkdir /tmp/openmpi && \ 20 | cd /tmp/openmpi && \ 21 | curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ 22 | tar zxf openmpi-3.1.2.tar.gz && \ 23 | cd openmpi-3.1.2 && \ 24 | ./configure --enable-orterun-prefix-by-default && \ 25 | make -j $(nproc) all && \ 26 | make install && \ 27 | ldconfig && \ 28 | rm -rf /tmp/openmpi 29 | 30 | # Create a wrapper for OpenMPI to allow running as root by default 31 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 32 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 33 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 34 | chmod a+x /usr/local/bin/mpirun 35 | 36 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 37 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 38 | 39 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 40 | 41 | ENV PATH /usr/local/openmpi/bin/:$PATH 42 | 43 | # SSH login fix. Otherwise user is kicked off after login 44 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 45 | 46 | # Create SSH key. 47 | RUN mkdir -p /root/.ssh/ && \ 48 | mkdir -p /var/run/sshd && \ 49 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 50 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 51 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 52 | 53 | # Set environment variables for MKL 54 | # For more about MKL with TensorFlow see: 55 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 56 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 57 | 58 | WORKDIR / 59 | 60 | ARG PYTHON=python3 61 | ARG PYTHON_PIP=python3-pip 62 | ARG PIP=pip3 63 | ARG PYTHON_VERSION=3.6.6 64 | 65 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \ 66 | tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \ 67 | ./configure && make && make install && \ 68 | apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \ 69 | make && make install && rm -rf ../Python-$PYTHON_VERSION* && \ 70 | ln -s /usr/local/bin/pip3 /usr/bin/pip 71 | 72 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 73 | 74 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 75 | COPY $framework_support_installable . 76 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" 77 | 78 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools 79 | 80 | # Some TF tools expect a "python" binary 81 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 82 | 83 | RUN ${PIP} install --no-cache-dir -U \ 84 | numpy==1.16.2 \ 85 | scipy==1.2.1 \ 86 | scikit-learn==0.20.3 \ 87 | pandas==0.24.2 \ 88 | Pillow==5.4.1 \ 89 | h5py==2.9.0 \ 90 | keras_applications==1.0.7 \ 91 | keras_preprocessing==1.0.9 \ 92 | keras==2.2.4 \ 93 | requests==2.21.0 \ 94 | awscli==1.16.130 \ 95 | mpi4py==3.0.1 \ 96 | "sagemaker-tensorflow>=1.13,<1.14" && \ 97 | # Let's install TensorFlow separately in the end to avoid 98 | # the library version to be overwritten 99 | ${PIP} install --force-reinstall --no-cache-dir -U \ 100 | ${TF_URL} \ 101 | horovod==0.16.4 && \ 102 | ${PIP} install --no-cache-dir -U $framework_support_installable && \ 103 | rm -f $framework_support_installable && \ 104 | ${PIP} uninstall -y --no-cache-dir \ 105 | markdown \ 106 | tensorboard 107 | 108 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 109 | 110 | CMD ["bin/bash"] 111 | -------------------------------------------------------------------------------- /docker/2.0.1/py3/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG PYTHON=python3 24 | ARG PYTHON_PIP=python3-pip 25 | ARG PIP=pip3 26 | 27 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 28 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/cpu/final/tensorflow-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl 29 | 30 | RUN apt-get update && apt-get install -y --no-install-recommends \ 31 | python3-dev \ 32 | python3-pip \ 33 | python3-setuptools \ 34 | software-properties-common \ 35 | build-essential \ 36 | openssh-client \ 37 | openssh-server \ 38 | ca-certificates \ 39 | curl \ 40 | git \ 41 | wget \ 42 | vim \ 43 | zlib1g-dev \ 44 | # Install dependent library for OpenCV 45 | libgtk2.0-dev \ 46 | && rm -rf /var/lib/apt/lists/* 47 | 48 | # Install Open MPI 49 | RUN mkdir /tmp/openmpi && \ 50 | cd /tmp/openmpi && \ 51 | curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 52 | && tar zxf openmpi-4.0.1.tar.gz \ 53 | && cd openmpi-4.0.1 \ 54 | && ./configure --enable-orterun-prefix-by-default \ 55 | && make -j $(nproc) all \ 56 | && make install \ 57 | && ldconfig \ 58 | && rm -rf /tmp/openmpi 59 | 60 | # Create a wrapper for OpenMPI to allow running as root by default 61 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 62 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 63 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 64 | && chmod a+x /usr/local/bin/mpirun 65 | 66 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 67 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 68 | 69 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 70 | ENV PATH /usr/local/openmpi/bin/:$PATH 71 | 72 | # SSH login fix. Otherwise user is kicked off after login 73 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 74 | 75 | # Create SSH key. 76 | RUN mkdir -p /root/.ssh/ \ 77 | && mkdir -p /var/run/sshd \ 78 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 79 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 80 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 81 | 82 | WORKDIR / 83 | 84 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 85 | 86 | RUN ${PIP} --no-cache-dir install --upgrade \ 87 | pip \ 88 | setuptools 89 | 90 | # Some TF tools expect a "python" binary 91 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \ 92 | && ln -s $(which ${PIP}) /usr/bin/pip 93 | 94 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 95 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 96 | RUN ${PIP} install --no-cache-dir -U \ 97 | numpy==1.17.4 \ 98 | scipy==1.2.2 \ 99 | scikit-learn==0.22 \ 100 | pandas==0.25.3 \ 101 | Pillow==6.2.1 \ 102 | h5py==2.10.0 \ 103 | keras_applications==1.0.8 \ 104 | keras_preprocessing==1.1.0 \ 105 | keras==2.3.1 \ 106 | python-dateutil==2.8.0 \ 107 | PyYAML==5.1.2 \ 108 | requests==2.22.0 \ 109 | awscli \ 110 | mpi4py==3.0.3 \ 111 | opencv-python==4.2.0.32 \ 112 | "sagemaker-tensorflow>=2.0,<2.1" \ 113 | # Let's install TensorFlow separately in the end to avoid 114 | # the library version to be overwritten 115 | && ${PIP} install --no-cache-dir -U \ 116 | ${TF_URL} \ 117 | horovod==0.18.2 \ 118 | && ${PIP} install --no-cache-dir -U \ 119 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 120 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 121 | 122 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 123 | 124 | RUN chmod +x /usr/local/bin/deep_learning_container.py 125 | 126 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt 127 | 128 | CMD ["bin/bash"] 129 | -------------------------------------------------------------------------------- /test/integration/local/test_training.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | import tarfile 17 | 18 | import pytest 19 | from sagemaker.tensorflow import TensorFlow 20 | 21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 22 | TF_CHECKPOINT_FILES = ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"] 23 | 24 | 25 | @pytest.fixture # noqa: F811 26 | def py_full_version(py_version): # noqa: F811 27 | if py_version == "2": 28 | return "2.7" 29 | else: 30 | return "3.8" 31 | 32 | 33 | @pytest.mark.skip_gpu 34 | def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version): 35 | output_path = "file://{}".format(tmpdir) 36 | run_tf_training( 37 | script=os.path.join(RESOURCE_PATH, "mnist", "mnist.py"), 38 | instance_type="local", 39 | instance_count=1, 40 | sagemaker_local_session=sagemaker_local_session, 41 | image_uri=image_uri, 42 | framework_version=framework_version, 43 | output_path=output_path, 44 | training_data_path="file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data")), 45 | ) 46 | _assert_files_exist_in_tar(output_path, ["my_model.h5"]) 47 | 48 | 49 | @pytest.mark.skip 50 | def test_distributed_training_cpu_no_ps( 51 | sagemaker_local_session, image_uri, tmpdir, framework_version 52 | ): 53 | output_path = "file://{}".format(tmpdir) 54 | run_tf_training( 55 | script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"), 56 | instance_type="local", 57 | instance_count=2, 58 | sagemaker_local_session=sagemaker_local_session, 59 | image_uri=image_uri, 60 | framework_version=framework_version, 61 | output_path=output_path, 62 | training_data_path="file://{}".format( 63 | os.path.join(RESOURCE_PATH, "mnist", "data-distributed") 64 | ), 65 | ) 66 | _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) 67 | 68 | 69 | @pytest.mark.skip 70 | def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version): 71 | output_path = "file://{}".format(tmpdir) 72 | run_tf_training( 73 | script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"), 74 | instance_type="local", 75 | instance_count=2, 76 | sagemaker_local_session=sagemaker_local_session, 77 | image_uri=image_uri, 78 | framework_version=framework_version, 79 | output_path=output_path, 80 | hyperparameters={"sagemaker_parameter_server_enabled": True}, 81 | training_data_path="file://{}".format( 82 | os.path.join(RESOURCE_PATH, "mnist", "data-distributed") 83 | ), 84 | ) 85 | _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES) 86 | 87 | 88 | def run_tf_training( 89 | script, 90 | instance_type, 91 | instance_count, 92 | sagemaker_local_session, 93 | image_uri, 94 | framework_version, 95 | training_data_path, 96 | output_path=None, 97 | hyperparameters=None, 98 | ): 99 | 100 | hyperparameters = hyperparameters or {} 101 | 102 | estimator = TensorFlow( 103 | entry_point=script, 104 | role="SageMakerRole", 105 | instance_count=instance_count, 106 | instance_type=instance_type, 107 | sagemaker_session=sagemaker_local_session, 108 | image_uri=image_uri, 109 | model_dir="/opt/ml/model", 110 | output_path=output_path, 111 | hyperparameters=hyperparameters, 112 | base_job_name="test-tf", 113 | framework_version=framework_version, 114 | py_version="py3", 115 | ) 116 | 117 | estimator.fit(training_data_path) 118 | 119 | 120 | def _assert_files_exist_in_tar(output_path, files): 121 | if output_path.startswith("file://"): 122 | output_path = output_path[7:] 123 | model_file = os.path.join(output_path, "model.tar.gz") 124 | with tarfile.open(model_file) as tar: 125 | for f in files: 126 | tar.getmember(f) 127 | -------------------------------------------------------------------------------- /docker/2.0.1/py2/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 24 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/cpu/final/tensorflow-2.0.1-cp27-cp27mu-manylinux2010_x86_64.whl 25 | 26 | ARG PYTHON=python 27 | ARG PYTHON_PIP=python-pip 28 | ARG PIP=pip 29 | 30 | RUN apt-get update && apt-get install -y --no-install-recommends \ 31 | software-properties-common \ 32 | build-essential \ 33 | openssh-client \ 34 | openssh-server \ 35 | ca-certificates \ 36 | curl \ 37 | git \ 38 | wget \ 39 | vim \ 40 | zlib1g-dev \ 41 | # Install dependent library for OpenCV 42 | libgtk2.0-dev \ 43 | && rm -rf /var/lib/apt/lists/* 44 | 45 | # Install Open MPI 46 | RUN mkdir /tmp/openmpi \ 47 | && cd /tmp/openmpi \ 48 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 49 | && tar zxf openmpi-4.0.1.tar.gz \ 50 | && cd openmpi-4.0.1 \ 51 | && ./configure --enable-orterun-prefix-by-default \ 52 | && make -j $(nproc) all \ 53 | && make install \ 54 | && ldconfig \ 55 | && rm -rf /tmp/openmpi 56 | 57 | # Create a wrapper for OpenMPI to allow running as root by default 58 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 59 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 60 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 61 | && chmod a+x /usr/local/bin/mpirun 62 | 63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 64 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 65 | 66 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 67 | ENV PATH /usr/local/openmpi/bin/:$PATH 68 | 69 | # SSH login fix. Otherwise user is kicked off after login 70 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 71 | 72 | # Create SSH key. 73 | RUN mkdir -p /root/.ssh/ \ 74 | && mkdir -p /var/run/sshd \ 75 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 76 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 77 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 78 | 79 | WORKDIR / 80 | 81 | RUN apt-get update && apt-get install -y \ 82 | ${PYTHON} \ 83 | ${PYTHON_PIP} 84 | 85 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 86 | 87 | RUN ${PIP} --no-cache-dir install --upgrade \ 88 | pip \ 89 | setuptools 90 | 91 | # Some TF tools expect a "python" binary 92 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 93 | 94 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 95 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 96 | RUN ${PIP} install --no-cache-dir -U \ 97 | numpy==1.16.5 \ 98 | scipy==1.2.2 \ 99 | scikit-learn==0.20.4 \ 100 | pandas==0.24.2 \ 101 | Pillow==6.2.1 \ 102 | h5py==2.10.0 \ 103 | keras_applications==1.0.8 \ 104 | keras_preprocessing==1.1.0 \ 105 | requests==2.22.0 \ 106 | keras==2.3.1 \ 107 | python-dateutil==2.8.0 \ 108 | PyYAML==5.1.2 \ 109 | awscli \ 110 | mpi4py==3.0.3 \ 111 | opencv-python==4.2.0.32 \ 112 | "cryptography>=2.3" \ 113 | "sagemaker-tensorflow>=2.0,<2.1" \ 114 | # Let's install TensorFlow separately in the end to avoid 115 | # the library version to be overwritten 116 | && ${PIP} install --no-cache-dir -U \ 117 | ${TF_URL} \ 118 | && ${PIP} install --no-cache-dir -U \ 119 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 120 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ 121 | && ${PIP} install --no-cache-dir -U \ 122 | horovod==0.18.2 123 | 124 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 125 | 126 | RUN chmod +x /usr/local/bin/deep_learning_container.py 127 | 128 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt 129 | 130 | CMD ["bin/bash"] 131 | -------------------------------------------------------------------------------- /scripts/build_all.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import argparse 16 | import os 17 | import subprocess 18 | 19 | VERSION = "1.13.1" 20 | REPO = "sagemaker-tensorflow-scriptmode" 21 | PY2_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl" # noqa 22 | PY3_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" # noqa 23 | PY2_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl" # noqa 24 | PY3_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" # noqa 25 | DEV_ACCOUNT = "142577830533" 26 | REGION = "us-west-2" 27 | 28 | 29 | def _parse_args(): 30 | 31 | parser = argparse.ArgumentParser() 32 | 33 | parser.add_argument("--account", type=str, default=DEV_ACCOUNT) 34 | parser.add_argument("--region", type=str, default=REGION) 35 | parser.add_argument("--version", type=str, default=VERSION) 36 | parser.add_argument("--py2-cpu-binary", type=str, default=PY2_CPU_BINARY) 37 | parser.add_argument("--py3-cpu-binary", type=str, default=PY3_CPU_BINARY) 38 | parser.add_argument("--py2-gpu-binary", type=str, default=PY2_GPU_BINARY) 39 | parser.add_argument("--py3-gpu-binary", type=str, default=PY3_GPU_BINARY) 40 | parser.add_argument("--repo", type=str, default=REPO) 41 | 42 | return parser.parse_args() 43 | 44 | 45 | args = _parse_args() 46 | binaries = { 47 | "py2-cpu": args.py2_cpu_binary, 48 | "py3-cpu": args.py3_cpu_binary, 49 | "py2-gpu": args.py2_gpu_binary, 50 | "py3-gpu": args.py3_gpu_binary, 51 | } 52 | build_dir = os.path.join("docker", args.version) 53 | 54 | # Run docker-login so we can pull the cached image 55 | login_cmd = subprocess.check_output( 56 | "aws ecr get-login --no-include-email --registry-id {}".format(args.account).split() 57 | ) 58 | print("Executing docker login command: {}".format(login_cmd)) 59 | subprocess.check_call(login_cmd.split()) 60 | 61 | for arch in ["cpu", "gpu"]: 62 | for py_version in ["2", "3"]: 63 | 64 | binary_url = binaries["py{}-{}".format(py_version, arch)] 65 | binary_file = os.path.basename(binary_url) 66 | cmd = "wget -O {}/{} {}".format(build_dir, binary_file, binary_url) 67 | print("Downloading binary file: {}".format(cmd)) 68 | subprocess.check_call(cmd.split()) 69 | 70 | tag = "{}-{}-py{}".format(args.version, arch, py_version) 71 | prev_image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format( 72 | args.account, args.region, args.repo, tag 73 | ) 74 | dockerfile = os.path.join(build_dir, "Dockerfile.{}".format(arch)) 75 | 76 | tar_file_name = ( 77 | subprocess.check_output( 78 | "ls {}/sagemaker_tensorflow_container*".format(build_dir), shell=True 79 | ) 80 | .strip() 81 | .decode("ascii") 82 | ) 83 | print("framework_support_installable is {}".format(os.path.basename(tar_file_name))) 84 | 85 | build_cmd = ( 86 | "docker build -f {} --cache-from {} --build-arg framework_support_installable={} " 87 | "--build-arg py_version={} --build-arg framework_installable={} " 88 | "-t {}:{} {}".format( 89 | dockerfile, 90 | prev_image_uri, 91 | os.path.basename(tar_file_name), 92 | py_version, 93 | binary_file, 94 | args.repo, 95 | tag, 96 | build_dir, 97 | ) 98 | ) 99 | print("Building docker image: {}".format(build_cmd)) 100 | subprocess.check_call(build_cmd.split()) 101 | 102 | print("Deleting binary file {}".format(binary_file)) 103 | subprocess.check_call("rm {}".format(os.path.join(build_dir, binary_file)).split()) 104 | -------------------------------------------------------------------------------- /docker/1.15.0/py2/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # Prevent docker build get stopped by requesting user interaction 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 8 | # Set environment variables for MKL 9 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 10 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 11 | ENV KMP_BLOCKTIME=1 12 | ENV KMP_SETTINGS=0 13 | # Python won’t try to write .pyc or .pyo files on the import of source modules 14 | ENV PYTHONDONTWRITEBYTECODE=1 15 | ENV PYTHONUNBUFFERED=1 16 | # See http://bugs.python.org/issue19846 17 | ENV PYTHONIOENCODING=UTF-8 18 | ENV LANG=C.UTF-8 19 | ENV LC_ALL=C.UTF-8 20 | # Specify the location of module that contains the training logic for SageMaker 21 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html 22 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main 23 | 24 | # Define framework-related package sources 25 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 26 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl 27 | 28 | RUN apt-get update \ 29 | && apt-get install -y --no-install-recommends \ 30 | software-properties-common \ 31 | build-essential \ 32 | openssh-client \ 33 | openssh-server \ 34 | ca-certificates \ 35 | curl \ 36 | git \ 37 | wget \ 38 | vim \ 39 | zlib1g-dev \ 40 | && rm -rf /var/lib/apt/lists/* 41 | 42 | # Install Open MPI 43 | RUN mkdir /tmp/openmpi \ 44 | && cd /tmp/openmpi \ 45 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 46 | && tar zxf openmpi-4.0.1.tar.gz \ 47 | && cd openmpi-4.0.1 \ 48 | && ./configure --enable-orterun-prefix-by-default \ 49 | && make -j $(nproc) all \ 50 | && make install \ 51 | && ldconfig \ 52 | && rm -rf /tmp/openmpi 53 | 54 | # Create a wrapper for OpenMPI to allow running as root by default 55 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 56 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 57 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 58 | && chmod a+x /usr/local/bin/mpirun 59 | 60 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 61 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 62 | 63 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 64 | ENV PATH=/usr/local/openmpi/bin/:$PATH 65 | 66 | # SSH login fix. Otherwise user is kicked off after login 67 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 68 | 69 | # Create SSH key. 70 | RUN mkdir -p /root/.ssh/ \ 71 | && mkdir -p /var/run/sshd \ 72 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 73 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 74 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 75 | 76 | WORKDIR / 77 | 78 | RUN apt-get update \ 79 | && apt-get install -y \ 80 | python \ 81 | python-pip 82 | 83 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 84 | 85 | RUN pip --no-cache-dir install --upgrade \ 86 | pip \ 87 | setuptools 88 | 89 | # Some TF tools expect a "python" binary 90 | RUN ln -s $(which python) /usr/local/bin/python 91 | 92 | RUN pip install --no-cache-dir -U \ 93 | numpy==1.16.5 \ 94 | scipy==1.2.2 \ 95 | scikit-learn==0.20.3 \ 96 | pandas==0.24.2 \ 97 | Pillow==6.2.1 \ 98 | h5py==2.9.0 \ 99 | keras_applications==1.0.8 \ 100 | keras_preprocessing==1.1.0 \ 101 | requests==2.22.0 \ 102 | keras==2.3.1 \ 103 | mpi4py==3.0.2 \ 104 | "cryptography>=2.3" \ 105 | "sagemaker-tensorflow>=1.15,<1.16" \ 106 | # Let's install TensorFlow separately in the end to avoid the library version to be overwritten 107 | && pip install --force-reinstall --no-cache-dir -U \ 108 | ${TF_URL} \ 109 | && pip install --no-cache-dir -U \ 110 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 111 | awscli==1.17.7 \ 112 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ 113 | && pip install --no-cache-dir -U \ 114 | horovod==0.18.2 115 | 116 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 117 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 118 | 119 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 120 | && chmod +x /usr/local/bin/deep_learning_container.py 121 | 122 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt 123 | 124 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 125 | CMD ["bin/bash"] 126 | -------------------------------------------------------------------------------- /docker/1.15.0/py3/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # Prevent docker build get stopped by requesting user interaction 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 8 | # Set environment variables for MKL 9 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 10 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 11 | ENV KMP_BLOCKTIME=1 12 | ENV KMP_SETTINGS=0 13 | # Python won’t try to write .pyc or .pyo files on the import of source modules 14 | ENV PYTHONDONTWRITEBYTECODE=1 15 | ENV PYTHONUNBUFFERED=1 16 | # See http://bugs.python.org/issue19846 17 | ENV PYTHONIOENCODING=UTF-8 18 | ENV LANG=C.UTF-8 19 | ENV LC_ALL=C.UTF-8 20 | # Specify the location of module that contains the training logic for SageMaker 21 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html 22 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main 23 | 24 | # Define framework-related package sources 25 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 26 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl 27 | 28 | RUN apt-get update \ 29 | && apt-get install -y --no-install-recommends \ 30 | python3-dev \ 31 | python3-pip \ 32 | python3-setuptools \ 33 | software-properties-common \ 34 | build-essential \ 35 | openssh-client \ 36 | openssh-server \ 37 | ca-certificates \ 38 | curl \ 39 | git \ 40 | wget \ 41 | vim \ 42 | zlib1g-dev \ 43 | && rm -rf /var/lib/apt/lists/* 44 | 45 | # Install Open MPI 46 | RUN mkdir /tmp/openmpi \ 47 | && cd /tmp/openmpi \ 48 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 49 | && tar zxf openmpi-4.0.1.tar.gz \ 50 | && cd openmpi-4.0.1 \ 51 | && ./configure --enable-orterun-prefix-by-default \ 52 | && make -j $(nproc) all \ 53 | && make install \ 54 | && ldconfig \ 55 | && rm -rf /tmp/openmpi 56 | 57 | # Create a wrapper for OpenMPI to allow running as root by default 58 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 59 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 60 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 61 | && chmod a+x /usr/local/bin/mpirun 62 | 63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 64 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 65 | 66 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 67 | ENV PATH=/usr/local/openmpi/bin/:$PATH 68 | 69 | # SSH login fix. Otherwise user is kicked off after login 70 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 71 | 72 | # Create SSH key. 73 | RUN mkdir -p /root/.ssh/ \ 74 | && mkdir -p /var/run/sshd \ 75 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 76 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 77 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 78 | 79 | WORKDIR / 80 | 81 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 82 | 83 | RUN pip3 --no-cache-dir install --upgrade \ 84 | pip \ 85 | setuptools 86 | 87 | # Some TF tools expect a "python" binary 88 | RUN ln -s $(which python3) /usr/local/bin/python \ 89 | && ln -s $(which pip3) /usr/bin/pip 90 | 91 | RUN pip install --no-cache-dir -U \ 92 | numpy==1.17.4 \ 93 | scipy==1.2.2 \ 94 | scikit-learn==0.20.3 \ 95 | pandas==0.24.2 \ 96 | Pillow==6.2.1 \ 97 | h5py==2.9.0 \ 98 | keras_applications==1.0.8 \ 99 | keras_preprocessing==1.1.0 \ 100 | keras==2.3.1 \ 101 | requests==2.22.0 \ 102 | smdebug==0.5.0.post0 \ 103 | sagemaker-experiments==0.1.3 \ 104 | mpi4py==3.0.2 \ 105 | "cryptography>=2.3" \ 106 | "sagemaker-tensorflow>=1.15,<1.16" \ 107 | # Let's install TensorFlow separately in the end to avoid 108 | # the library version to be overwritten 109 | && pip install --force-reinstall --no-cache-dir -U \ 110 | ${TF_URL} \ 111 | && pip install --force-reinstall --no-cache-dir -U \ 112 | horovod==0.18.2 \ 113 | && pip install --no-cache-dir -U \ 114 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 115 | awscli==1.17.7 \ 116 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 117 | 118 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 119 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 120 | 121 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 122 | && chmod +x /usr/local/bin/deep_learning_container.py 123 | 124 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt 125 | 126 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 127 | CMD ["bin/bash"] 128 | -------------------------------------------------------------------------------- /docker/2.0.0/py2/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz 24 | ARG TENSORFLOW_WHL=tensorflow-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl 25 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/cpu/final/$TENSORFLOW_WHL 26 | 27 | ARG PYTHON=python 28 | ARG PYTHON_PIP=python-pip 29 | ARG PIP=pip 30 | 31 | RUN apt-get update && apt-get install -y --no-install-recommends \ 32 | software-properties-common \ 33 | build-essential \ 34 | openssh-client \ 35 | openssh-server \ 36 | ca-certificates \ 37 | curl \ 38 | git \ 39 | wget \ 40 | vim \ 41 | zlib1g-dev \ 42 | && rm -rf /var/lib/apt/lists/* 43 | 44 | # Install Open MPI 45 | RUN mkdir /tmp/openmpi \ 46 | && cd /tmp/openmpi \ 47 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 48 | && tar zxf openmpi-4.0.1.tar.gz \ 49 | && cd openmpi-4.0.1 \ 50 | && ./configure --enable-orterun-prefix-by-default \ 51 | && make -j $(nproc) all \ 52 | && make install \ 53 | && ldconfig \ 54 | && rm -rf /tmp/openmpi 55 | 56 | # Create a wrapper for OpenMPI to allow running as root by default 57 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 58 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 59 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 60 | && chmod a+x /usr/local/bin/mpirun 61 | 62 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 63 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 64 | 65 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 66 | ENV PATH /usr/local/openmpi/bin/:$PATH 67 | 68 | # SSH login fix. Otherwise user is kicked off after login 69 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 70 | 71 | # Create SSH key. 72 | RUN mkdir -p /root/.ssh/ \ 73 | && mkdir -p /var/run/sshd \ 74 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 75 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 76 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 77 | 78 | WORKDIR / 79 | 80 | RUN apt-get update && apt-get install -y \ 81 | ${PYTHON} \ 82 | ${PYTHON_PIP} 83 | 84 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 85 | 86 | RUN ${PIP} --no-cache-dir install --upgrade \ 87 | pip \ 88 | setuptools 89 | 90 | # Some TF tools expect a "python" binary 91 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 92 | 93 | # Setup TF Wheel 94 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL 95 | 96 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 97 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 98 | RUN ${PIP} install --no-cache-dir -U \ 99 | numpy==1.16.5 \ 100 | scipy==1.2.2 \ 101 | scikit-learn==0.20.4 \ 102 | pandas==0.24.2 \ 103 | Pillow==6.2.1 \ 104 | h5py==2.10.0 \ 105 | keras_applications==1.0.8 \ 106 | keras_preprocessing==1.1.0 \ 107 | requests==2.22.0 \ 108 | keras==2.3.1 \ 109 | python-dateutil==2.8.0 \ 110 | PyYAML==5.1.2 \ 111 | awscli==1.16.303 \ 112 | mpi4py==3.0.3 \ 113 | "cryptography>=2.3" \ 114 | "sagemaker-tensorflow>=2.0,<2.1" \ 115 | # Let's install TensorFlow separately in the end to avoid 116 | # the library version to be overwritten 117 | # ${PIP} install --no-cache-dir -U ${TF_URL} \ 118 | && ${PIP} install --no-cache-dir -U \ 119 | /tmp/$TENSORFLOW_WHL \ 120 | && rm -f /tmp/$TENSORFLOW_WHL \ 121 | && ${PIP} install --no-cache-dir -U \ 122 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 123 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \ 124 | && ${PIP} install --no-cache-dir -U \ 125 | horovod==0.18.2 126 | 127 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 128 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 129 | 130 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 131 | && chmod +x /usr/local/bin/deep_learning_container.py 132 | 133 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt 134 | 135 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 136 | CMD ["bin/bash"] 137 | -------------------------------------------------------------------------------- /docker/2.0.0/py3/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | # Set environment variables for MKL 11 | # For more about MKL with TensorFlow see: 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 14 | ENV KMP_BLOCKTIME=1 15 | ENV KMP_SETTINGS=0 16 | 17 | ENV PYTHONDONTWRITEBYTECODE=1 18 | ENV PYTHONUNBUFFERED=1 19 | ENV PYTHONIOENCODING=UTF-8 20 | ENV LANG=C.UTF-8 21 | ENV LC_ALL=C.UTF-8 22 | 23 | ARG PYTHON=python3 24 | ARG PYTHON_PIP=python3-pip 25 | ARG PIP=pip3 26 | 27 | # Use TENSORFLOW_WHL instead of TF_URL before releasing 28 | # ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl" 29 | 30 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz 31 | ARG TENSORFLOW_WHL=tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl 32 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/cpu/final/$TENSORFLOW_WHL 33 | 34 | RUN apt-get update && apt-get install -y --no-install-recommends \ 35 | python3-dev \ 36 | python3-pip \ 37 | python3-setuptools \ 38 | software-properties-common \ 39 | build-essential \ 40 | openssh-client \ 41 | openssh-server \ 42 | ca-certificates \ 43 | curl \ 44 | git \ 45 | wget \ 46 | vim \ 47 | zlib1g-dev \ 48 | && rm -rf /var/lib/apt/lists/* 49 | 50 | # Install Open MPI 51 | RUN mkdir /tmp/openmpi && \ 52 | cd /tmp/openmpi && \ 53 | curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 54 | && tar zxf openmpi-4.0.1.tar.gz \ 55 | && cd openmpi-4.0.1 \ 56 | && ./configure --enable-orterun-prefix-by-default \ 57 | && make -j $(nproc) all \ 58 | && make install \ 59 | && ldconfig \ 60 | && rm -rf /tmp/openmpi 61 | 62 | # Create a wrapper for OpenMPI to allow running as root by default 63 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 64 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 65 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 66 | && chmod a+x /usr/local/bin/mpirun 67 | 68 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 69 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 70 | 71 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 72 | ENV PATH /usr/local/openmpi/bin/:$PATH 73 | 74 | # SSH login fix. Otherwise user is kicked off after login 75 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 76 | 77 | # Create SSH key. 78 | RUN mkdir -p /root/.ssh/ \ 79 | && mkdir -p /var/run/sshd \ 80 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 81 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 82 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 83 | 84 | WORKDIR / 85 | 86 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 87 | 88 | RUN ${PIP} --no-cache-dir install --upgrade \ 89 | pip \ 90 | setuptools 91 | 92 | # Some TF tools expect a "python" binary 93 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \ 94 | && ln -s $(which ${PIP}) /usr/bin/pip 95 | 96 | # Setup TF Wheel 97 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL 98 | 99 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 100 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 101 | RUN ${PIP} install --no-cache-dir -U \ 102 | numpy==1.17.4 \ 103 | scipy==1.2.2 \ 104 | scikit-learn==0.22 \ 105 | pandas==0.25.3 \ 106 | Pillow==6.2.1 \ 107 | h5py==2.10.0 \ 108 | keras_applications==1.0.8 \ 109 | keras_preprocessing==1.1.0 \ 110 | keras==2.3.1 \ 111 | python-dateutil==2.8.0 \ 112 | PyYAML==5.1.2 \ 113 | requests==2.22.0 \ 114 | awscli==1.16.303 \ 115 | mpi4py==3.0.3 \ 116 | "sagemaker-tensorflow>=2.0,<2.1" \ 117 | # Let's install TensorFlow separately in the end to avoid 118 | # the library version to be overwritten 119 | # ${PIP} install --no-cache-dir -U ${TF_URL} \ 120 | && ${PIP} install --no-cache-dir -U \ 121 | /tmp/$TENSORFLOW_WHL \ 122 | horovod==0.18.2 \ 123 | && rm -f /tmp/$TENSORFLOW_WHL \ 124 | && ${PIP} install --no-cache-dir -U \ 125 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 126 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 127 | 128 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 129 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 130 | 131 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 132 | && chmod +x /usr/local/bin/deep_learning_container.py 133 | 134 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt 135 | 136 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 137 | CMD ["bin/bash"] 138 | -------------------------------------------------------------------------------- /docker/1.14.0/py2/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | openssh-client \ 9 | openssh-server \ 10 | ca-certificates \ 11 | curl \ 12 | git \ 13 | wget \ 14 | vim \ 15 | gcc-4.9 \ 16 | g++-4.9 \ 17 | gcc-4.9-base \ 18 | zlib1g-dev \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # Install Open MPI 22 | RUN mkdir /tmp/openmpi && \ 23 | cd /tmp/openmpi && \ 24 | curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \ 25 | tar zxf openmpi-4.0.1.tar.gz && \ 26 | cd openmpi-4.0.1 && \ 27 | ./configure --enable-orterun-prefix-by-default && \ 28 | make -j $(nproc) all && \ 29 | make install && \ 30 | ldconfig && \ 31 | rm -rf /tmp/openmpi 32 | 33 | # Create a wrapper for OpenMPI to allow running as root by default 34 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 35 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 36 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 37 | chmod a+x /usr/local/bin/mpirun 38 | 39 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 40 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 41 | 42 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 43 | 44 | ENV PATH /usr/local/openmpi/bin/:$PATH 45 | 46 | # SSH login fix. Otherwise user is kicked off after login 47 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 48 | 49 | # Create SSH key. 50 | RUN mkdir -p /root/.ssh/ && \ 51 | mkdir -p /var/run/sshd && \ 52 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 53 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 54 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 55 | 56 | # Set environment variables for MKL 57 | # For more about MKL with TensorFlow see: 58 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 59 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 60 | 61 | WORKDIR / 62 | 63 | ARG PYTHON=python 64 | ARG PYTHON_PIP=python-pip 65 | ARG PIP=pip 66 | 67 | RUN apt-get update && apt-get install -y \ 68 | ${PYTHON} \ 69 | ${PYTHON_PIP} 70 | 71 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 72 | 73 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 74 | ARG sagemaker_tensorflow_extensions=sagemaker_tensorflow-1.14.0.1.0.0-cp27-cp27mu-manylinux1_x86_64.whl 75 | COPY $framework_support_installable . 76 | COPY $sagemaker_tensorflow_extensions . 77 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp27-cp27mu-linux_x86_64.whl" 78 | 79 | # Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod 80 | # Backup existing GCC installation as priority 100, so that it can be recovered later. 81 | RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ 82 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ 83 | update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ 84 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 85 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ 86 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ 87 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ 88 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 89 | 90 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools 91 | 92 | # Some TF tools expect a "python" binary 93 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 94 | 95 | RUN ${PIP} install --no-cache-dir -U \ 96 | numpy==1.16.4 \ 97 | scipy==1.2.2 \ 98 | scikit-learn==0.20.3 \ 99 | pandas==0.24.2 \ 100 | Pillow==6.1.0 \ 101 | h5py==2.9.0 \ 102 | keras_applications==1.0.8 \ 103 | keras_preprocessing==1.1.0 \ 104 | requests==2.22.0 \ 105 | keras==2.2.4 \ 106 | awscli==1.16.196 \ 107 | mpi4py==3.0.2 \ 108 | $sagemaker_tensorflow_extensions \ 109 | # Let's install TensorFlow separately in the end to avoid 110 | # the library version to be overwritten 111 | && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \ 112 | && ${PIP} install --no-cache-dir -U $framework_support_installable && \ 113 | rm -f $framework_support_installable \ 114 | && ${PIP} install --no-cache-dir -U horovod==0.16.4 \ 115 | && ${PIP} uninstall -y --no-cache-dir \ 116 | markdown 117 | 118 | # Remove GCC pinning 119 | RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ 120 | update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ 121 | update-alternatives --remove g++ /usr/bin/g++-4.9 && \ 122 | update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 123 | 124 | 125 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 126 | 127 | CMD ["bin/bash"] 128 | -------------------------------------------------------------------------------- /docker/1.14.0/py3/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | software-properties-common \ 7 | build-essential \ 8 | openssh-client \ 9 | openssh-server \ 10 | ca-certificates \ 11 | curl \ 12 | git \ 13 | wget \ 14 | vim \ 15 | gcc-4.9 \ 16 | g++-4.9 \ 17 | gcc-4.9-base \ 18 | zlib1g-dev \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # Install Open MPI 22 | RUN mkdir /tmp/openmpi && \ 23 | cd /tmp/openmpi && \ 24 | curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \ 25 | tar zxf openmpi-4.0.1.tar.gz && \ 26 | cd openmpi-4.0.1 && \ 27 | ./configure --enable-orterun-prefix-by-default && \ 28 | make -j $(nproc) all && \ 29 | make install && \ 30 | ldconfig && \ 31 | rm -rf /tmp/openmpi 32 | 33 | # Create a wrapper for OpenMPI to allow running as root by default 34 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 35 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 36 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 37 | chmod a+x /usr/local/bin/mpirun 38 | 39 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 40 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 41 | 42 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 43 | 44 | ENV PATH /usr/local/openmpi/bin/:$PATH 45 | 46 | # SSH login fix. Otherwise user is kicked off after login 47 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 48 | 49 | # Create SSH key. 50 | RUN mkdir -p /root/.ssh/ && \ 51 | mkdir -p /var/run/sshd && \ 52 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 53 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 54 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 55 | 56 | # Set environment variables for MKL 57 | # For more about MKL with TensorFlow see: 58 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn 59 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 60 | 61 | WORKDIR / 62 | 63 | ARG PYTHON=python3 64 | ARG PYTHON_PIP=python3-pip 65 | ARG PIP=pip3 66 | ARG PYTHON_VERSION=3.6.6 67 | 68 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \ 69 | tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \ 70 | ./configure && make && make install && \ 71 | apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \ 72 | make && make install && rm -rf ../Python-$PYTHON_VERSION* && \ 73 | ln -s /usr/local/bin/pip3 /usr/bin/pip 74 | 75 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 76 | 77 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 78 | COPY $framework_support_installable . 79 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl" 80 | 81 | # Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod 82 | # Backup existing GCC installation as priority 100, so that it can be recovered later. 83 | RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \ 84 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \ 85 | update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \ 86 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100 87 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \ 88 | update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \ 89 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \ 90 | update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200 91 | 92 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools 93 | 94 | # Some TF tools expect a "python" binary 95 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 96 | 97 | RUN ${PIP} install --no-cache-dir -U \ 98 | numpy==1.16.4 \ 99 | scipy==1.2.2 \ 100 | scikit-learn==0.20.3 \ 101 | pandas==0.24.2 \ 102 | Pillow==6.1.0 \ 103 | h5py==2.9.0 \ 104 | keras_applications==1.0.8 \ 105 | keras_preprocessing==1.1.0 \ 106 | keras==2.2.4 \ 107 | requests==2.22.0 \ 108 | awscli==1.16.196 \ 109 | mpi4py==3.0.2 \ 110 | "sagemaker-tensorflow>=1.14,<1.15" && \ 111 | # Let's install TensorFlow separately in the end to avoid 112 | # the library version to be overwritten 113 | ${PIP} install --force-reinstall --no-cache-dir -U \ 114 | ${TF_URL} \ 115 | horovod==0.16.4 && \ 116 | ${PIP} install --no-cache-dir -U $framework_support_installable && \ 117 | rm -f $framework_support_installable && \ 118 | ${PIP} uninstall -y --no-cache-dir \ 119 | markdown 120 | 121 | # Remove GCC pinning 122 | RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \ 123 | update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \ 124 | update-alternatives --remove g++ /usr/bin/g++-4.9 && \ 125 | update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9 126 | 127 | 128 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 129 | 130 | CMD ["bin/bash"] 131 | -------------------------------------------------------------------------------- /docker/1.12.0/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | ENV NCCL_VERSION=2.3.5-2+cuda9.0 6 | ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0 7 | ENV TF_TENSORRT_VERSION=4.1.2 8 | 9 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 10 | software-properties-common && \ 11 | add-apt-repository ppa:deadsnakes/ppa -y && \ 12 | rm -rf /var/lib/apt/lists/* 13 | 14 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 15 | ca-certificates \ 16 | cuda-command-line-tools-9-0 \ 17 | cuda-cublas-dev-9-0 \ 18 | cuda-cudart-dev-9-0 \ 19 | cuda-cufft-dev-9-0 \ 20 | cuda-curand-dev-9-0 \ 21 | cuda-cusolver-dev-9-0 \ 22 | cuda-cusparse-dev-9-0 \ 23 | curl \ 24 | libcudnn7=${CUDNN_VERSION} \ 25 | libnccl2=${NCCL_VERSION} \ 26 | libnccl-dev=${NCCL_VERSION} \ 27 | libgomp1 \ 28 | wget \ 29 | openssh-client \ 30 | openssh-server \ 31 | build-essential && \ 32 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 33 | # adds a new list which contains libnvinfer library, so it needs another 34 | # 'apt-get update' to retrieve that list before it can actually install the 35 | # library. 36 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 37 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 38 | apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 39 | nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \ 40 | apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 41 | libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 && \ 42 | rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \ 43 | rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \ 44 | rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \ 45 | rm -rf /var/lib/apt/lists/* 46 | 47 | ########################################################################### 48 | # Horovod & its dependencies 49 | ########################################################################### 50 | 51 | # Install Open MPI 52 | RUN mkdir /tmp/openmpi && \ 53 | cd /tmp/openmpi && \ 54 | curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ 55 | tar zxf openmpi-3.1.2.tar.gz && \ 56 | cd openmpi-3.1.2 && \ 57 | ./configure --enable-orterun-prefix-by-default && \ 58 | make -j $(nproc) all && \ 59 | make install && \ 60 | ldconfig && \ 61 | rm -rf /tmp/openmpi 62 | 63 | ARG py_version 64 | 65 | RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \ 66 | apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \ 67 | ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \ 68 | rm -rf /var/lib/apt/lists/* 69 | 70 | # Create a wrapper for OpenMPI to allow running as root by default 71 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 72 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 73 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 74 | chmod a+x /usr/local/bin/mpirun 75 | 76 | # Configure OpenMPI to run good defaults: 77 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 78 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 79 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 80 | 81 | # Set default NCCL parameters 82 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 83 | 84 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 85 | ENV PATH /usr/local/openmpi/bin/:$PATH 86 | ENV PATH=/usr/local/nvidia/bin:$PATH 87 | 88 | # SSH login fix. Otherwise user is kicked off after login 89 | RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 90 | 91 | # Create SSH key. 92 | RUN mkdir -p /root/.ssh/ && \ 93 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 94 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 95 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 96 | 97 | ########################################################################### 98 | # Python won’t try to write .pyc or .pyo files on the import of source modules 99 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 100 | 101 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 102 | python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \ 103 | rm get-pip.py 104 | 105 | WORKDIR / 106 | 107 | ARG framework_installable 108 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 109 | 110 | COPY $framework_installable tensorflow-1.12.0-py2.py3-none-any.whl 111 | COPY $framework_support_installable . 112 | 113 | RUN pip install --no-cache-dir -U \ 114 | keras==2.2.4 \ 115 | mpi4py==3.0.1 \ 116 | $framework_support_installable \ 117 | "sagemaker-tensorflow>=1.12,<1.13" \ 118 | # Let's install TensorFlow separately in the end to avoid 119 | # the library version to be overwritten 120 | && pip install --force-reinstall --no-cache-dir -U tensorflow-1.12.0-py2.py3-none-any.whl \ 121 | \ 122 | && rm -f tensorflow-1.12.0-py2.py3-none-any.whl \ 123 | && rm -f $framework_support_installable \ 124 | && pip uninstall -y --no-cache-dir \ 125 | markdown \ 126 | tensorboard 127 | 128 | # Install Horovod, temporarily using CUDA stubs 129 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \ 130 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \ 131 | ldconfig 132 | 133 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 134 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===================================== 2 | SageMaker TensorFlow Training Toolkit 3 | ===================================== 4 | 5 | The SageMaker TensorFlow Training Toolkit is an open source library for making the 6 | TensorFlow framework run on `Amazon SageMaker `__. 7 | 8 | This repository also contains Dockerfiles which install this library, TensorFlow, and dependencies 9 | for building SageMaker TensorFlow images. 10 | 11 | For information on running TensorFlow jobs on SageMaker: 12 | 13 | - `SageMaker Python SDK documentation `__ 14 | - `SageMaker Notebook Examples `__ 15 | 16 | Table of Contents 17 | ----------------- 18 | 19 | #. `Getting Started <#getting-started>`__ 20 | #. `Building your Image <#building-your-image>`__ 21 | #. `Running the tests <#running-the-tests>`__ 22 | 23 | Getting Started 24 | --------------- 25 | 26 | Prerequisites 27 | ~~~~~~~~~~~~~ 28 | 29 | Make sure you have installed all of the following prerequisites on your 30 | development machine: 31 | 32 | - `Docker `__ 33 | 34 | For Testing on GPU 35 | ^^^^^^^^^^^^^^^^^^ 36 | 37 | - `Nvidia-Docker `__ 38 | 39 | Recommended 40 | ^^^^^^^^^^^ 41 | 42 | - A Python environment management tool. (e.g. 43 | `PyEnv `__, 44 | `VirtualEnv `__) 45 | 46 | Building your Image 47 | ------------------- 48 | 49 | `Amazon SageMaker `__ 50 | utilizes Docker containers to run all training jobs & inference endpoints. 51 | 52 | The Docker images are built from the Dockerfiles specified in 53 | `docker/ `__. 54 | 55 | The Dockerfiles are grouped based on TensorFlow version and separated 56 | based on Python version and processor type. 57 | 58 | The Dockerfiles for TensorFlow 2.0+ are available in the 59 | `tf-2 `__ branch. 60 | 61 | To build the images, first copy the files under 62 | `docker/build_artifacts/ `__ 63 | to the folder container the Dockerfile you wish to build. 64 | 65 | :: 66 | 67 | # Example for building a TF 2.1 image with Python 3 68 | cp docker/build_artifacts/* docker/2.1.0/py3/. 69 | 70 | After that, go to the directory containing the Dockerfile you wish to build, 71 | and run ``docker build`` to build the image. 72 | 73 | :: 74 | 75 | # Example for building a TF 2.1 image for CPU with Python 3 76 | cd docker/2.1.0/py3 77 | docker build -t tensorflow-training:2.1.0-cpu-py3 -f Dockerfile.cpu . 78 | 79 | Don't forget the period at the end of the ``docker build`` command! 80 | 81 | Running the tests 82 | ----------------- 83 | 84 | Running the tests requires installation of the SageMaker TensorFlow Training Toolkit code and its test 85 | dependencies. 86 | 87 | :: 88 | 89 | git clone https://github.com/aws/sagemaker-tensorflow-container.git 90 | cd sagemaker-tensorflow-container 91 | pip install -e .[test] 92 | 93 | Tests are defined in 94 | `test/ `__ 95 | and include unit, integration and functional tests. 96 | 97 | Unit Tests 98 | ~~~~~~~~~~ 99 | 100 | If you want to run unit tests, then use: 101 | 102 | :: 103 | 104 | # All test instructions should be run from the top level directory 105 | pytest test/unit 106 | 107 | Integration Tests 108 | ~~~~~~~~~~~~~~~~~ 109 | 110 | Running integration tests require `Docker `__ and `AWS 111 | credentials `__, 112 | as the integration tests make calls to a couple AWS services. The integration and functional 113 | tests require configurations specified within their respective 114 | `conftest.py `__.Make sure to update the account-id and region at a minimum. 115 | 116 | Integration tests on GPU require `Nvidia-Docker `__. 117 | 118 | Before running integration tests: 119 | 120 | #. Build your Docker image. 121 | #. Pass in the correct pytest arguments to run tests against your Docker image. 122 | 123 | If you want to run local integration tests, then use: 124 | 125 | :: 126 | 127 | # Required arguments for integration tests are found in test/integ/conftest.py 128 | pytest test/integration --docker-base-name \ 129 | --tag \ 130 | --framework-version \ 131 | --processor 132 | 133 | :: 134 | 135 | # Example 136 | pytest test/integration --docker-base-name preprod-tensorflow \ 137 | --tag 1.0 \ 138 | --framework-version 1.4.1 \ 139 | --processor cpu 140 | 141 | Functional Tests 142 | ~~~~~~~~~~~~~~~~ 143 | 144 | Functional tests are removed from the current branch, please see them in older branch `r1.0 `__. 145 | 146 | Contributing 147 | ------------ 148 | 149 | Please read 150 | `CONTRIBUTING.md `__ 151 | for details on our code of conduct, and the process for submitting pull 152 | requests to us. 153 | 154 | License 155 | ------- 156 | 157 | SageMaker TensorFlow Containers is licensed under the Apache 2.0 License. It is copyright 2018 158 | Amazon.com, Inc. or its affiliates. All Rights Reserved. The license is available at: 159 | http://aws.amazon.com/apache2.0/ 160 | -------------------------------------------------------------------------------- /test/integration/sagemaker/test_mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import os 16 | 17 | import boto3 18 | import pytest 19 | from sagemaker.tensorflow import TensorFlow 20 | from sagemaker.tuner import HyperparameterTuner, IntegerParameter 21 | from sagemaker.utils import unique_name_from_base 22 | from six.moves.urllib.parse import urlparse 23 | 24 | from timeout import timeout 25 | 26 | 27 | @pytest.mark.deploy_test 28 | def test_mnist(sagemaker_session, image_uri, instance_type, framework_version): 29 | resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 30 | script = os.path.join(resource_path, "mnist", "mnist.py") 31 | estimator = TensorFlow( 32 | entry_point=script, 33 | role="SageMakerRole", 34 | instance_type=instance_type, 35 | instance_count=1, 36 | sagemaker_session=sagemaker_session, 37 | image_uri=image_uri, 38 | framework_version=framework_version, 39 | ) 40 | inputs = estimator.sagemaker_session.upload_data( 41 | path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" 42 | ) 43 | estimator.fit(inputs, job_name=unique_name_from_base("test-sagemaker-mnist")) 44 | _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) 45 | 46 | 47 | def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, framework_version): 48 | resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 49 | script = os.path.join(resource_path, "mnist", "mnist.py") 50 | estimator = TensorFlow( 51 | entry_point=script, 52 | role="SageMakerRole", 53 | instance_count=2, 54 | instance_type=instance_type, 55 | sagemaker_session=sagemaker_session, 56 | image_uri=image_uri, 57 | framework_version=framework_version, 58 | ) 59 | inputs = estimator.sagemaker_session.upload_data( 60 | path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" 61 | ) 62 | estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) 63 | _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data) 64 | 65 | 66 | def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version): 67 | resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 68 | script = os.path.join(resource_path, "mnist", "mnist_custom.py") 69 | estimator = TensorFlow( 70 | entry_point=script, 71 | role="SageMakerRole", 72 | hyperparameters={"sagemaker_parameter_server_enabled": True}, 73 | instance_count=2, 74 | instance_type=instance_type, 75 | sagemaker_session=sagemaker_session, 76 | image_uri=image_uri, 77 | framework_version=framework_version, 78 | ) 79 | inputs = estimator.sagemaker_session.upload_data( 80 | path=os.path.join(resource_path, "mnist", "data-distributed"), 81 | key_prefix="scriptmode/mnist-distributed", 82 | ) 83 | estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist")) 84 | _assert_checkpoint_exists_v2(sagemaker_session.boto_region_name, estimator.model_dir, 10) 85 | 86 | 87 | def test_tuning(sagemaker_session, image_uri, instance_type, framework_version): 88 | resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") 89 | script = os.path.join(resource_path, "mnist", "mnist.py") 90 | 91 | estimator = TensorFlow( 92 | entry_point=script, 93 | role="SageMakerRole", 94 | instance_type=instance_type, 95 | instance_count=1, 96 | sagemaker_session=sagemaker_session, 97 | image_uri=image_uri, 98 | framework_version=framework_version, 99 | script_mode=True, 100 | ) 101 | 102 | hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)} 103 | objective_metric_name = "accuracy" 104 | metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}] 105 | 106 | tuner = HyperparameterTuner( 107 | estimator, 108 | objective_metric_name, 109 | hyperparameter_ranges, 110 | metric_definitions, 111 | max_jobs=2, 112 | max_parallel_jobs=2, 113 | ) 114 | 115 | with timeout(minutes=20): 116 | inputs = estimator.sagemaker_session.upload_data( 117 | path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist" 118 | ) 119 | 120 | tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32) 121 | tuner.fit(inputs, job_name=tuning_job_name) 122 | tuner.wait() 123 | 124 | 125 | def _assert_checkpoint_exists_v2(region, model_dir, checkpoint_number): 126 | """ 127 | Checking for v2 style checkpoints i.e. checkpoint and .index files 128 | """ 129 | _assert_s3_file_exists(region, os.path.join(model_dir, 'checkpoint')) 130 | _assert_s3_file_exists(region, 131 | os.path.join(model_dir, 'model.ckpt-{}.index'.format(checkpoint_number))) 132 | 133 | 134 | def _assert_checkpoint_exists(region, model_dir, checkpoint_number): 135 | _assert_s3_file_exists(region, os.path.join(model_dir, "graph.pbtxt")) 136 | _assert_s3_file_exists( 137 | region, os.path.join(model_dir, "model.ckpt-{}.index".format(checkpoint_number)) 138 | ) 139 | _assert_s3_file_exists( 140 | region, os.path.join(model_dir, "model.ckpt-{}.meta".format(checkpoint_number)) 141 | ) 142 | 143 | 144 | def _assert_s3_file_exists(region, s3_url): 145 | parsed_url = urlparse(s3_url) 146 | s3 = boto3.resource("s3", region_name=region) 147 | s3.Object(parsed_url.netloc, parsed_url.path.lstrip("/")).load() 148 | -------------------------------------------------------------------------------- /test/integration/sagemaker/recordio_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | from __future__ import absolute_import 14 | 15 | import argparse 16 | from random import randint 17 | import struct 18 | import sys 19 | 20 | import numpy as np 21 | import tensorflow as tf 22 | 23 | # Utility functions for generating a recordio encoded file of labeled numpy data 24 | # for testing. Each file contains one or more records. Each record is a TensorFlow 25 | # protobuf Example object. Each object contains an integer label and a numpy array 26 | # encoded as a byte list. 27 | 28 | # This file can be used in script mode to generate a single file or be used 29 | # as a module to generate files via build_record_file. 30 | 31 | _kmagic = 0xCED7230A 32 | 33 | padding = {} 34 | for amount in range(4): 35 | if sys.version_info >= (3,): 36 | padding[amount] = bytes([0x00 for _ in range(amount)]) 37 | else: 38 | padding[amount] = bytearray([0x00 for _ in range(amount)]) 39 | 40 | 41 | def write_recordio(f, data, header_flag=0): 42 | """Writes a single data point as a RecordIO record to the given file.""" 43 | length = len(data) 44 | f.write(struct.pack("I", _kmagic)) 45 | header = (header_flag << 29) | length 46 | f.write(struct.pack("I", header)) 47 | pad = (((length + 3) >> 2) << 2) - length 48 | f.write(data) 49 | f.write(padding[pad]) 50 | 51 | 52 | def write_recordio_multipart(f, data): 53 | """Writes a single data point into three multipart records.""" 54 | length = len(data) 55 | stride = int(length / 3) 56 | 57 | data_start = data[0:stride] 58 | data_middle = data[stride : 2 * stride] 59 | data_end = data[2 * stride :] 60 | 61 | write_recordio(f, data_start, 1) 62 | write_recordio(f, data_middle, 2) 63 | write_recordio(f, data_end, 3) 64 | 65 | 66 | def string_feature(value): 67 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.tostring()])) 68 | 69 | 70 | def label_feature(value): 71 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 72 | 73 | 74 | def write_numpy_array(f, feature_name, label, arr, multipart=False): 75 | feature = {"labels": label_feature(label), feature_name: string_feature(arr)} 76 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 77 | if multipart: 78 | write_recordio_multipart(f, example.SerializeToString()) 79 | else: 80 | write_recordio(f, example.SerializeToString()) 81 | 82 | 83 | def build_record_file( 84 | filename, num_records, dimension, classes=2, data_feature_name="data", multipart=False 85 | ): 86 | """Builds a recordio encoded file of TF protobuf Example objects. Each object 87 | is a labeled numpy array. Each example has two field - a single int64 'label' 88 | field and a single bytes list field, containing a serialized numpy array. 89 | 90 | Each generated numpy array is a multidimensional normal with 91 | the specified dimension. The normal distribution is class specific, each class 92 | has a different mean for the distribution, so it should be possible to learn 93 | a multiclass classifier on this data. Class means are determnistic - so multiple 94 | calls to this function with the same number of classes will produce samples drawn 95 | from the same distribution for each class. 96 | 97 | Args: 98 | filename - the file to write to 99 | num_records - how many labeled numpy arrays to generate 100 | classes - the cardinality of labels 101 | data_feature_name - the name to give the numpy array in the Example object 102 | dimension - the size of each numpy array. 103 | """ 104 | with open(filename, "wb") as f: 105 | for i in range(num_records): 106 | cur_class = i % classes 107 | loc = int(cur_class - (classes / 2)) 108 | write_numpy_array( 109 | f, 110 | data_feature_name, 111 | cur_class, 112 | np.random.normal(loc=loc, size=(dimension,)), 113 | multipart, 114 | ) 115 | 116 | 117 | def build_single_record_file(filename, dimension, classes=2, data_feature_name="data"): 118 | cur_class = randint(0, classes - 1) 119 | loc = int(cur_class - (classes / 2)) 120 | 121 | arr = np.random.normal(loc=loc, size=(dimension,)) 122 | feature = {"labels": label_feature(cur_class), data_feature_name: string_feature(arr)} 123 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 124 | with open(filename, "wb") as f: 125 | f.write(example.SerializeToString()) 126 | 127 | 128 | def validate_record_file(filename, dimension): 129 | data = open(filename, "rb").read() 130 | magic_number, length = struct.unpack("II", data[0:8]) 131 | encoded = data[8 : 8 + length] 132 | 133 | features = { 134 | "data": tf.io.FixedLenFeature([], tf.string), 135 | "labels": tf.io.FixedLenFeature([], tf.int64), 136 | } 137 | parsed = tf.io.parse_single_example(encoded, features) 138 | array = tf.io.decode_raw(parsed["data"], tf.float64) 139 | 140 | assert array.shape[0] == dimension 141 | 142 | 143 | if __name__ == "__main__": 144 | parser = argparse.ArgumentParser(description="Generate synthetic multi-class training data") 145 | parser.add_argument("--dimension", default=65536, type=int) 146 | parser.add_argument("--classes", default=2, type=int) 147 | parser.add_argument("--num-records", default=4, type=int) 148 | parser.add_argument("--data-feature-name", default="data") 149 | parser.add_argument("filename", type=str) 150 | args = parser.parse_args() 151 | build_record_file( 152 | args.filename, args.num_records, args.dimension, args.classes, args.data_feature_name 153 | ) 154 | validate_record_file(args.filename, args.dimension) 155 | -------------------------------------------------------------------------------- /docker/2.0.1/py2/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-base-ubuntu18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | ENV PYTHONDONTWRITEBYTECODE=1 11 | ENV PYTHONUNBUFFERED=1 12 | ENV PYTHONIOENCODING=UTF-8 13 | ENV LANG=C.UTF-8 14 | ENV LC_ALL=C.UTF-8 15 | 16 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 17 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/gpu/final/tensorflow_gpu-2.0.1-cp27-cp27mu-manylinux2010_x86_64.whl 18 | 19 | ARG PYTHON=python 20 | ARG PYTHON_PIP=python-pip 21 | ARG PIP=pip 22 | 23 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 24 | ca-certificates \ 25 | cuda-command-line-tools-10-0 \ 26 | cuda-cublas-dev-10-0 \ 27 | cuda-cudart-dev-10-0 \ 28 | cuda-cufft-dev-10-0 \ 29 | cuda-curand-dev-10-0 \ 30 | cuda-cusolver-dev-10-0 \ 31 | cuda-cusparse-dev-10-0 \ 32 | curl \ 33 | libcudnn7=7.5.1.10-1+cuda10.0 \ 34 | # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it 35 | libnccl2=2.4.7-1+cuda10.0 \ 36 | libgomp1 \ 37 | libnccl-dev=2.4.7-1+cuda10.0 \ 38 | libfreetype6-dev \ 39 | libhdf5-serial-dev \ 40 | libpng-dev \ 41 | libzmq3-dev \ 42 | git \ 43 | wget \ 44 | vim \ 45 | build-essential \ 46 | openssh-client \ 47 | openssh-server \ 48 | zlib1g-dev \ 49 | # Install dependent library for OpenCV 50 | libgtk2.0-dev \ 51 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 52 | # adds a new list which contains libnvinfer library, so it needs another 53 | # 'apt-get update' to retrieve that list before it can actually install the 54 | # library. 55 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 56 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 57 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 58 | nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ 59 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 60 | libnvinfer5=5.0.2-1+cuda10.0 \ 61 | && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ 62 | && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ 63 | && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ 64 | && rm -rf /var/lib/apt/lists/* \ 65 | && mkdir -p /var/run/sshd 66 | 67 | # Install Open MPI 68 | RUN mkdir /tmp/openmpi \ 69 | && cd /tmp/openmpi \ 70 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 71 | && tar zxf openmpi-4.0.1.tar.gz \ 72 | && cd openmpi-4.0.1 \ 73 | && ./configure --enable-orterun-prefix-by-default \ 74 | && make -j $(nproc) all \ 75 | && make install \ 76 | && ldconfig \ 77 | && rm -rf /tmp/openmpi 78 | 79 | RUN apt-get update && apt-get install -y \ 80 | ${PYTHON} \ 81 | ${PYTHON_PIP} 82 | 83 | # Create a wrapper for OpenMPI to allow running as root by default 84 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 85 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 86 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 87 | && chmod a+x /usr/local/bin/mpirun 88 | 89 | # Configure OpenMPI to run good defaults: 90 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 91 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 92 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 93 | 94 | # Set default NCCL parameters 95 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 96 | 97 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 98 | ENV PATH /usr/local/openmpi/bin/:$PATH 99 | ENV PATH=/usr/local/nvidia/bin:$PATH 100 | 101 | # SSH login fix. Otherwise user is kicked off after login 102 | RUN mkdir -p /var/run/sshd \ 103 | && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 104 | 105 | # Create SSH key. 106 | RUN mkdir -p /root/.ssh/ \ 107 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 108 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 109 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 110 | 111 | WORKDIR / 112 | 113 | RUN ${PIP} --no-cache-dir install --upgrade \ 114 | pip \ 115 | setuptools 116 | 117 | # Some TF tools expect a "python" binary 118 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 119 | 120 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 121 | 122 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 123 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 124 | RUN ${PIP} install --no-cache-dir -U \ 125 | numpy==1.16.5 \ 126 | scipy==1.2.2 \ 127 | scikit-learn==0.20.4 \ 128 | pandas==0.24.2 \ 129 | Pillow==6.2.1 \ 130 | h5py==2.10.0 \ 131 | keras_applications==1.0.8 \ 132 | keras_preprocessing==1.1.0 \ 133 | requests==2.22.0 \ 134 | keras==2.3.1 \ 135 | python-dateutil==2.8.0 \ 136 | PyYAML==5.1.2 \ 137 | awscli \ 138 | mpi4py==3.0.3 \ 139 | opencv-python==4.2.0.32 \ 140 | "cryptography>=2.3" \ 141 | "sagemaker-tensorflow>=2.0,<2.1" \ 142 | # Let's install TensorFlow separately in the end to avoid 143 | # the library version to be overwritten 144 | && ${PIP} install --no-cache-dir -U \ 145 | ${TF_URL} \ 146 | && ${PIP} install --no-cache-dir -U \ 147 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 148 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 149 | 150 | # Install Horovod, temporarily using CUDA stubs 151 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ 152 | && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \ 153 | && ldconfig 154 | 155 | # Allow OpenSSH to talk to containers without asking for confirmation 156 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ 157 | && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ 158 | && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 159 | 160 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 161 | 162 | RUN chmod +x /usr/local/bin/deep_learning_container.py 163 | 164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt 165 | 166 | CMD ["bin/bash"] 167 | -------------------------------------------------------------------------------- /docker/2.1.0/py2/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-base-ubuntu18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | ENV PYTHONDONTWRITEBYTECODE=1 11 | ENV PYTHONUNBUFFERED=1 12 | ENV PYTHONIOENCODING=UTF-8 13 | ENV LANG=C.UTF-8 14 | ENV LC_ALL=C.UTF-8 15 | 16 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/gpu/final/tensorflow_gpu-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl 17 | 18 | ARG PYTHON=python 19 | ARG PYTHON_PIP=python-pip 20 | ARG PIP=pip 21 | 22 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 23 | ca-certificates \ 24 | cuda-command-line-tools-10-1 \ 25 | cuda-cudart-dev-10-1 \ 26 | cuda-cufft-dev-10-1 \ 27 | cuda-curand-dev-10-1 \ 28 | cuda-cusolver-dev-10-1 \ 29 | cuda-cusparse-dev-10-1 \ 30 | curl \ 31 | libcudnn7=7.6.2.24-1+cuda10.1 \ 32 | # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it 33 | libnccl2=2.4.7-1+cuda10.1 \ 34 | libgomp1 \ 35 | libnccl-dev=2.4.7-1+cuda10.1 \ 36 | libfreetype6-dev \ 37 | libhdf5-serial-dev \ 38 | libpng-dev \ 39 | libzmq3-dev \ 40 | git \ 41 | wget \ 42 | vim \ 43 | build-essential \ 44 | openssh-client \ 45 | openssh-server \ 46 | zlib1g-dev \ 47 | # Install dependent library for OpenCV 48 | libgtk2.0-dev \ 49 | #cuda-cublas-dev not available with 10-1, install libcublas instead 50 | #it will downgrade the cublas from 10-2 to 10-1 51 | #adding an extra flag --allow-downgrades for it 52 | && apt-get update \ 53 | && apt-get install -y --no-install-recommends --allow-unauthenticated --allow-downgrades \ 54 | libcublas10=10.1.0.105-1 \ 55 | libcublas-dev=10.1.0.105-1 \ 56 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 57 | # adds a new list which contains libnvinfer library, so it needs another 58 | # 'apt-get update' to retrieve that list before it can actually install the 59 | # library. 60 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 61 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 62 | # nvinfer-runtime-trt-repo doesn't have a 1804-cuda10.1 version yet. see: 63 | # https://developer.download.nvidia.cn/compute/machine-learning/repos/ubuntu1804/x86_64/ 64 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 65 | nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ 66 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 67 | libnvinfer6=6.0.1-1+cuda10.1 \ 68 | && rm -rf /var/lib/apt/lists/* \ 69 | && mkdir -p /var/run/sshd 70 | 71 | # Install Open MPI 72 | RUN mkdir /tmp/openmpi \ 73 | && cd /tmp/openmpi \ 74 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 75 | && tar zxf openmpi-4.0.1.tar.gz \ 76 | && cd openmpi-4.0.1 \ 77 | && ./configure --enable-orterun-prefix-by-default \ 78 | && make -j $(nproc) all \ 79 | && make install \ 80 | && ldconfig \ 81 | && rm -rf /tmp/openmpi 82 | 83 | RUN apt-get update && apt-get install -y \ 84 | ${PYTHON} \ 85 | ${PYTHON_PIP} 86 | 87 | # Create a wrapper for OpenMPI to allow running as root by default 88 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 89 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 90 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 91 | && chmod a+x /usr/local/bin/mpirun 92 | 93 | # Configure OpenMPI to run good defaults: 94 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 95 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 96 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 97 | 98 | # Set default NCCL parameters 99 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 100 | 101 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 102 | ENV PATH /usr/local/openmpi/bin/:$PATH 103 | ENV PATH=/usr/local/nvidia/bin:$PATH 104 | 105 | # SSH login fix. Otherwise user is kicked off after login 106 | RUN mkdir -p /var/run/sshd \ 107 | && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 108 | 109 | # Create SSH key. 110 | RUN mkdir -p /root/.ssh/ \ 111 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 112 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 113 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 114 | 115 | WORKDIR / 116 | 117 | RUN ${PIP} --no-cache-dir install --upgrade \ 118 | pip \ 119 | setuptools 120 | 121 | # Some TF tools expect a "python" binary 122 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 123 | 124 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 125 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 126 | RUN ${PIP} install --no-cache-dir -U \ 127 | numpy==1.16.6 \ 128 | scipy==1.2.2 \ 129 | scikit-learn==0.20.4 \ 130 | pandas==0.24.2 \ 131 | Pillow==6.2.2 \ 132 | h5py==2.10.0 \ 133 | keras_applications==1.0.8 \ 134 | keras_preprocessing==1.1.0 \ 135 | keras==2.3.1 \ 136 | python-dateutil==2.8.1 \ 137 | pyYAML==5.3.1 \ 138 | requests==2.22.0 \ 139 | awscli \ 140 | mpi4py==3.0.3 \ 141 | opencv-python==4.2.0.32 \ 142 | "cryptography>=2.3" \ 143 | "sagemaker-tensorflow>=2.1,<2.2" \ 144 | "sagemaker-tensorflow-training>2,<4" \ 145 | # Let's install TensorFlow separately in the end to avoid 146 | # the library version to be overwritten 147 | && ${PIP} install --no-cache-dir -U \ 148 | ${TF_URL} 149 | 150 | # Install Horovod, temporarily using CUDA stubs 151 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ 152 | && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \ 153 | && ldconfig 154 | 155 | # Allow OpenSSH to talk to containers without asking for confirmation 156 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ 157 | && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ 158 | && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 159 | 160 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py 161 | 162 | RUN chmod +x /usr/local/bin/deep_learning_container.py 163 | 164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt 165 | 166 | CMD ["bin/bash"] 167 | -------------------------------------------------------------------------------- /docker/2.0.0/py2/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-base-ubuntu18.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | # prevent stopping by user interaction 6 | ENV DEBIAN_FRONTEND noninteractive 7 | ENV DEBCONF_NONINTERACTIVE_SEEN true 8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 9 | 10 | ENV PYTHONDONTWRITEBYTECODE=1 11 | ENV PYTHONUNBUFFERED=1 12 | ENV PYTHONIOENCODING=UTF-8 13 | ENV LANG=C.UTF-8 14 | ENV LC_ALL=C.UTF-8 15 | 16 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz 17 | ARG TENSORFLOW_WHL=tensorflow_gpu-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl 18 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/gpu/final/$TENSORFLOW_WHL 19 | 20 | ARG PYTHON=python 21 | ARG PYTHON_PIP=python-pip 22 | ARG PIP=pip 23 | 24 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 25 | ca-certificates \ 26 | cuda-command-line-tools-10-0 \ 27 | cuda-cublas-dev-10-0 \ 28 | cuda-cudart-dev-10-0 \ 29 | cuda-cufft-dev-10-0 \ 30 | cuda-curand-dev-10-0 \ 31 | cuda-cusolver-dev-10-0 \ 32 | cuda-cusparse-dev-10-0 \ 33 | curl \ 34 | libcudnn7=7.5.1.10-1+cuda10.0 \ 35 | # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it 36 | libnccl2=2.4.7-1+cuda10.0 \ 37 | libgomp1 \ 38 | libnccl-dev=2.4.7-1+cuda10.0 \ 39 | libfreetype6-dev \ 40 | libhdf5-serial-dev \ 41 | libpng-dev \ 42 | libzmq3-dev \ 43 | git \ 44 | wget \ 45 | vim \ 46 | build-essential \ 47 | openssh-client \ 48 | openssh-server \ 49 | zlib1g-dev \ 50 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 51 | # adds a new list which contains libnvinfer library, so it needs another 52 | # 'apt-get update' to retrieve that list before it can actually install the 53 | # library. 54 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 55 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 56 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 57 | nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ 58 | && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 59 | libnvinfer5=5.0.2-1+cuda10.0 \ 60 | && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ 61 | && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ 62 | && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ 63 | && rm -rf /var/lib/apt/lists/* \ 64 | && mkdir -p /var/run/sshd 65 | 66 | # Install Open MPI 67 | RUN mkdir /tmp/openmpi \ 68 | && cd /tmp/openmpi \ 69 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 70 | && tar zxf openmpi-4.0.1.tar.gz \ 71 | && cd openmpi-4.0.1 \ 72 | && ./configure --enable-orterun-prefix-by-default \ 73 | && make -j $(nproc) all \ 74 | && make install \ 75 | && ldconfig \ 76 | && rm -rf /tmp/openmpi 77 | 78 | RUN apt-get update && apt-get install -y \ 79 | ${PYTHON} \ 80 | ${PYTHON_PIP} 81 | 82 | # Create a wrapper for OpenMPI to allow running as root by default 83 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 84 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 85 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 86 | && chmod a+x /usr/local/bin/mpirun 87 | 88 | # Configure OpenMPI to run good defaults: 89 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 90 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 91 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 92 | 93 | # Set default NCCL parameters 94 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 95 | 96 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 97 | ENV PATH /usr/local/openmpi/bin/:$PATH 98 | ENV PATH=/usr/local/nvidia/bin:$PATH 99 | 100 | # SSH login fix. Otherwise user is kicked off after login 101 | RUN mkdir -p /var/run/sshd \ 102 | && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 103 | 104 | # Create SSH key. 105 | RUN mkdir -p /root/.ssh/ \ 106 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 107 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 108 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 109 | 110 | WORKDIR / 111 | 112 | RUN ${PIP} --no-cache-dir install --upgrade \ 113 | pip \ 114 | setuptools 115 | 116 | # Some TF tools expect a "python" binary 117 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 118 | 119 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 120 | 121 | # Setup TF Wheel 122 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL 123 | 124 | # install PyYAML==5.1.2 to avoid conflict with latest awscli 125 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli 126 | RUN ${PIP} install --no-cache-dir -U \ 127 | numpy==1.16.5 \ 128 | scipy==1.2.2 \ 129 | scikit-learn==0.20.4 \ 130 | pandas==0.24.2 \ 131 | Pillow==6.2.1 \ 132 | h5py==2.10.0 \ 133 | keras_applications==1.0.8 \ 134 | keras_preprocessing==1.1.0 \ 135 | requests==2.22.0 \ 136 | keras==2.3.1 \ 137 | python-dateutil==2.8.0 \ 138 | PyYAML==5.1.2 \ 139 | awscli==1.16.303 \ 140 | mpi4py==3.0.3 \ 141 | "cryptography>=2.3" \ 142 | "sagemaker-tensorflow>=2.0,<2.1" \ 143 | # Let's install TensorFlow separately in the end to avoid 144 | # the library version to be overwritten 145 | # ${PIP} install --no-cache-dir -U ${TF_URL} \ 146 | && ${PIP} install --no-cache-dir -U \ 147 | /tmp/$TENSORFLOW_WHL \ 148 | && rm -f /tmp/$TENSORFLOW_WHL \ 149 | && ${PIP} install --no-cache-dir -U \ 150 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 151 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 152 | 153 | # Install Horovod, temporarily using CUDA stubs 154 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ 155 | && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \ 156 | && ldconfig 157 | 158 | # Allow OpenSSH to talk to containers without asking for confirmation 159 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ 160 | && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ 161 | && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 162 | 163 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 164 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 165 | 166 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 167 | && chmod +x /usr/local/bin/deep_learning_container.py 168 | 169 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt 170 | 171 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 172 | CMD ["bin/bash"] 173 | -------------------------------------------------------------------------------- /docker/1.13.1/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-base-ubuntu16.04 2 | 3 | LABEL maintainer="Amazon AI" 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 6 | ca-certificates \ 7 | cuda-command-line-tools-10-0 \ 8 | cuda-cublas-dev-10-0 \ 9 | cuda-cudart-dev-10-0 \ 10 | cuda-cufft-dev-10-0 \ 11 | cuda-curand-dev-10-0 \ 12 | cuda-cusolver-dev-10-0 \ 13 | cuda-cusparse-dev-10-0 \ 14 | curl \ 15 | libcudnn7=7.5.1.10-1+cuda10.0 \ 16 | # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it 17 | libnccl2=2.4.7-1+cuda10.0 \ 18 | libgomp1 \ 19 | libnccl-dev=2.4.7-1+cuda10.0 \ 20 | libfreetype6-dev \ 21 | libhdf5-serial-dev \ 22 | libpng12-dev \ 23 | libzmq3-dev \ 24 | git \ 25 | wget \ 26 | vim \ 27 | build-essential \ 28 | openssh-client \ 29 | openssh-server \ 30 | zlib1g-dev && \ 31 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 32 | # adds a new list which contains libnvinfer library, so it needs another 33 | # 'apt-get update' to retrieve that list before it can actually install the 34 | # library. 35 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 36 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 37 | apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 38 | nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \ 39 | apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ 40 | libnvinfer5=5.0.2-1+cuda10.0 && \ 41 | rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \ 42 | rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \ 43 | rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \ 44 | rm -rf /var/lib/apt/lists/* && \ 45 | mkdir -p /var/run/sshd 46 | 47 | ########################################################################### 48 | # Horovod & its dependencies 49 | ########################################################################### 50 | 51 | # Install Open MPI 52 | RUN mkdir /tmp/openmpi && \ 53 | cd /tmp/openmpi && \ 54 | curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ 55 | tar zxf openmpi-3.1.2.tar.gz && \ 56 | cd openmpi-3.1.2 && \ 57 | ./configure --enable-orterun-prefix-by-default && \ 58 | make -j $(nproc) all && \ 59 | make install && \ 60 | ldconfig && \ 61 | rm -rf /tmp/openmpi 62 | 63 | ARG PYTHON=python3 64 | ARG PYTHON_PIP=python3-pip 65 | ARG PIP=pip3 66 | ARG PYTHON_VERSION=3.6.6 67 | 68 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \ 69 | tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \ 70 | ./configure && make && make install && \ 71 | apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \ 72 | make && make install && rm -rf ../Python-$PYTHON_VERSION* && \ 73 | ln -s /usr/local/bin/pip3 /usr/bin/pip 74 | 75 | # Create a wrapper for OpenMPI to allow running as root by default 76 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ 77 | echo '#!/bin/bash' > /usr/local/bin/mpirun && \ 78 | echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ 79 | chmod a+x /usr/local/bin/mpirun 80 | 81 | # Configure OpenMPI to run good defaults: 82 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 83 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ 84 | echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 85 | 86 | # Set default NCCL parameters 87 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 88 | 89 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 90 | ENV PATH /usr/local/openmpi/bin/:$PATH 91 | ENV PATH=/usr/local/nvidia/bin:$PATH 92 | 93 | # SSH login fix. Otherwise user is kicked off after login 94 | RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 95 | 96 | # Create SSH key. 97 | RUN mkdir -p /root/.ssh/ && \ 98 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 99 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 100 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 101 | 102 | ########################################################################### 103 | # Python won’t try to write .pyc or .pyo files on the import of source modules 104 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 105 | 106 | WORKDIR / 107 | 108 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl" 109 | 110 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools 111 | 112 | # Some TF tools expect a "python" binary 113 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 114 | 115 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz 116 | COPY $framework_support_installable . 117 | 118 | RUN ${PIP} install --no-cache-dir -U \ 119 | numpy==1.16.2 \ 120 | scipy==1.2.1 \ 121 | scikit-learn==0.20.3 \ 122 | pandas==0.24.2 \ 123 | Pillow==5.4.1 \ 124 | h5py==2.9.0 \ 125 | keras_applications==1.0.7 \ 126 | keras_preprocessing==1.0.9 \ 127 | requests==2.21.0 \ 128 | keras==2.2.4 \ 129 | awscli==1.16.130 \ 130 | mpi4py==3.0.1 \ 131 | "sagemaker-tensorflow>=1.13,<1.14" \ 132 | # Let's install TensorFlow separately in the end to avoid 133 | # the library version to be overwritten 134 | && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \ 135 | && ${PIP} install --no-cache-dir -U $framework_support_installable && \ 136 | rm -f $framework_support_installable \ 137 | && ${PIP} uninstall -y --no-cache-dir \ 138 | markdown \ 139 | tensorboard 140 | 141 | # Install Horovod, temporarily using CUDA stubs 142 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \ 143 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.16.4 && \ 144 | ldconfig 145 | 146 | # Allow OpenSSH to talk to containers without asking for confirmation 147 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 148 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 149 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 150 | 151 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main 152 | 153 | CMD ["bin/bash"] 154 | -------------------------------------------------------------------------------- /test/resources/mnist/mnist_custom.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | import argparse 14 | import numpy as np 15 | import os 16 | import json 17 | import tensorflow as tf 18 | import tensorflow_io as tfio 19 | from tensorflow.keras.layers import Conv2D, BatchNormalization, Dense, Flatten 20 | 21 | """ 22 | This script uses custom loops to train Mnist model and saves the checkpoints using 23 | checkpoint manager. 24 | """ 25 | 26 | # define a model 27 | class LeNet(tf.keras.Model): 28 | def __init__(self): 29 | super(LeNet, self).__init__() 30 | self.conv1 = Conv2D( 31 | filters=16, kernel_size=3, padding='valid', 32 | strides=(2, 2), input_shape=(None, 28, 28, 1), 33 | data_format='channels_last', trainable=True, 34 | ) 35 | 36 | self.bn1 = BatchNormalization() 37 | self.conv2 = Conv2D( 38 | filters= 16, kernel_size=3, strides=(2,2), 39 | data_format='channels_last', padding='valid', 40 | trainable=True 41 | ) 42 | self.bn2 = BatchNormalization() 43 | self.flatten = Flatten() 44 | self.fc = Dense(10, trainable=True) 45 | 46 | def call(self, x): 47 | x = self.conv1(x) 48 | x = self.bn1(x) 49 | x = tf.nn.relu(x) 50 | x = self.conv2(x) 51 | x = self.bn2(x) 52 | x = tf.nn.relu(x) 53 | x = self.flatten(x) 54 | x = self.fc(x) 55 | return x 56 | 57 | 58 | @tf.function 59 | def train_step(x, y, net, optimizer, loss_summary, accuracy_summary): 60 | """ 61 | x: input 62 | y: true label 63 | net: model object 64 | optim: optimizer 65 | loss_summary: summary writer for loss 66 | acc_summary: summary writer for accuracy 67 | """ 68 | with tf.GradientTape() as tape: 69 | z = net(x) 70 | loss = tf.keras.losses.sparse_categorical_crossentropy( 71 | y_true=y, y_pred=z, from_logits=True, axis=-1 72 | ) 73 | loss = tf.reduce_mean(loss) 74 | grads = tape.gradient(loss, net.trainable_variables) 75 | optimizer.apply_gradients(zip(grads, net.trainable_variables)) 76 | 77 | # instrument loss 78 | loss_summary(loss) 79 | 80 | # instrument accuracy 81 | accuracy_summary(y, z) 82 | return 83 | 84 | 85 | @tf.function 86 | def eval_step(x, y, net, loss_summary, accuracy_summary): 87 | # training=False is only needed if there are layers with different 88 | # behavior during training versus inference (e.g. Dropout). 89 | z = net(x) 90 | 91 | loss = tf.keras.losses.sparse_categorical_crossentropy( 92 | y_true=y, y_pred=z, from_logits=True, axis=-1 93 | ) 94 | loss = tf.reduce_mean(loss) 95 | 96 | loss_summary(loss) 97 | accuracy_summary(y, z) 98 | return 99 | 100 | 101 | def load_data(data_dir): 102 | """ Load training and eval dataset 103 | """ 104 | x, y = np.load(os.path.join(data_dir, 'train_data.npy')), \ 105 | np.load(os.path.join(data_dir, 'train_labels.npy')) 106 | 107 | vx, vy = np.load(os.path.join(data_dir, 'eval_data.npy')), \ 108 | np.load(os.path.join(data_dir, 'eval_labels.npy')) 109 | 110 | print('==== train tensor shape ====') 111 | print(x.shape, y.shape) 112 | 113 | print('==== eval tensor shape ====') 114 | print(vx.shape, vy.shape) 115 | # x.shape = (1000, 784), y.shape = (1000, ) 116 | 117 | x, y = x.astype(np.float32), y.astype(np.int) 118 | vx, vy = vx.astype(np.float32), vy.astype(np.int) 119 | x /= 255.0 120 | vx /= 255.0 121 | 122 | dtrain = tf.data.Dataset.from_tensor_slices((x, y)) 123 | dtrain=dtrain.map(lambda x, y:(tf.reshape(x, (28, 28, 1)), y)) 124 | dtrain = dtrain.shuffle(10000).batch(512) 125 | 126 | deval = tf.data.Dataset.from_tensor_slices((vx, vy)) 127 | deval=deval.map(lambda x, y:(tf.reshape(x, (28, 28, 1)), y)) 128 | deval = deval.batch(10) 129 | return dtrain, deval 130 | 131 | 132 | def parse_args(): 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING']) 135 | parser.add_argument('--model_dir', type=str) 136 | parser.add_argument('--max-steps', type=int, default=200) 137 | parser.add_argument('--save-checkpoint-steps', type=int, default=200) 138 | parser.add_argument('--throttle-secs', type=int, default=60) 139 | parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])) 140 | parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST']) 141 | parser.add_argument('--batch-size', type=int, default=100) 142 | parser.add_argument('--export-model-during-training', type=bool, default=False) 143 | return parser.parse_args() 144 | 145 | 146 | def main(args): 147 | net = LeNet() 148 | net.build(input_shape=(None, 28, 28, 1)) 149 | 150 | optimizer = tf.keras.optimizers.Adam() 151 | 152 | train_loss = tf.keras.metrics.Mean(name='train_loss') 153 | train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') 154 | 155 | test_loss = tf.keras.metrics.Mean(name='test_loss') 156 | test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy') 157 | 158 | ckpt = tf.train.Checkpoint(optimizer=optimizer, model=net) 159 | ckpt_manager = tf.train.CheckpointManager( 160 | ckpt, args.model_dir, max_to_keep=5, checkpoint_name='model.ckpt' 161 | ) 162 | 163 | dtrain, deval = load_data(args.train) 164 | num_epochs = 10 165 | for i in range(num_epochs): 166 | for x, y in dtrain: 167 | train_step(x, y, net, optimizer, train_loss, train_accuracy) 168 | 169 | for x, y in deval: 170 | eval_step(x, y, net, test_loss, test_accuracy) 171 | 172 | print( 173 | f"Epoch {i+1}", 174 | f"Train Loss: {train_loss.result()}", 175 | f"Train Accuracy: {train_accuracy.result()}", 176 | f"Test Loss: {test_loss.result()}", 177 | f"Test Accuracy: {test_accuracy.result()}" 178 | ) 179 | 180 | if args.current_host == args.hosts[0]: 181 | ckpt_manager.save() 182 | 183 | if __name__ == '__main__': 184 | main(parse_args()) 185 | -------------------------------------------------------------------------------- /docker/1.15.0/py2/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0. 2 | # https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/ 3 | FROM nvidia/cuda:10.0-base-ubuntu18.04 4 | 5 | LABEL maintainer="Amazon AI" 6 | 7 | # Prevent docker build get stopped by requesting user interaction 8 | ENV DEBIAN_FRONTEND=noninteractive 9 | ENV DEBCONF_NONINTERACTIVE_SEEN=true 10 | # Python won’t try to write .pyc or .pyo files on the import of source modules 11 | ENV PYTHONDONTWRITEBYTECODE=1 12 | ENV PYTHONUNBUFFERED=1 13 | # See http://bugs.python.org/issue19846 14 | ENV PYTHONIOENCODING=UTF-8 15 | ENV LANG=C.UTF-8 16 | ENV LC_ALL=C.UTF-8 17 | # Specify the location of module that contains the training logic for SageMaker 18 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html 19 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main 20 | 21 | # Define framework-related package sources 22 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz 23 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl 24 | 25 | RUN apt-get update \ 26 | && apt-get install -y --no-install-recommends --allow-unauthenticated \ 27 | ca-certificates \ 28 | cuda-command-line-tools-10-0 \ 29 | cuda-cublas-dev-10-0 \ 30 | cuda-cudart-dev-10-0 \ 31 | cuda-cufft-dev-10-0 \ 32 | cuda-curand-dev-10-0 \ 33 | cuda-cusolver-dev-10-0 \ 34 | cuda-cusparse-dev-10-0 \ 35 | curl \ 36 | libcudnn7=7.5.1.10-1+cuda10.0 \ 37 | # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it 38 | libnccl2=2.4.7-1+cuda10.0 \ 39 | libgomp1 \ 40 | libnccl-dev=2.4.7-1+cuda10.0 \ 41 | libfreetype6-dev \ 42 | libhdf5-serial-dev \ 43 | libpng-dev \ 44 | libzmq3-dev \ 45 | git \ 46 | wget \ 47 | vim \ 48 | build-essential \ 49 | openssh-client \ 50 | openssh-server \ 51 | zlib1g-dev \ 52 | # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 53 | # adds a new list which contains libnvinfer library, so it needs another 54 | # 'apt-get update' to retrieve that list before it can actually install the library. 55 | # We don't install libnvinfer-dev since we don't need to build against TensorRT, 56 | # and libnvinfer4 doesn't contain libnvinfer.a static library. 57 | && apt-get update \ 58 | && apt-get install -y --no-install-recommends --allow-unauthenticated \ 59 | nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \ 60 | && apt-get update \ 61 | && apt-get install -y --no-install-recommends --allow-unauthenticated \ 62 | libnvinfer5=5.0.2-1+cuda10.0 \ 63 | && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ 64 | && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ 65 | && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ 66 | && rm -rf /var/lib/apt/lists/* \ 67 | && mkdir -p /var/run/sshd 68 | 69 | # Install Open MPI 70 | RUN mkdir /tmp/openmpi \ 71 | && cd /tmp/openmpi \ 72 | && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \ 73 | && tar zxf openmpi-4.0.1.tar.gz \ 74 | && cd openmpi-4.0.1 \ 75 | && ./configure --enable-orterun-prefix-by-default \ 76 | && make -j $(nproc) all \ 77 | && make install \ 78 | && ldconfig \ 79 | && rm -rf /tmp/openmpi 80 | 81 | RUN apt-get update \ 82 | && apt-get install -y \ 83 | python \ 84 | python-pip 85 | 86 | # Create a wrapper for OpenMPI to allow running as root by default 87 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \ 88 | && echo '#!/bin/bash' > /usr/local/bin/mpirun \ 89 | && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \ 90 | && chmod a+x /usr/local/bin/mpirun 91 | 92 | # Configure OpenMPI to run good defaults: 93 | # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 94 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \ 95 | && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf 96 | 97 | # Set default NCCL parameters 98 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf 99 | 100 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 101 | ENV PATH /usr/local/openmpi/bin/:$PATH 102 | ENV PATH=/usr/local/nvidia/bin:$PATH 103 | 104 | # SSH login fix. Otherwise user is kicked off after login 105 | RUN mkdir -p /var/run/sshd \ 106 | && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 107 | 108 | # Create SSH key. 109 | RUN mkdir -p /root/.ssh/ \ 110 | && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ 111 | && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ 112 | && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 113 | 114 | WORKDIR / 115 | 116 | RUN pip --no-cache-dir install --upgrade \ 117 | pip \ 118 | setuptools 119 | 120 | # Some TF tools expect a "python" binary 121 | RUN ln -s $(which python) /usr/local/bin/python 122 | 123 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE . 124 | 125 | RUN pip install --no-cache-dir -U \ 126 | numpy==1.16.5 \ 127 | scipy==1.2.2 \ 128 | scikit-learn==0.20.3 \ 129 | pandas==0.24.2 \ 130 | Pillow==6.2.1 \ 131 | h5py==2.9.0 \ 132 | keras_applications==1.0.8 \ 133 | keras_preprocessing==1.1.0 \ 134 | requests==2.22.0 \ 135 | keras==2.3.1 \ 136 | mpi4py==3.0.2 \ 137 | "cryptography>=2.3" \ 138 | "sagemaker-tensorflow>=1.15,<1.16" \ 139 | # Let's install TensorFlow separately in the end to avoid the library version to be overwritten 140 | && pip install --force-reinstall --no-cache-dir -U \ 141 | ${TF_URL} \ 142 | && pip install --no-cache-dir -U \ 143 | $FRAMEWORK_SUPPORT_INSTALLABLE \ 144 | awscli==1.17.7 \ 145 | && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 146 | 147 | # Install Horovod, temporarily using CUDA stubs 148 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \ 149 | && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \ 150 | horovod==0.18.2 \ 151 | && ldconfig 152 | 153 | # Allow OpenSSH to talk to containers without asking for confirmation 154 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ 155 | && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ 156 | && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 157 | 158 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py 159 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py 160 | 161 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ 162 | && chmod +x /usr/local/bin/deep_learning_container.py 163 | 164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt 165 | 166 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] 167 | CMD ["bin/bash"] 168 | --------------------------------------------------------------------------------