├── docker
    ├── __init__.py
    ├── build_artifacts
    │   ├── __init__.py
    │   ├── dockerd-entrypoint.py
    │   └── deep_learning_container.py
    ├── 1.15.0
    │   ├── py3
    │   │   ├── dockerd-entrypoint.py
    │   │   └── Dockerfile.cpu
    │   └── py2
    │   │   ├── Dockerfile.cpu
    │   │   └── Dockerfile.gpu
    ├── 2.0.0
    │   ├── py2
    │   │   ├── dockerd-entrypoint.py
    │   │   ├── Dockerfile.cpu
    │   │   └── Dockerfile.gpu
    │   └── py3
    │   │   ├── dockerd-entrypoint.py
    │   │   └── Dockerfile.cpu
    ├── 1.10.0
    │   ├── Dockerfile.cpu
    │   └── Dockerfile.gpu
    ├── 1.11.0
    │   ├── Dockerfile.cpu
    │   └── Dockerfile.gpu
    ├── 1.12.0
    │   ├── Dockerfile.cpu
    │   └── Dockerfile.gpu
    ├── 2.1.0
    │   ├── py2
    │   │   ├── Dockerfile.cpu
    │   │   └── Dockerfile.gpu
    │   └── py3
    │   │   └── Dockerfile.cpu
    ├── 1.13.1
    │   ├── Dockerfile.cpu
    │   └── Dockerfile.gpu
    ├── 2.0.1
    │   ├── py3
    │   │   └── Dockerfile.cpu
    │   └── py2
    │   │   ├── Dockerfile.cpu
    │   │   └── Dockerfile.gpu
    └── 1.14.0
    │   ├── py2
    │       └── Dockerfile.cpu
    │   └── py3
    │       └── Dockerfile.cpu
├── VERSION
├── CODEOWNERS
├── test
    ├── resources
    │   ├── test_dir_wrong_model
    │   │   └── fake_model.h5
    │   ├── test_dir_correct_model
    │   │   └── 12345
    │   │   │   └── saved_model.pb
    │   ├── test_dir_wrong_parent_dir
    │   │   └── not-digit
    │   │   │   └── saved_model.pb
    │   ├── mnist
    │   │   ├── data
    │   │   │   ├── test
    │   │   │   │   ├── x_test.npy
    │   │   │   │   └── y_test.npy
    │   │   │   └── train
    │   │   │   │   ├── x_train.npy
    │   │   │   │   └── y_train.npy
    │   │   ├── data-distributed
    │   │   │   ├── eval_data.npy
    │   │   │   ├── eval_labels.npy
    │   │   │   ├── train_data.npy
    │   │   │   └── train_labels.npy
    │   │   ├── __init__.py
    │   │   ├── mnist.py
    │   │   ├── smdataparallel_mnist.py
    │   │   ├── horovod_mnist.py
    │   │   └── mnist_custom.py
    │   ├── hvdbasic
    │   │   ├── train_hvd_basic.py
    │   │   └── train_hvd_env_vars.py
    │   ├── __init__.py
    │   ├── multi_worker_mirrored
    │   │   ├── __init__.py
    │   │   └── train_dummy.py
    │   └── tuning_model_dir
    │   │   └── entry.py
    ├── container
    │   └── 2.7.1
    │   │   ├── Dockerfile.dlc.cpu
    │   │   ├── Dockerfile.dlc.gpu
    │   │   ├── Dockerfile.tf.cpu
    │   │   └── Dockerfile.tf.gpu
    ├── unit
    │   ├── __init__.py
    │   └── test_s3_utils.py
    ├── utils
    │   ├── __init__.py
    │   └── image_utils.py
    └── integration
    │   ├── __init__.py
    │   ├── sagemaker
    │       ├── timeout.py
    │       ├── test_multi_worker_mirrored.py
    │       ├── test_tuning_model_dir.py
    │       ├── test_smdataparallel.py
    │       ├── test_horovod_sagemaker.py
    │       ├── test_mnist.py
    │       └── recordio_utils.py
    │   └── local
    │       ├── test_horovod_local.py
    │       └── test_training.py
├── NOTICE
├── .flake8
├── .gitignore
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── MANIFEST.in
├── CODE_OF_CONDUCT.md
├── .coveragerc_py38
├── .coveragerc_py39
├── .coveragerc_py37
├── benchmarks
    ├── tf_benchmarks
    │   ├── bench.sh
    │   ├── README.md
    │   └── execute_tensorflow_training.py
    └── horovod-resnet
    │   ├── bench.sh
    │   └── train.sh
├── src
    └── sagemaker_tensorflow_container
    │   ├── __init__.py
    │   ├── s3_utils.py
    │   └── deep_learning_container.py
├── scripts
    ├── publish_all.py
    └── build_all.py
├── tox.ini
├── setup.py
└── README.rst


/docker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 20.5.1.dev0
2 | 


--------------------------------------------------------------------------------
/docker/build_artifacts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @aws/sagemaker-jobs-platform
2 | 


--------------------------------------------------------------------------------
/test/resources/test_dir_wrong_model/fake_model.h5:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/resources/test_dir_correct_model/12345/saved_model.pb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/resources/test_dir_wrong_parent_dir/not-digit/saved_model.pb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | application_import_names = image_utils, integration, sagemaker_tensorflow_container, test, timeout, utils
3 | import-order-style = google
4 | 


--------------------------------------------------------------------------------
/test/resources/mnist/data/test/x_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/test/x_test.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data/test/y_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/test/y_test.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data/train/x_train.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/train/x_train.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data/train/y_train.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data/train/y_train.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data-distributed/eval_data.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/eval_data.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data-distributed/eval_labels.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/eval_labels.npy


--------------------------------------------------------------------------------
/test/resources/mnist/data-distributed/train_data.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/train_data.npy


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *pyc
 2 | dist
 3 | **/*.egg-info
 4 | .DS_Store
 5 | .idea/
 6 | .cache/
 7 | *.iml
 8 | **/.ipynb_checkpoints
 9 | **/.python-version
10 | .tox
11 | *~
12 | .coverage
13 | 


--------------------------------------------------------------------------------
/test/resources/mnist/data-distributed/train_labels.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-tensorflow-training-toolkit/HEAD/test/resources/mnist/data-distributed/train_labels.npy


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include src/sagemaker_tensorflow_container *
 2 | 
 3 | include VERSION
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | prune test
 8 | 
 9 | recursive-exclude * __pycache__
10 | recursive-exclude * *.py[co]
11 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/.coveragerc_py38:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | timid = True
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     pragma: no cover
 8 |     pragma: py3 no cover
 9 |     if six.PY2
10 |     elif six.PY2
11 | 
12 | partial_branches =
13 |     pragma: no cover
14 |     pragma: py3 no cover
15 |     if six.PY3
16 |     elif six.PY3
17 | 
18 | show_missing = True
19 | 
20 | fail_under = 70


--------------------------------------------------------------------------------
/.coveragerc_py39:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | timid = True
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     pragma: no cover
 8 |     pragma: py3 no cover
 9 |     if six.PY2
10 |     elif six.PY2
11 | 
12 | partial_branches =
13 |     pragma: no cover
14 |     pragma: py3 no cover
15 |     if six.PY3
16 |     elif six.PY3
17 | 
18 | show_missing = True
19 | 
20 | fail_under = 70


--------------------------------------------------------------------------------
/.coveragerc_py37:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | timid = True
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     pragma: no cover
 8 |     pragma: py3 no cover
 9 |     if six.PY2
10 |     elif six.PY2
11 | 
12 | partial_branches =
13 |     pragma: no cover
14 |     pragma: py3 no cover
15 |     if six.PY3
16 |     elif six.PY3
17 | 
18 | show_missing = True
19 | 
20 | fail_under = 70
21 | 


--------------------------------------------------------------------------------
/test/container/2.7.1/Dockerfile.dlc.cpu:
--------------------------------------------------------------------------------
1 | ARG region
2 | FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-cpu-py38-ubuntu20.04-sagemaker
3 | 
4 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
5 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
6 |     rm /sagemaker_tensorflow_training.tar.gz
7 | 


--------------------------------------------------------------------------------
/test/container/2.7.1/Dockerfile.dlc.gpu:
--------------------------------------------------------------------------------
1 | ARG region
2 | FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.7.1-gpu-py38-cu112-ubuntu20.04-sagemaker
3 | 
4 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
5 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
6 |     rm /sagemaker_tensorflow_training.tar.gz
7 | 


--------------------------------------------------------------------------------
/test/resources/hvdbasic/train_hvd_basic.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import horovod.tensorflow as hvd
 4 | 
 5 | hvd.init()
 6 | 
 7 | with open(
 8 |     os.path.join("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank())), "w+"
 9 | ) as f:
10 |     basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()}
11 | 
12 |     print(basic_info)
13 |     json.dump(basic_info, f)
14 | 


--------------------------------------------------------------------------------
/test/container/2.7.1/Dockerfile.tf.cpu:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:2.7.1
2 | 
3 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
4 | 
5 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
6 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
7 |     rm /sagemaker_tensorflow_training.tar.gz
8 | RUN pip install --no-cache-dir tensorflow-io
9 | RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd


--------------------------------------------------------------------------------
/benchmarks/tf_benchmarks/bench.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | execute_tensorflow_training.py train \
 4 | --framework-version  1.12 \
 5 | --device gpu \
 6 | \
 7 | --instance-types ml.p3.16xlarge \
 8 | \
 9 | --instance-counts 1 \
10 | --instance-counts 2 \
11 | --instance-counts 4 \
12 | \
13 | --py-versions py3 \
14 | \
15 | --subnets subnet-125fb674  \
16 | \
17 | --security-groups sg-ce5dd1b4  \
18 | \
19 | --batch-sizes 64 \
20 | \
21 | -- --num_batches=1000 --model vgg16 \
22 |     --variable_update horovod --horovod_device gpu --use_fp16 --summary_verbosity 1 --save_summaries_steps 10


--------------------------------------------------------------------------------
/test/resources/hvdbasic/train_hvd_env_vars.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import horovod.tensorflow as hvd
 4 | 
 5 | hvd.init()
 6 | 
 7 | with open("/opt/ml/model/local-rank-%s-rank-%s" % (hvd.local_rank(), hvd.rank()), "w+") as f:
 8 |     basic_info = {"local-rank": hvd.local_rank(), "rank": hvd.rank(), "size": hvd.size()}
 9 | 
10 |     print(basic_info)
11 |     json.dump(basic_info, f)
12 | 
13 | val = os.environ.get("AWS_CONTAINER_CREDENTIALS_RELATIVE_URI")
14 | host = os.environ.get("SM_CURRENT_HOST")
15 | 
16 | assert val is not None
17 | assert host is not None
18 | 
19 | print("host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}".format(host, val))
20 | 


--------------------------------------------------------------------------------
/src/sagemaker_tensorflow_container/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | 


--------------------------------------------------------------------------------
/test/unit/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 


--------------------------------------------------------------------------------
/test/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 


--------------------------------------------------------------------------------
/test/resources/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 


--------------------------------------------------------------------------------
/test/resources/mnist/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 


--------------------------------------------------------------------------------
/test/resources/multi_worker_mirrored/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 


--------------------------------------------------------------------------------
/test/container/2.7.1/Dockerfile.tf.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:2.7.1-gpu
 2 | 
 3 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
 4 | 
 5 | COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
 6 | RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
 7 |     rm /sagemaker_tensorflow_training.tar.gz
 8 | RUN pip install --no-cache-dir tensorflow-io
 9 | RUN apt-key del 7fa2af80 \
10 |  && rm /etc/apt/sources.list.d/nvidia-ml.list \
11 |  && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
12 |  && apt-get update \
13 |  && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd
14 | 


--------------------------------------------------------------------------------
/test/integration/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License").
 4 | #  You may not use this file except in compliance with the License.
 5 | #  A copy of the License is located at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  or in the "license" file accompanying this file. This file is distributed
10 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | #  express or implied. See the License for the specific language governing
12 | #  permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import logging
16 | import os
17 | 
18 | logging.getLogger("boto3").setLevel(logging.INFO)
19 | logging.getLogger("botocore").setLevel(logging.INFO)
20 | 
21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "resources")
22 | DEFAULT_TIMEOUT = 120
23 | 


--------------------------------------------------------------------------------
/docker/1.15.0/py3/dockerd-entrypoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os.path
16 | import shlex
17 | import subprocess
18 | import sys
19 | 
20 | if not os.path.exists("/opt/ml/input/config"):
21 |     subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
22 | 
23 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
24 | 


--------------------------------------------------------------------------------
/docker/2.0.0/py2/dockerd-entrypoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from __future__ import absolute_import
15 | 
16 | import os.path
17 | import shlex
18 | import subprocess
19 | import sys
20 | 
21 | if not os.path.exists("/opt/ml/input/config"):
22 |     subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
23 | 
24 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
25 | 


--------------------------------------------------------------------------------
/docker/2.0.0/py3/dockerd-entrypoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from __future__ import absolute_import
15 | 
16 | import os.path
17 | import shlex
18 | import subprocess
19 | import sys
20 | 
21 | if not os.path.exists("/opt/ml/input/config"):
22 |     subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
23 | 
24 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
25 | 


--------------------------------------------------------------------------------
/docker/build_artifacts/dockerd-entrypoint.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os.path
16 | import shlex
17 | import subprocess
18 | import sys
19 | 
20 | if not os.path.exists("/opt/ml/input/config"):
21 |     subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
22 | 
23 | subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
24 | 


--------------------------------------------------------------------------------
/benchmarks/horovod-resnet/bench.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 5 | # may not use this file except in compliance with the License. A copy of
 6 | # the License is located at
 7 | #
 8 | #     http://aws.amazon.com/apache2.0/
 9 | #
10 | # or in the "license" file accompanying this file. This file is
11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
12 | # ANY KIND, either express or implied. See the License for the specific
13 | # language governing permissions and limitations under the License.
14 | 
15 | ./execute_tensorflow_training.py train \
16 | --framework-version  1.12 \
17 | --device gpu \
18 | \
19 | --instance-types ml.p3.16xlarge \
20 | \
21 | --instance-counts 1 \
22 | --instance-counts 2 \
23 | --instance-counts 4 \
24 | --instance-counts 8 \
25 | --instance-counts 16 \
26 | \
27 | --py-versions py3 \
28 | \
29 | --subnets # add subnet id here  \
30 | \
31 | --security-groups # add security-group id here
32 | 


--------------------------------------------------------------------------------
/test/resources/tuning_model_dir/entry.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import argparse
16 | import os
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("--model_dir", type=str)
20 | parser.add_argument("--arbitrary_value", type=int, default=0)
21 | args = parser.parse_args()
22 | 
23 | assert os.environ["TRAINING_JOB_NAME"] in args.model_dir, (
24 |     "model_dir not unique to training job: %s" % args.model_dir
25 | )
26 | 
27 | # For the "hyperparameter tuning" to work
28 | print("accuracy=1")
29 | 


--------------------------------------------------------------------------------
/benchmarks/horovod-resnet/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 5 | # may not use this file except in compliance with the License. A copy of
 6 | # the License is located at
 7 | #
 8 | #     http://aws.amazon.com/apache2.0/
 9 | #
10 | # or in the "license" file accompanying this file. This file is
11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
12 | # ANY KIND, either express or implied. See the License for the specific
13 | # language governing permissions and limitations under the License.
14 | 
15 | set -ex
16 | 
17 | echo "Launching training job using $SM_NUM_GPUS GPUs"
18 | 
19 | # p3 instances have larger GPU memory, so a higher batch size can be used
20 | GPU_MEM=`nvidia-smi --query-gpu=memory.total --format=csv,noheader -i 0 | awk '{print $1}'`
21 | if [ $GPU_MEM -gt 15000 ] ; then BATCH_SIZE=256; else BATCH_SIZE=128; fi
22 | 
23 | # Training
24 | python -W ignore train_imagenet_resnet_hvd.py --num_epochs 90 --synthetic -b $BATCH_SIZE \
25 | 	--lr_decay_mode poly --warmup_epochs 10 --clear_log
26 | 


--------------------------------------------------------------------------------
/src/sagemaker_tensorflow_container/s3_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the 'license' file accompanying this file. This file is
10 | # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | import boto3
18 | from six.moves.urllib.parse import urlparse
19 | 
20 | 
21 | def configure(model_dir, job_region):
22 | 
23 |     os.environ["S3_REGION"] = _s3_region(job_region, model_dir)
24 | 
25 |     # setting log level to WARNING
26 |     os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
27 |     os.environ["S3_USE_HTTPS"] = "1"
28 | 
29 | 
30 | def _s3_region(job_region, model_dir):
31 |     if model_dir and model_dir.startswith("s3://"):
32 |         s3 = boto3.client("s3", region_name=job_region)
33 | 
34 |         # We get the AWS region of the checkpoint bucket, which may be different from
35 |         # the region this container is currently running in.
36 |         parsed_url = urlparse(model_dir)
37 |         bucket_name = parsed_url.netloc
38 | 
39 |         bucket_location = s3.get_bucket_location(Bucket=bucket_name)["LocationConstraint"]
40 | 
41 |         return bucket_location or job_region
42 |     else:
43 |         return job_region
44 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/timeout.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License").
 4 | # You may not use this file except in compliance with the License.
 5 | # A copy of the License is located at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # or in the "license" file accompanying this file. This file is distributed
10 | # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11 | # express or implied. See the License for the specific language governing
12 | # permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | from contextlib import contextmanager
16 | import logging
17 | import signal
18 | 
19 | LOGGER = logging.getLogger("timeout")
20 | 
21 | 
22 | class TimeoutError(Exception):
23 |     pass
24 | 
25 | 
26 | @contextmanager
27 | def timeout(seconds=0, minutes=0, hours=0):
28 |     """Add a signal-based timeout to any block of code.
29 |     If multiple time units are specified, they will be added together to determine time limit.
30 |     Usage:
31 |     with timeout(seconds=5):
32 |         my_slow_function(...)
33 |     Args:
34 |         - seconds: The time limit, in seconds.
35 |         - minutes: The time limit, in minutes.
36 |         - hours: The time limit, in hours.
37 |     """
38 | 
39 |     limit = seconds + 60 * minutes + 3600 * hours
40 | 
41 |     def handler(signum, frame):
42 |         raise TimeoutError("timed out after {} seconds".format(limit))
43 | 
44 |     try:
45 |         signal.signal(signal.SIGALRM, handler)
46 |         signal.alarm(limit)
47 | 
48 |         yield
49 |     finally:
50 |         signal.alarm(0)
51 | 


--------------------------------------------------------------------------------
/test/unit/test_s3_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the 'license' file accompanying this file. This file is
10 | # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | from mock import MagicMock, patch
18 | 
19 | from sagemaker_tensorflow_container import s3_utils
20 | 
21 | 
22 | BUCKET_REGION = "us-west-2"
23 | JOB_REGION = "us-west-1"
24 | JOB_BUKCET = "sagemaker-us-west-2-000-00-1"
25 | PREFIX = "sagemaker/something"
26 | MODEL_DIR = "s3://{}/{}".format(JOB_BUKCET, PREFIX)
27 | 
28 | 
29 | @patch("boto3.client")
30 | def test_configure(client):
31 |     s3 = MagicMock()
32 |     client.return_value = s3
33 |     loc = {"LocationConstraint": BUCKET_REGION}
34 |     s3.get_bucket_location.return_value = loc
35 | 
36 |     s3_utils.configure(MODEL_DIR, JOB_REGION)
37 | 
38 |     assert os.environ["S3_REGION"] == BUCKET_REGION
39 |     assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1"
40 |     assert os.environ["S3_USE_HTTPS"] == "1"
41 | 
42 | 
43 | def test_configure_local_dir():
44 |     s3_utils.configure("/opt/ml/model", JOB_REGION)
45 | 
46 |     assert os.environ["S3_REGION"] == JOB_REGION
47 |     assert os.environ["TF_CPP_MIN_LOG_LEVEL"] == "1"
48 |     assert os.environ["S3_USE_HTTPS"] == "1"
49 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/test_multi_worker_mirrored.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | from sagemaker.tensorflow import TensorFlow
18 | from sagemaker.utils import unique_name_from_base
19 | 
20 | 
21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
22 | 
23 | 
24 | def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys):
25 |     estimator = TensorFlow(
26 |         entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_dummy.py"),
27 |         role="SageMakerRole",
28 |         instance_type=instance_type,
29 |         instance_count=2,
30 |         image_name=image_uri,
31 |         framework_version=framework_version,
32 |         py_version="py3",
33 |         hyperparameters={
34 |             "sagemaker_multi_worker_mirrored_strategy_enabled": True,
35 |         },
36 |         sagemaker_session=sagemaker_session,
37 |     )
38 |     estimator.fit(job_name=unique_name_from_base("test-tf-mwms"))
39 |     captured = capsys.readouterr()
40 |     logs = captured.out + captured.err
41 |     assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs
42 |     assert "TF_CONFIG=" in logs
43 | 


--------------------------------------------------------------------------------
/test/resources/multi_worker_mirrored/train_dummy.py:
--------------------------------------------------------------------------------
 1 | # Please refer to https://github.com/tensorflow/docs/blob/master/site/en/tutorials/distribute/multi_worker_with_keras.ipynb
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | def mnist_dataset(batch_size):
10 |   (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
11 |   # The `x` arrays are in uint8 and have values in the [0, 255] range.
12 |   # You need to convert them to float32 with values in the [0, 1] range.
13 |   x_train = x_train / np.float32(255)
14 |   y_train = y_train.astype(np.int64)
15 |   train_dataset = tf.data.Dataset.from_tensor_slices(
16 |       (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
17 |   return train_dataset
18 | 
19 | def build_and_compile_cnn_model():
20 |   model = tf.keras.Sequential([
21 |       tf.keras.layers.InputLayer(input_shape=(28, 28)),
22 |       tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
23 |       tf.keras.layers.Conv2D(32, 3, activation='relu'),
24 |       tf.keras.layers.Flatten(),
25 |       tf.keras.layers.Dense(128, activation='relu'),
26 |       tf.keras.layers.Dense(10)
27 |   ])
28 |   model.compile(
29 |       loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
30 |       optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
31 |       metrics=['accuracy'])
32 |   return model
33 | 
34 | 
35 | per_worker_batch_size = 64
36 | tf_config = json.loads(os.environ['TF_CONFIG'])
37 | num_workers = len(tf_config['cluster']['worker'])
38 | 
39 | strategy = tf.distribute.MultiWorkerMirroredStrategy()
40 | 
41 | global_batch_size = per_worker_batch_size * num_workers
42 | multi_worker_dataset = mnist_dataset(global_batch_size)
43 | 
44 | with strategy.scope():
45 |   # Model building/compiling need to be within `strategy.scope()`.
46 |   multi_worker_model = build_and_compile_cnn_model()
47 | 
48 | multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
49 | 


--------------------------------------------------------------------------------
/test/resources/mnist/mnist.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def _parse_args():
11 | 
12 |     parser = argparse.ArgumentParser()
13 | 
14 |     # hyperparameters sent by the client are passed as command-line arguments to the script.
15 |     parser.add_argument("--epochs", type=int, default=1)
16 |     # Data, model, and output directories
17 |     parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
18 |     parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
19 |     parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
20 |     parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
21 | 
22 |     return parser.parse_known_args()
23 | 
24 | 
25 | def _load_training_data(base_dir):
26 |     x_train = np.load(os.path.join(base_dir, "train", "x_train.npy"))
27 |     y_train = np.load(os.path.join(base_dir, "train", "y_train.npy"))
28 |     return x_train, y_train
29 | 
30 | 
31 | def _load_testing_data(base_dir):
32 |     x_test = np.load(os.path.join(base_dir, "test", "x_test.npy"))
33 |     y_test = np.load(os.path.join(base_dir, "test", "y_test.npy"))
34 |     return x_test, y_test
35 | 
36 | 
37 | args, unknown = _parse_args()
38 | 
39 | model = tf.keras.models.Sequential(
40 |     [
41 |         tf.keras.layers.Flatten(input_shape=(28, 28)),
42 |         tf.keras.layers.Dense(512, activation=tf.nn.relu),
43 |         tf.keras.layers.Dropout(0.2),
44 |         tf.keras.layers.Dense(10, activation=tf.nn.softmax),
45 |     ]
46 | )
47 | 
48 | model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
49 | x_train, y_train = _load_training_data(args.train)
50 | x_test, y_test = _load_testing_data(args.train)
51 | model.fit(x_train, y_train, epochs=args.epochs)
52 | model.evaluate(x_test, y_test)
53 | 
54 | if args.current_host == args.hosts[0]:
55 |     model.save(os.path.join("/opt/ml/model", "my_model.h5"))
56 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/test_tuning_model_dir.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | from sagemaker.tensorflow import TensorFlow
18 | from sagemaker.tuner import HyperparameterTuner, IntegerParameter
19 | from sagemaker.utils import unique_name_from_base
20 | 
21 | 
22 | def test_model_dir_with_training_job_name(
23 |     sagemaker_session, image_uri, instance_type, framework_version
24 | ):
25 |     resource_path = os.path.join(os.path.dirname(__file__), "../..", "resources")
26 |     script = os.path.join(resource_path, "tuning_model_dir", "entry.py")
27 | 
28 |     estimator = TensorFlow(
29 |         entry_point=script,
30 |         role="SageMakerRole",
31 |         instance_type=instance_type,
32 |         instance_count=1,
33 |         image_uri=image_uri,
34 |         framework_version=framework_version,
35 |         py_version="py3",
36 |         sagemaker_session=sagemaker_session,
37 |     )
38 | 
39 |     tuner = HyperparameterTuner(
40 |         estimator=estimator,
41 |         objective_metric_name="accuracy",
42 |         hyperparameter_ranges={"arbitrary_value": IntegerParameter(0, 1)},
43 |         metric_definitions=[{"Name": "accuracy", "Regex": "accuracy=([01])"}],
44 |         max_jobs=1,
45 |         max_parallel_jobs=1,
46 |     )
47 | 
48 |     # User script has logic to check for the correct model_dir
49 |     tuner.fit(job_name=unique_name_from_base("test-tf-model-dir", max_length=32))
50 |     tuner.wait()
51 | 


--------------------------------------------------------------------------------
/benchmarks/tf_benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow benchmarking scripts
 2 | 
 3 | This folder contains the TF training scripts https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks.
 4 | 
 5 | ## Basic usage
 6 | **execute_tensorflow_training.py train** uses SageMaker python sdk to start a training job. 
 7 | 
 8 | ```bash
 9 | ./execute_tensorflow_training.py train --help
10 | Usage: execute_tensorflow_training.py train [OPTIONS] [SCRIPT_ARGS]...
11 | 
12 | Options:
13 |   --framework-version [1.11.0|1.12.0]
14 |                                   [required]
15 |   --device [cpu|gpu]              [required]
16 |   --py-versions TEXT
17 |   --training-input-mode [File|Pipe]
18 |   --networking-isolation / --no-networking-isolation
19 |   --wait / --no-wait
20 |   --security-groups TEXT
21 |   --subnets TEXT
22 |   --role TEXT
23 |   --instance-counts INTEGER
24 |   --batch-sizes INTEGER
25 |   --instance-types TEXT
26 |   --help                          Show this message and exit.
27 | 
28 | ```
29 | **execute_tensorflow_training.py generate_reports** generate benchmark reports. 
30 | 
31 | ## Examples:
32 | 
33 | ```bash
34 | #!/usr/bin/env bash
35 | 
36 | ./execute_tensorflow_training.py train \
37 | --framework-version  1.11.0 \
38 | --device gpu \
39 | \
40 | --instance-types ml.p3.2xlarge \
41 | --instance-types ml.p3.8xlarge \
42 | --instance-types ml.p3.16xlarge \
43 | --instance-types ml.p2.xlarge \
44 | --instance-types ml.p2.8xlarge \
45 | --instance-types ml.p2.16xlarge \
46 | \
47 | --instance-counts 1 \
48 | \
49 | --py-versions py3 \
50 | --py-versions py2 \
51 | \
52 | --subnets subnet-125fb674  \
53 | \
54 | --security-groups sg-ce5dd1b4  \
55 | \
56 | --batch-sizes 32 \
57 | --batch-sizes 64 \
58 | --batch-sizes 128 \
59 | --batch-sizes 256 \
60 | --batch-sizes 512 \
61 | \
62 | -- --model resnet32 --num_epochs 10 --data_format NHWC --summary_verbosity 1 --save_summaries_steps 10 --data_name cifar10
63 | ```
64 | 
65 | ## Using other models, datasets and benchmarks configurations
66 | ```python tf_cnn_benchmarks/tf_cnn_benchmarks.py --help``` shows all the options that the script has.
67 | 


--------------------------------------------------------------------------------
/scripts/publish_all.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import argparse
16 | import subprocess
17 | 
18 | DEV_ACCOUNT = "142577830533"
19 | VERSION = "1.13.1"
20 | REGION = "us-west-2"
21 | REPO = "sagemaker-tensorflow-scriptmode"
22 | 
23 | 
24 | def _parse_args():
25 | 
26 |     parser = argparse.ArgumentParser()
27 | 
28 |     parser.add_argument("--account", type=str, default=DEV_ACCOUNT)
29 |     parser.add_argument("--version", type=str, default=VERSION)
30 |     parser.add_argument("--repo", type=str, default=REPO)
31 |     parser.add_argument("--region", type=str, default=REGION)
32 | 
33 |     return parser.parse_args()
34 | 
35 | 
36 | args = _parse_args()
37 | 
38 | for arch in ["cpu", "gpu"]:
39 |     for py_version in ["2", "3"]:
40 |         source = "{}:{}-{}-py{}".format(args.repo, args.version, arch, py_version)
41 |         dest = "{}.dkr.ecr.{}.amazonaws.com/{}".format(args.account, args.region, source)
42 |         tag_cmd = "docker tag {} {}".format(source, dest)
43 |         print("Tagging image: {}".format(tag_cmd))
44 |         subprocess.check_call(tag_cmd.split())
45 |         login_cmd = subprocess.check_output(
46 |             "aws ecr get-login --no-include-email --registry-id {} --region {}".format(
47 |                 args.account, args.region
48 |             ).split()
49 |         )
50 |         print("Executing docker login command: {}".format(login_cmd))
51 |         subprocess.check_call(login_cmd.split())
52 |         push_cmd = "docker push {}".format(dest)
53 |         print("Pushing image: {}".format(push_cmd))
54 |         subprocess.check_call(push_cmd.split())
55 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py38, flake8
 8 | skip_missing_interpreters = False
 9 | 
10 | [travis]
11 | python =
12 |   3.8: py38, flake8
13 | 
14 | [flake8]
15 | max-line-length = 120
16 | exclude =
17 |     build/
18 |     .git
19 |     __pycache__
20 |     examples/
21 |     *pb2.py
22 |     .tox
23 |     tests/data/
24 |     test/resources
25 |     venv/
26 |     sagemaker-tensorflow-extensions
27 |     benchmarks/
28 | max-complexity = 10
29 | ignore =
30 |     C901,
31 |     E203,
32 |     FI10,
33 |     FI12,
34 |     FI13,
35 |     FI14,
36 |     FI15,
37 |     FI16,
38 |     FI17,
39 |     FI18,
40 |     FI50,
41 |     FI51,
42 |     FI52,
43 |     FI53,
44 |     FI54,
45 |     FI55,
46 |     FI56,
47 |     FI57,
48 |     W503
49 | 
50 | require-code = True
51 | 
52 | [testenv]
53 | # TEAMCITY_VERSION environment variable exists during build on Teamcity. teamcity-messages uses it in order to enable
54 | # reporting to TeamCity.
55 | passenv =
56 |     AWS_ACCESS_KEY_ID
57 |     AWS_SECRET_ACCESS_KEY
58 |     AWS_SESSION_TOKEN
59 |     AWS_CONTAINER_CREDENTIALS_RELATIVE_URI
60 |     AWS_DEFAULT_REGION
61 | # {posargs} can be passed in by additional arguments specified when invoking tox.
62 | # Can be used to specify which tests to run, e.g.: tox -- -s
63 | commands =
64 |     coverage run --rcfile .coveragerc_{envname} --source sagemaker_tensorflow_container -m py.test {posargs}
65 |     {env:IGNORE_COVERAGE:} coverage report --rcfile .coveragerc_{envname}
66 |     {env:IGNORE_COVERAGE:} coverage html --rcfile .coveragerc_{envname}
67 | 
68 | deps = .[test]
69 | 
70 | [testenv:flake8]
71 | basepython = python
72 | deps =
73 |     flake8
74 |     flake8-future-import
75 |     flake8-import-order
76 | commands = flake8 --append-config .flake8
77 | 
78 | 
79 | [testenv:twine]
80 | basepython = python3
81 | # https://github.com/pypa/twine/blob/master/docs/changelog.rst
82 | deps =
83 |     twine>=1.12.0
84 | # https://packaging.python.org/guides/making-a-pypi-friendly-readme/#validating-restructuredtext-markup
85 | commands =
86 |     python setup.py sdist
87 |     twine check dist/*.tar.gz
88 | 


--------------------------------------------------------------------------------
/test/utils/image_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | import subprocess
17 | import sys
18 | 
19 | CYAN_COLOR = "\033[36m"
20 | END_COLOR = "\033[0m"
21 | DLC_AWS_ID = "763104351884"
22 | 
23 | 
24 | def build_image(framework_version, dockerfile, image_uri, region, cwd="."):
25 |     _check_call("python setup.py sdist")
26 | 
27 |     if "dlc" in dockerfile:
28 |         ecr_login(region, DLC_AWS_ID)
29 | 
30 |     dockerfile_location = os.path.join("test", "container", framework_version, dockerfile)
31 | 
32 |     subprocess.check_call(
33 |         [
34 |             "docker",
35 |             "build",
36 |             "-t",
37 |             image_uri,
38 |             "-f",
39 |             dockerfile_location,
40 |             "--build-arg",
41 |             "region={}".format(region),
42 |             cwd,
43 |         ],
44 |         cwd=cwd,
45 |     )
46 |     print("created image {}".format(image_uri))
47 |     return image_uri
48 | 
49 | 
50 | def push_image(ecr_image, region, aws_id):
51 |     ecr_login(region, aws_id)
52 |     _check_call("docker push {}".format(ecr_image))
53 | 
54 | 
55 | def ecr_login(region, aws_id):
56 |     login = _check_call(
57 |         "aws ecr get-login --registry-ids {} ".format(aws_id)
58 |         + "--no-include-email --region {}".format(region)
59 |     )
60 |     _check_call(login.decode("utf-8").rstrip("\n"))
61 | 
62 | 
63 | def _check_call(cmd, *popenargs, **kwargs):
64 |     if isinstance(cmd, str):
65 |         cmd = cmd.split(" ")
66 |     _print_cmd(cmd)
67 |     return subprocess.check_output(cmd, *popenargs, **kwargs)
68 | 
69 | 
70 | def _print_cmd(cmd):
71 |     print("executing docker command: {}{}{}".format(CYAN_COLOR, " ".join(cmd), END_COLOR))
72 |     sys.stdout.flush()
73 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/test_smdataparallel.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | import pytest
18 | import sagemaker
19 | from sagemaker.tensorflow import TensorFlow
20 | from sagemaker.utils import unique_name_from_base
21 | 
22 | from integration import DEFAULT_TIMEOUT, RESOURCE_PATH
23 | from integration.sagemaker.timeout import timeout
24 | 
25 | 
26 | @pytest.mark.skip_cpu
27 | @pytest.mark.skip_generic
28 | @pytest.mark.parametrize(
29 |     "instances, instance_type",
30 |     [(2, "ml.p3.16xlarge")],
31 | )
32 | def test_smdataparallel_training(instances, instance_type, sagemaker_session, image_uri, framework_version, tmpdir):
33 |     default_bucket = sagemaker_session.default_bucket()
34 |     output_path = "s3://{}/{}/{}".format(default_bucket, "tensorflow", "smdataparallel")
35 | 
36 |     estimator = TensorFlow(
37 |         entry_point=os.path.join(RESOURCE_PATH, "mnist", "smdataparallel_mnist.py"),
38 |         role="SageMakerRole",
39 |         instance_type=instance_type,
40 |         sagemaker_session=sagemaker_session,
41 |         instance_count=instances,
42 |         image_uri=image_uri,
43 |         output_path=output_path,
44 |         framework_version=framework_version,
45 |         py_version="py3",
46 |         distribution={"smdistributed": {"dataparallel": {"enabled": True}}}
47 |     )
48 | 
49 |     with timeout(minutes=DEFAULT_TIMEOUT):
50 |         estimator.fit(job_name=unique_name_from_base("test-tf-smdataparallel"))
51 | 
52 |         model_data_source = sagemaker.local.data.get_data_source_instance(
53 |             estimator.model_data, sagemaker_session
54 |         )
55 | 
56 |         for filename in model_data_source.get_file_list():
57 |             assert os.path.basename(filename) == "model.tar.gz"
58 | 


--------------------------------------------------------------------------------
/docker/1.10.0/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | MAINTAINER Amazon AI
 4 | 
 5 | ARG framework_installable
 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 7 | ARG py_version
 8 | 
 9 | # Validate that arguments are specified
10 | RUN test $framework_installable || exit 1 \
11 |     && test $py_version || exit 1
12 | 
13 | WORKDIR /root
14 | 
15 | COPY $framework_installable .
16 | COPY $framework_support_installable .
17 | 
18 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
19 |     && add-apt-repository ppa:deadsnakes/ppa -y
20 | 
21 | RUN buildDeps=" \
22 |         build-essential \
23 |         curl \
24 |         git \
25 |         libcurl3-dev \
26 |         libfreetype6-dev \
27 |         libpng12-dev \
28 |         libzmq3-dev \
29 |         pkg-config \
30 |         rsync \
31 |         unzip \
32 |         zip \
33 |         zlib1g-dev \
34 |         openjdk-8-jdk \
35 |         openjdk-8-jre-headless \
36 |         wget \
37 |         vim \
38 |         iputils-ping \
39 |         nginx \
40 |     " \
41 |     && apt-get update && apt-get install -y --no-install-recommends $buildDeps \
42 |     && apt-get clean \
43 |     && rm -rf /var/lib/apt/lists/*
44 | 
45 | RUN if [ $py_version -eq 3 ]; \
46 |         then apt-get update && apt-get install -y --no-install-recommends python3.6-dev \
47 |              && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
48 |         else apt-get update && apt-get install -y --no-install-recommends python-dev; fi
49 | 
50 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
51 |     python get-pip.py && \
52 |     rm get-pip.py
53 | 
54 | RUN pip install --upgrade \
55 |     pip \
56 |     setuptools
57 | 
58 | # Set environment variables for MKL
59 | # TODO: investigate the right value for OMP_NUM_THREADS
60 | # For more about MKL with TensorFlow see:
61 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
62 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
63 | 
64 | RUN framework_installable_local=$(basename $framework_installable) \
65 |     && framework_support_installable_local=$(basename $framework_support_installable) \
66 |     && pip install --no-cache --upgrade \
67 |     $framework_installable_local \
68 |     $framework_support_installable_local \
69 |     "sagemaker-tensorflow>=1.10,<1.11" \
70 |     \
71 |     && rm $framework_installable_local \
72 |     && rm $framework_support_installable_local
73 | 
74 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
75 | 


--------------------------------------------------------------------------------
/docker/1.10.0/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04
 2 | 
 3 | MAINTAINER Amazon AI
 4 | 
 5 | ARG framework_installable
 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 7 | ARG py_version
 8 | 
 9 | # Validate that arguments are specified
10 | RUN test $framework_installable || exit 1 \
11 |     && test $py_version || exit 1
12 | 
13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
14 |     && add-apt-repository ppa:deadsnakes/ppa -y
15 | 
16 | RUN buildDeps=" \
17 |         build-essential \
18 |         cuda-command-line-tools-9-0 \
19 |         cuda-cublas-dev-9-0 \
20 |         cuda-cudart-dev-9-0 \
21 |         cuda-cufft-dev-9-0 \
22 |         cuda-curand-dev-9-0 \
23 |         cuda-cusolver-dev-9-0 \
24 |         cuda-cusparse-dev-9-0 \
25 |         curl \
26 |         git \
27 |         libcudnn7=7.1.4.18-1+cuda9.0 \
28 |         libcudnn7-dev=7.1.4.18-1+cuda9.0 \
29 |         libcurl3-dev \
30 |         libfreetype6-dev \
31 |         libpng12-dev \
32 |         libzmq3-dev \
33 |         pkg-config \
34 |         rsync \
35 |         unzip \
36 |         zip \
37 |         zlib1g-dev \
38 |         wget \
39 |         vim \
40 |         nginx \
41 |         iputils-ping \
42 |     " \
43 |     && apt-get update && apt-get install -y --no-install-recommends $buildDeps \
44 |     && rm -rf /var/lib/apt/lists/* \
45 |     && find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete \
46 |     && rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
47 | 
48 | RUN if [ $py_version -eq 3 ]; \
49 |         then apt-get update && apt-get install -y --no-install-recommends python3.6-dev \
50 |              && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
51 |         else apt-get update && apt-get install -y --no-install-recommends python-dev; fi
52 | 
53 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
54 |     python get-pip.py && \
55 |     rm get-pip.py
56 | 
57 | WORKDIR /root
58 | 
59 | COPY $framework_installable .
60 | COPY $framework_support_installable .
61 | 
62 | RUN framework_installable_local=$(basename $framework_installable) && \
63 |     framework_support_installable_local=$(basename $framework_support_installable) && \
64 |     \
65 |     pip install --no-cache --upgrade $framework_installable_local \
66 |     $framework_support_installable_local \
67 |     "sagemaker-tensorflow>=1.10,<1.11" &&\
68 |     \
69 |     rm $framework_installable_local && \
70 |     rm $framework_support_installable_local
71 | 
72 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
73 | 


--------------------------------------------------------------------------------
/docker/1.11.0/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | LABEL maintainer="Amazon AI"
 4 | 
 5 | ARG framework_installable
 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 7 | ARG py_version
 8 | 
 9 | # Validate that arguments are specified
10 | RUN test $framework_installable || exit 1 \
11 |     && test $py_version || exit 1
12 | 
13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
14 |     && add-apt-repository ppa:deadsnakes/ppa -y \
15 |     && rm -rf /var/lib/apt/lists/*
16 | 
17 | RUN apt-get update && apt-get install -y --no-install-recommends \
18 |     ca-certificates \
19 |     build-essential \
20 |     curl \
21 |     nginx \
22 |     && if [ $py_version -eq 3 ]; \
23 |        then apt-get install -y --no-install-recommends python3.6-dev \
24 |            && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
25 |        else apt-get install -y --no-install-recommends python-dev; fi \
26 |     && rm -rf /var/lib/apt/lists/*
27 | 
28 | # Python won’t try to write .pyc or .pyo files on the import of source modules
29 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
30 | 
31 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
32 |     python get-pip.py \
33 | 		    --disable-pip-version-check \
34 | 		    --no-cache-dir \
35 | 		    "pip==18.1" \
36 | 	; \
37 | 	pip --version; \
38 | 	find /usr/local -depth \
39 | 		\( \
40 | 			\( -type d -a \( -name test -o -name tests \) \) \
41 | 			-o \
42 | 			\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
43 | 		\) -exec rm -rf '{}' +; \
44 |     rm get-pip.py
45 | 
46 | # Set environment variables for MKL
47 | # TODO: investigate the right value for OMP_NUM_THREADS
48 | # For more about MKL with TensorFlow see:
49 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
50 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
51 | 
52 | WORKDIR /
53 | 
54 | COPY $framework_installable .
55 | COPY $framework_support_installable .
56 | 
57 | RUN pip install --no-cache-dir -U \
58 |     keras==2.2.4 \
59 |     $framework_support_installable \
60 |     "sagemaker-tensorflow>=1.11,<1.12" \
61 |     # Let's install TensorFlow separately in the end to avoid
62 |     # the library version to be overwritten
63 |     && pip install --force-reinstall --no-cache-dir -U $framework_installable \
64 |     \
65 |     && rm -f $framework_installable \
66 |     && rm -f $framework_support_installable \
67 |     && pip uninstall -y --no-cache-dir \
68 |     markdown \
69 |     tensorboard
70 | 
71 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
72 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | from glob import glob
16 | import os
17 | from os.path import basename
18 | from os.path import splitext
19 | import sys
20 | 
21 | from setuptools import find_packages, setup
22 | 
23 | 
24 | def read(fname):
25 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
26 | 
27 | 
28 | def read_version():
29 |     return read("VERSION").strip()
30 | 
31 | 
32 | test_dependencies = [
33 |     "tox",
34 |     "flake8",
35 |     "pytest",
36 |     "pytest-cov",
37 |     "pytest-xdist",
38 |     "pytest-rerunfailures",
39 |     "mock",
40 |     "sagemaker[local]>=2",
41 |     "tensorflow<2.4",
42 |     "docker-compose",
43 |     "boto3==1.16.34",
44 |     "python-dateutil>=2.1,<2.8.1",
45 |     "botocore==1.19.34",
46 |     "requests-mock",
47 |     "awscli==1.18.194",
48 |     "protobuf>=3.9.2,<3.20"
49 | ]
50 | 
51 | if sys.version_info.major > 2:
52 |     test_dependencies.append("sagemaker-experiments==0.1.7")
53 | 
54 | setup(
55 |     name="sagemaker_tensorflow_training",
56 |     version=read_version(),
57 |     description="Open source library for using "
58 |     "TensorFlow to train models on on Amazon SageMaker.",
59 |     packages=find_packages(where="src", exclude=("test",)),
60 |     package_dir={"": "src"},
61 |     py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
62 |     long_description=read("README.rst"),
63 |     author="Amazon Web Services",
64 |     url="https://github.com/aws/sagemaker-tensorflow-training-toolkit",
65 |     license="Apache License 2.0",
66 |     classifiers=[
67 |         "Development Status :: 5 - Production/Stable",
68 |         "Intended Audience :: Developers",
69 |         "Natural Language :: English",
70 |         "License :: OSI Approved :: Apache Software License",
71 |         "Programming Language :: Python",
72 |         "Programming Language :: Python :: 3.7",
73 |         "Programming Language :: Python :: 3.8",
74 |         "Programming Language :: Python :: 3.9",
75 |     ],
76 |     install_requires=[
77 |         "sagemaker-training>=4.3.0,<=4.8.3",
78 |         "numpy < 1.24",
79 |         "scipy",
80 |         "scikit-learn",
81 |         "pandas",
82 |         "Pillow",
83 |         "h5py",
84 |     ],
85 |     extras_require={"test": test_dependencies, "benchmark": ["click"], },
86 | )
87 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/test_horovod_sagemaker.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import os
16 | 
17 | import pytest
18 | import sagemaker
19 | from sagemaker.tensorflow import TensorFlow
20 | from sagemaker.utils import unique_name_from_base
21 | 
22 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
23 | 
24 | 
25 | @pytest.mark.skip_generic
26 | def test_distributed_training_horovod(
27 |     sagemaker_session, instance_type, image_uri, tmpdir, framework_version
28 | ):
29 | 
30 |     mpi_options = "-verbose -x orte_base_help_aggregate=0"
31 |     estimator = TensorFlow(
32 |         entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"),
33 |         role="SageMakerRole",
34 |         instance_type=instance_type,
35 |         instance_count=2,
36 |         image_uri=image_uri,
37 |         framework_version=framework_version,
38 |         py_version="py3",
39 |         hyperparameters={
40 |             "sagemaker_mpi_enabled": True,
41 |             "sagemaker_mpi_custom_mpi_options": mpi_options,
42 |             "sagemaker_mpi_num_of_processes_per_host": 1,
43 |         },
44 |         sagemaker_session=sagemaker_session,
45 |     )
46 | 
47 |     estimator.fit(job_name=unique_name_from_base("test-tf-horovod"))
48 | 
49 |     model_data_source = sagemaker.local.data.get_data_source_instance(
50 |         estimator.model_data, sagemaker_session
51 |     )
52 | 
53 |     for filename in model_data_source.get_file_list():
54 |         assert os.path.basename(filename) == "model.tar.gz"
55 | 
56 | 
57 | @pytest.mark.skip_generic
58 | def test_distributed_training_horovod_with_env_vars(
59 |     sagemaker_session, instance_type, image_uri, tmpdir, framework_version
60 | ):
61 | 
62 |     mpi_options = "-verbose -x orte_base_help_aggregate=0"
63 |     estimator = TensorFlow(
64 |         entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
65 |         role="SageMakerRole",
66 |         instance_type=instance_type,
67 |         instance_count=2,
68 |         image_uri=image_uri,
69 |         framework_version=framework_version,
70 |         py_version="py3",
71 |         hyperparameters={
72 |             "sagemaker_mpi_enabled": True,
73 |             "sagemaker_mpi_custom_mpi_options": mpi_options,
74 |             "sagemaker_mpi_num_of_processes_per_host": 2,
75 |         },
76 |         sagemaker_session=sagemaker_session,
77 |     )
78 | 
79 |     estimator.fit(job_name=unique_name_from_base("test-tf-horovod-env-vars"))
80 | 


--------------------------------------------------------------------------------
/test/resources/mnist/smdataparallel_mnist.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | import tensorflow as tf
14 | 
15 | import smdistributed.dataparallel.tensorflow as dist
16 | 
17 | tf.random.set_seed(42)
18 | 
19 | dist.init()
20 | 
21 | gpus = tf.config.experimental.list_physical_devices("GPU")
22 | for gpu in gpus:
23 |     tf.config.experimental.set_memory_growth(gpu, True)
24 | if gpus:
25 |     tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU")
26 | 
27 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
28 |     path="mnist-%d.npz" % dist.rank()
29 | )
30 | 
31 | dataset = tf.data.Dataset.from_tensor_slices(
32 |     (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
33 | )
34 | dataset = dataset.repeat().shuffle(10000).batch(128)
35 | 
36 | mnist_model = tf.keras.Sequential(
37 |     [
38 |         tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
39 |         tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
40 |         tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
41 |         tf.keras.layers.Dropout(0.25),
42 |         tf.keras.layers.Flatten(),
43 |         tf.keras.layers.Dense(128, activation="relu"),
44 |         tf.keras.layers.Dropout(0.5),
45 |         tf.keras.layers.Dense(10, activation="softmax"),
46 |     ]
47 | )
48 | loss = tf.losses.SparseCategoricalCrossentropy()
49 | # LR for 8 node run : 0.000125
50 | # LR for single node run : 0.001
51 | opt = tf.optimizers.Adam(0.000125 * dist.size())
52 | 
53 | checkpoint_dir = "./checkpoints"
54 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
55 | 
56 | 
57 | @tf.function
58 | def training_step(images, labels, first_batch):
59 |     with tf.GradientTape() as tape:
60 |         probs = mnist_model(images, training=True)
61 |         loss_value = loss(labels, probs)
62 | 
63 |     tape = dist.DistributedGradientTape(tape)
64 | 
65 |     grads = tape.gradient(loss_value, mnist_model.trainable_variables)
66 |     opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
67 | 
68 |     if first_batch:
69 |         dist.broadcast_variables(mnist_model.variables, root_rank=0)
70 |         dist.broadcast_variables(opt.variables(), root_rank=0)
71 | 
72 |     loss_value = dist.oob_allreduce(loss_value)  # Average the loss across workers
73 |     return loss_value
74 | 
75 | 
76 | for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())):
77 |     loss_value = training_step(images, labels, batch == 0)
78 | 
79 |     if batch % 50 == 0 and dist.rank() == 0:
80 |         print("Step #%d\tLoss: %.6f" % (batch, loss_value))
81 | 
82 | if dist.rank() == 0:
83 |     checkpoint.save(checkpoint_dir)
84 | 


--------------------------------------------------------------------------------
/test/integration/local/test_horovod_local.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | from __future__ import absolute_import
14 | 
15 | import json
16 | import os
17 | import tarfile
18 | 
19 | import pytest
20 | from sagemaker.tensorflow import TensorFlow
21 | 
22 | 
23 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
24 | 
25 | 
26 | @pytest.mark.skip_cpu
27 | @pytest.mark.skip_generic
28 | def test_distributed_training_horovod_gpu(
29 |     sagemaker_local_session, image_uri, tmpdir, framework_version
30 | ):
31 |     _test_distributed_training_horovod(
32 |         1, 2, sagemaker_local_session, image_uri, tmpdir, framework_version, "local_gpu"
33 |     )
34 | 
35 | 
36 | @pytest.mark.skip_gpu
37 | @pytest.mark.skip_generic
38 | @pytest.mark.parametrize("instances, processes", [(2, 2)])
39 | def test_distributed_training_horovod_cpu(
40 |     instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version
41 | ):
42 |     _test_distributed_training_horovod(
43 |         instances, processes, sagemaker_local_session, image_uri, tmpdir, framework_version, "local"
44 |     )
45 | 
46 | 
47 | def _test_distributed_training_horovod(
48 |     instances, processes, session, image_uri, tmpdir, framework_version, instance_type
49 | ):
50 |     output_path = "file://%s" % tmpdir
51 |     estimator = TensorFlow(
52 |         entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"),
53 |         role="SageMakerRole",
54 |         instance_type=instance_type,
55 |         sagemaker_session=session,
56 |         instance_count=instances,
57 |         image_uri=image_uri,
58 |         output_path=output_path,
59 |         hyperparameters={
60 |             "sagemaker_mpi_enabled": True,
61 |             "sagemaker_network_interface_name": "eth0",
62 |             "sagemaker_mpi_num_of_processes_per_host": processes,
63 |         },
64 |     )
65 | 
66 |     estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed")))
67 | 
68 |     tmp = str(tmpdir)
69 |     extract_files(output_path.replace("file://", ""), tmp)
70 | 
71 |     size = instances * processes
72 | 
73 |     for rank in range(size):
74 |         local_rank = rank % processes
75 |         assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == {
76 |             "local-rank": local_rank,
77 |             "rank": rank,
78 |             "size": size,
79 |         }
80 | 
81 | 
82 | def read_json(file, tmp):
83 |     with open(os.path.join(tmp, file)) as f:
84 |         return json.load(f)
85 | 
86 | 
87 | def assert_files_exist_in_tar(output_path, files):
88 |     if output_path.startswith("file://"):
89 |         output_path = output_path[7:]
90 |     model_file = os.path.join(output_path, "model.tar.gz")
91 |     with tarfile.open(model_file) as tar:
92 |         for f in files:
93 |             tar.getmember(f)
94 | 
95 | 
96 | def extract_files(output_path, tmpdir):
97 |     with tarfile.open(os.path.join(output_path, "model.tar.gz")) as tar:
98 |         tar.extractall(tmpdir)
99 | 


--------------------------------------------------------------------------------
/docker/1.11.0/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:9.0-base-ubuntu16.04
 2 | 
 3 | LABEL maintainer="Amazon AI"
 4 | 
 5 | ARG framework_installable
 6 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 7 | ARG py_version
 8 | 
 9 | # Validate that arguments are specified
10 | RUN test $framework_installable || exit 1 \
11 |     && test $py_version || exit 1
12 | 
13 | RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
14 |     && add-apt-repository ppa:deadsnakes/ppa -y \
15 |     && rm -rf /var/lib/apt/lists/*
16 | 
17 | ENV NCCL_VERSION=2.3.5-2+cuda9.0
18 | ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0
19 | ENV TF_TENSORRT_VERSION=4.1.2
20 | 
21 | RUN apt-get update && apt-get install -y --no-install-recommends \
22 |         build-essential \
23 |         ca-certificates \
24 |         cuda-command-line-tools-9-0 \
25 |         cuda-cublas-dev-9-0 \
26 |         cuda-cudart-dev-9-0 \
27 |         cuda-cufft-dev-9-0 \
28 |         cuda-curand-dev-9-0 \
29 |         cuda-cusolver-dev-9-0 \
30 |         cuda-cusparse-dev-9-0 \
31 |         curl \
32 |         libcudnn7=${CUDNN_VERSION} \
33 |         libnccl2=${NCCL_VERSION} \
34 |         libgomp1 \
35 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
36 |     # adds a new list which contains libnvinfer library, so it needs another
37 |     # 'apt-get update' to retrieve that list before it can actually install the
38 |     # library.
39 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
40 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
41 |     && apt-get update && apt-get install -y --no-install-recommends \
42 |         nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \
43 |     && apt-get update && apt-get install -y --no-install-recommends \
44 |         libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \
45 |     && apt-get clean \
46 |     && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
47 |     && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
48 |     && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
49 |     && if [ $py_version -eq 3 ]; \
50 |            then apt-get install -y --no-install-recommends python3.6-dev \
51 |                 && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
52 |            else apt-get install -y --no-install-recommends python-dev; fi \
53 |     && rm -rf /var/lib/apt/lists/*
54 | 
55 | # Python won’t try to write .pyc or .pyo files on the import of source modules
56 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
57 | 
58 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
59 |     python get-pip.py \
60 | 		    --disable-pip-version-check \
61 | 		    --no-cache-dir \
62 | 		    "pip==18.1" \
63 | 	; \
64 | 	pip --version; \
65 | 	find /usr/local -depth \
66 | 		\( \
67 | 			\( -type d -a \( -name test -o -name tests \) \) \
68 | 			-o \
69 | 			\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
70 | 		\) -exec rm -rf '{}' +; \
71 |     rm get-pip.py
72 | 
73 | WORKDIR /
74 | 
75 | COPY $framework_installable .
76 | COPY $framework_support_installable .
77 | 
78 | RUN pip install --no-cache-dir -U \
79 |     keras==2.2.4 \
80 |     $framework_support_installable \
81 |     "sagemaker-tensorflow>=1.11,<1.12" \
82 |     # Let's install TensorFlow separately in the end to avoid
83 |     # the library version to be overwritten
84 |     && pip install --force-reinstall --no-cache-dir -U $framework_installable \
85 |     \
86 |     && rm -f $framework_installable \
87 |     && rm -f $framework_support_installable \
88 |     && pip uninstall -y --no-cache-dir \
89 |     markdown \
90 |     tensorboard
91 | 
92 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
93 | 


--------------------------------------------------------------------------------
/docker/build_artifacts/deep_learning_container.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import json
 16 | import logging
 17 | import re
 18 | 
 19 | import requests
 20 | 
 21 | 
 22 | def _validate_instance_id(instance_id):
 23 |     """
 24 |     Validate instance ID
 25 |     """
 26 |     instance_id_regex = r'^(i-\S{17})'
 27 |     compiled_regex = re.compile(instance_id_regex)
 28 |     match = compiled_regex.match(instance_id)
 29 | 
 30 |     if not match:
 31 |         return None
 32 | 
 33 |     return match.group(1)
 34 | 
 35 | 
 36 | def _retrieve_instance_id():
 37 |     """
 38 |     Retrieve instance ID from instance metadata service
 39 |     """
 40 |     instance_id = None
 41 |     url = "http://169.254.169.254/latest/meta-data/instance-id"
 42 |     response = requests_helper(url, timeout=0.1)
 43 | 
 44 |     if response is not None:
 45 |         instance_id = _validate_instance_id(response.text)
 46 | 
 47 |     return instance_id
 48 | 
 49 | 
 50 | def _retrieve_instance_region():
 51 |     """
 52 |     Retrieve instance region from instance metadata service
 53 |     """
 54 |     region = None
 55 |     valid_regions = ['ap-northeast-1', 'ap-northeast-2', 'ap-southeast-1', 'ap-southeast-2',
 56 |                      'ap-south-1', 'ca-central-1', 'eu-central-1', 'eu-north-1',
 57 |                      'eu-west-1', 'eu-west-2', 'eu-west-3', 'sa-east-1',
 58 |                      'us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
 59 | 
 60 |     url = "http://169.254.169.254/latest/dynamic/instance-identity/document"
 61 |     response = requests_helper(url, timeout=0.1)
 62 | 
 63 |     if response is not None:
 64 |         response_json = json.loads(response.text)
 65 | 
 66 |         if response_json['region'] in valid_regions:
 67 |             region = response_json['region']
 68 | 
 69 |     return region
 70 | 
 71 | 
 72 | def query_bucket():
 73 |     """
 74 |     GET request on an empty object from an Amazon S3 bucket
 75 |     """
 76 |     response = None
 77 |     instance_id = _retrieve_instance_id()
 78 |     region = _retrieve_instance_region()
 79 | 
 80 |     if instance_id is not None and region is not None:
 81 |         url = ("https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com"
 82 |                "/dlc-containers.txt?x-instance-id={1}".format(region, instance_id))
 83 |         response = requests_helper(url, timeout=0.2)
 84 | 
 85 |     logging.debug("Query bucket finished: {}".format(response))
 86 | 
 87 |     return response
 88 | 
 89 | 
 90 | def requests_helper(url, timeout):
 91 |     response = None
 92 |     try:
 93 |         response = requests.get(url, timeout=timeout)
 94 |     except requests.exceptions.RequestException as e:
 95 |         logging.error("Request exception: {}".format(e))
 96 | 
 97 |     return response
 98 | 
 99 | 
100 | def main():
101 |     """
102 |     Invoke bucket query
103 |     """
104 |     # Logs are not necessary for normal run. Remove this line while debugging.
105 |     logging.getLogger().disabled = True
106 | 
107 |     logging.basicConfig(level=logging.ERROR)
108 |     query_bucket()
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     main()
113 | 


--------------------------------------------------------------------------------
/docker/1.12.0/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | LABEL maintainer="Amazon AI"
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends \
 6 |     software-properties-common \
 7 |     build-essential \
 8 |     openssh-client \
 9 |     openssh-server \
10 |     ca-certificates \
11 |     curl \
12 |     && add-apt-repository ppa:deadsnakes/ppa -y \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | # Install Open MPI
16 | RUN mkdir /tmp/openmpi && \
17 |     cd /tmp/openmpi && \
18 |     curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
19 |     tar zxf openmpi-3.1.2.tar.gz && \
20 |     cd openmpi-3.1.2 && \
21 |     ./configure --enable-orterun-prefix-by-default && \
22 |     make -j $(nproc) all && \
23 |     make install && \
24 |     ldconfig && \
25 |     rm -rf /tmp/openmpi
26 | 
27 | 
28 |  # Create a wrapper for OpenMPI to allow running as root by default
29 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
30 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
31 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
32 |     chmod a+x /usr/local/bin/mpirun
33 | 
34 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
35 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
36 | 
37 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
38 | 
39 | ENV PATH /usr/local/openmpi/bin/:$PATH
40 | 
41 | # SSH login fix. Otherwise user is kicked off after login
42 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
43 | 
44 | # Create SSH key.
45 | RUN mkdir -p /root/.ssh/ && \
46 |     mkdir -p /var/run/sshd && \
47 |     ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
48 |     cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
49 |     printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
50 | 
51 | # Set environment variables for MKL
52 | # For more about MKL with TensorFlow see:
53 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
54 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
55 | 
56 | WORKDIR /
57 | 
58 | ARG py_version
59 | 
60 | RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \
61 |         apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated  && \
62 |         ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \
63 |     rm -rf /var/lib/apt/lists/*
64 | 
65 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
66 | 
67 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
68 |     python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \
69 |     rm get-pip.py
70 | 
71 | ARG framework_installable
72 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
73 | 
74 | COPY $framework_installable tensorflow-1.12.0-py2.py3-none-any.whl
75 | COPY $framework_support_installable .
76 | 
77 | RUN pip install --no-cache-dir -U \
78 |         keras==2.2.4 \
79 |         mpi4py==3.0.1 \
80 |         "sagemaker-tensorflow>=1.12,<1.13" && \
81 |     # Let's install TensorFlow separately in the end to avoid
82 |     # the library version to be overwritten
83 |     pip install --force-reinstall --no-cache-dir -U \
84 |         tensorflow-1.12.0-py2.py3-none-any.whl \
85 |         horovod && \
86 |     pip install --no-cache-dir -U $framework_support_installable && \
87 |     rm -f tensorflow-1.12.0-py2.py3-none-any.whl && \
88 |     rm -f $framework_support_installable && \
89 |     pip uninstall -y --no-cache-dir \
90 |         markdown \
91 |         tensorboard
92 | 
93 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
94 | 


--------------------------------------------------------------------------------
/test/resources/mnist/horovod_mnist.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | import os
14 | import tensorflow as tf
15 | import horovod.tensorflow as hvd
16 | 
17 | # Horovod: initialize Horovod.
18 | hvd.init()
19 | 
20 | # Horovod: pin GPU to be used to process local rank (one GPU per process)
21 | gpus = tf.config.experimental.list_physical_devices("GPU")
22 | for gpu in gpus:
23 |     tf.config.experimental.set_memory_growth(gpu, True)
24 | if gpus:
25 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
26 | 
27 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
28 |     path="mnist-%d.npz" % hvd.rank()
29 | )
30 | 
31 | dataset = tf.data.Dataset.from_tensor_slices(
32 |     (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
33 | )
34 | dataset = dataset.repeat().shuffle(10000).batch(128)
35 | 
36 | mnist_model = tf.keras.Sequential(
37 |     [
38 |         tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
39 |         tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
40 |         tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
41 |         tf.keras.layers.Dropout(0.25),
42 |         tf.keras.layers.Flatten(),
43 |         tf.keras.layers.Dense(128, activation="relu"),
44 |         tf.keras.layers.Dropout(0.5),
45 |         tf.keras.layers.Dense(10, activation="softmax"),
46 |     ]
47 | )
48 | loss = tf.losses.SparseCategoricalCrossentropy()
49 | 
50 | # Horovod: adjust learning rate based on number of GPUs.
51 | opt = tf.optimizers.Adam(0.001 * hvd.size())
52 | 
53 | checkpoint_dir = "./checkpoints"
54 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
55 | 
56 | 
57 | @tf.function
58 | def training_step(images, labels, first_batch):
59 |     with tf.GradientTape() as tape:
60 |         probs = mnist_model(images, training=True)
61 |         loss_value = loss(labels, probs)
62 | 
63 |     # Horovod: add Horovod Distributed GradientTape.
64 |     tape = hvd.DistributedGradientTape(tape)
65 | 
66 |     grads = tape.gradient(loss_value, mnist_model.trainable_variables)
67 |     opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
68 | 
69 |     # Horovod: broadcast initial variable states from rank 0 to all other processes.
70 |     # This is necessary to ensure consistent initialization of all workers when
71 |     # training is started with random weights or restored from a checkpoint.
72 |     #
73 |     # Note: broadcast should be done after the first gradient step to ensure optimizer
74 |     # initialization.
75 |     if first_batch:
76 |         hvd.broadcast_variables(mnist_model.variables, root_rank=0)
77 |         hvd.broadcast_variables(opt.variables(), root_rank=0)
78 | 
79 |     return loss_value
80 | 
81 | 
82 | # Horovod: adjust number of steps based on number of GPUs.
83 | for batch, (images, labels) in enumerate(dataset.take(600 // hvd.size())):
84 |     loss_value = training_step(images, labels, batch == 0)
85 | 
86 |     if batch % 10 == 0 and hvd.local_rank() == 0:
87 |         print("Step #%d\tLoss: %.6f" % (batch, loss_value))
88 | 
89 | # Horovod: save checkpoints only on worker 0 to prevent other workers from
90 | # corrupting it.
91 | if hvd.rank() == 0:
92 |     # Export the keras model as Tensorflow SavedModelBundle
93 |     mnist_model.save(os.path.join("/opt/ml/model/mnist/1"), save_format="tf")
94 | 


--------------------------------------------------------------------------------
/src/sagemaker_tensorflow_container/deep_learning_container.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import json
 16 | import logging
 17 | import re
 18 | 
 19 | import requests
 20 | 
 21 | 
 22 | def _validate_instance_id(instance_id):
 23 |     """
 24 |     Validate instance ID
 25 |     """
 26 |     compiled_regex = re.compile(r"^(i-\S{17})")
 27 |     match = compiled_regex.match(instance_id)
 28 | 
 29 |     if not match:
 30 |         return None
 31 | 
 32 |     return match.group(1)
 33 | 
 34 | 
 35 | def _retrieve_instance_id():
 36 |     """
 37 |     Retrieve instance ID from instance metadata service
 38 |     """
 39 |     instance_id = None
 40 |     url = "http://169.254.169.254/latest/meta-data/instance-id"
 41 |     response = requests_helper(url, timeout=0.1)
 42 | 
 43 |     if response is not None:
 44 |         instance_id = _validate_instance_id(response.text)
 45 | 
 46 |     return instance_id
 47 | 
 48 | 
 49 | def _retrieve_instance_region():
 50 |     """
 51 |     Retrieve instance region from instance metadata service
 52 |     """
 53 |     region = None
 54 |     valid_regions = [
 55 |         "ap-northeast-1",
 56 |         "ap-northeast-2",
 57 |         "ap-southeast-1",
 58 |         "ap-southeast-2",
 59 |         "ap-south-1",
 60 |         "ca-central-1",
 61 |         "eu-central-1",
 62 |         "eu-north-1",
 63 |         "eu-west-1",
 64 |         "eu-west-2",
 65 |         "eu-west-3",
 66 |         "sa-east-1",
 67 |         "us-east-1",
 68 |         "us-east-2",
 69 |         "us-west-1",
 70 |         "us-west-2",
 71 |     ]
 72 | 
 73 |     url = "http://169.254.169.254/latest/dynamic/instance-identity/document"
 74 |     response = requests_helper(url, timeout=0.1)
 75 | 
 76 |     if response is not None:
 77 |         response_json = json.loads(response.text)
 78 | 
 79 |         if response_json["region"] in valid_regions:
 80 |             region = response_json["region"]
 81 | 
 82 |     return region
 83 | 
 84 | 
 85 | def query_bucket():
 86 |     """
 87 |     GET request on an empty object from an Amazon S3 bucket
 88 |     """
 89 |     response = None
 90 |     instance_id = _retrieve_instance_id()
 91 |     region = _retrieve_instance_region()
 92 | 
 93 |     if instance_id is not None and region is not None:
 94 |         url = "https://aws-deep-learning-containers-{0}.s3.{0}.amazonaws.com/dlc-containers.txt?x-instance-id={1}".format(  # noqa: E501
 95 |             region, instance_id
 96 |         )
 97 |         response = requests_helper(url, timeout=0.2)
 98 | 
 99 |     logging.debug("Query bucket finished: {}".format(response))
100 | 
101 |     return response
102 | 
103 | 
104 | def requests_helper(url, timeout):
105 |     response = None
106 |     try:
107 |         response = requests.get(url, timeout=timeout)
108 |     except requests.exceptions.RequestException as e:
109 |         logging.error("Request exception: {}".format(e))
110 | 
111 |     return response
112 | 
113 | 
114 | def main():
115 |     """
116 |     Invoke bucket query
117 |     """
118 |     # Logs are not necessary for normal run. Remove this line while debugging.
119 |     logging.getLogger().disabled = True
120 | 
121 |     logging.basicConfig(level=logging.ERROR)
122 |     query_bucket()
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/benchmarks/tf_benchmarks/execute_tensorflow_training.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  5 | # may not use this file except in compliance with the License. A copy of
  6 | # the License is located at
  7 | #
  8 | #     http://aws.amazon.com/apache2.0/
  9 | #
 10 | # or in the "license" file accompanying this file. This file is
 11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 12 | # ANY KIND, either express or implied. See the License for the specific
 13 | # language governing permissions and limitations under the License.
 14 | 
 15 | from __future__ import absolute_import
 16 | 
 17 | import argparse
 18 | import itertools
 19 | import os
 20 | 
 21 | from sagemaker import Session
 22 | from sagemaker.estimator import Framework
 23 | from sagemaker.tensorflow import TensorFlow
 24 | 
 25 | default_bucket = Session().default_bucket
 26 | dir_path = os.path.dirname(os.path.realpath(__file__))
 27 | 
 28 | _DEFAULT_HYPERPARAMETERS = {
 29 |     "batch_size": 32,
 30 |     "model": "resnet32",
 31 |     "num_epochs": 10,
 32 |     "data_format": "NHWC",
 33 |     "summary_verbosity": 1,
 34 |     "save_summaries_steps": 10,
 35 |     "data_name": "cifar10",
 36 | }
 37 | 
 38 | 
 39 | class ScriptModeTensorFlow(Framework):
 40 |     """This class is temporary until the final version of Script Mode is released.
 41 |     """
 42 | 
 43 |     __framework_name__ = "tensorflow-scriptmode-beta"
 44 | 
 45 |     create_model = TensorFlow.create_model
 46 | 
 47 |     def __init__(self, py_version="py3", **kwargs):
 48 |         super(ScriptModeTensorFlow, self).__init__(**kwargs)
 49 |         self.py_version = py_version
 50 |         self.image_name = None
 51 |         self.framework_version = "1.10.0"
 52 | 
 53 | 
 54 | def get_args():
 55 |     parser = argparse.ArgumentParser()
 56 |     parser.add_argument(
 57 |         "-t", "--instance-types", nargs="+", help="<Required> Set flag", required=True
 58 |     )
 59 |     parser.add_argument("-r", "--role", required=True)
 60 |     parser.add_argument("-w", "--wait", action="store_true")
 61 |     parser.add_argument("--region", default="us-west-2")
 62 |     parser.add_argument("--py-versions", nargs="+", help="<Required> Set flag", default=["py3"])
 63 |     parser.add_argument(
 64 |         "--checkpoint-path",
 65 |         default=os.path.join(default_bucket(), "benchmarks", "checkpoints"),
 66 |         help="The S3 location where the model checkpoints and tensorboard events are saved after training",
 67 |     )
 68 | 
 69 |     return parser.parse_known_args()
 70 | 
 71 | 
 72 | def main(args, script_args):
 73 |     for instance_type, py_version in itertools.product(args.instance_types, args.py_versions):
 74 |         base_name = "%s-%s-%s" % (py_version, instance_type[3:5], instance_type[6:])
 75 |         model_dir = os.path.join(args.checkpoint_path, base_name)
 76 | 
 77 |         job_hps = create_hyperparameters(model_dir, script_args)
 78 | 
 79 |         print("hyperparameters:")
 80 |         print(job_hps)
 81 | 
 82 |         estimator = ScriptModeTensorFlow(
 83 |             entry_point="tf_cnn_benchmarks.py",
 84 |             role="SageMakerRole",
 85 |             source_dir=os.path.join(dir_path, "tf_cnn_benchmarks"),
 86 |             base_job_name=base_name,
 87 |             train_instance_count=1,
 88 |             hyperparameters=job_hps,
 89 |             train_instance_type=instance_type,
 90 |         )
 91 | 
 92 |         input_dir = "s3://sagemaker-sample-data-%s/spark/mnist/train/" % args.region
 93 |         estimator.fit({"train": input_dir}, wait=args.wait)
 94 | 
 95 |     print("To use TensorBoard, execute the following command:")
 96 |     cmd = "S3_USE_HTTPS=0 S3_VERIFY_SSL=0  AWS_REGION=%s tensorboard --host localhost --port 6006 --logdir %s"
 97 |     print(cmd % (args.region, args.checkpoint_path))
 98 | 
 99 | 
100 | def create_hyperparameters(model_dir, script_args):
101 |     job_hps = _DEFAULT_HYPERPARAMETERS.copy()
102 | 
103 |     job_hps.update({"train_dir": model_dir, "eval_dir": model_dir})
104 | 
105 |     script_arg_keys_without_dashes = [
106 |         key[2:] if key.startswith("--") else key[1:] for key in script_args[::2]
107 |     ]
108 |     script_arg_values = script_args[1::2]
109 |     job_hps.update(dict(zip(script_arg_keys_without_dashes, script_arg_values)))
110 | 
111 |     return job_hps
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     args, script_args = get_args()
116 |     main(args, script_args)
117 | 


--------------------------------------------------------------------------------
/docker/2.1.0/py2/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 
 14 | ENV KMP_BLOCKTIME=1 
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1 
 18 | ENV PYTHONUNBUFFERED=1 
 19 | ENV PYTHONIOENCODING=UTF-8 
 20 | ENV LANG=C.UTF-8 
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/cpu/final/tensorflow-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl
 24 | 
 25 | ARG PYTHON=python
 26 | ARG PYTHON_PIP=python-pip
 27 | ARG PIP=pip
 28 | 
 29 | RUN apt-get update && apt-get install -y --no-install-recommends \
 30 |     software-properties-common \
 31 |     build-essential \
 32 |     openssh-client \
 33 |     openssh-server \
 34 |     ca-certificates \
 35 |     curl \
 36 |     git \
 37 |     wget \
 38 |     vim \
 39 |     zlib1g-dev \
 40 |     # Install dependent library for OpenCV
 41 |     libgtk2.0-dev \
 42 |  && rm -rf /var/lib/apt/lists/*
 43 | 
 44 | # Install Open MPI
 45 | RUN mkdir /tmp/openmpi \
 46 |  && cd /tmp/openmpi \
 47 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 48 |  && tar zxf openmpi-4.0.1.tar.gz \
 49 |  && cd openmpi-4.0.1 \
 50 |  && ./configure --enable-orterun-prefix-by-default \
 51 |  && make -j $(nproc) all \
 52 |  && make install \
 53 |  && ldconfig \
 54 |  && rm -rf /tmp/openmpi
 55 | 
 56 | # Create a wrapper for OpenMPI to allow running as root by default
 57 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 58 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 59 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 60 |  && chmod a+x /usr/local/bin/mpirun
 61 | 
 62 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 63 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 64 | 
 65 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 66 | ENV PATH /usr/local/openmpi/bin/:$PATH
 67 | 
 68 | # SSH login fix. Otherwise user is kicked off after login
 69 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 70 | 
 71 | # Create SSH key.
 72 | RUN mkdir -p /root/.ssh/ \
 73 |  && mkdir -p /var/run/sshd \
 74 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 75 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 76 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 77 | 
 78 | WORKDIR /
 79 | 
 80 | RUN apt-get update && apt-get install -y \
 81 |     ${PYTHON} \
 82 |     ${PYTHON_PIP}
 83 | 
 84 | RUN ${PIP} --no-cache-dir install --upgrade \
 85 |     pip \
 86 |     setuptools
 87 | 
 88 | # Some TF tools expect a "python" binary
 89 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 90 | 
 91 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
 92 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
 93 | RUN ${PIP} install --no-cache-dir -U \
 94 |     numpy==1.16.6 \
 95 |     scipy==1.2.2 \
 96 |     scikit-learn==0.20.4 \
 97 |     pandas==0.24.2 \
 98 |     Pillow==6.2.2 \
 99 |     h5py==2.10.0 \
100 |     keras_applications==1.0.8 \
101 |     keras_preprocessing==1.1.0 \
102 |     keras==2.3.1 \
103 |     python-dateutil==2.8.1 \
104 |     pyYAML==5.3.1 \
105 |     requests==2.22.0 \
106 |     awscli \
107 |     mpi4py==3.0.3 \
108 |     opencv-python==4.2.0.32 \
109 |     "cryptography>=2.3" \
110 |     "sagemaker-tensorflow>=2.1,<2.2" \
111 |     "sagemaker-tensorflow-training>2,<4" \
112 |     # Let's install TensorFlow separately in the end to avoid
113 |     # the library version to be overwritten
114 |  && ${PIP} install --no-cache-dir -U \
115 |     ${TF_URL} \
116 |  && ${PIP} install --no-cache-dir -U \
117 |     horovod==0.18.2 
118 | 
119 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
120 | 
121 | RUN chmod +x /usr/local/bin/deep_learning_container.py
122 | 
123 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt
124 | 
125 | CMD ["bin/bash"]
126 | 


--------------------------------------------------------------------------------
/docker/2.1.0/py3/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0
 14 | ENV KMP_BLOCKTIME=1
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1
 18 | ENV PYTHONUNBUFFERED=1
 19 | ENV PYTHONIOENCODING=UTF-8
 20 | ENV LANG=C.UTF-8
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG PYTHON=python3
 24 | ARG PYTHON_PIP=python3-pip
 25 | ARG PIP=pip3
 26 | 
 27 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/cpu/final/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl
 28 | 
 29 | RUN apt-get update && apt-get install -y --no-install-recommends \
 30 |     python3-dev \
 31 |     python3-pip \
 32 |     python3-setuptools \
 33 |     software-properties-common \
 34 |     build-essential \
 35 |     openssh-client \
 36 |     openssh-server \
 37 |     ca-certificates \
 38 |     curl \
 39 |     git \
 40 |     wget \
 41 |     vim \
 42 |     zlib1g-dev \
 43 |     # Install dependent library for OpenCV
 44 |     libgtk2.0-dev \
 45 |  && rm -rf /var/lib/apt/lists/*
 46 | 
 47 | # Install Open MPI
 48 | RUN mkdir /tmp/openmpi && \
 49 |     cd /tmp/openmpi && \
 50 |     curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 51 |  && tar zxf openmpi-4.0.1.tar.gz \
 52 |  && cd openmpi-4.0.1 \
 53 |  && ./configure --enable-orterun-prefix-by-default \
 54 |  && make -j $(nproc) all \
 55 |  && make install \
 56 |  && ldconfig \
 57 |  && rm -rf /tmp/openmpi
 58 | 
 59 | # Create a wrapper for OpenMPI to allow running as root by default
 60 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 61 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 62 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 63 |  && chmod a+x /usr/local/bin/mpirun
 64 | 
 65 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 66 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 67 | 
 68 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 69 | ENV PATH /usr/local/openmpi/bin/:$PATH
 70 | 
 71 | # SSH login fix. Otherwise user is kicked off after login
 72 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 73 | 
 74 | # Create SSH key.
 75 | RUN mkdir -p /root/.ssh/ \
 76 |  && mkdir -p /var/run/sshd \
 77 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 78 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 79 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 80 | 
 81 | WORKDIR /
 82 | 
 83 | RUN ${PIP} --no-cache-dir install --upgrade \
 84 |     pip \
 85 |     setuptools
 86 | 
 87 | # Some TF tools expect a "python" binary
 88 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
 89 |  && ln -s $(which ${PIP}) /usr/bin/pip
 90 | 
 91 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
 92 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
 93 | RUN ${PIP} install --no-cache-dir -U \
 94 |     numpy==1.18.1 \
 95 |     scipy==1.2.2 \
 96 |     scikit-learn==0.22 \
 97 |     pandas==1.0.1 \
 98 |     Pillow==7.0.0 \
 99 |     h5py==2.10.0 \
100 |     keras_applications==1.0.8 \
101 |     keras_preprocessing==1.1.0 \
102 |     keras==2.3.1 \
103 |     smdebug==0.7.2 \
104 |     python-dateutil==2.8.1 \
105 |     pyYAML==5.3.1 \
106 |     requests==2.22.0 \
107 |     awscli \
108 |     mpi4py==3.0.3 \
109 |     opencv-python==4.2.0.32 \
110 |     sagemaker==1.50.17 \
111 |     sagemaker-experiments==0.1.7 \
112 |     "sagemaker-tensorflow>=2.1,<2.2" \
113 |     "sagemaker-tensorflow-training>2,<4" \
114 |     # Let's install TensorFlow separately in the end to avoid
115 |     # the library version to be overwritten
116 |  && ${PIP} install --no-cache-dir -U \
117 |     ${TF_URL} \
118 |     horovod==0.18.2
119 | 
120 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
121 | 
122 | RUN chmod +x /usr/local/bin/deep_learning_container.py
123 | 
124 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt
125 | 
126 | CMD ["bin/bash"]
127 | 


--------------------------------------------------------------------------------
/docker/1.13.1/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:16.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | RUN apt-get update && apt-get install -y --no-install-recommends \
  6 |     software-properties-common \
  7 |     build-essential \
  8 |     openssh-client \
  9 |     openssh-server \
 10 |     ca-certificates \
 11 |     curl \
 12 |     git \
 13 |     wget \
 14 |     vim \
 15 |     zlib1g-dev \
 16 |     && rm -rf /var/lib/apt/lists/*
 17 | 
 18 | # Install Open MPI
 19 | RUN mkdir /tmp/openmpi && \
 20 |     cd /tmp/openmpi && \
 21 |     curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
 22 |     tar zxf openmpi-3.1.2.tar.gz && \
 23 |     cd openmpi-3.1.2 && \
 24 |     ./configure --enable-orterun-prefix-by-default && \
 25 |     make -j $(nproc) all && \
 26 |     make install && \
 27 |     ldconfig && \
 28 |     rm -rf /tmp/openmpi
 29 | 
 30 | # Create a wrapper for OpenMPI to allow running as root by default
 31 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 32 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 33 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 34 |     chmod a+x /usr/local/bin/mpirun
 35 | 
 36 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 37 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 38 | 
 39 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 40 | 
 41 | ENV PATH /usr/local/openmpi/bin/:$PATH
 42 | 
 43 | # SSH login fix. Otherwise user is kicked off after login
 44 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 45 | 
 46 | # Create SSH key.
 47 | RUN mkdir -p /root/.ssh/ && \
 48 |     mkdir -p /var/run/sshd && \
 49 |     ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
 50 |     cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
 51 |     printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 52 | 
 53 | # Set environment variables for MKL
 54 | # For more about MKL with TensorFlow see:
 55 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 56 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
 57 | 
 58 | WORKDIR /
 59 | 
 60 | ARG PYTHON=python3
 61 | ARG PYTHON_PIP=python3-pip
 62 | ARG PIP=pip3
 63 | ARG PYTHON_VERSION=3.6.6
 64 | 
 65 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
 66 |     tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
 67 |     ./configure && make && make install && \
 68 |     apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
 69 |     make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
 70 |     ln -s /usr/local/bin/pip3 /usr/bin/pip
 71 | 
 72 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
 73 | 
 74 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 75 | COPY $framework_support_installable .
 76 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl"
 77 | 
 78 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
 79 | 
 80 | # Some TF tools expect a "python" binary
 81 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 82 | 
 83 | RUN ${PIP} install --no-cache-dir -U \
 84 |            numpy==1.16.2 \
 85 |            scipy==1.2.1 \
 86 |            scikit-learn==0.20.3 \
 87 |            pandas==0.24.2 \
 88 |            Pillow==5.4.1 \
 89 |            h5py==2.9.0 \
 90 |            keras_applications==1.0.7 \
 91 |            keras_preprocessing==1.0.9 \
 92 |            keras==2.2.4 \
 93 |            requests==2.21.0 \
 94 |            awscli==1.16.130 \
 95 |            mpi4py==3.0.1 \
 96 |            "sagemaker-tensorflow>=1.13,<1.14" && \
 97 |     # Let's install TensorFlow separately in the end to avoid
 98 |     # the library version to be overwritten
 99 |     ${PIP} install --force-reinstall --no-cache-dir -U \
100 |            ${TF_URL} \
101 |            horovod==0.16.4 && \
102 |     ${PIP} install --no-cache-dir -U $framework_support_installable && \
103 |            rm -f $framework_support_installable && \
104 |     ${PIP} uninstall -y --no-cache-dir \
105 |            markdown \
106 |            tensorboard
107 | 
108 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
109 | 
110 | CMD ["bin/bash"]
111 | 


--------------------------------------------------------------------------------
/docker/2.0.1/py3/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 
 14 | ENV KMP_BLOCKTIME=1 
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1 
 18 | ENV PYTHONUNBUFFERED=1 
 19 | ENV PYTHONIOENCODING=UTF-8 
 20 | ENV LANG=C.UTF-8 
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG PYTHON=python3
 24 | ARG PYTHON_PIP=python3-pip
 25 | ARG PIP=pip3
 26 | 
 27 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 28 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/cpu/final/tensorflow-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl
 29 | 
 30 | RUN apt-get update && apt-get install -y --no-install-recommends \
 31 |     python3-dev \
 32 |     python3-pip \
 33 |     python3-setuptools \
 34 |     software-properties-common \
 35 |     build-essential \
 36 |     openssh-client \
 37 |     openssh-server \
 38 |     ca-certificates \
 39 |     curl \
 40 |     git \
 41 |     wget \
 42 |     vim \
 43 |     zlib1g-dev \
 44 |     # Install dependent library for OpenCV
 45 |     libgtk2.0-dev \
 46 |  && rm -rf /var/lib/apt/lists/*
 47 | 
 48 | # Install Open MPI
 49 | RUN mkdir /tmp/openmpi && \
 50 |     cd /tmp/openmpi && \
 51 |     curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 52 |  && tar zxf openmpi-4.0.1.tar.gz \
 53 |  && cd openmpi-4.0.1 \
 54 |  && ./configure --enable-orterun-prefix-by-default \
 55 |  && make -j $(nproc) all \
 56 |  && make install \
 57 |  && ldconfig \
 58 |  && rm -rf /tmp/openmpi
 59 | 
 60 | # Create a wrapper for OpenMPI to allow running as root by default
 61 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 62 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 63 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 64 |  && chmod a+x /usr/local/bin/mpirun
 65 | 
 66 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 67 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 68 | 
 69 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 70 | ENV PATH /usr/local/openmpi/bin/:$PATH
 71 | 
 72 | # SSH login fix. Otherwise user is kicked off after login
 73 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 74 | 
 75 | # Create SSH key.
 76 | RUN mkdir -p /root/.ssh/ \
 77 |  && mkdir -p /var/run/sshd \
 78 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 79 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 80 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 81 | 
 82 | WORKDIR /
 83 | 
 84 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 85 | 
 86 | RUN ${PIP} --no-cache-dir install --upgrade \
 87 |     pip \
 88 |     setuptools
 89 | 
 90 | # Some TF tools expect a "python" binary
 91 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
 92 |  && ln -s $(which ${PIP}) /usr/bin/pip
 93 | 
 94 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
 95 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
 96 | RUN ${PIP} install --no-cache-dir -U \
 97 |     numpy==1.17.4 \
 98 |     scipy==1.2.2 \
 99 |     scikit-learn==0.22 \
100 |     pandas==0.25.3 \
101 |     Pillow==6.2.1 \
102 |     h5py==2.10.0 \
103 |     keras_applications==1.0.8 \
104 |     keras_preprocessing==1.1.0 \
105 |     keras==2.3.1 \
106 |     python-dateutil==2.8.0 \
107 |     PyYAML==5.1.2 \
108 |     requests==2.22.0 \
109 |     awscli \
110 |     mpi4py==3.0.3 \
111 |     opencv-python==4.2.0.32 \
112 |     "sagemaker-tensorflow>=2.0,<2.1" \
113 |     # Let's install TensorFlow separately in the end to avoid
114 |     # the library version to be overwritten
115 |  && ${PIP} install --no-cache-dir -U \
116 |     ${TF_URL} \
117 |     horovod==0.18.2 \
118 |  && ${PIP} install --no-cache-dir -U \
119 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
120 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 
121 | 
122 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
123 | 
124 | RUN chmod +x /usr/local/bin/deep_learning_container.py
125 | 
126 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt
127 | 
128 | CMD ["bin/bash"]
129 | 


--------------------------------------------------------------------------------
/test/integration/local/test_training.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import os
 16 | import tarfile
 17 | 
 18 | import pytest
 19 | from sagemaker.tensorflow import TensorFlow
 20 | 
 21 | RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
 22 | TF_CHECKPOINT_FILES = ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
 23 | 
 24 | 
 25 | @pytest.fixture  # noqa: F811
 26 | def py_full_version(py_version):  # noqa: F811
 27 |     if py_version == "2":
 28 |         return "2.7"
 29 |     else:
 30 |         return "3.8"
 31 | 
 32 | 
 33 | @pytest.mark.skip_gpu
 34 | def test_mnist_cpu(sagemaker_local_session, image_uri, tmpdir, framework_version):
 35 |     output_path = "file://{}".format(tmpdir)
 36 |     run_tf_training(
 37 |         script=os.path.join(RESOURCE_PATH, "mnist", "mnist.py"),
 38 |         instance_type="local",
 39 |         instance_count=1,
 40 |         sagemaker_local_session=sagemaker_local_session,
 41 |         image_uri=image_uri,
 42 |         framework_version=framework_version,
 43 |         output_path=output_path,
 44 |         training_data_path="file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data")),
 45 |     )
 46 |     _assert_files_exist_in_tar(output_path, ["my_model.h5"])
 47 | 
 48 | 
 49 | @pytest.mark.skip
 50 | def test_distributed_training_cpu_no_ps(
 51 |     sagemaker_local_session, image_uri, tmpdir, framework_version
 52 | ):
 53 |     output_path = "file://{}".format(tmpdir)
 54 |     run_tf_training(
 55 |         script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"),
 56 |         instance_type="local",
 57 |         instance_count=2,
 58 |         sagemaker_local_session=sagemaker_local_session,
 59 |         image_uri=image_uri,
 60 |         framework_version=framework_version,
 61 |         output_path=output_path,
 62 |         training_data_path="file://{}".format(
 63 |             os.path.join(RESOURCE_PATH, "mnist", "data-distributed")
 64 |         ),
 65 |     )
 66 |     _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)
 67 | 
 68 | 
 69 | @pytest.mark.skip
 70 | def test_distributed_training_cpu_ps(sagemaker_local_session, image_uri, tmpdir, framework_version):
 71 |     output_path = "file://{}".format(tmpdir)
 72 |     run_tf_training(
 73 |         script=os.path.join(RESOURCE_PATH, "mnist", "mnist_custom.py"),
 74 |         instance_type="local",
 75 |         instance_count=2,
 76 |         sagemaker_local_session=sagemaker_local_session,
 77 |         image_uri=image_uri,
 78 |         framework_version=framework_version,
 79 |         output_path=output_path,
 80 |         hyperparameters={"sagemaker_parameter_server_enabled": True},
 81 |         training_data_path="file://{}".format(
 82 |             os.path.join(RESOURCE_PATH, "mnist", "data-distributed")
 83 |         ),
 84 |     )
 85 |     _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)
 86 | 
 87 | 
 88 | def run_tf_training(
 89 |     script,
 90 |     instance_type,
 91 |     instance_count,
 92 |     sagemaker_local_session,
 93 |     image_uri,
 94 |     framework_version,
 95 |     training_data_path,
 96 |     output_path=None,
 97 |     hyperparameters=None,
 98 | ):
 99 | 
100 |     hyperparameters = hyperparameters or {}
101 | 
102 |     estimator = TensorFlow(
103 |         entry_point=script,
104 |         role="SageMakerRole",
105 |         instance_count=instance_count,
106 |         instance_type=instance_type,
107 |         sagemaker_session=sagemaker_local_session,
108 |         image_uri=image_uri,
109 |         model_dir="/opt/ml/model",
110 |         output_path=output_path,
111 |         hyperparameters=hyperparameters,
112 |         base_job_name="test-tf",
113 |         framework_version=framework_version,
114 |         py_version="py3",
115 |     )
116 | 
117 |     estimator.fit(training_data_path)
118 | 
119 | 
120 | def _assert_files_exist_in_tar(output_path, files):
121 |     if output_path.startswith("file://"):
122 |         output_path = output_path[7:]
123 |     model_file = os.path.join(output_path, "model.tar.gz")
124 |     with tarfile.open(model_file) as tar:
125 |         for f in files:
126 |             tar.getmember(f)
127 | 


--------------------------------------------------------------------------------
/docker/2.0.1/py2/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 
 14 | ENV KMP_BLOCKTIME=1 
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1 
 18 | ENV PYTHONUNBUFFERED=1 
 19 | ENV PYTHONIOENCODING=UTF-8 
 20 | ENV LANG=C.UTF-8 
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 24 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/cpu/final/tensorflow-2.0.1-cp27-cp27mu-manylinux2010_x86_64.whl
 25 | 
 26 | ARG PYTHON=python
 27 | ARG PYTHON_PIP=python-pip
 28 | ARG PIP=pip
 29 | 
 30 | RUN apt-get update && apt-get install -y --no-install-recommends \
 31 |     software-properties-common \
 32 |     build-essential \
 33 |     openssh-client \
 34 |     openssh-server \
 35 |     ca-certificates \
 36 |     curl \
 37 |     git \
 38 |     wget \
 39 |     vim \
 40 |     zlib1g-dev \
 41 |     # Install dependent library for OpenCV
 42 |     libgtk2.0-dev \
 43 |  && rm -rf /var/lib/apt/lists/*
 44 | 
 45 | # Install Open MPI
 46 | RUN mkdir /tmp/openmpi \
 47 |  && cd /tmp/openmpi \
 48 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 49 |  && tar zxf openmpi-4.0.1.tar.gz \
 50 |  && cd openmpi-4.0.1 \
 51 |  && ./configure --enable-orterun-prefix-by-default \
 52 |  && make -j $(nproc) all \
 53 |  && make install \
 54 |  && ldconfig \
 55 |  && rm -rf /tmp/openmpi
 56 | 
 57 | # Create a wrapper for OpenMPI to allow running as root by default
 58 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 59 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 60 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 61 |  && chmod a+x /usr/local/bin/mpirun
 62 | 
 63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 64 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 65 | 
 66 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 67 | ENV PATH /usr/local/openmpi/bin/:$PATH
 68 | 
 69 | # SSH login fix. Otherwise user is kicked off after login
 70 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 71 | 
 72 | # Create SSH key.
 73 | RUN mkdir -p /root/.ssh/ \
 74 |  && mkdir -p /var/run/sshd \
 75 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 76 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 77 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 78 | 
 79 | WORKDIR /
 80 | 
 81 | RUN apt-get update && apt-get install -y \
 82 |     ${PYTHON} \
 83 |     ${PYTHON_PIP}
 84 | 
 85 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 86 | 
 87 | RUN ${PIP} --no-cache-dir install --upgrade \
 88 |     pip \
 89 |     setuptools
 90 | 
 91 | # Some TF tools expect a "python" binary
 92 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 93 | 
 94 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
 95 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
 96 | RUN ${PIP} install --no-cache-dir -U \
 97 |     numpy==1.16.5 \
 98 |     scipy==1.2.2 \
 99 |     scikit-learn==0.20.4 \
100 |     pandas==0.24.2 \
101 |     Pillow==6.2.1 \
102 |     h5py==2.10.0 \
103 |     keras_applications==1.0.8 \
104 |     keras_preprocessing==1.1.0 \
105 |     requests==2.22.0 \
106 |     keras==2.3.1 \
107 |     python-dateutil==2.8.0 \
108 |     PyYAML==5.1.2 \
109 |     awscli \
110 |     mpi4py==3.0.3 \
111 |     opencv-python==4.2.0.32 \
112 |     "cryptography>=2.3" \
113 |     "sagemaker-tensorflow>=2.0,<2.1" \
114 |     # Let's install TensorFlow separately in the end to avoid
115 |     # the library version to be overwritten
116 |  && ${PIP} install --no-cache-dir -U \
117 |     ${TF_URL} \
118 |  && ${PIP} install --no-cache-dir -U \
119 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
120 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
121 |  && ${PIP} install --no-cache-dir -U \
122 |     horovod==0.18.2 
123 | 
124 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
125 | 
126 | RUN chmod +x /usr/local/bin/deep_learning_container.py
127 | 
128 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt
129 | 
130 | CMD ["bin/bash"]
131 | 


--------------------------------------------------------------------------------
/scripts/build_all.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import argparse
 16 | import os
 17 | import subprocess
 18 | 
 19 | VERSION = "1.13.1"
 20 | REPO = "sagemaker-tensorflow-scriptmode"
 21 | PY2_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl"  # noqa
 22 | PY3_CPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/cpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl"  # noqa
 23 | PY2_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp27-cp27mu-linux_x86_64.whl"  # noqa
 24 | PY3_GPU_BINARY = "https://s3-us-west-2.amazonaws.com/tensorflow-aws/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl"  # noqa
 25 | DEV_ACCOUNT = "142577830533"
 26 | REGION = "us-west-2"
 27 | 
 28 | 
 29 | def _parse_args():
 30 | 
 31 |     parser = argparse.ArgumentParser()
 32 | 
 33 |     parser.add_argument("--account", type=str, default=DEV_ACCOUNT)
 34 |     parser.add_argument("--region", type=str, default=REGION)
 35 |     parser.add_argument("--version", type=str, default=VERSION)
 36 |     parser.add_argument("--py2-cpu-binary", type=str, default=PY2_CPU_BINARY)
 37 |     parser.add_argument("--py3-cpu-binary", type=str, default=PY3_CPU_BINARY)
 38 |     parser.add_argument("--py2-gpu-binary", type=str, default=PY2_GPU_BINARY)
 39 |     parser.add_argument("--py3-gpu-binary", type=str, default=PY3_GPU_BINARY)
 40 |     parser.add_argument("--repo", type=str, default=REPO)
 41 | 
 42 |     return parser.parse_args()
 43 | 
 44 | 
 45 | args = _parse_args()
 46 | binaries = {
 47 |     "py2-cpu": args.py2_cpu_binary,
 48 |     "py3-cpu": args.py3_cpu_binary,
 49 |     "py2-gpu": args.py2_gpu_binary,
 50 |     "py3-gpu": args.py3_gpu_binary,
 51 | }
 52 | build_dir = os.path.join("docker", args.version)
 53 | 
 54 | # Run docker-login so we can pull the cached image
 55 | login_cmd = subprocess.check_output(
 56 |     "aws ecr get-login --no-include-email --registry-id {}".format(args.account).split()
 57 | )
 58 | print("Executing docker login command: {}".format(login_cmd))
 59 | subprocess.check_call(login_cmd.split())
 60 | 
 61 | for arch in ["cpu", "gpu"]:
 62 |     for py_version in ["2", "3"]:
 63 | 
 64 |         binary_url = binaries["py{}-{}".format(py_version, arch)]
 65 |         binary_file = os.path.basename(binary_url)
 66 |         cmd = "wget -O {}/{} {}".format(build_dir, binary_file, binary_url)
 67 |         print("Downloading binary file: {}".format(cmd))
 68 |         subprocess.check_call(cmd.split())
 69 | 
 70 |         tag = "{}-{}-py{}".format(args.version, arch, py_version)
 71 |         prev_image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format(
 72 |             args.account, args.region, args.repo, tag
 73 |         )
 74 |         dockerfile = os.path.join(build_dir, "Dockerfile.{}".format(arch))
 75 | 
 76 |         tar_file_name = (
 77 |             subprocess.check_output(
 78 |                 "ls {}/sagemaker_tensorflow_container*".format(build_dir), shell=True
 79 |             )
 80 |             .strip()
 81 |             .decode("ascii")
 82 |         )
 83 |         print("framework_support_installable is {}".format(os.path.basename(tar_file_name)))
 84 | 
 85 |         build_cmd = (
 86 |             "docker build -f {} --cache-from {} --build-arg framework_support_installable={} "
 87 |             "--build-arg py_version={} --build-arg framework_installable={} "
 88 |             "-t {}:{} {}".format(
 89 |                 dockerfile,
 90 |                 prev_image_uri,
 91 |                 os.path.basename(tar_file_name),
 92 |                 py_version,
 93 |                 binary_file,
 94 |                 args.repo,
 95 |                 tag,
 96 |                 build_dir,
 97 |             )
 98 |         )
 99 |         print("Building docker image: {}".format(build_cmd))
100 |         subprocess.check_call(build_cmd.split())
101 | 
102 |         print("Deleting binary file {}".format(binary_file))
103 |         subprocess.check_call("rm {}".format(os.path.join(build_dir, binary_file)).split())
104 | 


--------------------------------------------------------------------------------
/docker/1.15.0/py2/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # Prevent docker build get stopped by requesting user interaction
  6 | ENV DEBIAN_FRONTEND=noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
  8 | # Set environment variables for MKL
  9 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 10 | ENV KMP_AFFINITY=granularity=fine,compact,1,0
 11 | ENV KMP_BLOCKTIME=1
 12 | ENV KMP_SETTINGS=0
 13 | # Python won’t try to write .pyc or .pyo files on the import of source modules
 14 | ENV PYTHONDONTWRITEBYTECODE=1
 15 | ENV PYTHONUNBUFFERED=1
 16 | # See http://bugs.python.org/issue19846
 17 | ENV PYTHONIOENCODING=UTF-8
 18 | ENV LANG=C.UTF-8
 19 | ENV LC_ALL=C.UTF-8
 20 | # Specify the location of module that contains the training logic for SageMaker
 21 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
 22 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
 23 | 
 24 | # Define framework-related package sources
 25 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 26 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
 27 | 
 28 | RUN apt-get update \
 29 |  && apt-get install -y --no-install-recommends \
 30 |     software-properties-common \
 31 |     build-essential \
 32 |     openssh-client \
 33 |     openssh-server \
 34 |     ca-certificates \
 35 |     curl \
 36 |     git \
 37 |     wget \
 38 |     vim \
 39 |     zlib1g-dev \
 40 |  && rm -rf /var/lib/apt/lists/*
 41 | 
 42 | # Install Open MPI
 43 | RUN mkdir /tmp/openmpi \
 44 |  && cd /tmp/openmpi \
 45 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 46 |  && tar zxf openmpi-4.0.1.tar.gz \
 47 |  && cd openmpi-4.0.1 \
 48 |  && ./configure --enable-orterun-prefix-by-default \
 49 |  && make -j $(nproc) all \
 50 |  && make install \
 51 |  && ldconfig \
 52 |  && rm -rf /tmp/openmpi
 53 | 
 54 | # Create a wrapper for OpenMPI to allow running as root by default
 55 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 56 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 57 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 58 |  && chmod a+x /usr/local/bin/mpirun
 59 | 
 60 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 61 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 62 | 
 63 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 64 | ENV PATH=/usr/local/openmpi/bin/:$PATH
 65 | 
 66 | # SSH login fix. Otherwise user is kicked off after login
 67 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 68 | 
 69 | # Create SSH key.
 70 | RUN mkdir -p /root/.ssh/ \
 71 |  && mkdir -p /var/run/sshd \
 72 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 73 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 74 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 75 | 
 76 | WORKDIR /
 77 | 
 78 | RUN apt-get update \
 79 |  && apt-get install -y \
 80 |     python \
 81 |     python-pip
 82 | 
 83 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 84 | 
 85 | RUN pip --no-cache-dir install --upgrade \
 86 |     pip \
 87 |     setuptools
 88 | 
 89 | # Some TF tools expect a "python" binary
 90 | RUN ln -s $(which python) /usr/local/bin/python
 91 | 
 92 | RUN pip install --no-cache-dir -U \
 93 |     numpy==1.16.5 \
 94 |     scipy==1.2.2 \
 95 |     scikit-learn==0.20.3 \
 96 |     pandas==0.24.2 \
 97 |     Pillow==6.2.1 \
 98 |     h5py==2.9.0 \
 99 |     keras_applications==1.0.8 \
100 |     keras_preprocessing==1.1.0 \
101 |     requests==2.22.0 \
102 |     keras==2.3.1 \
103 |     mpi4py==3.0.2 \
104 |     "cryptography>=2.3" \
105 |     "sagemaker-tensorflow>=1.15,<1.16" \
106 |     # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
107 |  && pip install --force-reinstall --no-cache-dir -U \
108 |     ${TF_URL} \
109 |  && pip install --no-cache-dir -U \
110 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
111 |     awscli==1.17.7 \
112 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
113 |  && pip install --no-cache-dir -U \
114 |     horovod==0.18.2
115 | 
116 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
117 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
118 | 
119 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
120 |  && chmod +x /usr/local/bin/deep_learning_container.py
121 | 
122 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
123 | 
124 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
125 | CMD ["bin/bash"]
126 | 


--------------------------------------------------------------------------------
/docker/1.15.0/py3/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # Prevent docker build get stopped by requesting user interaction
  6 | ENV DEBIAN_FRONTEND=noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
  8 | # Set environment variables for MKL
  9 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 10 | ENV KMP_AFFINITY=granularity=fine,compact,1,0
 11 | ENV KMP_BLOCKTIME=1
 12 | ENV KMP_SETTINGS=0
 13 | # Python won’t try to write .pyc or .pyo files on the import of source modules
 14 | ENV PYTHONDONTWRITEBYTECODE=1
 15 | ENV PYTHONUNBUFFERED=1
 16 | # See http://bugs.python.org/issue19846
 17 | ENV PYTHONIOENCODING=UTF-8
 18 | ENV LANG=C.UTF-8
 19 | ENV LC_ALL=C.UTF-8
 20 | # Specify the location of module that contains the training logic for SageMaker
 21 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
 22 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
 23 | 
 24 | # Define framework-related package sources
 25 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 26 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp36-cp36m-manylinux2010_x86_64.whl
 27 | 
 28 | RUN apt-get update \
 29 |  && apt-get install -y --no-install-recommends \
 30 |     python3-dev \
 31 |     python3-pip \
 32 |     python3-setuptools \
 33 |     software-properties-common \
 34 |     build-essential \
 35 |     openssh-client \
 36 |     openssh-server \
 37 |     ca-certificates \
 38 |     curl \
 39 |     git \
 40 |     wget \
 41 |     vim \
 42 |     zlib1g-dev \
 43 |  && rm -rf /var/lib/apt/lists/*
 44 | 
 45 | # Install Open MPI
 46 | RUN mkdir /tmp/openmpi \
 47 |  && cd /tmp/openmpi \
 48 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 49 |  && tar zxf openmpi-4.0.1.tar.gz \
 50 |  && cd openmpi-4.0.1 \
 51 |  && ./configure --enable-orterun-prefix-by-default \
 52 |  && make -j $(nproc) all \
 53 |  && make install \
 54 |  && ldconfig \
 55 |  && rm -rf /tmp/openmpi
 56 | 
 57 | # Create a wrapper for OpenMPI to allow running as root by default
 58 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 59 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 60 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 61 |  && chmod a+x /usr/local/bin/mpirun
 62 | 
 63 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 64 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 65 | 
 66 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 67 | ENV PATH=/usr/local/openmpi/bin/:$PATH
 68 | 
 69 | # SSH login fix. Otherwise user is kicked off after login
 70 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 71 | 
 72 | # Create SSH key.
 73 | RUN mkdir -p /root/.ssh/ \
 74 |  && mkdir -p /var/run/sshd \
 75 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 76 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 77 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 78 | 
 79 | WORKDIR /
 80 | 
 81 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 82 | 
 83 | RUN pip3 --no-cache-dir install --upgrade \
 84 |     pip \
 85 |     setuptools
 86 | 
 87 | # Some TF tools expect a "python" binary
 88 | RUN ln -s $(which python3) /usr/local/bin/python \
 89 |  && ln -s $(which pip3) /usr/bin/pip
 90 | 
 91 | RUN pip install --no-cache-dir -U \
 92 |     numpy==1.17.4 \
 93 |     scipy==1.2.2 \
 94 |     scikit-learn==0.20.3 \
 95 |     pandas==0.24.2 \
 96 |     Pillow==6.2.1 \
 97 |     h5py==2.9.0 \
 98 |     keras_applications==1.0.8 \
 99 |     keras_preprocessing==1.1.0 \
100 |     keras==2.3.1 \
101 |     requests==2.22.0 \
102 |     smdebug==0.5.0.post0 \
103 |     sagemaker-experiments==0.1.3 \
104 |     mpi4py==3.0.2 \
105 |     "cryptography>=2.3" \
106 |     "sagemaker-tensorflow>=1.15,<1.16" \
107 |     # Let's install TensorFlow separately in the end to avoid
108 |     # the library version to be overwritten
109 |  && pip install --force-reinstall --no-cache-dir -U \
110 |     ${TF_URL} \
111 |  && pip install --force-reinstall --no-cache-dir -U \
112 |     horovod==0.18.2 \
113 |  && pip install --no-cache-dir -U \
114 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
115 |     awscli==1.17.7 \
116 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
117 | 
118 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
119 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
120 | 
121 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
122 |  && chmod +x /usr/local/bin/deep_learning_container.py
123 | 
124 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
125 | 
126 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
127 | CMD ["bin/bash"]
128 | 


--------------------------------------------------------------------------------
/docker/2.0.0/py2/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 
 14 | ENV KMP_BLOCKTIME=1 
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1 
 18 | ENV PYTHONUNBUFFERED=1 
 19 | ENV PYTHONIOENCODING=UTF-8 
 20 | ENV LANG=C.UTF-8 
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz
 24 | ARG TENSORFLOW_WHL=tensorflow-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl
 25 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/cpu/final/$TENSORFLOW_WHL
 26 | 
 27 | ARG PYTHON=python
 28 | ARG PYTHON_PIP=python-pip
 29 | ARG PIP=pip
 30 | 
 31 | RUN apt-get update && apt-get install -y --no-install-recommends \
 32 |     software-properties-common \
 33 |     build-essential \
 34 |     openssh-client \
 35 |     openssh-server \
 36 |     ca-certificates \
 37 |     curl \
 38 |     git \
 39 |     wget \
 40 |     vim \
 41 |     zlib1g-dev \
 42 |  && rm -rf /var/lib/apt/lists/*
 43 | 
 44 | # Install Open MPI
 45 | RUN mkdir /tmp/openmpi \
 46 |  && cd /tmp/openmpi \
 47 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 48 |  && tar zxf openmpi-4.0.1.tar.gz \
 49 |  && cd openmpi-4.0.1 \
 50 |  && ./configure --enable-orterun-prefix-by-default \
 51 |  && make -j $(nproc) all \
 52 |  && make install \
 53 |  && ldconfig \
 54 |  && rm -rf /tmp/openmpi
 55 | 
 56 | # Create a wrapper for OpenMPI to allow running as root by default
 57 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 58 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 59 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 60 |  && chmod a+x /usr/local/bin/mpirun
 61 | 
 62 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 63 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 64 | 
 65 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 66 | ENV PATH /usr/local/openmpi/bin/:$PATH
 67 | 
 68 | # SSH login fix. Otherwise user is kicked off after login
 69 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 70 | 
 71 | # Create SSH key.
 72 | RUN mkdir -p /root/.ssh/ \
 73 |  && mkdir -p /var/run/sshd \
 74 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 75 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 76 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 77 | 
 78 | WORKDIR /
 79 | 
 80 | RUN apt-get update && apt-get install -y \
 81 |     ${PYTHON} \
 82 |     ${PYTHON_PIP}
 83 | 
 84 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 85 | 
 86 | RUN ${PIP} --no-cache-dir install --upgrade \
 87 |     pip \
 88 |     setuptools
 89 | 
 90 | # Some TF tools expect a "python" binary
 91 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 92 | 
 93 | # Setup TF Wheel
 94 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL
 95 | 
 96 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
 97 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
 98 | RUN ${PIP} install --no-cache-dir -U \
 99 |     numpy==1.16.5 \
100 |     scipy==1.2.2 \
101 |     scikit-learn==0.20.4 \
102 |     pandas==0.24.2 \
103 |     Pillow==6.2.1 \
104 |     h5py==2.10.0 \
105 |     keras_applications==1.0.8 \
106 |     keras_preprocessing==1.1.0 \
107 |     requests==2.22.0 \
108 |     keras==2.3.1 \
109 |     python-dateutil==2.8.0 \
110 |     PyYAML==5.1.2 \
111 |     awscli==1.16.303 \
112 |     mpi4py==3.0.3 \
113 |     "cryptography>=2.3" \
114 |     "sagemaker-tensorflow>=2.0,<2.1" \
115 |     # Let's install TensorFlow separately in the end to avoid
116 |     # the library version to be overwritten
117 |     # ${PIP} install --no-cache-dir -U ${TF_URL} \
118 |  && ${PIP} install --no-cache-dir -U \
119 |     /tmp/$TENSORFLOW_WHL \
120 |  && rm -f /tmp/$TENSORFLOW_WHL \
121 |  && ${PIP} install --no-cache-dir -U \
122 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
123 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
124 |  && ${PIP} install --no-cache-dir -U \
125 |     horovod==0.18.2 
126 | 
127 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
128 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
129 | 
130 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
131 |  && chmod +x /usr/local/bin/deep_learning_container.py
132 | 
133 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt
134 | 
135 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
136 | CMD ["bin/bash"]
137 | 


--------------------------------------------------------------------------------
/docker/2.0.0/py3/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | # Set environment variables for MKL
 11 | # For more about MKL with TensorFlow see:
 12 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 13 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 
 14 | ENV KMP_BLOCKTIME=1 
 15 | ENV KMP_SETTINGS=0
 16 | 
 17 | ENV PYTHONDONTWRITEBYTECODE=1 
 18 | ENV PYTHONUNBUFFERED=1 
 19 | ENV PYTHONIOENCODING=UTF-8 
 20 | ENV LANG=C.UTF-8 
 21 | ENV LC_ALL=C.UTF-8
 22 | 
 23 | ARG PYTHON=python3
 24 | ARG PYTHON_PIP=python3-pip
 25 | ARG PIP=pip3
 26 | 
 27 | # Use TENSORFLOW_WHL instead of TF_URL before releasing
 28 | # ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl"
 29 | 
 30 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz
 31 | ARG TENSORFLOW_WHL=tensorflow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl
 32 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/cpu/final/$TENSORFLOW_WHL
 33 | 
 34 | RUN apt-get update && apt-get install -y --no-install-recommends \
 35 |     python3-dev \
 36 |     python3-pip \
 37 |     python3-setuptools \
 38 |     software-properties-common \
 39 |     build-essential \
 40 |     openssh-client \
 41 |     openssh-server \
 42 |     ca-certificates \
 43 |     curl \
 44 |     git \
 45 |     wget \
 46 |     vim \
 47 |     zlib1g-dev \
 48 |  && rm -rf /var/lib/apt/lists/*
 49 | 
 50 | # Install Open MPI
 51 | RUN mkdir /tmp/openmpi && \
 52 |     cd /tmp/openmpi && \
 53 |     curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 54 |  && tar zxf openmpi-4.0.1.tar.gz \
 55 |  && cd openmpi-4.0.1 \
 56 |  && ./configure --enable-orterun-prefix-by-default \
 57 |  && make -j $(nproc) all \
 58 |  && make install \
 59 |  && ldconfig \
 60 |  && rm -rf /tmp/openmpi
 61 | 
 62 | # Create a wrapper for OpenMPI to allow running as root by default
 63 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 64 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 65 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 66 |  && chmod a+x /usr/local/bin/mpirun
 67 | 
 68 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 69 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 70 | 
 71 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 72 | ENV PATH /usr/local/openmpi/bin/:$PATH
 73 | 
 74 | # SSH login fix. Otherwise user is kicked off after login
 75 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 76 | 
 77 | # Create SSH key.
 78 | RUN mkdir -p /root/.ssh/ \
 79 |  && mkdir -p /var/run/sshd \
 80 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
 81 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
 82 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 83 | 
 84 | WORKDIR /
 85 | 
 86 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
 87 | 
 88 | RUN ${PIP} --no-cache-dir install --upgrade \
 89 |     pip \
 90 |     setuptools
 91 | 
 92 | # Some TF tools expect a "python" binary
 93 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python \
 94 |  && ln -s $(which ${PIP}) /usr/bin/pip
 95 | 
 96 | # Setup TF Wheel
 97 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL
 98 | 
 99 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
100 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
101 | RUN ${PIP} install --no-cache-dir -U \
102 |     numpy==1.17.4 \
103 |     scipy==1.2.2 \
104 |     scikit-learn==0.22 \
105 |     pandas==0.25.3 \
106 |     Pillow==6.2.1 \
107 |     h5py==2.10.0 \
108 |     keras_applications==1.0.8 \
109 |     keras_preprocessing==1.1.0 \
110 |     keras==2.3.1 \
111 |     python-dateutil==2.8.0 \
112 |     PyYAML==5.1.2 \
113 |     requests==2.22.0 \
114 |     awscli==1.16.303 \
115 |     mpi4py==3.0.3 \
116 |     "sagemaker-tensorflow>=2.0,<2.1" \
117 |     # Let's install TensorFlow separately in the end to avoid
118 |     # the library version to be overwritten
119 |     # ${PIP} install --no-cache-dir -U ${TF_URL} \
120 |  && ${PIP} install --no-cache-dir -U \
121 |     /tmp/$TENSORFLOW_WHL \
122 |     horovod==0.18.2 \
123 |  && rm -f /tmp/$TENSORFLOW_WHL \
124 |  && ${PIP} install --no-cache-dir -U \
125 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
126 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 
127 | 
128 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
129 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
130 | 
131 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
132 |  && chmod +x /usr/local/bin/deep_learning_container.py
133 | 
134 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt
135 | 
136 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
137 | CMD ["bin/bash"]
138 | 


--------------------------------------------------------------------------------
/docker/1.14.0/py2/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:16.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | RUN apt-get update && apt-get install -y --no-install-recommends \
  6 |     software-properties-common \
  7 |     build-essential \
  8 |     openssh-client \
  9 |     openssh-server \
 10 |     ca-certificates \
 11 |     curl \
 12 |     git \
 13 |     wget \
 14 |     vim \
 15 |     gcc-4.9 \
 16 |     g++-4.9 \
 17 |     gcc-4.9-base \
 18 |     zlib1g-dev \
 19 |     && rm -rf /var/lib/apt/lists/*
 20 | 
 21 | # Install Open MPI
 22 | RUN mkdir /tmp/openmpi && \
 23 |     cd /tmp/openmpi && \
 24 |     curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \
 25 |     tar zxf openmpi-4.0.1.tar.gz && \
 26 |     cd openmpi-4.0.1 && \
 27 |     ./configure --enable-orterun-prefix-by-default && \
 28 |     make -j $(nproc) all && \
 29 |     make install && \
 30 |     ldconfig && \
 31 |     rm -rf /tmp/openmpi
 32 | 
 33 | # Create a wrapper for OpenMPI to allow running as root by default
 34 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 35 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 36 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 37 |     chmod a+x /usr/local/bin/mpirun
 38 | 
 39 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 40 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 41 | 
 42 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 43 | 
 44 | ENV PATH /usr/local/openmpi/bin/:$PATH
 45 | 
 46 | # SSH login fix. Otherwise user is kicked off after login
 47 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 48 | 
 49 | # Create SSH key.
 50 | RUN mkdir -p /root/.ssh/ && \
 51 |     mkdir -p /var/run/sshd && \
 52 |     ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
 53 |     cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
 54 |     printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 55 | 
 56 | # Set environment variables for MKL
 57 | # For more about MKL with TensorFlow see:
 58 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 59 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
 60 | 
 61 | WORKDIR /
 62 | 
 63 | ARG PYTHON=python
 64 | ARG PYTHON_PIP=python-pip
 65 | ARG PIP=pip
 66 | 
 67 | RUN apt-get update && apt-get install -y \
 68 |     ${PYTHON} \
 69 |     ${PYTHON_PIP}
 70 | 
 71 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
 72 | 
 73 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 74 | ARG sagemaker_tensorflow_extensions=sagemaker_tensorflow-1.14.0.1.0.0-cp27-cp27mu-manylinux1_x86_64.whl
 75 | COPY $framework_support_installable .
 76 | COPY $sagemaker_tensorflow_extensions .
 77 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp27-cp27mu-linux_x86_64.whl"
 78 | 
 79 | # Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod
 80 | # Backup existing GCC installation as priority 100, so that it can be recovered later.
 81 | RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
 82 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
 83 |     update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
 84 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
 85 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
 86 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
 87 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
 88 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
 89 | 
 90 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
 91 | 
 92 | # Some TF tools expect a "python" binary
 93 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 94 | 
 95 | RUN ${PIP} install --no-cache-dir -U \
 96 |             numpy==1.16.4 \
 97 |             scipy==1.2.2 \
 98 |             scikit-learn==0.20.3 \
 99 |             pandas==0.24.2 \
100 |             Pillow==6.1.0 \
101 |             h5py==2.9.0 \
102 |             keras_applications==1.0.8 \
103 |             keras_preprocessing==1.1.0 \
104 |             requests==2.22.0 \
105 |             keras==2.2.4 \
106 |             awscli==1.16.196 \
107 |             mpi4py==3.0.2 \
108 |             $sagemaker_tensorflow_extensions \
109 |             # Let's install TensorFlow separately in the end to avoid
110 |             # the library version to be overwritten
111 |     && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
112 |     && ${PIP} install --no-cache-dir -U $framework_support_installable && \
113 |         rm -f $framework_support_installable \
114 |     && ${PIP} install --no-cache-dir -U horovod==0.16.4 \
115 |     && ${PIP} uninstall -y --no-cache-dir \
116 |         markdown
117 | 
118 | # Remove GCC pinning
119 | RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
120 |     update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
121 |     update-alternatives --remove g++ /usr/bin/g++-4.9 && \
122 |     update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
123 | 
124 | 
125 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
126 | 
127 | CMD ["bin/bash"]
128 | 


--------------------------------------------------------------------------------
/docker/1.14.0/py3/Dockerfile.cpu:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:16.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | RUN apt-get update && apt-get install -y --no-install-recommends \
  6 |     software-properties-common \
  7 |     build-essential \
  8 |     openssh-client \
  9 |     openssh-server \
 10 |     ca-certificates \
 11 |     curl \
 12 |     git \
 13 |     wget \
 14 |     vim \
 15 |     gcc-4.9 \
 16 |     g++-4.9 \
 17 |     gcc-4.9-base \
 18 |     zlib1g-dev \
 19 |     && rm -rf /var/lib/apt/lists/*
 20 | 
 21 | # Install Open MPI
 22 | RUN mkdir /tmp/openmpi && \
 23 |     cd /tmp/openmpi && \
 24 |     curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz && \
 25 |     tar zxf openmpi-4.0.1.tar.gz && \
 26 |     cd openmpi-4.0.1 && \
 27 |     ./configure --enable-orterun-prefix-by-default && \
 28 |     make -j $(nproc) all && \
 29 |     make install && \
 30 |     ldconfig && \
 31 |     rm -rf /tmp/openmpi
 32 | 
 33 | # Create a wrapper for OpenMPI to allow running as root by default
 34 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 35 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 36 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 37 |     chmod a+x /usr/local/bin/mpirun
 38 | 
 39 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 40 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 41 | 
 42 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 43 | 
 44 | ENV PATH /usr/local/openmpi/bin/:$PATH
 45 | 
 46 | # SSH login fix. Otherwise user is kicked off after login
 47 | RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 48 | 
 49 | # Create SSH key.
 50 | RUN mkdir -p /root/.ssh/ && \
 51 |     mkdir -p /var/run/sshd && \
 52 |     ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
 53 |     cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
 54 |     printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 55 | 
 56 | # Set environment variables for MKL
 57 | # For more about MKL with TensorFlow see:
 58 | # https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
 59 | ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
 60 | 
 61 | WORKDIR /
 62 | 
 63 | ARG PYTHON=python3
 64 | ARG PYTHON_PIP=python3-pip
 65 | ARG PIP=pip3
 66 | ARG PYTHON_VERSION=3.6.6
 67 | 
 68 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
 69 |     tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
 70 |     ./configure && make && make install && \
 71 |     apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
 72 |     make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
 73 |     ln -s /usr/local/bin/pip3 /usr/bin/pip
 74 | 
 75 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
 76 | 
 77 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
 78 | COPY $framework_support_installable .
 79 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl"
 80 | 
 81 | # Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod
 82 | # Backup existing GCC installation as priority 100, so that it can be recovered later.
 83 | RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
 84 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
 85 |     update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
 86 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
 87 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
 88 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
 89 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
 90 |     update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
 91 | 
 92 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
 93 | 
 94 | # Some TF tools expect a "python" binary
 95 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 96 | 
 97 | RUN ${PIP} install --no-cache-dir -U \
 98 |            numpy==1.16.4 \
 99 |            scipy==1.2.2 \
100 |            scikit-learn==0.20.3 \
101 |            pandas==0.24.2 \
102 |            Pillow==6.1.0 \
103 |            h5py==2.9.0 \
104 |            keras_applications==1.0.8 \
105 |            keras_preprocessing==1.1.0 \
106 |            keras==2.2.4 \
107 |            requests==2.22.0 \
108 |            awscli==1.16.196 \
109 |            mpi4py==3.0.2 \
110 |            "sagemaker-tensorflow>=1.14,<1.15" && \
111 |     # Let's install TensorFlow separately in the end to avoid
112 |     # the library version to be overwritten
113 |     ${PIP} install --force-reinstall --no-cache-dir -U \
114 |            ${TF_URL} \
115 |            horovod==0.16.4 && \
116 |     ${PIP} install --no-cache-dir -U $framework_support_installable && \
117 |            rm -f $framework_support_installable && \
118 |     ${PIP} uninstall -y --no-cache-dir \
119 |            markdown
120 | 
121 | # Remove GCC pinning
122 | RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
123 |     update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
124 |     update-alternatives --remove g++ /usr/bin/g++-4.9 && \
125 |     update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
126 | 
127 | 
128 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
129 | 
130 | CMD ["bin/bash"]
131 | 


--------------------------------------------------------------------------------
/docker/1.12.0/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:9.0-base-ubuntu16.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | ENV NCCL_VERSION=2.3.5-2+cuda9.0
  6 | ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0
  7 | ENV TF_TENSORRT_VERSION=4.1.2
  8 | 
  9 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
 10 |         software-properties-common && \
 11 |     add-apt-repository ppa:deadsnakes/ppa -y && \
 12 |     rm -rf /var/lib/apt/lists/*
 13 | 
 14 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
 15 |         ca-certificates \
 16 |         cuda-command-line-tools-9-0 \
 17 |         cuda-cublas-dev-9-0 \
 18 |         cuda-cudart-dev-9-0 \
 19 |         cuda-cufft-dev-9-0 \
 20 |         cuda-curand-dev-9-0 \
 21 |         cuda-cusolver-dev-9-0 \
 22 |         cuda-cusparse-dev-9-0 \
 23 |         curl \
 24 |         libcudnn7=${CUDNN_VERSION} \
 25 |         libnccl2=${NCCL_VERSION} \
 26 |         libnccl-dev=${NCCL_VERSION} \
 27 |         libgomp1 \
 28 |         wget \
 29 |         openssh-client \
 30 |         openssh-server \
 31 |         build-essential && \
 32 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
 33 |     # adds a new list which contains libnvinfer library, so it needs another
 34 |     # 'apt-get update' to retrieve that list before it can actually install the
 35 |     # library.
 36 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 37 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 38 |     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 39 |         nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
 40 |     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 41 |         libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 && \
 42 |     rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
 43 |     rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
 44 |     rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
 45 |     rm -rf /var/lib/apt/lists/*
 46 | 
 47 | ###########################################################################
 48 | # Horovod & its dependencies
 49 | ###########################################################################
 50 | 
 51 | # Install Open MPI
 52 | RUN mkdir /tmp/openmpi && \
 53 |     cd /tmp/openmpi && \
 54 |     curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
 55 |     tar zxf openmpi-3.1.2.tar.gz && \
 56 |     cd openmpi-3.1.2 && \
 57 |     ./configure --enable-orterun-prefix-by-default && \
 58 |     make -j $(nproc) all && \
 59 |     make install && \
 60 |     ldconfig && \
 61 |     rm -rf /tmp/openmpi
 62 | 
 63 | ARG py_version
 64 | 
 65 | RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \
 66 |         apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated  && \
 67 |         ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \
 68 |     rm -rf /var/lib/apt/lists/*
 69 | 
 70 | # Create a wrapper for OpenMPI to allow running as root by default
 71 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 72 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 73 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 74 |     chmod a+x /usr/local/bin/mpirun
 75 | 
 76 | # Configure OpenMPI to run good defaults:
 77 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 78 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 79 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 80 | 
 81 | # Set default NCCL parameters
 82 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 83 | 
 84 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 85 | ENV PATH /usr/local/openmpi/bin/:$PATH
 86 | ENV PATH=/usr/local/nvidia/bin:$PATH
 87 | 
 88 | # SSH login fix. Otherwise user is kicked off after login
 89 | RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 90 | 
 91 | # Create SSH key.
 92 | RUN mkdir -p /root/.ssh/ && \
 93 |   ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
 94 |   cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
 95 |   printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
 96 | 
 97 | ###########################################################################
 98 | # Python won’t try to write .pyc or .pyo files on the import of source modules
 99 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
100 | 
101 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
102 |     python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \
103 |     rm get-pip.py
104 | 
105 | WORKDIR /
106 | 
107 | ARG framework_installable
108 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
109 | 
110 | COPY $framework_installable tensorflow-1.12.0-py2.py3-none-any.whl
111 | COPY $framework_support_installable .
112 | 
113 | RUN pip install --no-cache-dir -U \
114 |     keras==2.2.4 \
115 |     mpi4py==3.0.1 \
116 |     $framework_support_installable \
117 |     "sagemaker-tensorflow>=1.12,<1.13" \
118 |     # Let's install TensorFlow separately in the end to avoid
119 |     # the library version to be overwritten
120 |     && pip install --force-reinstall --no-cache-dir -U tensorflow-1.12.0-py2.py3-none-any.whl  \
121 |     \
122 |     && rm -f tensorflow-1.12.0-py2.py3-none-any.whl  \
123 |     && rm -f $framework_support_installable \
124 |     && pip uninstall -y --no-cache-dir \
125 |     markdown \
126 |     tensorboard
127 | 
128 | # Install Horovod, temporarily using CUDA stubs
129 | RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
130 |     HOROVOD_GPU_ALLREDUCE=NCCL  HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \
131 |     ldconfig
132 | 
133 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
134 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =====================================
  2 | SageMaker TensorFlow Training Toolkit
  3 | =====================================
  4 | 
  5 | The SageMaker TensorFlow Training Toolkit is an open source library for making the
  6 | TensorFlow framework run on `Amazon SageMaker <https://aws.amazon.com/documentation/sagemaker/>`__.
  7 | 
  8 | This repository also contains Dockerfiles which install this library, TensorFlow, and dependencies
  9 | for building SageMaker TensorFlow images.
 10 | 
 11 | For information on running TensorFlow jobs on SageMaker:
 12 | 
 13 | - `SageMaker Python SDK documentation <https://sagemaker.readthedocs.io/en/stable/using_tf.html>`__
 14 | - `SageMaker Notebook Examples <https://github.com/awslabs/amazon-sagemaker-examples>`__
 15 | 
 16 | Table of Contents
 17 | -----------------
 18 | 
 19 | #. `Getting Started <#getting-started>`__
 20 | #. `Building your Image <#building-your-image>`__
 21 | #. `Running the tests <#running-the-tests>`__
 22 | 
 23 | Getting Started
 24 | ---------------
 25 | 
 26 | Prerequisites
 27 | ~~~~~~~~~~~~~
 28 | 
 29 | Make sure you have installed all of the following prerequisites on your
 30 | development machine:
 31 | 
 32 | - `Docker <https://www.docker.com/>`__
 33 | 
 34 | For Testing on GPU
 35 | ^^^^^^^^^^^^^^^^^^
 36 | 
 37 | -  `Nvidia-Docker <https://github.com/NVIDIA/nvidia-docker>`__
 38 | 
 39 | Recommended
 40 | ^^^^^^^^^^^
 41 | 
 42 | -  A Python environment management tool. (e.g.
 43 |    `PyEnv <https://github.com/pyenv/pyenv>`__,
 44 |    `VirtualEnv <https://virtualenv.pypa.io/en/stable/>`__)
 45 | 
 46 | Building your Image
 47 | -------------------
 48 | 
 49 | `Amazon SageMaker <https://aws.amazon.com/documentation/sagemaker/>`__
 50 | utilizes Docker containers to run all training jobs & inference endpoints.
 51 | 
 52 | The Docker images are built from the Dockerfiles specified in
 53 | `docker/ <https://github.com/aws/sagemaker-tensorflow-containers/tree/master/docker>`__.
 54 | 
 55 | The Dockerfiles are grouped based on TensorFlow version and separated
 56 | based on Python version and processor type.
 57 | 
 58 | The Dockerfiles for TensorFlow 2.0+ are available in the
 59 | `tf-2 <https://github.com/aws/sagemaker-tensorflow-container/tree/tf-2>`__ branch.
 60 | 
 61 | To build the images, first copy the files under
 62 | `docker/build_artifacts/ <https://github.com/aws/sagemaker-tensorflow-container/tree/tf-2/docker/build_artifacts>`__
 63 | to the folder container the Dockerfile you wish to build.
 64 | 
 65 | ::
 66 | 
 67 |     # Example for building a TF 2.1 image with Python 3
 68 |     cp docker/build_artifacts/* docker/2.1.0/py3/.
 69 | 
 70 | After that, go to the directory containing the Dockerfile you wish to build,
 71 | and run ``docker build`` to build the image.
 72 | 
 73 | ::
 74 | 
 75 |     # Example for building a TF 2.1 image for CPU with Python 3
 76 |     cd docker/2.1.0/py3
 77 |     docker build -t tensorflow-training:2.1.0-cpu-py3 -f Dockerfile.cpu .
 78 | 
 79 | Don't forget the period at the end of the ``docker build`` command!
 80 | 
 81 | Running the tests
 82 | -----------------
 83 | 
 84 | Running the tests requires installation of the SageMaker TensorFlow Training Toolkit code and its test
 85 | dependencies.
 86 | 
 87 | ::
 88 | 
 89 |     git clone https://github.com/aws/sagemaker-tensorflow-container.git
 90 |     cd sagemaker-tensorflow-container
 91 |     pip install -e .[test]
 92 | 
 93 | Tests are defined in
 94 | `test/ <https://github.com/aws/sagemaker-tensorflow-container/tree/master/test>`__
 95 | and include unit, integration and functional tests.
 96 | 
 97 | Unit Tests
 98 | ~~~~~~~~~~
 99 | 
100 | If you want to run unit tests, then use:
101 | 
102 | ::
103 | 
104 |     # All test instructions should be run from the top level directory
105 |     pytest test/unit
106 | 
107 | Integration Tests
108 | ~~~~~~~~~~~~~~~~~
109 | 
110 | Running integration tests require `Docker <https://www.docker.com/>`__ and `AWS
111 | credentials <https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-credentials.html>`__,
112 | as the integration tests make calls to a couple AWS services. The integration and functional
113 | tests require configurations specified within their respective
114 | `conftest.py <https://github.com/aws/sagemaker-tensorflow-containers/blob/master/test/integration/conftest.py>`__.Make sure to update the account-id and region at a minimum.
115 | 
116 | Integration tests on GPU require `Nvidia-Docker <https://github.com/NVIDIA/nvidia-docker>`__.
117 | 
118 | Before running integration tests:
119 | 
120 | #. Build your Docker image.
121 | #. Pass in the correct pytest arguments to run tests against your Docker image.
122 | 
123 | If you want to run local integration tests, then use:
124 | 
125 | ::
126 | 
127 |     # Required arguments for integration tests are found in test/integ/conftest.py
128 |     pytest test/integration --docker-base-name <your_docker_image> \
129 |                             --tag <your_docker_image_tag> \
130 |                             --framework-version <tensorflow_version> \
131 |                             --processor <cpu_or_gpu>
132 | 
133 | ::
134 | 
135 |     # Example
136 |     pytest test/integration --docker-base-name preprod-tensorflow \
137 |                             --tag 1.0 \
138 |                             --framework-version 1.4.1 \
139 |                             --processor cpu
140 | 
141 | Functional Tests
142 | ~~~~~~~~~~~~~~~~
143 | 
144 | Functional tests are removed from the current branch, please see them in older branch `r1.0 <https://github.com/aws/sagemaker-tensorflow-container/tree/r1.0#functional-tests>`__.
145 | 
146 | Contributing
147 | ------------
148 | 
149 | Please read
150 | `CONTRIBUTING.md <https://github.com/aws/sagemaker-tensorflow-containers/blob/master/CONTRIBUTING.md>`__
151 | for details on our code of conduct, and the process for submitting pull
152 | requests to us.
153 | 
154 | License
155 | -------
156 | 
157 | SageMaker TensorFlow Containers is licensed under the Apache 2.0 License. It is copyright 2018
158 | Amazon.com, Inc. or its affiliates. All Rights Reserved. The license is available at:
159 | http://aws.amazon.com/apache2.0/
160 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/test_mnist.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import os
 16 | 
 17 | import boto3
 18 | import pytest
 19 | from sagemaker.tensorflow import TensorFlow
 20 | from sagemaker.tuner import HyperparameterTuner, IntegerParameter
 21 | from sagemaker.utils import unique_name_from_base
 22 | from six.moves.urllib.parse import urlparse
 23 | 
 24 | from timeout import timeout
 25 | 
 26 | 
 27 | @pytest.mark.deploy_test
 28 | def test_mnist(sagemaker_session, image_uri, instance_type, framework_version):
 29 |     resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
 30 |     script = os.path.join(resource_path, "mnist", "mnist.py")
 31 |     estimator = TensorFlow(
 32 |         entry_point=script,
 33 |         role="SageMakerRole",
 34 |         instance_type=instance_type,
 35 |         instance_count=1,
 36 |         sagemaker_session=sagemaker_session,
 37 |         image_uri=image_uri,
 38 |         framework_version=framework_version,
 39 |     )
 40 |     inputs = estimator.sagemaker_session.upload_data(
 41 |         path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
 42 |     )
 43 |     estimator.fit(inputs, job_name=unique_name_from_base("test-sagemaker-mnist"))
 44 |     _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
 45 | 
 46 | 
 47 | def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, framework_version):
 48 |     resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
 49 |     script = os.path.join(resource_path, "mnist", "mnist.py")
 50 |     estimator = TensorFlow(
 51 |         entry_point=script,
 52 |         role="SageMakerRole",
 53 |         instance_count=2,
 54 |         instance_type=instance_type,
 55 |         sagemaker_session=sagemaker_session,
 56 |         image_uri=image_uri,
 57 |         framework_version=framework_version,
 58 |     )
 59 |     inputs = estimator.sagemaker_session.upload_data(
 60 |         path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
 61 |     )
 62 |     estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist"))
 63 |     _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
 64 | 
 65 | 
 66 | def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, framework_version):
 67 |     resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
 68 |     script = os.path.join(resource_path, "mnist", "mnist_custom.py")
 69 |     estimator = TensorFlow(
 70 |         entry_point=script,
 71 |         role="SageMakerRole",
 72 |         hyperparameters={"sagemaker_parameter_server_enabled": True},
 73 |         instance_count=2,
 74 |         instance_type=instance_type,
 75 |         sagemaker_session=sagemaker_session,
 76 |         image_uri=image_uri,
 77 |         framework_version=framework_version,
 78 |     )
 79 |     inputs = estimator.sagemaker_session.upload_data(
 80 |         path=os.path.join(resource_path, "mnist", "data-distributed"),
 81 |         key_prefix="scriptmode/mnist-distributed",
 82 |     )
 83 |     estimator.fit(inputs, job_name=unique_name_from_base("test-tf-sm-distributed-mnist"))
 84 |     _assert_checkpoint_exists_v2(sagemaker_session.boto_region_name, estimator.model_dir, 10)
 85 | 
 86 | 
 87 | def test_tuning(sagemaker_session, image_uri, instance_type, framework_version):
 88 |     resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
 89 |     script = os.path.join(resource_path, "mnist", "mnist.py")
 90 | 
 91 |     estimator = TensorFlow(
 92 |         entry_point=script,
 93 |         role="SageMakerRole",
 94 |         instance_type=instance_type,
 95 |         instance_count=1,
 96 |         sagemaker_session=sagemaker_session,
 97 |         image_uri=image_uri,
 98 |         framework_version=framework_version,
 99 |         script_mode=True,
100 |     )
101 | 
102 |     hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
103 |     objective_metric_name = "accuracy"
104 |     metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}]
105 | 
106 |     tuner = HyperparameterTuner(
107 |         estimator,
108 |         objective_metric_name,
109 |         hyperparameter_ranges,
110 |         metric_definitions,
111 |         max_jobs=2,
112 |         max_parallel_jobs=2,
113 |     )
114 | 
115 |     with timeout(minutes=20):
116 |         inputs = estimator.sagemaker_session.upload_data(
117 |             path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
118 |         )
119 | 
120 |         tuning_job_name = unique_name_from_base("test-tf-sm-tuning", max_length=32)
121 |         tuner.fit(inputs, job_name=tuning_job_name)
122 |         tuner.wait()
123 | 
124 | 
125 | def _assert_checkpoint_exists_v2(region, model_dir, checkpoint_number):
126 |     """
127 |     Checking for v2 style checkpoints i.e. checkpoint and .index files
128 |     """
129 |     _assert_s3_file_exists(region, os.path.join(model_dir, 'checkpoint'))
130 |     _assert_s3_file_exists(region,
131 |                            os.path.join(model_dir, 'model.ckpt-{}.index'.format(checkpoint_number)))
132 | 
133 | 
134 | def _assert_checkpoint_exists(region, model_dir, checkpoint_number):
135 |     _assert_s3_file_exists(region, os.path.join(model_dir, "graph.pbtxt"))
136 |     _assert_s3_file_exists(
137 |         region, os.path.join(model_dir, "model.ckpt-{}.index".format(checkpoint_number))
138 |     )
139 |     _assert_s3_file_exists(
140 |         region, os.path.join(model_dir, "model.ckpt-{}.meta".format(checkpoint_number))
141 |     )
142 | 
143 | 
144 | def _assert_s3_file_exists(region, s3_url):
145 |     parsed_url = urlparse(s3_url)
146 |     s3 = boto3.resource("s3", region_name=region)
147 |     s3.Object(parsed_url.netloc, parsed_url.path.lstrip("/")).load()
148 | 


--------------------------------------------------------------------------------
/test/integration/sagemaker/recordio_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | from __future__ import absolute_import
 14 | 
 15 | import argparse
 16 | from random import randint
 17 | import struct
 18 | import sys
 19 | 
 20 | import numpy as np
 21 | import tensorflow as tf
 22 | 
 23 | # Utility functions for generating a recordio encoded file of labeled numpy data
 24 | # for testing. Each file contains one or more records. Each record is a TensorFlow
 25 | # protobuf Example object. Each object contains an integer label and a numpy array
 26 | # encoded as a byte list.
 27 | 
 28 | # This file can be used in script mode to generate a single file or be used
 29 | # as a module to generate files via build_record_file.
 30 | 
 31 | _kmagic = 0xCED7230A
 32 | 
 33 | padding = {}
 34 | for amount in range(4):
 35 |     if sys.version_info >= (3,):
 36 |         padding[amount] = bytes([0x00 for _ in range(amount)])
 37 |     else:
 38 |         padding[amount] = bytearray([0x00 for _ in range(amount)])
 39 | 
 40 | 
 41 | def write_recordio(f, data, header_flag=0):
 42 |     """Writes a single data point as a RecordIO record to the given file."""
 43 |     length = len(data)
 44 |     f.write(struct.pack("I", _kmagic))
 45 |     header = (header_flag << 29) | length
 46 |     f.write(struct.pack("I", header))
 47 |     pad = (((length + 3) >> 2) << 2) - length
 48 |     f.write(data)
 49 |     f.write(padding[pad])
 50 | 
 51 | 
 52 | def write_recordio_multipart(f, data):
 53 |     """Writes a single data point into three multipart records."""
 54 |     length = len(data)
 55 |     stride = int(length / 3)
 56 | 
 57 |     data_start = data[0:stride]
 58 |     data_middle = data[stride : 2 * stride]
 59 |     data_end = data[2 * stride :]
 60 | 
 61 |     write_recordio(f, data_start, 1)
 62 |     write_recordio(f, data_middle, 2)
 63 |     write_recordio(f, data_end, 3)
 64 | 
 65 | 
 66 | def string_feature(value):
 67 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.tostring()]))
 68 | 
 69 | 
 70 | def label_feature(value):
 71 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 72 | 
 73 | 
 74 | def write_numpy_array(f, feature_name, label, arr, multipart=False):
 75 |     feature = {"labels": label_feature(label), feature_name: string_feature(arr)}
 76 |     example = tf.train.Example(features=tf.train.Features(feature=feature))
 77 |     if multipart:
 78 |         write_recordio_multipart(f, example.SerializeToString())
 79 |     else:
 80 |         write_recordio(f, example.SerializeToString())
 81 | 
 82 | 
 83 | def build_record_file(
 84 |     filename, num_records, dimension, classes=2, data_feature_name="data", multipart=False
 85 | ):
 86 |     """Builds a recordio encoded file of TF protobuf Example objects. Each object
 87 |     is a labeled numpy array. Each example has two field - a single int64 'label'
 88 |     field and a single bytes list field, containing a serialized numpy array.
 89 | 
 90 |     Each generated numpy array is a multidimensional normal with
 91 |     the specified dimension. The normal distribution is class specific, each class
 92 |     has a different mean for the distribution, so it should be possible to learn
 93 |     a multiclass classifier on this data. Class means are determnistic - so multiple
 94 |     calls to this function with the same number of classes will produce samples drawn
 95 |     from the same distribution for each class.
 96 | 
 97 |     Args:
 98 |         filename - the file to write to
 99 |         num_records - how many labeled numpy arrays to generate
100 |         classes - the cardinality of labels
101 |         data_feature_name - the name to give the numpy array in the Example object
102 |         dimension - the size of each numpy array.
103 |     """
104 |     with open(filename, "wb") as f:
105 |         for i in range(num_records):
106 |             cur_class = i % classes
107 |             loc = int(cur_class - (classes / 2))
108 |             write_numpy_array(
109 |                 f,
110 |                 data_feature_name,
111 |                 cur_class,
112 |                 np.random.normal(loc=loc, size=(dimension,)),
113 |                 multipart,
114 |             )
115 | 
116 | 
117 | def build_single_record_file(filename, dimension, classes=2, data_feature_name="data"):
118 |     cur_class = randint(0, classes - 1)
119 |     loc = int(cur_class - (classes / 2))
120 | 
121 |     arr = np.random.normal(loc=loc, size=(dimension,))
122 |     feature = {"labels": label_feature(cur_class), data_feature_name: string_feature(arr)}
123 |     example = tf.train.Example(features=tf.train.Features(feature=feature))
124 |     with open(filename, "wb") as f:
125 |         f.write(example.SerializeToString())
126 | 
127 | 
128 | def validate_record_file(filename, dimension):
129 |     data = open(filename, "rb").read()
130 |     magic_number, length = struct.unpack("II", data[0:8])
131 |     encoded = data[8 : 8 + length]
132 | 
133 |     features = {
134 |         "data": tf.io.FixedLenFeature([], tf.string),
135 |         "labels": tf.io.FixedLenFeature([], tf.int64),
136 |     }
137 |     parsed = tf.io.parse_single_example(encoded, features)
138 |     array = tf.io.decode_raw(parsed["data"], tf.float64)
139 | 
140 |     assert array.shape[0] == dimension
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     parser = argparse.ArgumentParser(description="Generate synthetic multi-class training data")
145 |     parser.add_argument("--dimension", default=65536, type=int)
146 |     parser.add_argument("--classes", default=2, type=int)
147 |     parser.add_argument("--num-records", default=4, type=int)
148 |     parser.add_argument("--data-feature-name", default="data")
149 |     parser.add_argument("filename", type=str)
150 |     args = parser.parse_args()
151 |     build_record_file(
152 |         args.filename, args.num_records, args.dimension, args.classes, args.data_feature_name
153 |     )
154 |     validate_record_file(args.filename, args.dimension)
155 | 


--------------------------------------------------------------------------------
/docker/2.0.1/py2/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.0-base-ubuntu18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | ENV PYTHONDONTWRITEBYTECODE=1 
 11 | ENV PYTHONUNBUFFERED=1 
 12 | ENV PYTHONIOENCODING=UTF-8 
 13 | ENV LANG=C.UTF-8 
 14 | ENV LC_ALL=C.UTF-8
 15 | 
 16 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 17 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0.1/AmazonLinux/gpu/final/tensorflow_gpu-2.0.1-cp27-cp27mu-manylinux2010_x86_64.whl
 18 | 
 19 | ARG PYTHON=python
 20 | ARG PYTHON_PIP=python-pip
 21 | ARG PIP=pip
 22 | 
 23 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
 24 |     ca-certificates \
 25 |     cuda-command-line-tools-10-0 \
 26 |     cuda-cublas-dev-10-0 \
 27 |     cuda-cudart-dev-10-0 \
 28 |     cuda-cufft-dev-10-0 \
 29 |     cuda-curand-dev-10-0 \
 30 |     cuda-cusolver-dev-10-0 \
 31 |     cuda-cusparse-dev-10-0 \
 32 |     curl \
 33 |     libcudnn7=7.5.1.10-1+cuda10.0 \
 34 |     # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
 35 |     libnccl2=2.4.7-1+cuda10.0 \
 36 |     libgomp1 \
 37 |     libnccl-dev=2.4.7-1+cuda10.0 \
 38 |     libfreetype6-dev \
 39 |     libhdf5-serial-dev \
 40 |     libpng-dev \
 41 |     libzmq3-dev \
 42 |     git \
 43 |     wget \
 44 |     vim \
 45 |     build-essential \
 46 |     openssh-client \
 47 |     openssh-server \
 48 |     zlib1g-dev \
 49 |     # Install dependent library for OpenCV
 50 |     libgtk2.0-dev \
 51 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
 52 |     # adds a new list which contains libnvinfer library, so it needs another
 53 |     # 'apt-get update' to retrieve that list before it can actually install the
 54 |     # library.
 55 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 56 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 57 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 58 |     nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
 59 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 60 |     libnvinfer5=5.0.2-1+cuda10.0 \
 61 |  && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
 62 |  && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
 63 |  && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
 64 |  && rm -rf /var/lib/apt/lists/* \
 65 |  && mkdir -p /var/run/sshd
 66 | 
 67 | # Install Open MPI
 68 | RUN mkdir /tmp/openmpi \
 69 |  && cd /tmp/openmpi \
 70 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 71 |  && tar zxf openmpi-4.0.1.tar.gz \
 72 |  && cd openmpi-4.0.1 \
 73 |  && ./configure --enable-orterun-prefix-by-default \
 74 |  && make -j $(nproc) all \
 75 |  && make install \
 76 |  && ldconfig \
 77 |  && rm -rf /tmp/openmpi
 78 | 
 79 | RUN apt-get update && apt-get install -y \
 80 |     ${PYTHON} \
 81 |     ${PYTHON_PIP}
 82 | 
 83 | # Create a wrapper for OpenMPI to allow running as root by default
 84 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 85 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 86 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 87 |  && chmod a+x /usr/local/bin/mpirun
 88 | 
 89 | # Configure OpenMPI to run good defaults:
 90 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 91 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 92 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 93 | 
 94 | # Set default NCCL parameters
 95 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 96 | 
 97 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 98 | ENV PATH /usr/local/openmpi/bin/:$PATH
 99 | ENV PATH=/usr/local/nvidia/bin:$PATH
100 | 
101 | # SSH login fix. Otherwise user is kicked off after login
102 | RUN mkdir -p /var/run/sshd \
103 |  && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
104 | 
105 | # Create SSH key.
106 | RUN mkdir -p /root/.ssh/ \
107 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
108 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
109 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
110 | 
111 | WORKDIR /
112 | 
113 | RUN ${PIP} --no-cache-dir install --upgrade \
114 |     pip \
115 |     setuptools
116 | 
117 | # Some TF tools expect a "python" binary
118 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
119 | 
120 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
121 | 
122 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
123 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
124 | RUN ${PIP} install --no-cache-dir -U \
125 |     numpy==1.16.5 \
126 |     scipy==1.2.2 \
127 |     scikit-learn==0.20.4 \
128 |     pandas==0.24.2 \
129 |     Pillow==6.2.1 \
130 |     h5py==2.10.0 \
131 |     keras_applications==1.0.8 \
132 |     keras_preprocessing==1.1.0 \
133 |     requests==2.22.0 \
134 |     keras==2.3.1 \
135 |     python-dateutil==2.8.0 \
136 |     PyYAML==5.1.2 \
137 |     awscli \
138 |     mpi4py==3.0.3 \
139 |     opencv-python==4.2.0.32 \
140 |     "cryptography>=2.3" \
141 |     "sagemaker-tensorflow>=2.0,<2.1" \
142 |     # Let's install TensorFlow separately in the end to avoid
143 |     # the library version to be overwritten
144 |  && ${PIP} install --no-cache-dir -U \
145 |     ${TF_URL} \
146 |  && ${PIP} install --no-cache-dir -U \
147 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
148 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 
149 | 
150 | # Install Horovod, temporarily using CUDA stubs
151 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
152 |  && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \
153 |  && ldconfig
154 | 
155 | # Allow OpenSSH to talk to containers without asking for confirmation
156 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
157 |  && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
158 |  && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
159 | 
160 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
161 | 
162 | RUN chmod +x /usr/local/bin/deep_learning_container.py
163 | 
164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0.1/license.txt -o /license.txt
165 | 
166 | CMD ["bin/bash"]
167 | 


--------------------------------------------------------------------------------
/docker/2.1.0/py2/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.1-base-ubuntu18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | ENV PYTHONDONTWRITEBYTECODE=1 
 11 | ENV PYTHONUNBUFFERED=1 
 12 | ENV PYTHONIOENCODING=UTF-8 
 13 | ENV LANG=C.UTF-8 
 14 | ENV LC_ALL=C.UTF-8
 15 | 
 16 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.1/AmazonLinux/gpu/final/tensorflow_gpu-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl
 17 | 
 18 | ARG PYTHON=python
 19 | ARG PYTHON_PIP=python-pip
 20 | ARG PIP=pip
 21 | 
 22 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
 23 |     ca-certificates \
 24 |     cuda-command-line-tools-10-1 \
 25 |     cuda-cudart-dev-10-1 \
 26 |     cuda-cufft-dev-10-1 \
 27 |     cuda-curand-dev-10-1 \
 28 |     cuda-cusolver-dev-10-1 \
 29 |     cuda-cusparse-dev-10-1 \
 30 |     curl \
 31 |     libcudnn7=7.6.2.24-1+cuda10.1 \
 32 |     # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
 33 |     libnccl2=2.4.7-1+cuda10.1 \
 34 |     libgomp1 \
 35 |     libnccl-dev=2.4.7-1+cuda10.1 \
 36 |     libfreetype6-dev \
 37 |     libhdf5-serial-dev \
 38 |     libpng-dev \
 39 |     libzmq3-dev \
 40 |     git \
 41 |     wget \
 42 |     vim \
 43 |     build-essential \
 44 |     openssh-client \
 45 |     openssh-server \
 46 |     zlib1g-dev \
 47 |     # Install dependent library for OpenCV
 48 |     libgtk2.0-dev \
 49 |     #cuda-cublas-dev not available with 10-1, install libcublas instead
 50 |     #it will downgrade the cublas from 10-2 to 10-1
 51 |     #adding an extra flag --allow-downgrades for it
 52 |     && apt-get update \
 53 |     && apt-get install -y --no-install-recommends --allow-unauthenticated --allow-downgrades \
 54 |     libcublas10=10.1.0.105-1 \
 55 |     libcublas-dev=10.1.0.105-1 \
 56 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
 57 |     # adds a new list which contains libnvinfer library, so it needs another
 58 |     # 'apt-get update' to retrieve that list before it can actually install the
 59 |     # library.
 60 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 61 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 62 |     # nvinfer-runtime-trt-repo doesn't have a 1804-cuda10.1 version yet. see:
 63 |     # https://developer.download.nvidia.cn/compute/machine-learning/repos/ubuntu1804/x86_64/
 64 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 65 |     nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
 66 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 67 |     libnvinfer6=6.0.1-1+cuda10.1 \
 68 |  && rm -rf /var/lib/apt/lists/* \
 69 |  && mkdir -p /var/run/sshd
 70 | 
 71 | # Install Open MPI
 72 | RUN mkdir /tmp/openmpi \
 73 |  && cd /tmp/openmpi \
 74 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 75 |  && tar zxf openmpi-4.0.1.tar.gz \
 76 |  && cd openmpi-4.0.1 \
 77 |  && ./configure --enable-orterun-prefix-by-default \
 78 |  && make -j $(nproc) all \
 79 |  && make install \
 80 |  && ldconfig \
 81 |  && rm -rf /tmp/openmpi
 82 | 
 83 | RUN apt-get update && apt-get install -y \
 84 |     ${PYTHON} \
 85 |     ${PYTHON_PIP}
 86 | 
 87 | # Create a wrapper for OpenMPI to allow running as root by default
 88 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 89 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 90 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 91 |  && chmod a+x /usr/local/bin/mpirun
 92 | 
 93 | # Configure OpenMPI to run good defaults:
 94 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 95 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 96 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 97 | 
 98 | # Set default NCCL parameters
 99 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
100 | 
101 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
102 | ENV PATH /usr/local/openmpi/bin/:$PATH
103 | ENV PATH=/usr/local/nvidia/bin:$PATH
104 | 
105 | # SSH login fix. Otherwise user is kicked off after login
106 | RUN mkdir -p /var/run/sshd \
107 |  && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
108 | 
109 | # Create SSH key.
110 | RUN mkdir -p /root/.ssh/ \
111 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
112 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
113 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
114 | 
115 | WORKDIR /
116 | 
117 | RUN ${PIP} --no-cache-dir install --upgrade \
118 |     pip \
119 |     setuptools
120 | 
121 | # Some TF tools expect a "python" binary
122 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
123 | 
124 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
125 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
126 | RUN ${PIP} install --no-cache-dir -U \
127 |     numpy==1.16.6 \
128 |     scipy==1.2.2 \
129 |     scikit-learn==0.20.4 \
130 |     pandas==0.24.2 \
131 |     Pillow==6.2.2 \
132 |     h5py==2.10.0 \
133 |     keras_applications==1.0.8 \
134 |     keras_preprocessing==1.1.0 \
135 |     keras==2.3.1 \
136 |     python-dateutil==2.8.1 \
137 |     pyYAML==5.3.1 \
138 |     requests==2.22.0 \
139 |     awscli \
140 |     mpi4py==3.0.3 \
141 |     opencv-python==4.2.0.32 \
142 |     "cryptography>=2.3" \
143 |     "sagemaker-tensorflow>=2.1,<2.2" \
144 |     "sagemaker-tensorflow-training>2,<4" \
145 |     # Let's install TensorFlow separately in the end to avoid
146 |     # the library version to be overwritten
147 |  && ${PIP} install --no-cache-dir -U \
148 |     ${TF_URL}
149 | 
150 | # Install Horovod, temporarily using CUDA stubs
151 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
152 |  && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \
153 |  && ldconfig
154 | 
155 | # Allow OpenSSH to talk to containers without asking for confirmation
156 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
157 |  && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
158 |  && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
159 | 
160 | ADD https://raw.githubusercontent.com/aws/aws-deep-learning-containers-utils/master/deep_learning_container.py /usr/local/bin/deep_learning_container.py
161 | 
162 | RUN chmod +x /usr/local/bin/deep_learning_container.py
163 | 
164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.1/license.txt -o /license.txt
165 | 
166 | CMD ["bin/bash"]
167 | 


--------------------------------------------------------------------------------
/docker/2.0.0/py2/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.0-base-ubuntu18.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | # prevent stopping by user interaction
  6 | ENV DEBIAN_FRONTEND noninteractive
  7 | ENV DEBCONF_NONINTERACTIVE_SEEN true
  8 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
  9 | 
 10 | ENV PYTHONDONTWRITEBYTECODE=1 
 11 | ENV PYTHONUNBUFFERED=1 
 12 | ENV PYTHONIOENCODING=UTF-8 
 13 | ENV LANG=C.UTF-8 
 14 | ENV LC_ALL=C.UTF-8
 15 | 
 16 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training.tar.gz
 17 | ARG TENSORFLOW_WHL=tensorflow_gpu-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl
 18 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/2.0/AmazonLinux/gpu/final/$TENSORFLOW_WHL
 19 | 
 20 | ARG PYTHON=python
 21 | ARG PYTHON_PIP=python-pip
 22 | ARG PIP=pip
 23 | 
 24 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
 25 |     ca-certificates \
 26 |     cuda-command-line-tools-10-0 \
 27 |     cuda-cublas-dev-10-0 \
 28 |     cuda-cudart-dev-10-0 \
 29 |     cuda-cufft-dev-10-0 \
 30 |     cuda-curand-dev-10-0 \
 31 |     cuda-cusolver-dev-10-0 \
 32 |     cuda-cusparse-dev-10-0 \
 33 |     curl \
 34 |     libcudnn7=7.5.1.10-1+cuda10.0 \
 35 |     # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
 36 |     libnccl2=2.4.7-1+cuda10.0 \
 37 |     libgomp1 \
 38 |     libnccl-dev=2.4.7-1+cuda10.0 \
 39 |     libfreetype6-dev \
 40 |     libhdf5-serial-dev \
 41 |     libpng-dev \
 42 |     libzmq3-dev \
 43 |     git \
 44 |     wget \
 45 |     vim \
 46 |     build-essential \
 47 |     openssh-client \
 48 |     openssh-server \
 49 |     zlib1g-dev \
 50 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
 51 |     # adds a new list which contains libnvinfer library, so it needs another
 52 |     # 'apt-get update' to retrieve that list before it can actually install the
 53 |     # library.
 54 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 55 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 56 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 57 |     nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
 58 |  && apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 59 |     libnvinfer5=5.0.2-1+cuda10.0 \
 60 |  && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
 61 |  && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
 62 |  && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
 63 |  && rm -rf /var/lib/apt/lists/* \
 64 |  && mkdir -p /var/run/sshd
 65 | 
 66 | # Install Open MPI
 67 | RUN mkdir /tmp/openmpi \
 68 |  && cd /tmp/openmpi \
 69 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 70 |  && tar zxf openmpi-4.0.1.tar.gz \
 71 |  && cd openmpi-4.0.1 \
 72 |  && ./configure --enable-orterun-prefix-by-default \
 73 |  && make -j $(nproc) all \
 74 |  && make install \
 75 |  && ldconfig \
 76 |  && rm -rf /tmp/openmpi
 77 | 
 78 | RUN apt-get update && apt-get install -y \
 79 |     ${PYTHON} \
 80 |     ${PYTHON_PIP}
 81 | 
 82 | # Create a wrapper for OpenMPI to allow running as root by default
 83 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 84 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 85 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 86 |  && chmod a+x /usr/local/bin/mpirun
 87 | 
 88 | # Configure OpenMPI to run good defaults:
 89 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 90 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 91 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 92 | 
 93 | # Set default NCCL parameters
 94 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 95 | 
 96 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 97 | ENV PATH /usr/local/openmpi/bin/:$PATH
 98 | ENV PATH=/usr/local/nvidia/bin:$PATH
 99 | 
100 | # SSH login fix. Otherwise user is kicked off after login
101 | RUN mkdir -p /var/run/sshd \
102 |  && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
103 | 
104 | # Create SSH key.
105 | RUN mkdir -p /root/.ssh/ \
106 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
107 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
108 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
109 | 
110 | WORKDIR /
111 | 
112 | RUN ${PIP} --no-cache-dir install --upgrade \
113 |     pip \
114 |     setuptools
115 | 
116 | # Some TF tools expect a "python" binary
117 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
118 | 
119 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
120 | 
121 | # Setup TF Wheel
122 | RUN wget $TF_URL -O /tmp/$TENSORFLOW_WHL
123 | 
124 | # install PyYAML==5.1.2 to avoid conflict with latest awscli
125 | # # python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
126 | RUN ${PIP} install --no-cache-dir -U \
127 |     numpy==1.16.5 \
128 |     scipy==1.2.2 \
129 |     scikit-learn==0.20.4 \
130 |     pandas==0.24.2 \
131 |     Pillow==6.2.1 \
132 |     h5py==2.10.0 \
133 |     keras_applications==1.0.8 \
134 |     keras_preprocessing==1.1.0 \
135 |     requests==2.22.0 \
136 |     keras==2.3.1 \
137 |     python-dateutil==2.8.0 \
138 |     PyYAML==5.1.2 \
139 |     awscli==1.16.303 \
140 |     mpi4py==3.0.3 \
141 |     "cryptography>=2.3" \
142 |     "sagemaker-tensorflow>=2.0,<2.1" \
143 |     # Let's install TensorFlow separately in the end to avoid
144 |     # the library version to be overwritten
145 |     # ${PIP} install --no-cache-dir -U ${TF_URL} \
146 |  && ${PIP} install --no-cache-dir -U \
147 |     /tmp/$TENSORFLOW_WHL \
148 |  && rm -f /tmp/$TENSORFLOW_WHL \
149 |  && ${PIP} install --no-cache-dir -U \
150 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
151 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE 
152 | 
153 | # Install Horovod, temporarily using CUDA stubs
154 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
155 |  && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.18.2 \
156 |  && ldconfig
157 | 
158 | # Allow OpenSSH to talk to containers without asking for confirmation
159 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
160 |  && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
161 |  && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
162 | 
163 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
164 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
165 | 
166 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
167 |  && chmod +x /usr/local/bin/deep_learning_container.py
168 | 
169 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow-2.0/license.txt -o /license.txt
170 | 
171 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
172 | CMD ["bin/bash"]
173 | 


--------------------------------------------------------------------------------
/docker/1.13.1/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.0-base-ubuntu16.04
  2 | 
  3 | LABEL maintainer="Amazon AI"
  4 | 
  5 | RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
  6 |         ca-certificates \
  7 |         cuda-command-line-tools-10-0 \
  8 |         cuda-cublas-dev-10-0 \
  9 |         cuda-cudart-dev-10-0 \
 10 |         cuda-cufft-dev-10-0 \
 11 |         cuda-curand-dev-10-0 \
 12 |         cuda-cusolver-dev-10-0 \
 13 |         cuda-cusparse-dev-10-0 \
 14 |         curl \
 15 |         libcudnn7=7.5.1.10-1+cuda10.0 \
 16 |         # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
 17 |         libnccl2=2.4.7-1+cuda10.0 \
 18 |         libgomp1 \
 19 |         libnccl-dev=2.4.7-1+cuda10.0 \
 20 |         libfreetype6-dev \
 21 |         libhdf5-serial-dev \
 22 |         libpng12-dev \
 23 |         libzmq3-dev \
 24 |         git \
 25 |         wget \
 26 |         vim \
 27 |         build-essential \
 28 |         openssh-client \
 29 |         openssh-server \
 30 |         zlib1g-dev && \
 31 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
 32 |     # adds a new list which contains libnvinfer library, so it needs another
 33 |     # 'apt-get update' to retrieve that list before it can actually install the
 34 |     # library.
 35 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 36 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 37 |     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 38 |         nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \
 39 |     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 40 |         libnvinfer5=5.0.2-1+cuda10.0 && \
 41 |     rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
 42 |     rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
 43 |     rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
 44 |     rm -rf /var/lib/apt/lists/* && \
 45 |     mkdir -p /var/run/sshd
 46 | 
 47 | ###########################################################################
 48 | # Horovod & its dependencies
 49 | ###########################################################################
 50 | 
 51 | # Install Open MPI
 52 | RUN mkdir /tmp/openmpi && \
 53 |     cd /tmp/openmpi && \
 54 |     curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
 55 |     tar zxf openmpi-3.1.2.tar.gz && \
 56 |     cd openmpi-3.1.2 && \
 57 |     ./configure --enable-orterun-prefix-by-default && \
 58 |     make -j $(nproc) all && \
 59 |     make install && \
 60 |     ldconfig && \
 61 |     rm -rf /tmp/openmpi
 62 | 
 63 | ARG PYTHON=python3
 64 | ARG PYTHON_PIP=python3-pip
 65 | ARG PIP=pip3
 66 | ARG PYTHON_VERSION=3.6.6
 67 | 
 68 | RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
 69 |     tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
 70 |     ./configure && make && make install && \
 71 |     apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
 72 |     make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
 73 |     ln -s /usr/local/bin/pip3 /usr/bin/pip
 74 | 
 75 | # Create a wrapper for OpenMPI to allow running as root by default
 76 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 77 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 78 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 79 |     chmod a+x /usr/local/bin/mpirun
 80 | 
 81 | # Configure OpenMPI to run good defaults:
 82 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 83 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 84 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 85 | 
 86 | # Set default NCCL parameters
 87 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 88 | 
 89 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 90 | ENV PATH /usr/local/openmpi/bin/:$PATH
 91 | ENV PATH=/usr/local/nvidia/bin:$PATH
 92 | 
 93 | # SSH login fix. Otherwise user is kicked off after login
 94 | RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 95 | 
 96 | # Create SSH key.
 97 | RUN mkdir -p /root/.ssh/ && \
 98 |   ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
 99 |   cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
100 |   printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
101 | 
102 | ###########################################################################
103 | # Python won’t try to write .pyc or .pyo files on the import of source modules
104 | ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
105 | 
106 | WORKDIR /
107 | 
108 | ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.13/AmazonLinux/gpu/latest-patch-latest-patch/tensorflow-1.13.1-cp36-cp36m-linux_x86_64.whl"
109 | 
110 | RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
111 | 
112 | # Some TF tools expect a "python" binary
113 | RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
114 | 
115 | ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
116 | COPY $framework_support_installable .
117 | 
118 | RUN ${PIP} install --no-cache-dir -U \
119 |     numpy==1.16.2 \
120 |     scipy==1.2.1 \
121 |     scikit-learn==0.20.3 \
122 |     pandas==0.24.2 \
123 |     Pillow==5.4.1 \
124 |     h5py==2.9.0 \
125 |     keras_applications==1.0.7 \
126 |     keras_preprocessing==1.0.9 \
127 |     requests==2.21.0 \
128 |     keras==2.2.4 \
129 |     awscli==1.16.130 \
130 |     mpi4py==3.0.1 \
131 |     "sagemaker-tensorflow>=1.13,<1.14" \
132 |     # Let's install TensorFlow separately in the end to avoid
133 |     # the library version to be overwritten
134 |     && ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
135 |     && ${PIP} install --no-cache-dir -U $framework_support_installable && \
136 |            rm -f $framework_support_installable \
137 |     && ${PIP} uninstall -y --no-cache-dir \
138 |     markdown \
139 |     tensorboard
140 | 
141 | # Install Horovod, temporarily using CUDA stubs
142 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
143 |     HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.16.4 && \
144 |     ldconfig
145 | 
146 | # Allow OpenSSH to talk to containers without asking for confirmation
147 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
148 |     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
149 |     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
150 | 
151 | ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
152 | 
153 | CMD ["bin/bash"]
154 | 


--------------------------------------------------------------------------------
/test/resources/mnist/mnist_custom.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | import argparse
 14 | import numpy as np
 15 | import os
 16 | import json
 17 | import tensorflow as tf
 18 | import tensorflow_io as tfio
 19 | from tensorflow.keras.layers import Conv2D, BatchNormalization, Dense, Flatten
 20 | 
 21 | """
 22 | This script uses custom loops to train Mnist model and saves the checkpoints using 
 23 | checkpoint manager.
 24 | """
 25 | 
 26 | # define a model
 27 | class LeNet(tf.keras.Model):
 28 |     def __init__(self):
 29 |         super(LeNet, self).__init__()
 30 |         self.conv1 = Conv2D(
 31 |             filters=16, kernel_size=3, padding='valid', 
 32 |             strides=(2, 2), input_shape=(None, 28, 28, 1),
 33 |             data_format='channels_last', trainable=True,
 34 |         )
 35 |         
 36 |         self.bn1 = BatchNormalization()
 37 |         self.conv2 = Conv2D(
 38 |             filters= 16, kernel_size=3, strides=(2,2),
 39 |             data_format='channels_last', padding='valid',
 40 |             trainable=True 
 41 |         )
 42 |         self.bn2 = BatchNormalization()
 43 |         self.flatten = Flatten()
 44 |         self.fc = Dense(10, trainable=True)
 45 | 
 46 |     def call(self, x):
 47 |         x = self.conv1(x)
 48 |         x = self.bn1(x)
 49 |         x = tf.nn.relu(x)
 50 |         x = self.conv2(x)
 51 |         x = self.bn2(x)
 52 |         x = tf.nn.relu(x)
 53 |         x = self.flatten(x)
 54 |         x = self.fc(x)
 55 |         return x
 56 | 
 57 | 
 58 | @tf.function
 59 | def train_step(x, y, net, optimizer, loss_summary, accuracy_summary):
 60 |     """
 61 |     x: input
 62 |     y: true label
 63 |     net: model object
 64 |     optim: optimizer
 65 |     loss_summary: summary writer for loss
 66 |     acc_summary: summary writer for accuracy
 67 |     """
 68 |     with tf.GradientTape() as tape:
 69 |         z = net(x)
 70 |         loss = tf.keras.losses.sparse_categorical_crossentropy(
 71 |             y_true=y, y_pred=z, from_logits=True, axis=-1
 72 |         ) 
 73 |         loss = tf.reduce_mean(loss)
 74 |     grads = tape.gradient(loss, net.trainable_variables)
 75 |     optimizer.apply_gradients(zip(grads, net.trainable_variables)) 
 76 | 
 77 |     # instrument loss 
 78 |     loss_summary(loss)
 79 | 
 80 |     # instrument accuracy
 81 |     accuracy_summary(y, z)
 82 |     return 
 83 | 
 84 | 
 85 | @tf.function
 86 | def eval_step(x, y, net, loss_summary, accuracy_summary):
 87 |     # training=False is only needed if there are layers with different
 88 |     # behavior during training versus inference (e.g. Dropout).
 89 |     z = net(x)
 90 |     
 91 |     loss = tf.keras.losses.sparse_categorical_crossentropy(
 92 |         y_true=y, y_pred=z, from_logits=True, axis=-1
 93 |     ) 
 94 |     loss = tf.reduce_mean(loss)
 95 |     
 96 |     loss_summary(loss)
 97 |     accuracy_summary(y, z)
 98 |     return
 99 |     
100 | 
101 | def load_data(data_dir):
102 |     """ Load training and eval dataset
103 |     """
104 |     x, y = np.load(os.path.join(data_dir, 'train_data.npy')), \
105 |         np.load(os.path.join(data_dir, 'train_labels.npy'))
106 |     
107 |     vx, vy = np.load(os.path.join(data_dir, 'eval_data.npy')), \
108 |         np.load(os.path.join(data_dir, 'eval_labels.npy'))
109 |     
110 |     print('==== train tensor shape ====')
111 |     print(x.shape, y.shape)
112 |     
113 |     print('==== eval tensor shape ====')
114 |     print(vx.shape, vy.shape)
115 |     # x.shape = (1000, 784), y.shape = (1000, )
116 | 
117 |     x, y = x.astype(np.float32), y.astype(np.int)
118 |     vx, vy = vx.astype(np.float32), vy.astype(np.int)
119 |     x /= 255.0 
120 |     vx /= 255.0
121 | 
122 |     dtrain = tf.data.Dataset.from_tensor_slices((x, y))
123 |     dtrain=dtrain.map(lambda x, y:(tf.reshape(x, (28, 28, 1)), y))
124 |     dtrain = dtrain.shuffle(10000).batch(512)
125 |     
126 |     deval = tf.data.Dataset.from_tensor_slices((vx, vy))
127 |     deval=deval.map(lambda x, y:(tf.reshape(x, (28, 28, 1)), y))
128 |     deval = deval.batch(10)
129 |     return dtrain, deval
130 | 
131 | 
132 | def parse_args():
133 |     parser = argparse.ArgumentParser()
134 |     parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
135 |     parser.add_argument('--model_dir', type=str)
136 |     parser.add_argument('--max-steps', type=int, default=200)
137 |     parser.add_argument('--save-checkpoint-steps', type=int, default=200)
138 |     parser.add_argument('--throttle-secs', type=int, default=60)
139 |     parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
140 |     parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
141 |     parser.add_argument('--batch-size', type=int, default=100)
142 |     parser.add_argument('--export-model-during-training', type=bool, default=False)
143 |     return parser.parse_args()
144 | 
145 | 
146 | def main(args):
147 |     net = LeNet()
148 |     net.build(input_shape=(None, 28, 28, 1))
149 | 
150 |     optimizer = tf.keras.optimizers.Adam()
151 | 
152 |     train_loss = tf.keras.metrics.Mean(name='train_loss')
153 |     train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
154 | 
155 |     test_loss = tf.keras.metrics.Mean(name='test_loss')
156 |     test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
157 |     
158 |     ckpt = tf.train.Checkpoint(optimizer=optimizer, model=net)
159 |     ckpt_manager = tf.train.CheckpointManager(
160 |         ckpt, args.model_dir, max_to_keep=5, checkpoint_name='model.ckpt'
161 |     )
162 |     
163 |     dtrain, deval = load_data(args.train)
164 |     num_epochs = 10
165 |     for i in range(num_epochs):
166 |         for x, y in dtrain:
167 |             train_step(x, y, net, optimizer, train_loss, train_accuracy) 
168 |         
169 |         for x, y in deval:
170 |             eval_step(x, y, net, test_loss, test_accuracy)
171 |             
172 |         print(
173 |             f"Epoch {i+1}",
174 |             f"Train Loss: {train_loss.result()}",
175 |             f"Train Accuracy: {train_accuracy.result()}",
176 |             f"Test Loss: {test_loss.result()}",
177 |             f"Test Accuracy: {test_accuracy.result()}"
178 |         )
179 |         
180 |         if args.current_host == args.hosts[0]:
181 |             ckpt_manager.save()
182 |     
183 | if __name__ == '__main__':
184 |     main(parse_args())
185 | 


--------------------------------------------------------------------------------
/docker/1.15.0/py2/Dockerfile.gpu:
--------------------------------------------------------------------------------
  1 | # Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
  2 | # https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
  3 | FROM nvidia/cuda:10.0-base-ubuntu18.04
  4 | 
  5 | LABEL maintainer="Amazon AI"
  6 | 
  7 | # Prevent docker build get stopped by requesting user interaction
  8 | ENV DEBIAN_FRONTEND=noninteractive
  9 | ENV DEBCONF_NONINTERACTIVE_SEEN=true
 10 | # Python won’t try to write .pyc or .pyo files on the import of source modules
 11 | ENV PYTHONDONTWRITEBYTECODE=1
 12 | ENV PYTHONUNBUFFERED=1
 13 | # See http://bugs.python.org/issue19846
 14 | ENV PYTHONIOENCODING=UTF-8
 15 | ENV LANG=C.UTF-8
 16 | ENV LC_ALL=C.UTF-8
 17 | # Specify the location of module that contains the training logic for SageMaker
 18 | # https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
 19 | ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
 20 | 
 21 | # Define framework-related package sources
 22 | ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_training*.tar.gz
 23 | ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
 24 | 
 25 | RUN apt-get update \
 26 |  && apt-get install -y --no-install-recommends --allow-unauthenticated \
 27 |     ca-certificates \
 28 |     cuda-command-line-tools-10-0 \
 29 |     cuda-cublas-dev-10-0 \
 30 |     cuda-cudart-dev-10-0 \
 31 |     cuda-cufft-dev-10-0 \
 32 |     cuda-curand-dev-10-0 \
 33 |     cuda-cusolver-dev-10-0 \
 34 |     cuda-cusparse-dev-10-0 \
 35 |     curl \
 36 |     libcudnn7=7.5.1.10-1+cuda10.0 \
 37 |     # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
 38 |     libnccl2=2.4.7-1+cuda10.0 \
 39 |     libgomp1 \
 40 |     libnccl-dev=2.4.7-1+cuda10.0 \
 41 |     libfreetype6-dev \
 42 |     libhdf5-serial-dev \
 43 |     libpng-dev \
 44 |     libzmq3-dev \
 45 |     git \
 46 |     wget \
 47 |     vim \
 48 |     build-essential \
 49 |     openssh-client \
 50 |     openssh-server \
 51 |     zlib1g-dev \
 52 |     # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
 53 |     # adds a new list which contains libnvinfer library, so it needs another
 54 |     # 'apt-get update' to retrieve that list before it can actually install the library.
 55 |     # We don't install libnvinfer-dev since we don't need to build against TensorRT,
 56 |     # and libnvinfer4 doesn't contain libnvinfer.a static library.
 57 |  && apt-get update \
 58 |  && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 59 |     nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
 60 |  && apt-get update \
 61 |  && apt-get install -y --no-install-recommends --allow-unauthenticated  \
 62 |     libnvinfer5=5.0.2-1+cuda10.0 \
 63 |  && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
 64 |  && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
 65 |  && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
 66 |  && rm -rf /var/lib/apt/lists/* \
 67 |  && mkdir -p /var/run/sshd
 68 | 
 69 | # Install Open MPI
 70 | RUN mkdir /tmp/openmpi \
 71 |  && cd /tmp/openmpi \
 72 |  && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
 73 |  && tar zxf openmpi-4.0.1.tar.gz \
 74 |  && cd openmpi-4.0.1 \
 75 |  && ./configure --enable-orterun-prefix-by-default \
 76 |  && make -j $(nproc) all \
 77 |  && make install \
 78 |  && ldconfig \
 79 |  && rm -rf /tmp/openmpi
 80 | 
 81 | RUN apt-get update \
 82 |  && apt-get install -y \
 83 |     python \
 84 |     python-pip
 85 | 
 86 | # Create a wrapper for OpenMPI to allow running as root by default
 87 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
 88 |  && echo '#!/bin/bash' > /usr/local/bin/mpirun \
 89 |  && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
 90 |  && chmod a+x /usr/local/bin/mpirun
 91 | 
 92 | # Configure OpenMPI to run good defaults:
 93 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 94 | RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
 95 |  && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
 96 | 
 97 | # Set default NCCL parameters
 98 | RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
 99 | 
100 | ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
101 | ENV PATH /usr/local/openmpi/bin/:$PATH
102 | ENV PATH=/usr/local/nvidia/bin:$PATH
103 | 
104 | # SSH login fix. Otherwise user is kicked off after login
105 | RUN mkdir -p /var/run/sshd \
106 |  && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
107 | 
108 | # Create SSH key.
109 | RUN mkdir -p /root/.ssh/ \
110 |  && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
111 |  && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
112 |  && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
113 | 
114 | WORKDIR /
115 | 
116 | RUN pip --no-cache-dir install --upgrade \
117 |     pip \
118 |     setuptools
119 | 
120 | # Some TF tools expect a "python" binary
121 | RUN ln -s $(which python) /usr/local/bin/python
122 | 
123 | COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
124 | 
125 | RUN pip install --no-cache-dir -U \
126 |     numpy==1.16.5 \
127 |     scipy==1.2.2 \
128 |     scikit-learn==0.20.3 \
129 |     pandas==0.24.2 \
130 |     Pillow==6.2.1 \
131 |     h5py==2.9.0 \
132 |     keras_applications==1.0.8 \
133 |     keras_preprocessing==1.1.0 \
134 |     requests==2.22.0 \
135 |     keras==2.3.1 \
136 |     mpi4py==3.0.2 \
137 |     "cryptography>=2.3" \
138 |     "sagemaker-tensorflow>=1.15,<1.16" \
139 |     # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
140 |  && pip install --force-reinstall --no-cache-dir -U \
141 |     ${TF_URL} \
142 |  && pip install --no-cache-dir -U \
143 |     $FRAMEWORK_SUPPORT_INSTALLABLE \
144 |     awscli==1.17.7 \
145 |  && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
146 | 
147 | # Install Horovod, temporarily using CUDA stubs
148 | RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
149 |  && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
150 |     horovod==0.18.2 \
151 |  && ldconfig
152 | 
153 | # Allow OpenSSH to talk to containers without asking for confirmation
154 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
155 |  && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
156 |  && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
157 | 
158 | COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
159 | COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
160 | 
161 | RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
162 |  && chmod +x /usr/local/bin/deep_learning_container.py
163 | 
164 | RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
165 | 
166 | ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
167 | CMD ["bin/bash"]
168 | 


--------------------------------------------------------------------------------