├── infra ├── eks │ ├── maskrcnn │ │ ├── overlays │ │ │ ├── run1.yaml │ │ │ ├── run2.yaml │ │ │ ├── run3.yaml │ │ │ ├── run4.yaml │ │ │ ├── run5.yaml │ │ │ ├── run6.yaml │ │ │ ├── run7.yaml │ │ │ ├── run8.yaml │ │ │ ├── run9.yaml │ │ │ ├── run10.yaml │ │ │ ├── 13rings.yaml │ │ │ ├── 8x4.yaml │ │ │ ├── 16x4.yaml │ │ │ ├── larc.yaml │ │ │ ├── 24epoch.yaml │ │ │ ├── predefined_padding.yaml │ │ │ ├── 64x4.yaml │ │ │ ├── syncbn.yaml │ │ │ ├── determinism.yaml │ │ │ ├── bbrw.yaml │ │ │ └── 32x4.yaml │ │ ├── requirements.yaml │ │ ├── charts │ │ │ └── mpi-operator │ │ │ │ ├── Chart.yaml │ │ │ │ ├── values.yaml │ │ │ │ └── templates │ │ │ │ └── mpi-operator.yaml │ │ ├── Chart.yaml │ │ ├── values.yaml │ │ └── templates │ │ │ └── maskrcnn.yaml │ ├── eksctl │ │ ├── p3 │ │ │ ├── kubeconfig │ │ │ ├── delete.sh │ │ │ ├── additional_nodegroup.yaml │ │ │ ├── additional_nodegroup_non_gpu.yaml │ │ │ ├── create.sh │ │ │ └── config.yaml │ │ └── p3dn │ │ │ ├── kubeconfig │ │ │ ├── delete.sh │ │ │ ├── additional_nodegroup_non_gpu.yaml │ │ │ ├── create.sh │ │ │ └── config.yaml │ ├── helm │ │ ├── mpijob │ │ │ ├── Chart.yaml │ │ │ └── templates │ │ │ │ └── mpijob.yaml │ │ └── tiller-rbac-config.yaml │ ├── ssh.sh │ ├── tboard.sh │ ├── fsx │ │ ├── p3 │ │ │ ├── pvc-fsx.yaml │ │ │ ├── pv-fsx.yaml │ │ │ ├── attach-pvc.yaml │ │ │ ├── attach-pvc-2.yaml │ │ │ └── stage-data.yaml │ │ └── p3dn │ │ │ ├── pvc-fsx.yaml │ │ │ ├── pv-fsx.yaml │ │ │ ├── attach-pvc.yaml │ │ │ ├── attach-pvc-2.yaml │ │ │ └── stage-data.yaml │ ├── TOOLS.md │ ├── tensorboard │ │ └── tensorboard.yaml │ ├── YAML_OVERLAY.md │ └── yaml_overlay ├── ami │ ├── reinstall_tensorpack.sh │ ├── tensorboard │ │ ├── tensorboard.sh │ │ └── ssh_tensorboard.sh │ ├── export_cmd │ ├── install_libs.sh │ ├── download_data.sh │ ├── README.md │ ├── train_efa.sh │ └── no_batch_train_1node_16xl_convergence.sh ├── docker │ ├── sleep.sh │ ├── run.sh │ ├── run_multinode.sh │ ├── build.sh │ ├── ssh_and_build.sh │ ├── docker.md │ ├── train.sh │ ├── README.md │ └── train_multinode.sh └── sm │ ├── Dockerfile_sm │ ├── build_push_submit.sh │ ├── run.sh │ ├── README.md │ ├── launch_sm_job.py │ ├── Dockerfile_base │ └── build_and_push.sh ├── .dockerignore ├── NOTICE ├── COCO_image_aspect_ratio_histogram.png ├── MaskRCNN ├── utils │ ├── __init__.py │ ├── README.md │ ├── randomnness.py │ ├── mixed_precision.py │ ├── box_ops.py │ ├── generate_anchors.py │ └── np_box_ops.py ├── viz.py └── NOTES.md ├── tensorpack ├── contrib │ └── __init__.py ├── train │ ├── utility.py │ └── __init__.py ├── callbacks │ ├── stats.py │ ├── __init__.py │ ├── hooks.py │ ├── concurrency.py │ ├── group.py │ └── misc.py ├── utils │ ├── naming.py │ ├── compatible_serialize.py │ ├── __init__.py │ ├── debug.py │ ├── palette.py │ ├── gpu.py │ ├── serialize.py │ ├── timer.py │ ├── rect.py │ └── fs.py ├── models │ ├── common.py │ ├── shapes.py │ ├── utils.py │ ├── _test.py │ ├── __init__.py │ ├── nonlin.py │ ├── shape_utils.py │ ├── fc.py │ └── layer_norm.py ├── dataflow │ ├── dftools.py │ ├── imgaug │ │ ├── _test.py │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── noise.py │ │ ├── external.py │ │ └── paste.py │ ├── dataset │ │ ├── __init__.py │ │ ├── svhn.py │ │ └── bsds500.py │ └── __init__.py ├── __init__.py ├── input_source │ └── __init__.py ├── predict │ ├── __init__.py │ └── feedfree.py ├── graph_builder │ ├── __init__.py │ └── predict.py ├── tfutils │ ├── __init__.py │ ├── distributed.py │ ├── symbolic_functions.py │ ├── dependency.py │ ├── sesscreate.py │ ├── model_utils.py │ └── varreplace.py └── libinfo.py ├── update_git.sh ├── CODE_OF_CONDUCT.md ├── setup.cfg ├── patch ├── tensorflow_Conv2DTranspose.diff ├── roi_align.diff └── README.md ├── RESULTS.md ├── .gitignore ├── Dockerfile ├── setup.py ├── README.md └── CONTRIBUTING.md /infra/eks/maskrcnn/overlays/run1.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run1 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run2.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run2 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run3.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run3 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run4.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run4 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run5.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run5 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run6.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run6 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run7.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run7 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run8.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run8 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run9.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run9 4 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/run10.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -run10 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .ignored/ 3 | .git/ 4 | 5 | tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | tensorflow-mask-rcnn 2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/kubeconfig: -------------------------------------------------------------------------------- 1 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3 2 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3dn/kubeconfig: -------------------------------------------------------------------------------- 1 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3dn 2 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/requirements.yaml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - name: mpi-operator 3 | version: 1.0.0 4 | repository: ./charts -------------------------------------------------------------------------------- /COCO_image_aspect_ratio_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armandmcqueen/tensorpack-mask-rcnn/HEAD/COCO_image_aspect_ratio_histogram.png -------------------------------------------------------------------------------- /MaskRCNN/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/13rings.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | nccl_min_rings: 13 4 | 5 | append: 6 | global: 7 | name: -13rings 8 | 9 | -------------------------------------------------------------------------------- /tensorpack/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/8x4.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | gpus: 8 4 | batch_size_per_gpu: 4 5 | 6 | append: 7 | global: 8 | name: -8x4 9 | 10 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/16x4.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | gpus: 16 4 | batch_size_per_gpu: 4 5 | 6 | append: 7 | global: 8 | name: -16x4 9 | 10 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/larc.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -larc 4 | 5 | set: 6 | maskrcnn: 7 | image: armandmcqueen/tensorpack-mask-rcnn:dev-larc 8 | 9 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/charts/mpi-operator/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for kubeflow mpi-operator 4 | name: mpi-operator 5 | version: 1.0.0 -------------------------------------------------------------------------------- /infra/eks/helm/mpijob/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for kubeflow mpijob customer resource definition 4 | name: mpijob 5 | version: 1.0.0 -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/24epoch.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]" 4 | 5 | append: 6 | global: 7 | name: -24e 8 | 9 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/predefined_padding.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -predefpad 4 | 5 | set: 6 | maskrcnn: 7 | predefined_padding: 'True' 8 | 9 | 10 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/64x4.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | gpus: 64 4 | batch_size_per_gpu: 4 5 | gradient_clip: 1.5 6 | 7 | append: 8 | global: 9 | name: -64x4 10 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/syncbn.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -syncbn 4 | 5 | set: 6 | maskrcnn: 7 | experiment_group: syncbn 8 | backbone_norm: SyncBN 9 | 10 | 11 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/determinism.yaml: -------------------------------------------------------------------------------- 1 | append: 2 | global: 3 | name: -determinism 4 | 5 | set: 6 | maskrcnn: 7 | image: armandmcqueen/tensorpack-mask-rcnn:dev-determinism_armand 8 | 9 | -------------------------------------------------------------------------------- /infra/eks/ssh.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | kubectl exec attach-pvc-2 -it -- /bin/bash -------------------------------------------------------------------------------- /update_git.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | git commit -am "Quick update" 6 | git push -------------------------------------------------------------------------------- /infra/eks/tboard.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | kubectl port-forward tensorboard 6006:6006 -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/delete.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | eksctl delete cluster -f p3_config.yaml -------------------------------------------------------------------------------- /infra/eks/maskrcnn/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: "1.0" 3 | description: A Helm chart for running Mask RCNN (optimized) using kubeflow mpi-operator and mpi-job 4 | name: maskrcnn-optimized 5 | version: 1.0.0 -------------------------------------------------------------------------------- /infra/eks/eksctl/p3dn/delete.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | eksctl delete cluster -f p3dn_config.yaml -------------------------------------------------------------------------------- /infra/ami/reinstall_tensorpack.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | pip uninstall -y tensorpack 6 | pip install -e ./ -------------------------------------------------------------------------------- /infra/eks/maskrcnn/charts/mpi-operator/values.yaml: -------------------------------------------------------------------------------- 1 | mpioperator: 2 | name: mpi-op 3 | image: mpioperator/mpi-operator:0.1.0 4 | deliveryimage: mpioperator/kubectl-delivery:latest 5 | pullpolicy: Always 6 | gpuspernode: 8 7 | -------------------------------------------------------------------------------- /infra/docker/sleep.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | PORT_ID=${1:-1234} 5 | /usr/sbin/sshd -p $PORT_ID; sleep infinity 6 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/bbrw.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | bbox_reg_weights: '[20., 20., 10., 10.]' 4 | 5 | 6 | append: 7 | global: 8 | name: -bbrw 9 | maskrcnn: 10 | experiment_group: _regweights 11 | 12 | -------------------------------------------------------------------------------- /infra/ami/tensorboard/tensorboard.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | tensorboard --logdir=live:~/logs/train_log,old:~/old_logs -------------------------------------------------------------------------------- /infra/eks/maskrcnn/overlays/32x4.yaml: -------------------------------------------------------------------------------- 1 | set: 2 | maskrcnn: 3 | gpus: 32 4 | batch_size_per_gpu: 4 5 | gradient_clip: 1.5 # set it to zero to disable gradient clipping 6 | 7 | append: 8 | global: 9 | name: -32x4 10 | -------------------------------------------------------------------------------- /infra/eks/fsx/p3/pvc-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: tensorpack-fsx 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: fsx-sc 9 | resources: 10 | requests: 11 | storage: 100Gi -------------------------------------------------------------------------------- /infra/eks/fsx/p3dn/pvc-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: tensorpack-fsx 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: fsx-sc 9 | resources: 10 | requests: 11 | storage: 100Gi -------------------------------------------------------------------------------- /infra/ami/export_cmd: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/: -------------------------------------------------------------------------------- /infra/sm/Dockerfile_sm: -------------------------------------------------------------------------------- 1 | FROM fewu/sagemaker-mask-rcnn:lateset 2 | 3 | # Copies the training code inside the container 4 | COPY run_mpi.py /opt/ml/code/run_mpi.py 5 | COPY run.sh /opt/ml/code/run.sh 6 | 7 | # Defines train.py as script entry point 8 | ENV SAGEMAKER_PROGRAM run_mpi.py 9 | -------------------------------------------------------------------------------- /infra/sm/build_push_submit.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | set -e 4 | imagename=$1 5 | sagemaker_iam_role=$2 6 | ./build_and_push.sh $imagename 7 | python3 launch_sm_job.py $imagename $sagemaker_iam_role 8 | -------------------------------------------------------------------------------- /infra/ami/tensorboard/ssh_tensorboard.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | # Will be able to access tensorboard at localhost:6007 6 | 7 | echo "ssh -L 127.0.0.1:6007:127.0.0.1:6006 ubuntu@" 8 | 9 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /tensorpack/train/utility.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: utility.py 5 | 6 | # for backwards-compatibility 7 | from ..graph_builder.utils import LeastLoadedDeviceSetter, OverrideToLocalVariable, override_to_local_variable # noqa 8 | -------------------------------------------------------------------------------- /tensorpack/callbacks/stats.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: stats.py 5 | 6 | from .graph import DumpParamAsImage # noqa 7 | # for compatibility only 8 | from .misc import InjectShell, SendStat # noqa 9 | 10 | __all__ = [] 11 | -------------------------------------------------------------------------------- /tensorpack/utils/naming.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: naming.py 5 | 6 | 7 | GLOBAL_STEP_INCR_OP_NAME = 'global_step_incr' 8 | 9 | # extra variables to summarize during training in a moving-average way 10 | MOVING_SUMMARY_OPS_KEY = 'MOVING_SUMMARY_OPS' 11 | -------------------------------------------------------------------------------- /infra/docker/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | BRANCH_NAME=${1:-"master"} 6 | 7 | echo "Running docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}" 8 | echo "" 9 | 10 | 11 | 12 | nvidia-docker run -it -v ~/data:/data -v ~/logs:/logs tensorpack-mask-rcnn:dev-${BRANCH_NAME} -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | author = TensorPack contributors 3 | author-email = ppwwyyxxc@gmail.com 4 | url = https://github.com/tensorpack/tensorpack 5 | keywords = tensorflow, deep learning, neural network 6 | license = Apache 7 | 8 | [options] 9 | zip_safe = False # dataset and __init__ use file 10 | # will call find_packages() 11 | packages = find: 12 | 13 | [wheel] 14 | universal = 1 15 | -------------------------------------------------------------------------------- /infra/ami/install_libs.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | pip install --upgrade pip 6 | pip install ujson 7 | pip install opencv-python 8 | pip install pycocotools 9 | pip install --ignore-installed numpy==1.14.5 10 | pip install tqdm 11 | pip install msgpack_numpy 12 | pip install tabulate 13 | -------------------------------------------------------------------------------- /tensorpack/models/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: common.py 5 | 6 | from .registry import layer_register # noqa 7 | from .tflayer import rename_tflayer_get_variable 8 | from .utils import VariableHolder # noqa 9 | 10 | __all__ = ['layer_register', 'VariableHolder', 'rename_tflayer_get_variable'] 11 | -------------------------------------------------------------------------------- /infra/docker/run_multinode.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | BRANCH_NAME=${1:-"master"} 6 | 7 | echo "Running docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}" 8 | echo "" 9 | 10 | 11 | 12 | nvidia-docker run -it --network=host -v /mnt/share/ssh:/root/.ssh -v ~/data:/data -v ~/logs:/logs tensorpack-mask-rcnn:dev-${BRANCH_NAME} 13 | -------------------------------------------------------------------------------- /infra/eks/helm/tiller-rbac-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: tiller 5 | namespace: kube-system 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1 8 | kind: ClusterRoleBinding 9 | metadata: 10 | name: tiller 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: ClusterRole 14 | name: cluster-admin 15 | subjects: 16 | - kind: ServiceAccount 17 | name: tiller 18 | namespace: kube-system -------------------------------------------------------------------------------- /infra/ami/download_data.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | DATA_DIR=/home/ubuntu/data 6 | 7 | mkdir -p $DATA_DIR 8 | aws s3 cp s3://armand-ajay-workshop/mask-rcnn/sagemaker/input/train $DATA_DIR --recursive 9 | 10 | wget -O $DATA_DIR/pretrained-models/ImageNet-R50-AlignPadding.npz http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz 11 | -------------------------------------------------------------------------------- /infra/eks/fsx/p3/pv-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: tensorpack-fsx 5 | spec: 6 | capacity: 7 | storage: 7Pi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: fsx-sc 13 | csi: 14 | driver: fsx.csi.aws.com 15 | volumeHandle: fs-03f556d03c3c590a2 16 | volumeAttributes: 17 | dnsname: fs-03f556d03c3c590a2.fsx.us-east-1.amazonaws.com -------------------------------------------------------------------------------- /infra/eks/fsx/p3dn/pv-fsx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: tensorpack-fsx 5 | spec: 6 | capacity: 7 | storage: 7Pi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: fsx-sc 13 | csi: 14 | driver: fsx.csi.aws.com 15 | volumeHandle: fs-04d78cb1f96eb771e 16 | volumeAttributes: 17 | dnsname: fs-04d78cb1f96eb771e.fsx.us-east-1.amazonaws.com -------------------------------------------------------------------------------- /MaskRCNN/utils/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Some third-party helper functions 3 | 4 | + generate_anchors.py: copied from [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py). 5 | + box_ops.py: modified from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/core/box_list_ops.py). 6 | + np_box_ops.py: copied from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/utils/np_box_ops.py). 7 | 8 | -------------------------------------------------------------------------------- /infra/docker/build.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | BRANCH_NAME=${1:-"master"} 6 | 7 | # The BRANCH_NAME refers to the git pull that happens inside of the Dockerfile 8 | echo "Building docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}" 9 | echo "" 10 | 11 | 12 | 13 | docker build -t tensorpack-mask-rcnn:dev-${BRANCH_NAME} ../.. --build-arg CACHEBUST=$(date +%s) --build-arg BRANCH_NAME=${BRANCH_NAME} -------------------------------------------------------------------------------- /infra/eks/fsx/p3/attach-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc 5 | spec: 6 | containers: 7 | - name: attach-pvc 8 | image: ubuntu:latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"] 13 | volumeMounts: 14 | - name: fsx 15 | mountPath: /fsx 16 | volumes: 17 | - name: fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/additional_nodegroup.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: tensorpack-mask-rcnn-p3 7 | region: us-east-1 8 | 9 | nodeGroups: 10 | - name: ng-p3-1c 11 | instanceType: p3.16xlarge 12 | availabilityZones: ["us-east-1c"] 13 | desiredCapacity: 1 14 | iam: 15 | withAddonPolicies: 16 | imageBuilder: true 17 | ebs: true 18 | fsx: true 19 | efs: true 20 | ssh: 21 | allow: true 22 | publicKeyName: 'maskrcnn' 23 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/additional_nodegroup_non_gpu.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: tensorpack-mask-rcnn-p3 7 | region: us-east-1 8 | 9 | nodeGroups: 10 | - name: ng-c5-1b 11 | instanceType: c5.4xlarge 12 | availabilityZones: ["us-east-1b"] 13 | desiredCapacity: 1 14 | iam: 15 | withAddonPolicies: 16 | imageBuilder: true 17 | ebs: true 18 | fsx: true 19 | efs: true 20 | ssh: 21 | allow: true 22 | publicKeyName: 'maskrcnn' 23 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3dn/additional_nodegroup_non_gpu.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: eksctl.io/v1alpha5 3 | kind: ClusterConfig 4 | 5 | metadata: 6 | name: tensorpack-mask-rcnn-p3dn 7 | region: us-east-1 8 | 9 | nodeGroups: 10 | - name: ng-c5-1f 11 | instanceType: c5.4xlarge 12 | availabilityZones: ["us-east-1f"] 13 | desiredCapacity: 1 14 | iam: 15 | withAddonPolicies: 16 | imageBuilder: true 17 | ebs: true 18 | fsx: true 19 | efs: true 20 | ssh: 21 | allow: true 22 | publicKeyName: 'maskrcnn' 23 | -------------------------------------------------------------------------------- /infra/eks/fsx/p3/attach-pvc-2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc-2 5 | spec: 6 | containers: 7 | - name: attach-pvc 8 | image: armandmcqueen/tensorpack-mask-rcnn:master-latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"] 13 | volumeMounts: 14 | - name: fsx 15 | mountPath: /fsx 16 | volumes: 17 | - name: fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx 20 | -------------------------------------------------------------------------------- /infra/eks/fsx/p3dn/attach-pvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc 5 | spec: 6 | containers: 7 | - name: attach-pvc 8 | image: ubuntu:latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"] 13 | volumeMounts: 14 | - name: fsx # efs or fsx 15 | mountPath: /fsx # /efs or /fsx 16 | volumes: 17 | - name: fsx # efs or fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx -------------------------------------------------------------------------------- /MaskRCNN/utils/randomnness.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | class SeedGenerator: 4 | def __init__(self, seed): 5 | self.seed = seed 6 | self.counters = dict() 7 | 8 | def next(self, key='default'): 9 | if self.seed == None: 10 | return None 11 | 12 | if key not in self.counters: 13 | self.counters[key] = self.seed 14 | return self.counters[key] 15 | else: 16 | self.counters[key] += 1 17 | return self.counters[key] -------------------------------------------------------------------------------- /infra/eks/fsx/p3dn/attach-pvc-2.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: attach-pvc-2 5 | spec: 6 | containers: 7 | - name: attach-pvc 8 | image: armandmcqueen/tensorpack-mask-rcnn:master-latest 9 | command: ["/bin/bash"] 10 | securityContext: 11 | privileged: true 12 | args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"] 13 | volumeMounts: 14 | - name: fsx # efs or fsx 15 | mountPath: /fsx # /efs or /fsx 16 | volumes: 17 | - name: fsx # efs or fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx 20 | -------------------------------------------------------------------------------- /infra/eks/TOOLS.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | 3 | # Invoke Tasks 4 | 5 | We use [invoke](https://www.pyinvoke.org/) to automate tasks around EKS training. 6 | 7 | invoke tasks are essentially python functions that can be launched with `inv task_name args` or `invoke task_name args` 8 | 9 | ## repeat 10 | 11 | Repeatedly run a string as command, replacing the substring '|N|' (can be changed) with the iteration number 12 | 13 | ``` 14 | $ inv repeat 'echo |N|' 15 | 1 16 | 2 17 | 3 18 | 4 19 | 5 20 | $ inv repeat 'echo [I]' --repeat=2 --verbose --sub='[I]' 21 | [cmd = echo 1] 22 | 1 23 | [cmd = echo 2] 24 | 2 25 | ``` 26 | 27 | ## 28 | -------------------------------------------------------------------------------- /patch/tensorflow_Conv2DTranspose.diff: -------------------------------------------------------------------------------- 1 | diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py 2 | index fc2e8aa..e3081c0 100644 3 | --- a/tensorflow/python/keras/backend.py 4 | +++ b/tensorflow/python/keras/backend.py 5 | @@ -594,7 +594,8 @@ def _has_nchw_support(): 6 | bool: if the current scope device placement would support nchw 7 | """ 8 | explicitly_on_cpu = _is_current_explicit_device('CPU') 9 | - gpus_available = bool(_get_available_gpus()) 10 | + #gpus_available = bool(_get_available_gpus()) 11 | + gpus_available = True 12 | return not explicitly_on_cpu and gpus_available -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/create.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | # Need to make sure you are on the latest version of eksctl for fsx support. Tested on eksctl v0.1.32 6 | 7 | eksctl create cluster -f config.yaml --auto-kubeconfig 8 | 9 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3 10 | # aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER 11 | 12 | kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta/nvidia-device-plugin.yml 13 | 14 | 15 | # eksctl scale nodegroup --cluster=tensorpack-mask-rcnn --nodes=12 --name=ng-1 16 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3dn/create.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | # Need to make sure you are on the latest version of eksctl for fsx support. Tested on eksctl v0.1.32 6 | 7 | eksctl create cluster -f config.yaml --auto-kubeconfig 8 | 9 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3dn 10 | # aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER 11 | 12 | kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta/nvidia-device-plugin.yml 13 | 14 | 15 | # eksctl scale nodegroup --cluster=tensorpack-mask-rcnn --nodes=12 --name=ng-1 16 | -------------------------------------------------------------------------------- /infra/eks/tensorboard/tensorboard.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: tensorboard 5 | spec: 6 | containers: 7 | - name: tensorboard 8 | image: armandmcqueen/tensorpack-mask-rcnn:dev-master 9 | command: ["tensorboard"] 10 | args: ["--logdir=/fsx"] 11 | securityContext: 12 | privileged: true 13 | volumeMounts: 14 | - name: fsx # efs or fsx 15 | mountPath: /fsx # /efs or /fsx 16 | ports: 17 | - containerPort: 6006 18 | # https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/ 19 | # 20 | 21 | volumes: 22 | - name: fsx # efs or fsx 23 | persistentVolumeClaim: 24 | claimName: tensorpack-fsx 25 | -------------------------------------------------------------------------------- /tensorpack/utils/compatible_serialize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | import os 4 | 5 | from .serialize import dumps_msgpack, dumps_pyarrow, loads_msgpack, loads_pyarrow 6 | 7 | """ 8 | Serialization that has compatibility guarantee (therefore is safe to store to disk). 9 | """ 10 | 11 | __all__ = ['loads', 'dumps'] 12 | 13 | 14 | # pyarrow has no compatibility guarantee 15 | # use msgpack for persistent serialization, unless explicitly set from envvar 16 | if os.environ.get('TENSORPACK_COMPATIBLE_SERIALIZE', 'msgpack') == 'msgpack': 17 | loads = loads_msgpack 18 | dumps = dumps_msgpack 19 | else: 20 | loads = loads_pyarrow 21 | dumps = dumps_pyarrow 22 | -------------------------------------------------------------------------------- /tensorpack/dataflow/dftools.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: dftools.py 5 | 6 | 7 | from ..utils.develop import deprecated 8 | from .remote import dump_dataflow_to_process_queue 9 | from .serialize import LMDBSerializer, TFRecordSerializer 10 | 11 | __all__ = ['dump_dataflow_to_process_queue', 12 | 'dump_dataflow_to_lmdb', 'dump_dataflow_to_tfrecord'] 13 | 14 | 15 | @deprecated("Use LMDBSerializer.save instead!", "2019-01-31") 16 | def dump_dataflow_to_lmdb(df, lmdb_path, write_frequency=5000): 17 | LMDBSerializer.save(df, lmdb_path, write_frequency) 18 | 19 | 20 | @deprecated("Use TFRecordSerializer.save instead!", "2019-01-31") 21 | def dump_dataflow_to_tfrecord(df, path): 22 | TFRecordSerializer.save(df, path) 23 | -------------------------------------------------------------------------------- /tensorpack/models/shapes.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: shapes.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from .common import layer_register 10 | 11 | __all__ = ['ConcatWith'] 12 | 13 | 14 | @layer_register(use_scope=None) 15 | def ConcatWith(x, tensor, dim): 16 | """ 17 | A wrapper around ``tf.concat`` to cooperate with :class:`LinearWrap`. 18 | 19 | Args: 20 | x (tf.Tensor): input 21 | tensor (list[tf.Tensor]): a tensor or list of tensors to concatenate with x. 22 | x will be at the beginning 23 | dim (int): the dimension along which to concatenate 24 | 25 | Returns: 26 | tf.Tensor: ``tf.concat([x] + tensor, dim)`` 27 | """ 28 | if type(tensor) != list: 29 | tensor = [tensor] 30 | return tf.concat([x] + tensor, dim) 31 | -------------------------------------------------------------------------------- /infra/docker/ssh_and_build.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | MASTER_HOST=${1:-"127.0.0.1"} 6 | HOSTS=${2:-"hosts"} 7 | BRANCH_NAME=${3:-"master"} 8 | 9 | 10 | ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa 11 | hosts=`cat $HOSTS` 12 | for host in $hosts; do 13 | scp ~/.ssh/id_rsa.pub $host:~/.ssh/ 14 | ssh $host "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys" 15 | ssh $host "printf 'Host *\n StrictHostKeyChecking no\n' >> ~/.ssh/config" 16 | ssh $host "chmod 400 ~/.ssh/config" 17 | ssh $host "sudo mkdir -p /mnt/share/ssh" 18 | ssh $host "sudo cp -r ~/.ssh/* /mnt/share/ssh" 19 | if [ $host != $MASTER_HOST ]; then 20 | ssh $host "git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn.git -b ${BRANCH_NAME}" 21 | fi 22 | ssh $host "cd ~/tensorpack-mask-rcnn/infra/docker; ./build.sh" 23 | done 24 | -------------------------------------------------------------------------------- /patch/roi_align.diff: -------------------------------------------------------------------------------- 1 | diff --git a/tensorflow/core/kernels/roi_align_op.cu.cc b/tensorflow/core/kernels/roi_align_op.cu.cc 2 | index 886f4bc81f..0a801dceb8 100644 3 | --- a/tensorflow/core/kernels/roi_align_op.cu.cc 4 | +++ b/tensorflow/core/kernels/roi_align_op.cu.cc 5 | @@ -1298,8 +1298,8 @@ __global__ void WriteUprightBoxesOutput(const CudaLaunchConfig nboxes, 6 | d_image_out_rois[base_idx + 0] = image_index; 7 | d_image_out_rois[base_idx + 1] = box.x; 8 | d_image_out_rois[base_idx + 2] = box.y; 9 | - d_image_out_rois[base_idx + 3] = box.z; 10 | - d_image_out_rois[base_idx + 4] = box.w; 11 | + d_image_out_rois[base_idx + 3] = box.z + 1.0f; 12 | + d_image_out_rois[base_idx + 4] = box.w + 1.0f; 13 | } 14 | } 15 | 16 | @@ -2395,4 +2395,4 @@ REGISTER_KERNEL_BUILDER( 17 | tensorflow::sami::BoxIntersectionOverUnion); 18 | 19 | } // namespace tensorflow 20 | -#endif 21 | \ No newline at end of file 22 | +#endif 23 | -------------------------------------------------------------------------------- /infra/ami/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | ## Upgrading protoc to 3.6.1 for Horovod install 3 | 4 | Required on DLAMI 21.2 5 | 6 | ``` 7 | pip uninstall -y protobuf 8 | 9 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/bin/protoc 10 | rm -r /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/include/google/protobuf 11 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/lib/python3.6/site-packages/protobuf-3.6.0-py3.6-nspkg.pth 12 | rm /home/ubuntu/anaconda3/bin//protoc 13 | 14 | wget https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip 15 | mkdir -p /home/ubuntu/protoc 16 | mv protoc-3.6.1-linux-x86_64.zip /home/ubuntu/protoc/protoc-3.6.1-linux-x86_64.zip 17 | unzip /home/ubuntu/protoc/protoc-3.6.1-linux-x86_64.zip -d protoc 18 | sudo mv /home/ubuntu/protoc/bin/protoc /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/bin/protoc 19 | sudo mv /home/ubuntu/protoc/include/* /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/include 20 | pip install protobuf==3.6.1 -------------------------------------------------------------------------------- /infra/sm/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | cd /opt/ml/code/tensorpack-mask-rcnn 5 | BATCH_SIZE_PER_GPU=4 6 | THROUGHPUT_LOG_FREQ=2000 7 | echo "Launch training job...." 8 | /usr/local/bin/python3 MaskRCNN/train.py \ 9 | --logdir /logs/train_log \ 10 | --fp16 \ 11 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \ 12 | --config \ 13 | MODE_MASK=True \ 14 | MODE_FPN=True \ 15 | DATA.BASEDIR=/opt/ml/code/data \ 16 | DATA.TRAIN='["train2017"]' \ 17 | DATA.VAL='("val2017",)' \ 18 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \ 19 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ 20 | TRAIN.EVAL_PERIOD=12 \ 21 | BACKBONE.WEIGHTS=/opt/ml/code/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ 22 | RPN.TOPK_PER_IMAGE=True \ 23 | PREPROC.PREDEFINED_PADDING=True \ 24 | TRAIN.GRADIENT_CLIP=0 \ 25 | BACKBONE.NORM=FreezeBN \ 26 | TRAINER=horovod 27 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: _test.py 5 | 6 | 7 | import sys 8 | import cv2 9 | 10 | from . import AugmentorList 11 | from .crop import * 12 | from .deform import * 13 | from .imgproc import * 14 | from .noise import SaltPepperNoise 15 | from .noname import * 16 | 17 | anchors = [(0.2, 0.2), (0.7, 0.2), (0.8, 0.8), (0.5, 0.5), (0.2, 0.5)] 18 | augmentors = AugmentorList([ 19 | Contrast((0.8, 1.2)), 20 | Flip(horiz=True), 21 | GaussianDeform(anchors, (360, 480), 0.2, randrange=20), 22 | # RandomCropRandomShape(0.3), 23 | SaltPepperNoise() 24 | ]) 25 | 26 | img = cv2.imread(sys.argv[1]) 27 | newimg, prms = augmentors._augment_return_params(img) 28 | cv2.imshow(" ", newimg.astype('uint8')) 29 | cv2.waitKey() 30 | 31 | newimg = augmentors._augment(img, prms) 32 | cv2.imshow(" ", newimg.astype('uint8')) 33 | cv2.waitKey() 34 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3/config.yaml: -------------------------------------------------------------------------------- 1 | # An example of ClusterConfig object with access to CSI drivers: 2 | --- 3 | apiVersion: eksctl.io/v1alpha5 4 | kind: ClusterConfig 5 | 6 | metadata: 7 | name: tensorpack-mask-rcnn-p3 8 | region: us-east-1 9 | 10 | 11 | vpc: 12 | id: "vpc-f6570b8d" # (optional, must match VPC ID used for each subnet below) 13 | subnets: 14 | # must provide 'private' and/or 'public' subnets by availability zone as shown 15 | public: 16 | us-east-1b: 17 | id: "subnet-58b35b04" 18 | 19 | us-east-1c: 20 | id: "subnet-b440b9d3" 21 | 22 | us-east-1f: 23 | id: "subnet-21ac2f2e" 24 | 25 | nodeGroups: 26 | - name: ng-p3-1f 27 | instanceType: p3.16xlarge 28 | availabilityZones: ["us-east-1f"] 29 | desiredCapacity: 1 30 | iam: 31 | withAddonPolicies: 32 | imageBuilder: true 33 | ebs: true 34 | fsx: true 35 | efs: true 36 | ssh: 37 | allow: true 38 | publicKeyName: 'maskrcnn' 39 | -------------------------------------------------------------------------------- /tensorpack/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | """ 7 | Common utils. 8 | These utils should be irrelevant to tensorflow. 9 | """ 10 | 11 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 12 | STATICA_HACK = True 13 | globals()['kcah_acitats'[::-1].upper()] = False 14 | if STATICA_HACK: 15 | from .utils import * 16 | 17 | 18 | __all__ = [] 19 | 20 | 21 | def _global_import(name): 22 | p = __import__(name, globals(), None, level=1) 23 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 24 | for k in lst: 25 | if not k.startswith('__'): 26 | globals()[k] = p.__dict__[k] 27 | __all__.append(k) 28 | 29 | 30 | _global_import('utils') 31 | 32 | # Import no other submodules. they are supposed to be explicitly imported by users. 33 | __all__.extend(['logger']) 34 | -------------------------------------------------------------------------------- /tensorpack/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | 7 | from tensorpack.libinfo import __version__, __git_version__, _HAS_TF 8 | 9 | from tensorpack.utils import * 10 | from tensorpack.dataflow import * 11 | 12 | # dataflow can be used alone without installing tensorflow 13 | # TODO maybe separate dataflow to a new project if it's good enough 14 | 15 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 16 | STATICA_HACK = True 17 | globals()['kcah_acitats'[::-1].upper()] = _HAS_TF 18 | if STATICA_HACK: 19 | from tensorpack.models import * 20 | 21 | from tensorpack.callbacks import * 22 | from tensorpack.tfutils import * 23 | 24 | from tensorpack.train import * 25 | from tensorpack.graph_builder import InputDesc, ModelDesc, ModelDescBase 26 | from tensorpack.input_source import * 27 | from tensorpack.predict import * 28 | -------------------------------------------------------------------------------- /infra/eks/eksctl/p3dn/config.yaml: -------------------------------------------------------------------------------- 1 | # An example of ClusterConfig object with access to CSI drivers: 2 | --- 3 | apiVersion: eksctl.io/v1alpha5 4 | kind: ClusterConfig 5 | 6 | metadata: 7 | name: tensorpack-mask-rcnn-p3dn 8 | region: us-east-1 9 | 10 | 11 | vpc: 12 | id: "vpc-f6570b8d" # (optional, must match VPC ID used for each subnet below) 13 | subnets: 14 | # must provide 'private' and/or 'public' subnets by availability zone as shown 15 | public: 16 | us-east-1b: 17 | id: "subnet-58b35b04" 18 | 19 | us-east-1c: 20 | id: "subnet-b440b9d3" 21 | 22 | us-east-1f: 23 | id: "subnet-21ac2f2e" 24 | 25 | nodeGroups: 26 | - name: ng-p3dn-1c 27 | instanceType: p3dn.24xlarge 28 | availabilityZones: ["us-east-1c"] 29 | desiredCapacity: 1 30 | iam: 31 | withAddonPolicies: 32 | imageBuilder: true 33 | ebs: true 34 | fsx: true 35 | efs: true 36 | ssh: 37 | allow: true 38 | publicKeyName: 'maskrcnn' 39 | 40 | 41 | 42 | # Never eksctl version require: 43 | # 44 | #ssh: 45 | # allow: true 46 | # publicKeyName: 'us-east-1-benchmark-tf' 47 | -------------------------------------------------------------------------------- /tensorpack/models/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: utils.py 5 | 6 | import six 7 | 8 | 9 | class VariableHolder(object): 10 | """ A proxy to access variables defined in a layer. """ 11 | def __init__(self, **kwargs): 12 | """ 13 | Args: 14 | kwargs: {name:variable} 15 | """ 16 | self._vars = {} 17 | for k, v in six.iteritems(kwargs): 18 | self._add_variable(k, v) 19 | 20 | def _add_variable(self, name, var): 21 | assert name not in self._vars 22 | self._vars[name] = var 23 | 24 | def __setattr__(self, name, var): 25 | if not name.startswith('_'): 26 | self._add_variable(name, var) 27 | else: 28 | # private attributes 29 | super(VariableHolder, self).__setattr__(name, var) 30 | 31 | def __getattr__(self, name): 32 | return self._vars[name] 33 | 34 | def all(self): 35 | """ 36 | Returns: 37 | list of all variables 38 | """ 39 | return list(six.itervalues(self._vars)) 40 | -------------------------------------------------------------------------------- /infra/eks/fsx/p3/stage-data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: stage-data 6 | data: 7 | stage-data.sh: | 8 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR 9 | --- 10 | apiVersion: v1 11 | kind: Pod 12 | metadata: 13 | name: stage-data 14 | spec: 15 | restartPolicy: Never 16 | volumes: 17 | - name: fsx # efs, or fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx 20 | - name: config 21 | configMap: 22 | defaultMode: 420 23 | items: 24 | - key: stage-data.sh 25 | mode: 365 26 | path: stage-data.sh 27 | name: stage-data 28 | containers: 29 | - name: data 30 | env: 31 | - name: S3_BUCKET 32 | value: armand-ajay-workshop 33 | - name: S3_PREFIX 34 | value: mask-rcnn/sagemaker/input/train 35 | - name: STAGE_DIR 36 | value: /fsx 37 | command: 38 | - sh 39 | - /etc/config/stage-data.sh 40 | image: armandmcqueen/tensorpack-mask-rcnn:master-latest 41 | imagePullPolicy: IfNotPresent 42 | volumeMounts: 43 | - mountPath: /etc/config 44 | name: config 45 | - mountPath: /fsx # /efs or /fsx 46 | name: fsx # efs, or fsx -------------------------------------------------------------------------------- /infra/eks/fsx/p3dn/stage-data.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: ConfigMap 4 | metadata: 5 | name: stage-data 6 | data: 7 | stage-data.sh: | 8 | aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR 9 | --- 10 | apiVersion: v1 11 | kind: Pod 12 | metadata: 13 | name: stage-data 14 | spec: 15 | restartPolicy: Never 16 | volumes: 17 | - name: fsx # efs, or fsx 18 | persistentVolumeClaim: 19 | claimName: tensorpack-fsx 20 | - name: config 21 | configMap: 22 | defaultMode: 420 23 | items: 24 | - key: stage-data.sh 25 | mode: 365 26 | path: stage-data.sh 27 | name: stage-data 28 | containers: 29 | - name: data 30 | env: 31 | - name: S3_BUCKET 32 | value: armand-ajay-workshop 33 | - name: S3_PREFIX 34 | value: mask-rcnn/sagemaker/input/train 35 | - name: STAGE_DIR 36 | value: /fsx 37 | command: 38 | - sh 39 | - /etc/config/stage-data.sh 40 | image: armandmcqueen/tensorpack-mask-rcnn:master-latest 41 | imagePullPolicy: IfNotPresent 42 | volumeMounts: 43 | - mountPath: /etc/config 44 | name: config 45 | - mountPath: /fsx # /efs or /fsx 46 | name: fsx # efs, or fsx -------------------------------------------------------------------------------- /infra/docker/docker.md: -------------------------------------------------------------------------------- 1 | # TODO 2 | 3 | Add image to dockerhub and add scripts to push to ECR. 4 | 5 | https://github.com/ajayvohra2005/tf-tp-hvd-eks/blob/master/container/build_tools/build_and_push.sh 6 | 7 | # Using Docker 8 | 9 | The ec2 instance must have the training data available at ~/data. 10 | 11 | ### Build container 12 | ``` 13 | cd docker 14 | ./build.sh 15 | ``` 16 | 17 | ### Run container interactively 18 | ``` 19 | ./run.sh 20 | ``` 21 | 22 | 23 | ### Run training job inside container 24 | 25 | ``` 26 | cd tensorpack-mask-rcnn 27 | infra/docker/train.sh 8 1 250 28 | ``` 29 | 30 | 31 | This is 8 GPUs, 1 img per GPU, summary writer logs every 250 steps. 32 | 33 | Logs will be exposed to the ec2 instance at ~/logs. 34 | 35 | ### Attaching/Detaching from docker container 36 | `ctl + p + q` will detach 37 | `docker ps` will give info on the running docker containers including convenient name. 38 | `docker attach $CONTAINER_NAME` will reattach to the running docker container. 39 | 40 | ## Notes 41 | 42 | The current Dockerfile uses the wheel built for p3.16xl. The wheel built for p3dn.24xl might have a performance improvement, but it does not run on 16xl due to different available instruction sets. 43 | -------------------------------------------------------------------------------- /tensorpack/models/_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: _test.py 5 | 6 | 7 | import logging 8 | import unittest 9 | import tensorflow as tf 10 | 11 | 12 | class TestModel(unittest.TestCase): 13 | 14 | def run_variable(self, var): 15 | sess = tf.Session() 16 | sess.run(tf.global_variables_initializer()) 17 | if isinstance(var, list): 18 | return sess.run(var) 19 | else: 20 | return sess.run([var])[0] 21 | 22 | def make_variable(self, *args): 23 | if len(args) > 1: 24 | return [tf.Variable(k) for k in args] 25 | else: 26 | return tf.Variable(args[0]) 27 | 28 | 29 | def run_test_case(case): 30 | suite = unittest.TestLoader().loadTestsFromTestCase(case) 31 | unittest.TextTestRunner(verbosity=2).run(suite) 32 | 33 | 34 | if __name__ == '__main__': 35 | import tensorpack 36 | from tensorpack.utils import logger 37 | from . import * # noqa 38 | logger.setLevel(logging.CRITICAL) 39 | subs = tensorpack.models._test.TestModel.__subclasses__() 40 | for cls in subs: 41 | run_test_case(cls) 42 | -------------------------------------------------------------------------------- /tensorpack/input_source/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .input_source_base import * 11 | from .input_source import * 12 | 13 | from pkgutil import iter_modules 14 | import os 15 | import os.path 16 | 17 | __all__ = [] 18 | 19 | 20 | def global_import(name): 21 | p = __import__(name, globals(), locals(), level=1) 22 | lst = p.__all__ if '__all__' in dir(p) else [] 23 | del globals()[name] 24 | for k in lst: 25 | if not k.startswith('__'): 26 | globals()[k] = p.__dict__[k] 27 | __all__.append(k) 28 | 29 | 30 | _CURR_DIR = os.path.dirname(__file__) 31 | _SKIP = [] 32 | for _, module_name, _ in iter_modules( 33 | [_CURR_DIR]): 34 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 35 | if not os.path.isfile(srcpath): 36 | continue 37 | if module_name.startswith('_'): 38 | continue 39 | if module_name not in _SKIP: 40 | global_import(module_name) 41 | -------------------------------------------------------------------------------- /RESULTS.md: -------------------------------------------------------------------------------- 1 | # Results 2 | 3 | Detailed results coming soon! 4 | 5 | ## Advanced configurations 6 | 7 | There are a few advanced configurations that you should be aware of for optimal performance. 8 | 9 | ### p3dn 10 | 11 | When using p3dn, you will want to use 13 NCCL rings. With p3.16xl, 8 NCCL rings is a good choice. 12 | 13 | ### Prioritizing bounding box accuracy 14 | 15 | You can use a improved bounding box regression weight (`cfg.FRCNN.BBOX_REG_WEIGHTS`) to get better bounding box mAP. If you use `[20, 20, 10, 10]` instead of `[10., 10., 5., 5.]` you will see a solid improvement in bbox mAP (for 12 epochs, 8x4 training, from 37.3 to 398) with a slight decrease in segmentation accuracy (34.3 to 34.2). As you increase the total batch size, the bbox improvement decreases and the segm penalty increases. 16 | 17 | ### SyncBN 18 | 19 | You can use SyncBN to train with very large batch sizes without getting NaN losses. However, currently the accuracy is generally lower than when using FreezeBN and the throughput is significantly worse. 20 | 21 | ### Large batch size 22 | 23 | When training in the 32x4 configuration, you will get NaN ~5% of the time if you do not use gradient clipping. To enable gradient clipping, you need to add `TRAIN.GRADIENT_CLIP=1.5` to the config. This has a minor throughput impact, but eliminates NaN runs. -------------------------------------------------------------------------------- /tensorpack/predict/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .base import * 11 | from .concurrency import * 12 | from .config import * 13 | from .dataset import * 14 | from .multigpu import * 15 | 16 | 17 | from pkgutil import iter_modules 18 | import os 19 | import os.path 20 | 21 | __all__ = [] 22 | 23 | 24 | def global_import(name): 25 | p = __import__(name, globals(), locals(), level=1) 26 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 27 | if lst: 28 | del globals()[name] 29 | for k in lst: 30 | globals()[k] = p.__dict__[k] 31 | __all__.append(k) 32 | 33 | 34 | _CURR_DIR = os.path.dirname(__file__) 35 | for _, module_name, _ in iter_modules( 36 | [_CURR_DIR]): 37 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 38 | if not os.path.isfile(srcpath): 39 | continue 40 | if module_name.startswith('_'): 41 | continue 42 | global_import(module_name) 43 | -------------------------------------------------------------------------------- /tensorpack/utils/debug.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: debug.py 5 | 6 | 7 | import sys 8 | 9 | 10 | def enable_call_trace(): 11 | """ Enable trace for calls to any function. """ 12 | def tracer(frame, event, arg): 13 | if event == 'call': 14 | co = frame.f_code 15 | func_name = co.co_name 16 | if func_name == 'write' or func_name == 'print': 17 | # ignore write() calls from print statements 18 | return 19 | func_line_no = frame.f_lineno 20 | func_filename = co.co_filename 21 | caller = frame.f_back 22 | if caller: 23 | caller_line_no = caller.f_lineno 24 | caller_filename = caller.f_code.co_filename 25 | print('Call to `%s` on line %s:%s from %s:%s' % 26 | (func_name, func_filename, func_line_no, 27 | caller_filename, caller_line_no)) 28 | return 29 | sys.settrace(tracer) 30 | 31 | 32 | if __name__ == '__main__': 33 | enable_call_trace() 34 | 35 | def b(a): 36 | print(2) 37 | 38 | def a(): 39 | print(1) 40 | b(1) 41 | 42 | a() 43 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/values.yaml: -------------------------------------------------------------------------------- 1 | global: 2 | namespace: default 3 | name: maskrcnn 4 | maskrcnn: 5 | experiment_group: default # For organizing result dirs. 6 | gpus: 8 7 | batch_size_per_gpu: 1 8 | image: armandmcqueen/tensorpack-mask-rcnn:master-latest # image URL from ECR or DockerHub 9 | train_script: /tensorpack-mask-rcnn/MaskRCNN/train.py 10 | fp_16: 1 # TODO: Setting this to 0 does not disable FP16, it just disables loss scaling 11 | base_lr: 0.00125 12 | warmup_lr: 0.000416667 13 | shared_fs: fsx 14 | data_fs: fsx 15 | shared_pvc: tensorpack-fsx 16 | data_dir: '' 17 | working_dir: /tensorpack-mask-rcnn 18 | images_per_epoch: 120000 19 | lr_epoch_schedule: "[(8, 0.1), (10, 0.01), (12, None)]" 20 | eval_period_in_epochs: 24 21 | data_train: "[\"train2017\"]" 22 | data_val: "(\"val2017\")" 23 | mode_fpn: 'True' 24 | mode_mask: 'True' 25 | backbone_norm: FreezeBN 26 | backbone_weights: pretrained-models/ImageNet-R50-AlignPadding.npz 27 | predefined_padding: 'True' 28 | topk_per_image: 'True' 29 | image_pull_policy: Always 30 | horovod_cycle_time: "0.5" 31 | horovod_fusion_threshold: "67108864" 32 | nccl_socket_ifname: ^lo,docker0 33 | nccl_min_rings: 8 34 | nccl_debug: INFO 35 | bbox_reg_weights: '[10., 10., 5., 5.]' 36 | result_score_thresh: 0.05 37 | gpus_per_node: 8 38 | -------------------------------------------------------------------------------- /tensorpack/dataflow/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .bsds500 import * 11 | from .cifar import * 12 | from .ilsvrc import * 13 | from .mnist import * 14 | from .svhn import * 15 | 16 | from pkgutil import iter_modules 17 | import os 18 | import os.path 19 | 20 | __all__ = [] 21 | 22 | 23 | def global_import(name): 24 | p = __import__(name, globals(), locals(), level=1) 25 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 26 | if lst: 27 | del globals()[name] 28 | for k in lst: 29 | if not k.startswith('__'): 30 | globals()[k] = p.__dict__[k] 31 | __all__.append(k) 32 | 33 | 34 | _CURR_DIR = os.path.dirname(__file__) 35 | for _, module_name, _ in iter_modules( 36 | [_CURR_DIR]): 37 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 38 | if not os.path.isfile(srcpath): 39 | continue 40 | if not module_name.startswith('_'): 41 | global_import(module_name) 42 | -------------------------------------------------------------------------------- /tensorpack/train/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | # flake8: noqa 6 | 7 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 8 | STATICA_HACK = True 9 | globals()['kcah_acitats'[::-1].upper()] = False 10 | if STATICA_HACK: 11 | from .base import * 12 | from .config import * 13 | from .interface import * 14 | from .tower import * 15 | from .trainers import * 16 | 17 | 18 | from pkgutil import iter_modules 19 | import os 20 | import os.path 21 | 22 | __all__ = [] 23 | 24 | 25 | def global_import(name): 26 | p = __import__(name, globals(), locals(), level=1) 27 | lst = p.__all__ if '__all__' in dir(p) else [] 28 | if lst: 29 | del globals()[name] 30 | for k in lst: 31 | globals()[k] = p.__dict__[k] 32 | __all__.append(k) 33 | 34 | 35 | _CURR_DIR = os.path.dirname(__file__) 36 | _SKIP = ['utility'] 37 | for _, module_name, _ in iter_modules( 38 | [_CURR_DIR]): 39 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 40 | if not os.path.isfile(srcpath): 41 | continue 42 | if module_name.startswith('_'): 43 | continue 44 | if module_name not in _SKIP: 45 | global_import(module_name) 46 | -------------------------------------------------------------------------------- /tensorpack/graph_builder/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .model_desc import * 11 | from .training import * 12 | from .distributed import * 13 | from .predict import * 14 | from .utils import * 15 | 16 | from pkgutil import iter_modules 17 | import os 18 | import os.path 19 | 20 | __all__ = [] 21 | 22 | def global_import(name): 23 | p = __import__(name, globals(), locals(), level=1) 24 | lst = p.__all__ if '__all__' in dir(p) else [] 25 | del globals()[name] 26 | for k in lst: 27 | if not k.startswith('__'): 28 | globals()[k] = p.__dict__[k] 29 | __all__.append(k) 30 | 31 | 32 | _CURR_DIR = os.path.dirname(__file__) 33 | _SKIP = ['distributed'] 34 | for _, module_name, _ in iter_modules( 35 | [_CURR_DIR]): 36 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 37 | if not os.path.isfile(srcpath): 38 | continue 39 | if module_name.startswith('_'): 40 | continue 41 | if module_name not in _SKIP: 42 | global_import(module_name) 43 | -------------------------------------------------------------------------------- /infra/eks/helm/mpijob/templates/mpijob.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apiextensions.k8s.io/v1beta1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | name: mpijobs.kubeflow.org 6 | spec: 7 | group: kubeflow.org 8 | names: 9 | kind: MPIJob 10 | plural: mpijobs 11 | shortNames: 12 | - mj 13 | - mpij 14 | singular: mpijob 15 | scope: Namespaced 16 | validation: 17 | openAPIV3Schema: 18 | properties: 19 | spec: 20 | description: Either `gpus` or `replicas` should be specified, but not both 21 | oneOf: 22 | - properties: 23 | gpus: 24 | description: Valid values are 1, 2, 4, or any multiple of 8 25 | oneOf: 26 | - enum: 27 | - 1 28 | - 2 29 | - 4 30 | type: integer 31 | - minimum: 8 32 | multipleOf: 8 33 | type: integer 34 | title: Total number of GPUs 35 | required: 36 | - gpus 37 | - properties: 38 | replicas: 39 | description: The GPU resource limit should be specified for each replica 40 | minimum: 1 41 | title: Total number of replicas 42 | type: integer 43 | required: 44 | - replicas 45 | title: The MPIJob spec 46 | version: v1alpha1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # specific stuff 2 | venv/ 3 | test.py 4 | tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl 5 | .ignored/ 6 | 7 | 8 | # EKS configs 9 | infra/eks/maskrcnn/values/ 10 | 11 | 12 | # tensorpack-specific stuff 13 | train_log 14 | train_log_* 15 | logs 16 | *.npy 17 | *.npz 18 | *.caffemodel 19 | *.tfmodel 20 | *.meta 21 | *.log* 22 | *.bin 23 | *.png 24 | *.jpg 25 | checkpoint 26 | *.json 27 | *.prototxt 28 | *.txt 29 | *.tgz 30 | *.gz 31 | 32 | 33 | 34 | 35 | 36 | # Byte-compiled / optimized / DLL files 37 | __pycache__/ 38 | *.py[cod] 39 | 40 | # C extensions 41 | *.so 42 | 43 | # Distribution / packaging 44 | .Python 45 | env/ 46 | build/ 47 | develop-eggs/ 48 | dist/ 49 | downloads/ 50 | eggs/ 51 | .eggs/ 52 | lib/ 53 | lib64/ 54 | parts/ 55 | sdist/ 56 | var/ 57 | *.egg-info/ 58 | .installed.cfg 59 | *.egg 60 | 61 | # PyInstaller 62 | # Usually these files are written by a python script from a template 63 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 64 | *.manifest 65 | *.spec 66 | 67 | # Installer logs 68 | pip-log.txt 69 | pip-delete-this-directory.txt 70 | 71 | # Unit test / coverage reports 72 | htmlcov/ 73 | .tox/ 74 | .coverage 75 | .coverage.* 76 | .cache 77 | nosetests.xml 78 | coverage.xml 79 | *,cover 80 | 81 | # Translations 82 | *.mo 83 | *.pot 84 | 85 | # Django stuff: 86 | *.log 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | *.dat 94 | 95 | .idea/ 96 | -------------------------------------------------------------------------------- /infra/sm/README.md: -------------------------------------------------------------------------------- 1 | # Train with Sagemaker 2 | 3 | ## To launch training 4 | 5 | - (1) Set up your Sagemaker role according to https://medium.com/ml-bytes/how-to-a-create-a-sagemaker-execution-role-539866910bda and record it as `$YOUR_SM_ROLE` 6 | - Make sure you have full access for S3 7 | - (2) Modify the `launch_sm_job.py`, pick your sagemaker_iam_role, instance type, instance numbers, GPUs per instance and other Sagemaker specifications. 8 | - (3) Modify the `run.sh`, pick your batch_size, training epoches and other training parameters. 9 | - (4) Create a repo in ECR with `$YOUR_JOB_NAME` 10 | - (4) Launch your training job by run `./build_push_submit $YOUR_JOB_NAME $YOUR_SM_ROLE` 11 | - If your have your image ready in ECS and just want to launch the job, you can run `python3 Launch_sm_job.py $YOUR_JOB_NAME $YOUR_SM_ROLE` 12 | 13 | ## What happened inside? 14 | 15 | ### (1) Build image and push it to ECR 16 | - The `Dockerfile_base` is similar to dockerfile used for EKS and EC2, use that to build the base image 17 | - The `Dockerfile_sm` is specially for SageMaker, everytime if the `run_mpi.py` or `run.sh` is changed, the image needs to be rebuilt 18 | - The `build_and_push.sh` will build the image and push it to ECR 19 | ### (2) Launch SageMaker estimator job 20 | - `Launch_sm_job.py` will lauch the estimator, which essentially launch the instances in container with the docker image we built before. Once the instance is started, it will launch the `run_mpi.py` 21 | - `run_mpi.py` build all mpi commands to run multi-node multi-gpu training. It will run the `run.sh`, which launch the training job. 22 | -------------------------------------------------------------------------------- /tensorpack/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .batch_norm import * 11 | from .common import * 12 | from .conv2d import * 13 | from .fc import * 14 | from .layer_norm import * 15 | from .linearwrap import * 16 | from .nonlin import * 17 | from .pool import * 18 | from .regularize import * 19 | 20 | 21 | from pkgutil import iter_modules 22 | import os 23 | import os.path 24 | # this line is necessary for _TFModuleFunc to work 25 | import tensorflow as tf # noqa: F401 26 | 27 | __all__ = [] 28 | 29 | 30 | def _global_import(name): 31 | p = __import__(name, globals(), locals(), level=1) 32 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 33 | del globals()[name] 34 | for k in lst: 35 | if not k.startswith('__'): 36 | globals()[k] = p.__dict__[k] 37 | __all__.append(k) 38 | 39 | 40 | _CURR_DIR = os.path.dirname(__file__) 41 | _SKIP = ['utils', 'registry', 'tflayer'] 42 | for _, module_name, _ in iter_modules( 43 | [_CURR_DIR]): 44 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 45 | if not os.path.isfile(srcpath): 46 | continue 47 | if module_name.startswith('_'): 48 | continue 49 | if module_name not in _SKIP: 50 | _global_import(module_name) 51 | -------------------------------------------------------------------------------- /infra/docker/train.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | NUM_GPU=${1:-1} 5 | BATCH_SIZE_PER_GPU=${2:-1} 6 | THROUGHPUT_LOG_FREQ=${3:-2000} 7 | 8 | 9 | echo "" 10 | echo "NUM_GPU: ${NUM_GPU}" 11 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}" 12 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}" 13 | echo "" 14 | 15 | 16 | 17 | /usr/local/bin/mpirun -np ${NUM_GPU} \ 18 | --H localhost:${NUM_GPU} \ 19 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \ 20 | -mca btl_tcp_if_exclude lo,docker0 \ 21 | -mca btl_vader_single_copy_mechanism none \ 22 | -x LD_LIBRARY_PATH \ 23 | -x PATH \ 24 | -x NCCL_SOCKET_IFNAME=^docker0,lo \ 25 | -x NCCL_MIN_NRINGS=8 \ 26 | -x NCCL_DEBUG=INFO \ 27 | -x TENSORPACK_FP16=1 \ 28 | -x HOROVOD_CYCLE_TIME=0.5 \ 29 | -x HOROVOD_FUSION_THRESHOLD=67108864 \ 30 | --output-filename /logs/mpirun_logs \ 31 | /usr/local/bin/python3 /tensorpack-mask-rcnn/MaskRCNN/train.py \ 32 | --logdir /logs/train_log \ 33 | --fp16 \ 34 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \ 35 | --config \ 36 | MODE_MASK=True \ 37 | MODE_FPN=True \ 38 | DATA.BASEDIR=/data \ 39 | DATA.TRAIN='["train2017"]' \ 40 | DATA.VAL='("val2017",)' \ 41 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \ 42 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ 43 | TRAIN.EVAL_PERIOD=12 \ 44 | RPN.TOPK_PER_IMAGE=True \ 45 | PREPROC.PREDEFINED_PADDING=True \ 46 | BACKBONE.WEIGHTS=/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ 47 | BACKBONE.NORM=FreezeBN \ 48 | TRAINER=horovod 49 | #For 32x4 50 | #TRAIN.GRADIENT_CLIP=1.5 51 | -------------------------------------------------------------------------------- /tensorpack/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | 7 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 8 | STATICA_HACK = True 9 | globals()['kcah_acitats'[::-1].upper()] = False 10 | if STATICA_HACK: 11 | from .base import * 12 | from .concurrency import * 13 | from .graph import * 14 | from .group import * 15 | from .hooks import * 16 | from .inference import * 17 | from .inference_runner import * 18 | from .monitor import * 19 | from .param import * 20 | from .prof import * 21 | from .saver import * 22 | from .misc import * 23 | from .steps import * 24 | from .summary import * 25 | from .trigger import * 26 | 27 | 28 | from pkgutil import iter_modules 29 | import os 30 | 31 | 32 | __all__ = [] 33 | 34 | 35 | def _global_import(name): 36 | p = __import__(name, globals(), locals(), level=1) 37 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 38 | if lst: 39 | del globals()[name] 40 | for k in lst: 41 | if not k.startswith('__'): 42 | globals()[k] = p.__dict__[k] 43 | __all__.append(k) 44 | 45 | 46 | _CURR_DIR = os.path.dirname(__file__) 47 | for _, module_name, _ in iter_modules( 48 | [_CURR_DIR]): 49 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 50 | if not os.path.isfile(srcpath): 51 | continue 52 | if not module_name.startswith('_'): 53 | _global_import(module_name) 54 | -------------------------------------------------------------------------------- /tensorpack/tfutils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | 7 | from .tower import get_current_tower_context, TowerContext 8 | 9 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 10 | STATICA_HACK = True 11 | globals()['kcah_acitats'[::-1].upper()] = False 12 | if STATICA_HACK: 13 | from .common import * 14 | from .sessinit import * 15 | from .argscope import * 16 | 17 | 18 | # don't want to include everything from .tower 19 | __all__ = ['get_current_tower_context', 'TowerContext'] 20 | 21 | 22 | def _global_import(name): 23 | p = __import__(name, globals(), None, level=1) 24 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 25 | for k in lst: 26 | if not k.startswith('__'): 27 | globals()[k] = p.__dict__[k] 28 | __all__.append(k) 29 | 30 | 31 | _TO_IMPORT = frozenset([ 32 | 'common', 33 | 'sessinit', 34 | 'argscope', 35 | ]) 36 | 37 | for module_name in _TO_IMPORT: 38 | _global_import(module_name) 39 | 40 | """ 41 | TODO remove this line in the future. 42 | Better to keep submodule names (sesscreate, varmanip, etc) out of __all__, 43 | so that these names will be invisible under `tensorpack.` namespace. 44 | 45 | To use these utilities, users are expected to import them explicitly, e.g.: 46 | 47 | import tensorpack.tfutils.symbolic_functions as symbf 48 | """ 49 | __all__.extend(['sessinit', 'summary', 'optimizer', 50 | 'sesscreate', 'gradproc', 'varreplace', 'symbolic_functions', 51 | 'distributed', 'tower']) 52 | -------------------------------------------------------------------------------- /infra/docker/README.md: -------------------------------------------------------------------------------- 1 | # To train with docker 2 | 3 | ## To run on single-node 4 | Refer to [Run with docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/docker/docker.md#using-docker "Run with docker") 5 | 6 | ## To run on multi-node 7 | Make sure you have your data ready as in [Run with docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/docker/docker.md#using-docker "Run with docker"). 8 | ### SSH settings and build container 9 | - ssh into your master node and clone the repo by `git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn.git` 10 | - run `cd ~/tensorpack-mask-rcnn/infra/docker/` 11 | - create your hosts file without slots 12 | - run `./ssh_and_build.sh $YOUR_MASTER_IP $YOUR_HOST_FILE`, this will enable the passwordless ssh connection and build the container on each of the nodes 13 | ### run container 14 | For each of the instances 15 | - run `cd ~/tensorpack-mask-rcnn/infra/docker/` 16 | - run the container by run `./run_multinode.sh` 17 | 18 | ### Launch training 19 | Inside the container: 20 | - On each host *apart from the primary* run the following in the container you started: 21 | - run `cd tensorpack-mask-rcnn/infra/docker/` 22 | - run `./sleep.sh` 23 | This will make those containers listen to the ssh connection from port 1234. 24 | - On primary host, `cd tensorpack-mask-rcnn/infra/docker`, create your hosts file, which contains all ips of your nodes (include the primary host). The format should be like: 25 | ``` 26 | 127.0.0.1 slots=8 27 | 127.0.0.2 slots=8 28 | 127.0.0.3 slots=8 29 | 127.0.0.4 slots=8 30 | ``` 31 | This is 4 nodes, 8 GPUs per node. 32 | Launch training with running `./train_multinode.sh 32 4` for 32 GPUs and 4 images per GPU 33 | -------------------------------------------------------------------------------- /infra/docker/train_multinode.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | NUM_GPU=${1:-1} 5 | BATCH_SIZE_PER_GPU=${2:-1} 6 | PORT_ID=${3:-1234} 7 | THROUGHPUT_LOG_FREQ=${4:-2000} 8 | 9 | 10 | 11 | echo "" 12 | echo "NUM_GPU: ${NUM_GPU}" 13 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}" 14 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}" 15 | echo "" 16 | 17 | 18 | 19 | /usr/local/bin/mpirun -np ${NUM_GPU} \ 20 | --hostfile hosts \ 21 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \ 22 | -mca btl_tcp_if_exclude lo,docker0 \ 23 | -mca btl_vader_single_copy_mechanism none \ 24 | -mca plm_rsh_args "-p ${PORT_ID}" \ 25 | -x LD_LIBRARY_PATH \ 26 | -x PATH \ 27 | -x NCCL_SOCKET_IFNAME=^docker0,lo \ 28 | -x NCCL_MIN_NRINGS=8 \ 29 | -x NCCL_DEBUG=INFO \ 30 | -x TENSORPACK_FP16=1 \ 31 | -x HOROVOD_CYCLE_TIME=0.5 \ 32 | -x HOROVOD_FUSION_THRESHOLD=67108864 \ 33 | --output-filename /logs/mpirun_logs \ 34 | /usr/local/bin/python3 /tensorpack-mask-rcnn/MaskRCNN/train.py \ 35 | --logdir /logs/train_log \ 36 | --fp16 \ 37 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \ 38 | --config \ 39 | MODE_MASK=True \ 40 | MODE_FPN=True \ 41 | DATA.BASEDIR=/data \ 42 | DATA.TRAIN='["train2017"]' \ 43 | DATA.VAL='("val2017",)' \ 44 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \ 45 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ 46 | TRAIN.EVAL_PERIOD=12 \ 47 | RPN.TOPK_PER_IMAGE=True \ 48 | PREPROC.PREDEFINED_PADDING=True \ 49 | BACKBONE.WEIGHTS=/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ 50 | BACKBONE.NORM=FreezeBN \ 51 | TRAINER=horovod 52 | #For 32x4 53 | #TRAIN.GRADIENT_CLIP=1.5 54 | -------------------------------------------------------------------------------- /infra/sm/launch_sm_job.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | from sagemaker import get_execution_role 4 | import sagemaker as sage 5 | from sagemaker.estimator import Estimator 6 | import datetime 7 | import subprocess 8 | import sys 9 | 10 | def get_str(cmd): 11 | content = subprocess.check_output(cmd, shell=True) 12 | return str(content)[2:-3] 13 | 14 | account = get_str("echo $(aws sts get-caller-identity --query Account --output text)") 15 | region = get_str("echo $(aws configure get region)") 16 | image = str(sys.argv[1]) 17 | sess = sage.Session() 18 | image_name=f"{account}.dkr.ecr.{region}.amazonaws.com/{image}" 19 | sagemaker_iam_role = str(sys.argv[2]) #get_execution_role() 20 | num_gpus = 8 21 | num_nodes = 4 22 | instance_type = 'ml.p3.16xlarge' 23 | custom_mpi_cmds = [] 24 | 25 | job_name = "maskrcnn-{}x{}-{}".format(num_nodes, num_gpus, image) 26 | 27 | output_path = 's3://mrcnn-sagemaker/sagemaker_training_release' 28 | 29 | hyperparams = {"sagemaker_use_mpi": "True", 30 | "sagemaker_process_slots_per_host": num_gpus, 31 | "num_gpus":num_gpus, 32 | "num_nodes": num_nodes, 33 | "custom_mpi_cmds": custom_mpi_cmds} 34 | 35 | estimator = Estimator(image_name, role=sagemaker_iam_role, output_path=output_path, 36 | train_instance_count=num_nodes, 37 | train_instance_type=instance_type, 38 | sagemaker_session=sess, 39 | train_volume_size=200, 40 | base_job_name=job_name, 41 | subnets=['subnet-21ac2f2e'], 42 | hyperparameters=hyperparams) 43 | 44 | estimator.fit(wait=False) 45 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .base import * 11 | from .convert import * 12 | from .crop import * 13 | from .deform import * 14 | from .geometry import * 15 | from .imgproc import * 16 | from .meta import * 17 | from .misc import * 18 | from .noise import * 19 | from .paste import * 20 | from .transform import * 21 | from .external import * 22 | 23 | 24 | import os 25 | from pkgutil import iter_modules 26 | 27 | __all__ = [] 28 | 29 | 30 | def global_import(name): 31 | p = __import__(name, globals(), locals(), level=1) 32 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 33 | if lst: 34 | del globals()[name] 35 | for k in lst: 36 | if not k.startswith('__'): 37 | globals()[k] = p.__dict__[k] 38 | __all__.append(k) 39 | 40 | 41 | try: 42 | import cv2 # noqa 43 | except ImportError: 44 | from ...utils import logger 45 | logger.warn("Cannot import 'cv2', therefore image augmentation is not available.") 46 | else: 47 | _CURR_DIR = os.path.dirname(__file__) 48 | for _, module_name, _ in iter_modules( 49 | [os.path.dirname(__file__)]): 50 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 51 | if not os.path.isfile(srcpath): 52 | continue 53 | if not module_name.startswith('_'): 54 | global_import(module_name) 55 | -------------------------------------------------------------------------------- /infra/ami/train_efa.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | NUM_GPU=${1:-1} 5 | BATCH_SIZE_PER_GPU=${2:-1} 6 | THROUGHPUT_LOG_FREQ=${3:-2000} 7 | 8 | 9 | echo "" 10 | echo "NUM_GPU: ${NUM_GPU}" 11 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}" 12 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}" 13 | echo "" 14 | 15 | 16 | 17 | mpirun -np ${NUM_GPU} \ 18 | --hostfile hosts \ 19 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 \ 20 | -mca btl_vader_single_copy_mechanism none \ 21 | --mca btl tcp,self \ 22 | --mca btl_tcp_if_exclude lo,docker0 \ 23 | -x FI_PROVIDER="efa" \ 24 | -x FI_OFI_RXR_RX_COPY_UNEXP=1 \ 25 | -x FI_OFI_RXR_RX_COPY_OOO=1 \ 26 | -x FI_EFA_MR_CACHE_ENABLE=1 \ 27 | -x FI_OFI_RXR_INLINE_MR_ENABLE=1 \ 28 | -x NCCL_TREE_THRESHOLD=4294967296 \ 29 | -x LD_LIBRARY_PATH \ 30 | -x PATH \ 31 | -x NCCL_SOCKET_IFNAME=^docker0,lo \ 32 | -x NCCL_MIN_NRINGS=13 \ 33 | -x NCCL_DEBUG=INFO \ 34 | -x TENSORPACK_FP16=1 \ 35 | -x HOROVOD_CYCLE_TIME=0.5 \ 36 | -x HOROVOD_FUSION_THRESHOLD=67108864 \ 37 | python3 /home/ec2-user/tensorpack-mask-rcnn/MaskRCNN/train.py \ 38 | --fp16 \ 39 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \ 40 | --config \ 41 | MODE_MASK=True \ 42 | MODE_FPN=True \ 43 | DATA.BASEDIR=/home/ec2-user/data \ 44 | DATA.TRAIN='["train2017"]' \ 45 | DATA.VAL='("val2017",)' \ 46 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \ 47 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ 48 | TRAIN.EVAL_PERIOD=12 \ 49 | RPN.TOPK_PER_IMAGE=True \ 50 | PREPROC.PREDEFINED_PADDING=True \ 51 | BACKBONE.WEIGHTS=/home/ec2-user/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ 52 | BACKBONE.NORM=FreezeBN \ 53 | TRAINER=horovod 54 | #For 32x4 55 | #TRAIN.GRADIENT_CLIP=1.5 56 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # DockerHub unaltered mirror of AWS Deep Learning Container 2 | FROM armandmcqueen/tensorflow-training:1.13-horovod-gpu-py36-cu100-ubuntu16.04 3 | 4 | RUN apt-get install less 5 | 6 | # Need to reinstall some libraries the DL container provides due to custom Tensorflow binary 7 | RUN pip uninstall -y tensorflow tensorboard tensorflow-estimator keras h5py horovod numpy 8 | 9 | # Download and install custom Tensorflow binary 10 | RUN wget https://github.com/armandmcqueen/tensorpack-mask-rcnn/releases/download/v0.0.0-WIP/tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \ 11 | pip install tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \ 12 | pip install tensorflow-estimator==1.13.0 && \ 13 | rm tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl 14 | 15 | RUN pip install keras h5py 16 | 17 | # Install Horovod, temporarily using CUDA stubs 18 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \ 19 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.15.2 && \ 20 | ldconfig 21 | 22 | 23 | # Install OpenSSH for MPI to communicate between containers 24 | RUN mkdir -p /root/.ssh/ && \ 25 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 26 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 27 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 28 | 29 | 30 | RUN pip install Cython 31 | RUN pip install ujson opencv-python pycocotools matplotlib 32 | RUN pip install --ignore-installed numpy==1.16.2 33 | 34 | 35 | # TODO: Do I really need this now that we are using the DL container? 36 | ARG CACHEBUST=1 37 | ARG BRANCH_NAME 38 | 39 | RUN git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn -b $BRANCH_NAME 40 | 41 | RUN chmod -R +w /tensorpack-mask-rcnn 42 | RUN pip install --ignore-installed -e /tensorpack-mask-rcnn/ 43 | -------------------------------------------------------------------------------- /tensorpack/tfutils/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: distributed.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | 10 | def get_distributed_session_creator(server): 11 | """ 12 | Args: 13 | server (tf.train.Server): 14 | 15 | Returns: 16 | tf.train.SessionCreator 17 | """ 18 | 19 | server_def = server.server_def 20 | is_chief = (server_def.job_name == 'worker') and (server_def.task_index == 0) 21 | 22 | init_op = tf.global_variables_initializer() 23 | local_init_op = tf.local_variables_initializer() 24 | ready_op = tf.report_uninitialized_variables() 25 | ready_for_local_init_op = tf.report_uninitialized_variables(tf.global_variables()) 26 | sm = tf.train.SessionManager( 27 | local_init_op=local_init_op, 28 | ready_op=ready_op, 29 | ready_for_local_init_op=ready_for_local_init_op, 30 | graph=tf.get_default_graph()) 31 | 32 | # to debug wrong variable collection 33 | # from pprint import pprint 34 | # print("GLOBAL:") 35 | # pprint([(k.name, k.device) for k in tf.global_variables()]) 36 | # print("LOCAL:") 37 | # pprint([(k.name, k.device) for k in tf.local_variables()]) 38 | 39 | class _Creator(tf.train.SessionCreator): 40 | def create_session(self): 41 | if is_chief: 42 | return sm.prepare_session(master=server.target, init_op=init_op) 43 | else: 44 | tf.logging.set_verbosity(tf.logging.INFO) # print message about uninitialized vars 45 | ret = sm.wait_for_session(master=server.target) 46 | tf.logging.set_verbosity(tf.logging.WARN) 47 | return ret 48 | 49 | return _Creator() 50 | -------------------------------------------------------------------------------- /patch/README.md: -------------------------------------------------------------------------------- 1 | # Building the Wheel 2 | 3 | Use the codebase here to build a TF wheel: https://github.com/samikama/tensorflow/commits/GenerateProposalsOp 4 | 5 | Apply the diff patches above. 6 | 7 | 8 | 9 | ## Building Tensorflow 10 | 11 | Requires custom Tensorflow for GPU optimized ops. Build steps were run on the AWS DLAMI 21.2. 12 | 13 | ``` 14 | source activate tensorflow_p36 15 | pip uninstall -y tensorflow horovod 16 | 17 | ############################################################################################################ 18 | # Upgrade Bazel 19 | ############################################################################################################ 20 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36/bin/bazel 21 | wget https://github.com/bazelbuild/bazel/releases/download/0.19.2/bazel-0.19.2-installer-linux-x86_64.sh 22 | chmod +x bazel-0.19.2-installer-linux-x86_64.sh 23 | ./bazel-0.19.2-installer-linux-x86_64.sh --user 24 | 25 | 26 | ############################################################################################################ 27 | # Build TF 1.13 with CUDA 10 28 | ############################################################################################################ 29 | 30 | ./configure 31 | 32 | # XLA JIT: N 33 | # CUDA: Y 34 | # CUDA/CUDNN/NCCL dir: /usr/local/cuda-10.0 35 | # CUDNN: 7.4.1 36 | # NCCL: 2.3.7 37 | 38 | 39 | ############################################################################################################ 40 | # Create pip wheel 41 | ############################################################################################################ 42 | 43 | bazel build --config=opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --config=cuda //tensorflow/tools/pip_package:build_pip_package 44 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package ./tensorflow_pkg 45 | ``` 46 | 47 | 48 | -------------------------------------------------------------------------------- /tensorpack/graph_builder/predict.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: predict.py 5 | 6 | import tensorflow as tf 7 | 8 | from ..tfutils.tower import PredictTowerContext 9 | from ..utils import logger 10 | from ..utils.develop import deprecated 11 | from .training import GraphBuilder 12 | 13 | __all__ = ['SimplePredictBuilder'] 14 | 15 | 16 | class SimplePredictBuilder(GraphBuilder): 17 | """ 18 | Single-tower predictor. 19 | """ 20 | @deprecated("Please use TowerContext to build it by yourself!", "2018-12-31") 21 | def __init__(self, ns_name='', vs_name='', device=0): 22 | """ 23 | Args: 24 | ns_name (str): 25 | vs_name (str): 26 | device (int): 27 | """ 28 | self._ns_name = ns_name 29 | self._vs_name = vs_name 30 | 31 | device = '/gpu:{}'.format(device) if device >= 0 else '/cpu:0' 32 | self._device = device 33 | 34 | def build(self, input, tower_fn): 35 | """ 36 | Args: 37 | input (InputSource): must have been setup 38 | tower_fn ( [tf.Tensors] ->): callable that takes input tensors. 39 | 40 | Returns: 41 | The return value of tower_fn called under the proper context. 42 | """ 43 | assert input.setup_done() 44 | logger.info("Building predictor tower '{}' on device {} ...".format( 45 | self._ns_name, self._device)) 46 | 47 | with tf.device(self._device), \ 48 | PredictTowerContext( 49 | self._ns_name, vs_name=self._vs_name): 50 | inputs = input.get_input_tensors() 51 | assert isinstance(inputs, (list, tuple)), inputs 52 | return tower_fn(*inputs) 53 | -------------------------------------------------------------------------------- /tensorpack/callbacks/hooks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: hooks.py 5 | 6 | 7 | """ Compatible layers between tf.train.SessionRunHook and Callback""" 8 | 9 | import tensorflow as tf 10 | 11 | from ..tfutils.common import tfv1 12 | from .base import Callback 13 | 14 | __all__ = ['CallbackToHook', 'HookToCallback'] 15 | 16 | 17 | class CallbackToHook(tfv1.train.SessionRunHook): 18 | """ This is only for internal implementation of 19 | before_run/after_run callbacks. 20 | You shouldn't need to use this. 21 | """ 22 | 23 | def __init__(self, cb): 24 | self._cb = cb 25 | 26 | def before_run(self, ctx): 27 | return self._cb.before_run(ctx) 28 | 29 | def after_run(self, ctx, vals): 30 | self._cb.after_run(ctx, vals) 31 | 32 | 33 | class HookToCallback(Callback): 34 | """ 35 | Make a ``tf.train.SessionRunHook`` into a callback. 36 | Note that when `SessionRunHook.after_create_session` is called, the `coord` argument will be None. 37 | """ 38 | 39 | _chief_only = False 40 | 41 | def __init__(self, hook): 42 | """ 43 | Args: 44 | hook (tf.train.SessionRunHook): 45 | """ 46 | self._hook = hook 47 | 48 | def _setup_graph(self): 49 | with tf.name_scope(None): # jump out of the name scope 50 | self._hook.begin() 51 | 52 | def _before_train(self): 53 | sess = tf.get_default_session() 54 | # coord is set to None when converting 55 | self._hook.after_create_session(sess, None) 56 | 57 | def _before_run(self, ctx): 58 | return self._hook.before_run(ctx) 59 | 60 | def _after_run(self, ctx, run_values): 61 | self._hook.after_run(ctx, run_values) 62 | 63 | def _after_train(self): 64 | self._hook.end(self.trainer.sess) 65 | -------------------------------------------------------------------------------- /infra/sm/Dockerfile_base: -------------------------------------------------------------------------------- 1 | # DockerHub unaltered mirror of AWS Deep Learning Container 2 | FROM 578276202366.dkr.ecr.us-east-1.amazonaws.com/dlami 3 | 4 | RUN apt-get install less 5 | 6 | # Need to reinstall some libraries the DL container provides due to custom Tensorflow binary 7 | RUN pip uninstall -y tensorflow tensorboard tensorflow-estimator keras h5py horovod numpy 8 | 9 | # Download and install custom Tensorflow binary 10 | RUN wget https://github.com/armandmcqueen/tensorpack-mask-rcnn/releases/download/v0.0.0-WIP/tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \ 11 | pip install tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \ 12 | pip install tensorflow-estimator==1.13.0 && \ 13 | rm tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl 14 | 15 | RUN pip install keras h5py 16 | 17 | # Install Horovod, temporarily using CUDA stubs 18 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \ 19 | HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.15.2 && \ 20 | ldconfig 21 | 22 | 23 | # Install OpenSSH for MPI to communicate between containers 24 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server 25 | RUN mkdir -p /var/run/sshd && \ 26 | sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd 27 | 28 | RUN mkdir -p /root/.ssh/ && \ 29 | ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ 30 | cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ 31 | printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config 32 | 33 | 34 | RUN pip install Cython 35 | RUN pip install ujson opencv-python pycocotools matplotlib 36 | RUN pip install --ignore-installed numpy==1.16.2 37 | 38 | 39 | # TODO: Do I really need this now that we are using the DL container? 40 | ARG CACHEBUST=1 41 | ARG BRANCH_NAME 42 | 43 | RUN pip install mpi4py 44 | 45 | # For Sagemaker 46 | RUN pip install sagemaker-containers 47 | -------------------------------------------------------------------------------- /tensorpack/dataflow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: __init__.py 5 | 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36 7 | STATICA_HACK = True 8 | globals()['kcah_acitats'[::-1].upper()] = False 9 | if STATICA_HACK: 10 | from .base import * 11 | from .common import * 12 | from .format import * 13 | from .image import * 14 | from .parallel_map import * 15 | from .parallel import * 16 | from .raw import * 17 | from .remote import * 18 | from . import imgaug 19 | from . import dataset 20 | from . import dftools 21 | 22 | 23 | from pkgutil import iter_modules 24 | import os 25 | import os.path 26 | from ..utils.develop import LazyLoader 27 | 28 | __all__ = [] 29 | 30 | 31 | def _global_import(name): 32 | p = __import__(name, globals(), locals(), level=1) 33 | lst = p.__all__ if '__all__' in dir(p) else dir(p) 34 | if lst: 35 | del globals()[name] 36 | for k in lst: 37 | if not k.startswith('__'): 38 | globals()[k] = p.__dict__[k] 39 | __all__.append(k) 40 | 41 | 42 | __SKIP = set(['dftools', 'dataset', 'imgaug']) 43 | _CURR_DIR = os.path.dirname(__file__) 44 | for _, module_name, __ in iter_modules( 45 | [os.path.dirname(__file__)]): 46 | srcpath = os.path.join(_CURR_DIR, module_name + '.py') 47 | if not os.path.isfile(srcpath): 48 | continue 49 | if not module_name.startswith('_') and \ 50 | module_name not in __SKIP: 51 | _global_import(module_name) 52 | 53 | 54 | globals()['dataset'] = LazyLoader('dataset', globals(), 'tensorpack.dataflow.dataset') 55 | globals()['imgaug'] = LazyLoader('imgaug', globals(), 'tensorpack.dataflow.imgaug') 56 | 57 | del LazyLoader 58 | 59 | __all__.extend(['imgaug', 'dftools', 'dataset']) 60 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/convert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: convert.py 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | from .base import ImageAugmentor 10 | from .meta import MapImage 11 | 12 | __all__ = ['ColorSpace', 'Grayscale', 'ToUint8', 'ToFloat32'] 13 | 14 | 15 | class ColorSpace(ImageAugmentor): 16 | """ Convert into another color space. """ 17 | 18 | def __init__(self, mode, keepdims=True): 19 | """ 20 | Args: 21 | mode: OpenCV color space conversion code (e.g., `cv2.COLOR_BGR2HSV`) 22 | keepdims (bool): keep the dimension of image unchanged if OpenCV 23 | changes it. 24 | """ 25 | self._init(locals()) 26 | 27 | def _augment(self, img, _): 28 | transf = cv2.cvtColor(img, self.mode) 29 | if self.keepdims: 30 | if len(transf.shape) is not len(img.shape): 31 | transf = transf[..., None] 32 | return transf 33 | 34 | 35 | class Grayscale(ColorSpace): 36 | """ Convert image to grayscale. """ 37 | 38 | def __init__(self, keepdims=True, rgb=False): 39 | """ 40 | Args: 41 | keepdims (bool): return image of shape [H, W, 1] instead of [H, W] 42 | rgb (bool): interpret input as RGB instead of the default BGR 43 | """ 44 | mode = cv2.COLOR_RGB2GRAY if rgb else cv2.COLOR_BGR2GRAY 45 | super(Grayscale, self).__init__(mode, keepdims) 46 | 47 | 48 | class ToUint8(MapImage): 49 | """ Convert image to uint8. Useful to reduce communication overhead. """ 50 | def __init__(self): 51 | super(ToUint8, self).__init__(lambda x: np.clip(x, 0, 255).astype(np.uint8), lambda x: x) 52 | 53 | 54 | class ToFloat32(MapImage): 55 | """ Convert image to float32, may increase quality of the augmentor. """ 56 | def __init__(self): 57 | super(ToFloat32, self).__init__(lambda x: x.astype(np.float32), lambda x: x) 58 | -------------------------------------------------------------------------------- /MaskRCNN/utils/mixed_precision.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | import tensorflow as tf 4 | from contextlib import suppress 5 | 6 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None, 7 | initializer=None, regularizer=None, 8 | trainable=True, 9 | *args, **kwargs): 10 | """Custom variable getter that forces trainable variables to be stored in 11 | float32 precision and then casts them to the training precision. 12 | """ 13 | norm = "norm" in name.lower() or "bn" in name.lower() 14 | storage_dtype = tf.float32 if trainable else dtype 15 | variable = getter(name, shape, dtype=storage_dtype, 16 | initializer=initializer, 17 | regularizer=regularizer if not norm else None, 18 | trainable=trainable, 19 | *args, **kwargs) 20 | 21 | # print(name, "trainable={} dtype={} storage_dtype={} id={} reuse={}".format(trainable, dtype, storage_dtype, id(variable), kwargs['reuse'])) 22 | 23 | if norm: 24 | return variable 25 | 26 | if trainable and dtype != tf.float32: 27 | # print(name, "fp16_cast") 28 | cast_name = name + '/fp16_cast' 29 | try: 30 | cast_variable = tf.get_default_graph().get_tensor_by_name( 31 | cast_name + ':0' 32 | ) 33 | except KeyError: 34 | cast_variable = tf.cast(variable, dtype, name=cast_name) 35 | cast_variable._ref = variable._ref 36 | variable = cast_variable 37 | return variable 38 | 39 | 40 | def mixed_precision_scope(mixed=True, *args, **kwargs): 41 | if not mixed: 42 | return suppress() 43 | 44 | return tf.variable_scope(name_or_scope=tf.get_variable_scope(), 45 | custom_getter=float32_variable_storage_getter, 46 | reuse=tf.AUTO_REUSE, *args, **kwargs) 47 | 48 | -------------------------------------------------------------------------------- /infra/sm/build_and_push.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | # This script shows how to build the Docker image and push it to ECR to be ready for use 6 | # by SageMaker. 7 | 8 | # The argument to this script is the image name. This will be used as the image on the local 9 | # machine and combined with the account and region to form the repository name for ECR. 10 | image=$1 11 | if [ "$image" == "" ] 12 | then 13 | echo "Usage: $0 " 14 | exit 1 15 | fi 16 | 17 | export AWS_ACCESS_KEY_ID=$(aws --profile default configure get aws_access_key_id) 18 | export AWS_SECRET_ACCESS_KEY=$(aws --profile default configure get aws_secret_access_key) 19 | 20 | 21 | # Get the account number associated with the current IAM credentials 22 | account=$(aws sts get-caller-identity --query Account --output text) 23 | 24 | if [ $? -ne 0 ] 25 | then 26 | exit 255 27 | fi 28 | 29 | 30 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 31 | region=$(aws configure get region) 32 | #region=${region:-us-east-1} 33 | 34 | 35 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" 36 | # If the repository doesn't exist in ECR, create it. 37 | 38 | aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 39 | 40 | if [ $? -ne 0 ] 41 | then 42 | aws ecr create-repository --repository-name "${image}" > /dev/null 43 | fi 44 | 45 | # Get the login command from ECR and execute it directly 46 | $(aws ecr get-login --region ${region} --no-include-email) 47 | 48 | # Build the docker image locally with the image name and then push it to ECR 49 | # with the full name. 50 | echo "Building docker image tensorpack-mask-rcnn" 51 | echo "" 52 | 53 | docker build -t ${image} -f Dockerfile_sm . --build-arg CACHEBUST=$(date +%s) \ 54 | --build-arg AWS_ACCESS_KEY_ID \ 55 | --build-arg AWS_SECRET_ACCESS_KEY \ 56 | 57 | if [ $? -ne 0 ] 58 | then 59 | echo "Local build failed. Not pushing." 60 | exit 1 61 | fi 62 | 63 | docker tag ${image} ${fullname} 64 | 65 | docker push ${fullname} 66 | -------------------------------------------------------------------------------- /tensorpack/utils/palette.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: palette.py 5 | 6 | import numpy as np 7 | 8 | __all__ = ['PALETTE_RGB'] 9 | 10 | # copied from https://stackoverflow.com/questions/2328339/how-to-generate-n-different-colors-for-any-natural-number-n 11 | PALETTE_HEX = [ 12 | "#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", 13 | "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", 14 | "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", 15 | "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", 16 | "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", 17 | "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", 18 | "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", 19 | "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", 20 | "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", 21 | "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", 22 | "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", 23 | "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", 24 | "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C", 25 | "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800", 26 | "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51", 27 | "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", 28 | "#7ED379", "#012C58"] 29 | 30 | 31 | def _parse_hex_color(s): 32 | r = int(s[1:3], 16) 33 | g = int(s[3:5], 16) 34 | b = int(s[5:7], 16) 35 | return (r, g, b) 36 | 37 | 38 | PALETTE_RGB = np.asarray( 39 | list(map(_parse_hex_color, PALETTE_HEX)), 40 | dtype='int32') 41 | -------------------------------------------------------------------------------- /tensorpack/models/nonlin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: nonlin.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from .batch_norm import BatchNorm 10 | from .common import VariableHolder, layer_register 11 | 12 | __all__ = ['Maxout', 'PReLU', 'BNReLU'] 13 | 14 | 15 | @layer_register(use_scope=None) 16 | def Maxout(x, num_unit): 17 | """ 18 | Maxout as in the paper `Maxout Networks `_. 19 | 20 | Args: 21 | x (tf.Tensor): a NHWC or NC tensor. Channel has to be known. 22 | num_unit (int): a int. Must be divisible by C. 23 | 24 | Returns: 25 | tf.Tensor: of shape NHW(C/num_unit) named ``output``. 26 | """ 27 | input_shape = x.get_shape().as_list() 28 | ndim = len(input_shape) 29 | assert ndim == 4 or ndim == 2 30 | ch = input_shape[-1] 31 | assert ch is not None and ch % num_unit == 0 32 | if ndim == 4: 33 | x = tf.reshape(x, [-1, input_shape[1], input_shape[2], ch / num_unit, num_unit]) 34 | else: 35 | x = tf.reshape(x, [-1, ch / num_unit, num_unit]) 36 | return tf.reduce_max(x, ndim, name='output') 37 | 38 | 39 | @layer_register() 40 | def PReLU(x, init=0.001, name='output'): 41 | """ 42 | Parameterized ReLU as in the paper `Delving Deep into Rectifiers: Surpassing 43 | Human-Level Performance on ImageNet Classification 44 | `_. 45 | 46 | Args: 47 | x (tf.Tensor): input 48 | init (float): initial value for the learnable slope. 49 | name (str): name of the output. 50 | 51 | Variable Names: 52 | 53 | * ``alpha``: learnable slope. 54 | """ 55 | init = tf.constant_initializer(init) 56 | alpha = tf.get_variable('alpha', [], initializer=init) 57 | x = ((1 + alpha) * x + (1 - alpha) * tf.abs(x)) 58 | ret = tf.multiply(x, 0.5, name=name) 59 | 60 | ret.variables = VariableHolder(alpha=alpha) 61 | return ret 62 | 63 | 64 | @layer_register(use_scope=None) 65 | def BNReLU(x, name=None): 66 | """ 67 | A shorthand of BatchNormalization + ReLU. 68 | """ 69 | x = BatchNorm('bn', x) 70 | x = tf.nn.relu(x, name=name) 71 | return x 72 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | import platform 4 | from os import path 5 | import setuptools 6 | from setuptools import setup 7 | 8 | version = int(setuptools.__version__.split('.')[0]) 9 | assert version > 30, "Tensorpack installation requires setuptools > 30" 10 | 11 | this_directory = path.abspath(path.dirname(__file__)) 12 | 13 | # setup metainfo 14 | libinfo_py = path.join(this_directory, 'tensorpack', 'libinfo.py') 15 | libinfo_content = open(libinfo_py, "r").readlines() 16 | version_line = [l.strip() for l in libinfo_content if l.startswith('__version__')][0] 17 | exec(version_line) # produce __version__ 18 | 19 | with open(path.join(this_directory, 'README.md'), 'rb') as f: 20 | long_description = f.read().decode('utf-8') 21 | 22 | 23 | def add_git_version(): 24 | 25 | def get_git_version(): 26 | from subprocess import check_output 27 | try: 28 | return check_output("git describe --tags --long --dirty".split()).decode('utf-8').strip() 29 | except Exception: 30 | return __version__ # noqa 31 | 32 | newlibinfo_content = [l for l in libinfo_content if not l.startswith('__git_version__')] 33 | newlibinfo_content.append('__git_version__ = "{}"'.format(get_git_version())) 34 | with open(libinfo_py, "w") as f: 35 | f.write("".join(newlibinfo_content)) 36 | 37 | 38 | add_git_version() 39 | 40 | 41 | setup( 42 | name='tensorpack', 43 | version=__version__, # noqa 44 | description='Neural Network Toolbox on TensorFlow', 45 | long_description=long_description, 46 | long_description_content_type='text/markdown', 47 | install_requires=[ 48 | "numpy>=1.14", 49 | "six", 50 | "termcolor>=1.1", 51 | "tabulate>=0.7.7", 52 | "tqdm>4.11.1", 53 | "msgpack>=0.5.2", 54 | "msgpack-numpy>=0.4.4.2", 55 | "pyzmq>=16", 56 | "subprocess32; python_version < '3.0'", 57 | "functools32; python_version < '3.0'", 58 | ], 59 | tests_require=['flake8', 'scikit-image'], 60 | extras_require={ 61 | 'all': ['pillow', 'scipy', 'h5py', 'lmdb>=0.92', 'matplotlib', 'scikit-learn'] + 62 | ['python-prctl'] if platform.system() == 'Linux' else [], 63 | 'all: python_version < "3.0"': ['tornado'], 64 | }, 65 | ) 66 | -------------------------------------------------------------------------------- /infra/eks/YAML_OVERLAY.md: -------------------------------------------------------------------------------- 1 | # Overyaml 2 | 3 | Take a base yaml file, apply a series of changes (overlays) and print out new yaml. 4 | 5 | e.g. take base maskrcnn params and change to run 5 experiments of 24 epochs, predefined_padding=True, 32x4 GPU configuration without helm naming conflicts. Then run 5 more experiments with 32x2 GPU configuration. 6 | 7 | * Be able to make changes to the base yaml and have it impact all other configurations. 8 | * Add a new experiment without having an exploding number of yaml files to maintain and update. 9 | 10 | ## CLI Syntax 11 | 12 | `./yaml_overlay $BASE $OVERLAY1 $OVERLAY2 $OVERLAY3 ...` 13 | 14 | Takes a base yaml and applies overlays sequentially. At the end, prints new yaml out to stdout. Overlay names should be the path to the overlay file minus '.yaml'. 15 | 16 | `./yaml_overlay maskrcnn/values.yaml maskrcnn/overlays/24epoch maskrcnn/overlays/32x4` 17 | 18 | ## Overlay folder 19 | 20 | You can keep all your overlays in a single folder and then pass in an `overlay_dir` either through the `--overlay_dir` flag or through the `OVERLAY_DIR` environment variable. 21 | 22 | ``` 23 | export OVERLAY_DIR=maskrcnn/overlays 24 | ./yaml_overlay maskrcnn/values.yaml 24epoch 32x4 25 | ``` 26 | 27 | ## Overlay syntax 28 | 29 | An overlay is a yaml file containing two sets of changes - changes where you want to `set` a new value for a field and changes where you want to `append` a postfix to the existing value. 30 | 31 | ``` 32 | set: 33 | someScope: 34 | someField: "new_value" 35 | append: 36 | someScope: 37 | someOtherField: "_new_postfix" 38 | ``` 39 | 40 | Both `set` and `append` are optional. 41 | 42 | Changes are represented as a copy of the original object with unchanged fields ommitted and each changed field holding the new value or the postfix as the field's value. See example below. 43 | 44 | 45 | ## Example 46 | 47 | **base.yaml** 48 | 49 | ``` 50 | someScope: 51 | someField: 1 52 | someOtherField: "my_name" 53 | ``` 54 | 55 | **overlay.yaml** 56 | 57 | ``` 58 | set: 59 | someScope: 60 | someField: "new_value" 61 | append: 62 | someScope: 63 | someOtherField: "_new_postfix" 64 | ``` 65 | 66 | 67 | 68 | ###`$ ./yaml_overlay base.yaml overlay > output.yaml` 69 | 70 | 71 | **output.yaml** 72 | ``` 73 | someScope: 74 | someField: "new_value" 75 | someOtherField: "my_name_new_postfix" 76 | ``` 77 | -------------------------------------------------------------------------------- /tensorpack/utils/gpu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: gpu.py 5 | 6 | 7 | import os 8 | 9 | from . import logger 10 | from .concurrency import subproc_call 11 | from .nvml import NVMLContext 12 | from .utils import change_env 13 | 14 | __all__ = ['change_gpu', 'get_nr_gpu', 'get_num_gpu'] 15 | 16 | 17 | def change_gpu(val): 18 | """ 19 | Args: 20 | val: an integer, the index of the GPU or -1 to disable GPU. 21 | 22 | Returns: 23 | a context where ``CUDA_VISIBLE_DEVICES=val``. 24 | """ 25 | val = str(val) 26 | if val == '-1': 27 | val = '' 28 | return change_env('CUDA_VISIBLE_DEVICES', val) 29 | 30 | 31 | def get_num_gpu(): 32 | """ 33 | Returns: 34 | int: #available GPUs in CUDA_VISIBLE_DEVICES, or in the system. 35 | """ 36 | 37 | def warn_return(ret, message): 38 | try: 39 | import tensorflow as tf 40 | except ImportError: 41 | return ret 42 | 43 | built_with_cuda = tf.test.is_built_with_cuda() 44 | if not built_with_cuda and ret > 0: 45 | logger.warn(message + "But TensorFlow was not built with CUDA support!") 46 | return ret 47 | 48 | env = os.environ.get('CUDA_VISIBLE_DEVICES', None) 49 | if env is not None: 50 | return warn_return(len(env.split(',')), "Found non-empty CUDA_VISIBLE_DEVICES. ") 51 | output, code = subproc_call("nvidia-smi -L", timeout=5) 52 | if code == 0: 53 | output = output.decode('utf-8') 54 | return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ") 55 | try: 56 | # Use NVML to query device properties 57 | with NVMLContext() as ctx: 58 | return warn_return(ctx.num_devices(), "NVML found nvidia devices. ") 59 | except Exception: 60 | # Fallback 61 | # Note this will initialize all GPUs and therefore has side effect 62 | # https://github.com/tensorflow/tensorflow/issues/8136 63 | logger.info("Loading local devices by TensorFlow ...") 64 | from tensorflow.python.client import device_lib 65 | local_device_protos = device_lib.list_local_devices() 66 | return len([x.name for x in local_device_protos if x.device_type == 'GPU']) 67 | 68 | 69 | get_nr_gpu = get_num_gpu 70 | -------------------------------------------------------------------------------- /tensorpack/callbacks/concurrency.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: concurrency.py 5 | 6 | 7 | import multiprocessing as mp 8 | 9 | from ..utils import logger 10 | from ..utils.concurrency import StoppableThread, start_proc_mask_signal 11 | from .base import Callback 12 | 13 | __all__ = ['StartProcOrThread'] 14 | 15 | 16 | class StartProcOrThread(Callback): 17 | """ 18 | Start some threads or processes before training. 19 | """ 20 | 21 | _chief_only = False 22 | 23 | def __init__(self, startable, stop_at_last=True): 24 | """ 25 | Args: 26 | startable (list): list of processes or threads which have ``start()`` method. 27 | Can also be a single instance of process of thread. 28 | stop_at_last (bool): whether to stop the processes or threads 29 | after training. It will use :meth:`Process.terminate()` or 30 | :meth:`StoppableThread.stop()`, but will do nothing on normal 31 | `threading.Thread` or other startable objects. 32 | """ 33 | if not isinstance(startable, list): 34 | startable = [startable] 35 | self._procs_threads = startable 36 | self._stop_at_last = stop_at_last 37 | 38 | def _before_train(self): 39 | logger.info("Starting " + 40 | ', '.join([k.name for k in self._procs_threads]) + ' ...') 41 | # avoid sigint get handled by other processes 42 | start_proc_mask_signal(self._procs_threads) 43 | 44 | def _after_train(self): 45 | if not self._stop_at_last: 46 | return 47 | for k in self._procs_threads: 48 | if not k.is_alive(): 49 | continue 50 | if isinstance(k, mp.Process): 51 | logger.info("Stopping {} ...".format(k.name)) 52 | k.terminate() 53 | k.join(5.0) 54 | if k.is_alive(): 55 | logger.error("Cannot join process {}.".format(k.name)) 56 | elif isinstance(k, StoppableThread): 57 | logger.info("Stopping {} ...".format(k.name)) 58 | k.stop() 59 | k.join(5.0) 60 | if k.is_alive(): 61 | logger.error("Cannot join thread {}.".format(k.name)) 62 | -------------------------------------------------------------------------------- /tensorpack/utils/serialize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: serialize.py 5 | 6 | import os 7 | import sys 8 | 9 | import msgpack 10 | import msgpack_numpy 11 | 12 | from . import logger 13 | from .develop import create_dummy_func 14 | 15 | msgpack_numpy.patch() 16 | assert msgpack.version >= (0, 5, 2) 17 | 18 | __all__ = ['loads', 'dumps'] 19 | 20 | 21 | MAX_MSGPACK_LEN = 1000000000 22 | 23 | 24 | def dumps_msgpack(obj): 25 | """ 26 | Serialize an object. 27 | 28 | Returns: 29 | Implementation-dependent bytes-like object. 30 | """ 31 | return msgpack.dumps(obj, use_bin_type=True) 32 | 33 | 34 | def loads_msgpack(buf): 35 | """ 36 | Args: 37 | buf: the output of `dumps`. 38 | """ 39 | # Since 0.6, the default max size was set to 1MB. 40 | # We change it to approximately 1G. 41 | return msgpack.loads(buf, raw=False, 42 | max_bin_len=MAX_MSGPACK_LEN, 43 | max_array_len=MAX_MSGPACK_LEN, 44 | max_map_len=MAX_MSGPACK_LEN, 45 | max_str_len=MAX_MSGPACK_LEN) 46 | 47 | 48 | def dumps_pyarrow(obj): 49 | """ 50 | Serialize an object. 51 | 52 | Returns: 53 | Implementation-dependent bytes-like object. 54 | May not be compatible across different versions of pyarrow. 55 | """ 56 | return pa.serialize(obj).to_buffer() 57 | 58 | 59 | def loads_pyarrow(buf): 60 | """ 61 | Args: 62 | buf: the output of `dumps`. 63 | """ 64 | return pa.deserialize(buf) 65 | 66 | 67 | # import pyarrow has a lot of side effect: 68 | # https://github.com/apache/arrow/pull/2329 69 | # https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/TMqRaT-H2bI 70 | # So we use msgpack as default. 71 | if os.environ.get('TENSORPACK_SERIALIZE', 'msgpack') == 'pyarrow': 72 | try: 73 | import pyarrow as pa 74 | except ImportError: 75 | loads_pyarrow = create_dummy_func('loads_pyarrow', ['pyarrow']) # noqa 76 | dumps_pyarrow = create_dummy_func('dumps_pyarrow', ['pyarrow']) # noqa 77 | 78 | if 'horovod' in sys.modules: 79 | logger.warn("Horovod and pyarrow may have symbol conflicts. " 80 | "Uninstall pyarrow and use msgpack instead.") 81 | loads = loads_pyarrow 82 | dumps = dumps_pyarrow 83 | else: 84 | loads = loads_msgpack 85 | dumps = dumps_msgpack 86 | -------------------------------------------------------------------------------- /tensorpack/tfutils/symbolic_functions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: symbolic_functions.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from ..utils.develop import deprecated 10 | 11 | __all__ = ['print_stat', 'rms'] 12 | 13 | 14 | def print_stat(x, message=None): 15 | """ A simple print Op that might be easier to use than :meth:`tf.Print`. 16 | Use it like: ``x = print_stat(x, message='This is x')``. 17 | """ 18 | if message is None: 19 | message = x.op.name 20 | lst = [tf.shape(x), tf.reduce_mean(x)] 21 | if x.dtype.is_floating: 22 | lst.append(rms(x)) 23 | return tf.Print(x, lst + [x], summarize=20, 24 | message=message, name='print_' + x.op.name) 25 | 26 | 27 | # for internal use only 28 | def rms(x, name=None): 29 | """ 30 | Returns: 31 | root mean square of tensor x. 32 | """ 33 | if name is None: 34 | name = x.op.name + '/rms' 35 | with tf.name_scope(None): # name already contains the scope 36 | return tf.sqrt(tf.reduce_mean(tf.square(x)), name=name) 37 | return tf.sqrt(tf.reduce_mean(tf.square(x)), name=name) 38 | 39 | 40 | # don't hurt to leave it here 41 | @deprecated("Please implement it by yourself.", "2018-04-28") 42 | def psnr(prediction, ground_truth, maxp=None, name='psnr'): 43 | """`Peek Signal to Noise Ratio `_. 44 | 45 | .. math:: 46 | 47 | PSNR = 20 \cdot \log_{10}(MAX_p) - 10 \cdot \log_{10}(MSE) 48 | 49 | Args: 50 | prediction: a :class:`tf.Tensor` representing the prediction signal. 51 | ground_truth: another :class:`tf.Tensor` with the same shape. 52 | maxp: maximum possible pixel value of the image (255 in in 8bit images) 53 | 54 | Returns: 55 | A scalar tensor representing the PSNR 56 | """ 57 | 58 | maxp = float(maxp) 59 | 60 | def log10(x): 61 | with tf.name_scope("log10"): 62 | numerator = tf.log(x) 63 | denominator = tf.log(tf.constant(10, dtype=numerator.dtype)) 64 | return numerator / denominator 65 | 66 | mse = tf.reduce_mean(tf.square(prediction - ground_truth)) 67 | if maxp is None: 68 | psnr = tf.multiply(log10(mse), -10., name=name) 69 | else: 70 | psnr = tf.multiply(log10(mse), -10.) 71 | psnr = tf.add(tf.multiply(20., log10(maxp)), psnr, name=name) 72 | 73 | return psnr 74 | -------------------------------------------------------------------------------- /tensorpack/models/shape_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: shape_utils.py 5 | 6 | import tensorflow as tf 7 | 8 | __all__ = [] 9 | 10 | 11 | class StaticDynamicAxis(object): 12 | def __init__(self, static, dynamic): 13 | self.static = static 14 | self.dynamic = dynamic 15 | 16 | def apply(self, f): 17 | try: 18 | st = f(self.static) 19 | return StaticDynamicAxis(st, st) 20 | except TypeError: 21 | return StaticDynamicAxis(None, f(self.dynamic)) 22 | 23 | def __str__(self): 24 | return "S={}, D={}".format(str(self.static), str(self.dynamic)) 25 | 26 | 27 | def DynamicLazyAxis(shape, idx): 28 | return lambda: shape[idx] 29 | 30 | 31 | def StaticLazyAxis(dim): 32 | return lambda: dim 33 | 34 | 35 | class StaticDynamicShape(object): 36 | def __init__(self, tensor): 37 | assert isinstance(tensor, tf.Tensor), tensor 38 | ndims = tensor.shape.ndims 39 | self.static = tensor.shape.as_list() 40 | if tensor.shape.is_fully_defined(): 41 | self.dynamic = self.static[:] 42 | else: 43 | dynamic = tf.shape(tensor) 44 | self.dynamic = [DynamicLazyAxis(dynamic, k) for k in range(ndims)] 45 | 46 | for k in range(ndims): 47 | if self.static[k] is not None: 48 | self.dynamic[k] = StaticLazyAxis(self.static[k]) 49 | 50 | def apply(self, axis, f): 51 | if self.static[axis] is not None: 52 | try: 53 | st = f(self.static[axis]) 54 | self.static[axis] = st 55 | self.dynamic[axis] = StaticLazyAxis(st) 56 | return 57 | except TypeError: 58 | pass 59 | self.static[axis] = None 60 | dyn = self.dynamic[axis] 61 | self.dynamic[axis] = lambda: f(dyn()) 62 | 63 | def get_static(self): 64 | return self.static 65 | 66 | @property 67 | def ndims(self): 68 | return len(self.static) 69 | 70 | def get_dynamic(self, axis=None): 71 | if axis is None: 72 | return [self.dynamic[k]() for k in range(self.ndims)] 73 | return self.dynamic[axis]() 74 | 75 | 76 | if __name__ == '__main__': 77 | x = tf.placeholder(tf.float32, shape=[None, 3, None, 10]) 78 | shape = StaticDynamicShape(x) 79 | shape.apply(1, lambda x: x * 3) 80 | shape.apply(2, lambda x: x + 5) 81 | print(shape.get_static()) 82 | print(shape.get_dynamic()) 83 | -------------------------------------------------------------------------------- /tensorpack/tfutils/dependency.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import tensorflow as tf 5 | from tensorflow.contrib.graph_editor import get_backward_walk_ops 6 | 7 | from ..utils.argtools import graph_memoized 8 | 9 | """ 10 | Utils about parsing dependencies in the graph. 11 | """ 12 | 13 | __all__ = [ 14 | 'dependency_of_targets', 'dependency_of_fetches' 15 | ] 16 | 17 | 18 | @graph_memoized 19 | def dependency_of_targets(targets, op): 20 | """ 21 | Check that op is in the subgraph induced by the dependencies of targets. 22 | The result is memoized. 23 | 24 | This is useful if some SessionRunHooks should be run only together with certain ops. 25 | 26 | Args: 27 | targets: a tuple of ops or tensors. The targets to find dependencies of. 28 | op (tf.Operation or tf.Tensor): 29 | 30 | Returns: 31 | bool 32 | """ 33 | # TODO tensorarray? sparsetensor? 34 | if isinstance(op, tf.Tensor): 35 | op = op.op 36 | assert isinstance(op, tf.Operation), op 37 | 38 | # alternative implementation can use graph_util.extract_sub_graph 39 | dependent_ops = get_backward_walk_ops(targets, control_inputs=True) 40 | return op in dependent_ops 41 | 42 | 43 | def dependency_of_fetches(fetches, op): 44 | """ 45 | Check that op is in the subgraph induced by the dependencies of fetches. 46 | fetches may have more general structure. 47 | 48 | Args: 49 | fetches: An argument to `sess.run`. Nested structure will affect performance. 50 | op (tf.Operation or tf.Tensor): 51 | 52 | Returns: 53 | bool 54 | """ 55 | try: 56 | from tensorflow.python.client.session import _FetchHandler as FetchHandler 57 | # use the graph of the op, so that this function can be called without being under a default graph 58 | handler = FetchHandler(op.graph, fetches, {}) 59 | targets = tuple(handler.fetches() + handler.targets()) 60 | except ImportError: 61 | if isinstance(fetches, list): 62 | targets = tuple(fetches) 63 | elif isinstance(fetches, dict): 64 | raise ValueError("Don't know how to parse dictionary to fetch list! " 65 | "This is a bug of tensorpack.") 66 | else: 67 | targets = (fetches, ) 68 | return dependency_of_targets(targets, op) 69 | 70 | 71 | if __name__ == '__main__': 72 | a = tf.random_normal(shape=[3, 3]) 73 | b = tf.random_normal(shape=[3, 3]) 74 | print(dependency_of_fetches(a, a)) 75 | print(dependency_of_fetches([a, b], a)) 76 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/noise.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: noise.py 5 | 6 | 7 | import numpy as np 8 | import cv2 9 | 10 | from .base import ImageAugmentor 11 | 12 | __all__ = ['JpegNoise', 'GaussianNoise', 'SaltPepperNoise'] 13 | 14 | 15 | class JpegNoise(ImageAugmentor): 16 | """ Random JPEG noise. """ 17 | 18 | def __init__(self, quality_range=(40, 100)): 19 | """ 20 | Args: 21 | quality_range (tuple): range to sample JPEG quality 22 | """ 23 | super(JpegNoise, self).__init__() 24 | self._init(locals()) 25 | 26 | def _get_augment_params(self, img): 27 | return self.rng.randint(*self.quality_range) 28 | 29 | def _augment(self, img, q): 30 | enc = cv2.imencode('.jpg', img, [cv2.IMWRITE_JPEG_QUALITY, q])[1] 31 | return cv2.imdecode(enc, 1).astype(img.dtype) 32 | 33 | 34 | class GaussianNoise(ImageAugmentor): 35 | """ 36 | Add random Gaussian noise N(0, sigma^2) of the same shape to img. 37 | """ 38 | def __init__(self, sigma=1, clip=True): 39 | """ 40 | Args: 41 | sigma (float): stddev of the Gaussian distribution. 42 | clip (bool): clip the result to [0,255] in the end. 43 | """ 44 | super(GaussianNoise, self).__init__() 45 | self._init(locals()) 46 | 47 | def _get_augment_params(self, img): 48 | return self.rng.randn(*img.shape) 49 | 50 | def _augment(self, img, noise): 51 | old_dtype = img.dtype 52 | ret = img + noise * self.sigma 53 | if self.clip or old_dtype == np.uint8: 54 | ret = np.clip(ret, 0, 255) 55 | return ret.astype(old_dtype) 56 | 57 | 58 | class SaltPepperNoise(ImageAugmentor): 59 | """ Salt and pepper noise. 60 | Randomly set some elements in image to 0 or 255, regardless of its channels. 61 | """ 62 | 63 | def __init__(self, white_prob=0.05, black_prob=0.05): 64 | """ 65 | Args: 66 | white_prob (float), black_prob (float): probabilities setting an element to 255 or 0. 67 | """ 68 | assert white_prob + black_prob <= 1, "Sum of probabilities cannot be greater than 1" 69 | super(SaltPepperNoise, self).__init__() 70 | self._init(locals()) 71 | 72 | def _get_augment_params(self, img): 73 | return self.rng.uniform(low=0, high=1, size=img.shape) 74 | 75 | def _augment(self, img, param): 76 | img[param > (1 - self.white_prob)] = 255 77 | img[param < self.black_prob] = 0 78 | return img 79 | -------------------------------------------------------------------------------- /tensorpack/models/fc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: fc.py 5 | 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | from ..tfutils.common import get_tf_version_tuple 11 | from .common import VariableHolder, layer_register 12 | from .tflayer import convert_to_tflayer_args, rename_get_variable 13 | 14 | __all__ = ['FullyConnected'] 15 | 16 | 17 | def batch_flatten(x): 18 | """ 19 | Flatten the tensor except the first dimension. 20 | """ 21 | shape = x.get_shape().as_list()[1:] 22 | if None not in shape: 23 | return tf.reshape(x, [-1, int(np.prod(shape))]) 24 | return tf.reshape(x, tf.stack([tf.shape(x)[0], -1])) 25 | 26 | 27 | @layer_register(log_shape=True) 28 | @convert_to_tflayer_args( 29 | args_names=['units'], 30 | name_mapping={'out_dim': 'units'}) 31 | def FullyConnected( 32 | inputs, 33 | units, 34 | activation=None, 35 | use_bias=True, 36 | kernel_initializer=None, 37 | bias_initializer=tf.zeros_initializer(), 38 | kernel_regularizer=None, 39 | bias_regularizer=None, 40 | activity_regularizer=None): 41 | """ 42 | A wrapper around `tf.layers.Dense`. 43 | One difference to maintain backward-compatibility: 44 | Default weight initializer is variance_scaling_initializer(2.0). 45 | 46 | Variable Names: 47 | 48 | * ``W``: weights of shape [in_dim, out_dim] 49 | * ``b``: bias 50 | """ 51 | if kernel_initializer is None: 52 | if get_tf_version_tuple() <= (1, 12): 53 | kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) 54 | else: 55 | kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') 56 | 57 | inputs = batch_flatten(inputs) 58 | with rename_get_variable({'kernel': 'W', 'bias': 'b'}): 59 | layer = tf.layers.Dense( 60 | units=units, 61 | activation=activation, 62 | use_bias=use_bias, 63 | kernel_initializer=kernel_initializer, 64 | bias_initializer=bias_initializer, 65 | kernel_regularizer=kernel_regularizer, 66 | bias_regularizer=bias_regularizer, 67 | activity_regularizer=activity_regularizer, 68 | _reuse=tf.get_variable_scope().reuse) 69 | ret = layer.apply(inputs, scope=tf.get_variable_scope()) 70 | ret = tf.identity(ret, name='output') 71 | 72 | ret.variables = VariableHolder(W=layer.kernel) 73 | if use_bias: 74 | ret.variables.b = layer.bias 75 | return ret 76 | -------------------------------------------------------------------------------- /tensorpack/libinfo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import os 5 | 6 | # issue#7378 may happen with custom opencv. It doesn't hurt to disable opencl 7 | os.environ['OPENCV_OPENCL_RUNTIME'] = 'disabled' # https://github.com/opencv/opencv/pull/10155 8 | try: 9 | # issue#1924 may happen on old systems 10 | import cv2 # noqa 11 | # cv2.setNumThreads(0) 12 | if int(cv2.__version__.split('.')[0]) == 3: 13 | cv2.ocl.setUseOpenCL(False) 14 | # check if cv is built with cuda or openmp 15 | info = cv2.getBuildInformation().split('\n') 16 | for line in info: 17 | splits = line.split() 18 | if not len(splits): 19 | continue 20 | answer = splits[-1].lower() 21 | if answer in ['yes', 'no']: 22 | if 'cuda' in line.lower() and answer == 'yes': 23 | # issue#1197 24 | print("OpenCV is built with CUDA support. " 25 | "This may cause slow initialization or sometimes segfault with TensorFlow.") 26 | if answer == 'openmp': 27 | print("OpenCV is built with OpenMP support. This usually results in poor performance. For details, see " 28 | "https://github.com/tensorpack/benchmarks/blob/master/ImageNet/benchmark-opencv-resize.py") 29 | except (ImportError, TypeError): 30 | pass 31 | 32 | os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # issue#9339 33 | os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' # use more warm-up 34 | 35 | # Since 1.3, this is not needed 36 | os.environ['TF_AVGPOOL_USE_CUDNN'] = '1' # issue#8566 37 | 38 | # TF1.5 features 39 | os.environ['TF_SYNC_ON_FINISH'] = '0' # will become default 40 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' 41 | os.environ['TF_GPU_THREAD_COUNT'] = '2' 42 | 43 | # Available in TF1.6+ & cudnn7. Haven't seen different performance on R50. 44 | # NOTE we disable it because: 45 | # this mode may use scaled atomic integer reduction that may cause a numerical 46 | # overflow for certain input data range. 47 | os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0' 48 | 49 | # Available since 1.12. issue#15874 50 | os.environ['TF_ENABLE_WHILE_V2'] = '1' 51 | os.environ['TF_ENABLE_COND_V2'] = '1' 52 | 53 | try: 54 | import tensorflow as tf # noqa 55 | _version = tf.__version__.split('.') 56 | assert int(_version[0]) >= 1 and int(_version[1]) >= 3, "TF>=1.3 is required!" 57 | _HAS_TF = True 58 | except ImportError: 59 | print("Failed to import tensorflow.") 60 | _HAS_TF = False 61 | 62 | 63 | # These lines will be programatically read/write by setup.py 64 | # Don't touch them. 65 | __version__ = '0.9.0.1' 66 | __git_version__ = __version__ 67 | -------------------------------------------------------------------------------- /infra/ami/no_batch_train_1node_16xl_convergence.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env bash 4 | 5 | # Set timestamp and logging directory, begin writing to it. 6 | TS=`date +'%Y%m%d_%H%M%S'` 7 | LOG_DIR=/home/ubuntu/logs/train_log_${TS} 8 | mkdir -p ${LOG_DIR} 9 | exec &> >(tee ${LOG_DIR}/nohup.out) 10 | 11 | # Print evaluated script commands 12 | set -x 13 | 14 | # Set VENV 15 | VENV=${CONDA_DEFAULT_ENV} 16 | 17 | # Write current branch and commit hash to log directory 18 | git branch | grep \* | awk '{print $2}' > ${LOG_DIR}/git_info 19 | git log | head -1 >> ${LOG_DIR}/git_info 20 | git diff >> ${LOG_DIR}/git_info 21 | 22 | # Copy this script into logging directory 23 | cp `basename $0` ${LOG_DIR} 24 | 25 | # Record environment variables 26 | env > ${LOG_DIR}/env.txt 27 | 28 | # Record python libaries 29 | pip freeze > ${LOG_DIR}/requirements.txt 30 | 31 | # Record tensorflow shared object linkages (CUDA version?) 32 | ldd /home/ubuntu/anaconda3/envs/${VENV}/lib/python3.6/site-packages/tensorflow/libtensorflow_framework.so > ${LOG_DIR}/tf_so_links.txt 33 | 34 | # Execute training job 35 | # HOROVOD_TIMELINE=${LOG_DIR}/htimeline.json \ 36 | #HOROVOD_AUTOTUNE=1 \ 37 | #HOROVOD_AUTOTUNE_LOG=${LOG_DIR}/hvd_autotune.log \ 38 | HOROVOD_CYCLE_TIME=0.5 \ 39 | HOROVOD_FUSION_THRESHOLD=67108864 \ 40 | HOROVOD_LOG_LEVEL=INFO \ 41 | TENSORPACK_FP16=1 \ 42 | /home/ubuntu/anaconda3/envs/${VENV}/bin/mpirun -np 8 -H localhost:8 \ 43 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \ 44 | -mca btl_tcp_if_exclude lo,docker0 \ 45 | -mca btl_vader_single_copy_mechanism none \ 46 | -x NCCL_SOCKET_IFNAME=^docker0,lo \ 47 | -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO \ 48 | -x HOROVOD_CYCLE_TIME \ 49 | -x HOROVOD_FUSION_THRESHOLD \ 50 | -x TENSORPACK_FP16 \ 51 | -x LD_LIBRARY_PATH -x PATH \ 52 | --output-filename ${LOG_DIR}/mpirun_logs \ 53 | /home/ubuntu/anaconda3/envs/${VENV}/bin/python3 /home/ubuntu/tensorpack-mask-rcnn/MaskRCNN/train.py \ 54 | --logdir ${LOG_DIR} \ 55 | --fp16 \ 56 | --throughput_log_freq 2000 \ 57 | --config MODE_MASK=True \ 58 | MODE_FPN=True \ 59 | DATA.BASEDIR=/home/ubuntu/data \ 60 | DATA.TRAIN='["train2017"]' \ 61 | DATA.VAL='("val2017",)' \ 62 | TRAIN.BATCH_SIZE_PER_GPU=1 \ 63 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ 64 | BACKBONE.WEIGHTS=/home/ubuntu/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ 65 | BACKBONE.NORM=FreezeBN \ 66 | TRAIN.EVAL_PERIOD=12 \ 67 | TRAINER=horovod 68 | 69 | #For 32x4 70 | #TRAIN.GRADIENT_CLIP=1.5 71 | 72 | #-x HOROVOD_AUTOTUNE \ 73 | #-x HOROVOD_AUTOTUNE_LOG \ 74 | #-x HOROVOD_LOG_LEVEL=INFO \ 75 | #-x HOROVOD_CYCLE_TIME -x HOROVOD_FUSION_THRESHOLD \ 76 | #TRAIN.EVAL_PERIOD=1 \ 77 | #TRAIN.STEPS_PER_EPOCH=15000 \ 78 | #TRAIN.LR_SCHEDULE='[120000, 160000, 180000]' \ 79 | -------------------------------------------------------------------------------- /tensorpack/predict/feedfree.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env python 4 | 5 | from tensorflow.python.training.monitored_session import _HookedSession as HookedSession 6 | 7 | from ..callbacks import Callbacks 8 | from ..tfutils.tower import PredictTowerContext 9 | from .base import PredictorBase 10 | 11 | __all__ = ['FeedfreePredictor'] 12 | 13 | 14 | class FeedfreePredictor(PredictorBase): 15 | """ 16 | Create a predictor that takes inputs from an :class:`InputSource`, instead of from feeds. 17 | An instance `pred` of :class:`FeedfreePredictor` can be called only by `pred()`, which returns 18 | a list of output values as defined in config.output_names. 19 | """ 20 | 21 | def __init__(self, config, input_source): 22 | """ 23 | Args: 24 | config (PredictConfig): the config to use. 25 | input_source (InputSource): the feedfree InputSource to use. 26 | Must match the inputs_desc in config. 27 | """ 28 | self._config = config 29 | self._input_source = input_source 30 | assert config.return_input is False, \ 31 | "return_input is not supported in FeedfreePredictor! " \ 32 | "If you need to fetch inputs, add the names to the output_names!" 33 | 34 | self._hooks = [] 35 | self.graph = config._maybe_create_graph() 36 | with self.graph.as_default(): 37 | self._input_callbacks = Callbacks( 38 | self._input_source.setup(config.inputs_desc)) 39 | with PredictTowerContext(''): 40 | self._input_tensors = self._input_source.get_input_tensors() 41 | config.tower_func(*self._input_tensors) 42 | self._tower_handle = config.tower_func.towers[-1] 43 | 44 | self._output_tensors = self._tower_handle.get_tensors(config.output_names) 45 | 46 | self._input_callbacks.setup_graph(None) 47 | 48 | for h in self._input_callbacks.get_hooks(): 49 | self._register_hook(h) 50 | self._initialize_session() 51 | 52 | def _register_hook(self, hook): 53 | """ 54 | Args: 55 | hook (tf.train.SessionRunHook): 56 | """ 57 | self._hooks.append(hook) 58 | 59 | def _initialize_session(self): 60 | # init the session 61 | self._config.session_init._setup_graph() 62 | self._sess = self._config.session_creator.create_session() 63 | self._config.session_init._run_init(self._sess) 64 | 65 | with self._sess.as_default(): 66 | self._input_callbacks.before_train() 67 | self._hooked_sess = HookedSession(self._sess, self._hooks) 68 | 69 | def __call__(self): 70 | return self._hooked_sess.run(self._output_tensors) 71 | 72 | def _do_call(self): 73 | raise NotImplementedError("You're calling the wrong function!") 74 | -------------------------------------------------------------------------------- /tensorpack/dataflow/dataset/svhn.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: svhn.py 5 | 6 | 7 | import numpy as np 8 | import os 9 | 10 | from ...utils import logger 11 | from ...utils.fs import download, get_dataset_path 12 | from ..base import RNGDataFlow 13 | 14 | __all__ = ['SVHNDigit'] 15 | 16 | SVHN_URL = "http://ufldl.stanford.edu/housenumbers/" 17 | 18 | 19 | class SVHNDigit(RNGDataFlow): 20 | """ 21 | `SVHN `_ Cropped Digit Dataset. 22 | Produces [img, label], img of 32x32x3 in range [0,255], label of 0-9 23 | """ 24 | _Cache = {} 25 | 26 | def __init__(self, name, data_dir=None, shuffle=True): 27 | """ 28 | Args: 29 | name (str): 'train', 'test', or 'extra'. 30 | data_dir (str): a directory containing the original {train,test,extra}_32x32.mat. 31 | shuffle (bool): shuffle the dataset. 32 | """ 33 | self.shuffle = shuffle 34 | 35 | if name in SVHNDigit._Cache: 36 | self.X, self.Y = SVHNDigit._Cache[name] 37 | return 38 | if data_dir is None: 39 | data_dir = get_dataset_path('svhn_data') 40 | assert name in ['train', 'test', 'extra'], name 41 | filename = os.path.join(data_dir, name + '_32x32.mat') 42 | if not os.path.isfile(filename): 43 | url = SVHN_URL + os.path.basename(filename) 44 | logger.info("File {} not found!".format(filename)) 45 | logger.info("Downloading from {} ...".format(url)) 46 | download(url, os.path.dirname(filename)) 47 | logger.info("Loading {} ...".format(filename)) 48 | data = scipy.io.loadmat(filename) 49 | self.X = data['X'].transpose(3, 0, 1, 2) 50 | self.Y = data['y'].reshape((-1)) 51 | self.Y[self.Y == 10] = 0 52 | SVHNDigit._Cache[name] = (self.X, self.Y) 53 | 54 | def __len__(self): 55 | return self.X.shape[0] 56 | 57 | def __iter__(self): 58 | n = self.X.shape[0] 59 | idxs = np.arange(n) 60 | if self.shuffle: 61 | self.rng.shuffle(idxs) 62 | for k in idxs: 63 | # since svhn is quite small, just do it for safety 64 | yield [self.X[k], self.Y[k]] 65 | 66 | @staticmethod 67 | def get_per_pixel_mean(): 68 | """ 69 | Returns: 70 | a 32x32x3 image 71 | """ 72 | a = SVHNDigit('train') 73 | b = SVHNDigit('test') 74 | c = SVHNDigit('extra') 75 | return np.concatenate((a.X, b.X, c.X)).mean(axis=0) 76 | 77 | 78 | try: 79 | import scipy.io 80 | except ImportError: 81 | from ...utils.develop import create_dummy_class 82 | SVHNDigit = create_dummy_class('SVHNDigit', 'scipy.io') # noqa 83 | 84 | if __name__ == '__main__': 85 | a = SVHNDigit('train') 86 | b = SVHNDigit.get_per_pixel_mean() 87 | -------------------------------------------------------------------------------- /tensorpack/tfutils/sesscreate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: sesscreate.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from ..tfutils.common import tfv1 10 | from ..utils import logger 11 | from .common import get_default_sess_config 12 | 13 | __all__ = ['NewSessionCreator', 'ReuseSessionCreator', 'SessionCreatorAdapter'] 14 | 15 | """ 16 | A SessionCreator should: 17 | create the session 18 | initialize all variables 19 | return a session that is ready to use 20 | not finalize the graph 21 | """ 22 | 23 | 24 | class NewSessionCreator(tfv1.train.SessionCreator): 25 | def __init__(self, target='', config=None): 26 | """ 27 | Args: 28 | target, config: same as :meth:`Session.__init__()`. 29 | config: a :class:`tf.ConfigProto` instance, defaults to :func:`tfutils.get_default_sess_config()` 30 | """ 31 | self.target = target 32 | 33 | if config is None: 34 | # distributed trainer doesn't support user-provided config 35 | # we set this attribute so that they can check 36 | self.user_provided_config = False 37 | config = get_default_sess_config() 38 | else: 39 | self.user_provided_config = True 40 | logger.warn( 41 | "User-provided custom session config may not work due to TF \ 42 | bugs. See https://github.com/tensorpack/tensorpack/issues/497 for workarounds.") 43 | self.config = config 44 | 45 | def create_session(self): 46 | sess = tf.Session(target=self.target, config=self.config) 47 | sess.run(tf.global_variables_initializer()) 48 | sess.run(tf.local_variables_initializer()) 49 | sess.run(tf.tables_initializer()) 50 | return sess 51 | 52 | 53 | class ReuseSessionCreator(tfv1.train.SessionCreator): 54 | """ 55 | Returns an existing session. 56 | """ 57 | def __init__(self, sess): 58 | """ 59 | Args: 60 | sess (tf.Session): the session to reuse 61 | """ 62 | self.sess = sess 63 | 64 | def create_session(self): 65 | return self.sess 66 | 67 | 68 | class SessionCreatorAdapter(tfv1.train.SessionCreator): 69 | """ 70 | Apply a function on the output of a SessionCreator. Can be used to create a debug session. 71 | """ 72 | def __init__(self, session_creator, func): 73 | """ 74 | Args: 75 | session_creator (tf.train.SessionCreator): a session creator 76 | func (tf.Session -> tf.Session): takes a session created by 77 | ``session_creator``, and return a new session to be returned by ``self.create_session`` 78 | """ 79 | self._creator = session_creator 80 | self._func = func 81 | 82 | def create_session(self): 83 | sess = self._creator.create_session() 84 | return self._func(sess) 85 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/external.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env python 4 | 5 | import numpy as np 6 | 7 | from .base import ImageAugmentor 8 | 9 | __all__ = ['IAAugmentor', 'Albumentations'] 10 | 11 | 12 | class IAAugmentor(ImageAugmentor): 13 | """ 14 | Wrap an augmentor form the IAA library: https://github.com/aleju/imgaug. 15 | Both images and coordinates are supported. 16 | 17 | Note: 18 | 1. It's NOT RECOMMENDED 19 | to use coordinates because the IAA library does not handle coordinates accurately. 20 | 21 | 2. Only uint8 images are supported by the IAA library. 22 | 23 | 3. The IAA library can only produces images of the same shape. 24 | 25 | Example: 26 | 27 | .. code-block:: python 28 | 29 | from tensorpack import imgaug # this is not the aleju/imgaug library 30 | from imgaug import augmentors as iaa # this is the aleju/imgaug library 31 | myaug = imgaug.IAAugmentor( 32 | iaa.Sequential([ 33 | iaa.Sharpen(alpha=(0, 1), lightness=(0.75, 1.5)), 34 | iaa.Fliplr(0.5), 35 | iaa.Crop(px=(0, 100)), 36 | ]) 37 | """ 38 | 39 | def __init__(self, augmentor): 40 | """ 41 | Args: 42 | augmentor (iaa.Augmenter): 43 | """ 44 | super(IAAugmentor, self).__init__() 45 | self._aug = augmentor 46 | 47 | def _get_augment_params(self, img): 48 | return (self._aug.to_deterministic(), img.shape) 49 | 50 | def _augment(self, img, param): 51 | aug, _ = param 52 | return aug.augment_image(img) 53 | 54 | def _augment_coords(self, coords, param): 55 | import imgaug as IA 56 | aug, shape = param 57 | points = [IA.Keypoint(x=x, y=y) for x, y in coords] 58 | points = IA.KeypointsOnImage(points, shape=shape) 59 | augmented = aug.augment_keypoints([points])[0].keypoints 60 | return np.asarray([[p.x, p.y] for p in augmented]) 61 | 62 | 63 | class Albumentations(ImageAugmentor): 64 | """ 65 | Wrap an augmentor form the albumentations library: https://github.com/albu/albumentations. 66 | Coordinate augmentation is not supported by the library. 67 | 68 | Example: 69 | 70 | .. code-block:: python 71 | 72 | from tensorpack import imgaug 73 | import albumentations as AB 74 | myaug = imgaug.Albumentations(AB.RandomRotate90(p=1)) 75 | """ 76 | def __init__(self, augmentor): 77 | """ 78 | Args: 79 | augmentor (albumentations.BasicTransform): 80 | """ 81 | super(Albumentations, self).__init__() 82 | self._aug = augmentor 83 | 84 | def _get_augment_params(self, img): 85 | return self._aug.get_params() 86 | 87 | def _augment(self, img, param): 88 | return self._aug.apply(img, **param) 89 | 90 | def _augment_coords(self, coords, param): 91 | raise NotImplementedError() 92 | -------------------------------------------------------------------------------- /tensorpack/tfutils/model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: model_utils.py 5 | # Author: tensorpack contributors 6 | 7 | import tensorflow as tf 8 | from tabulate import tabulate 9 | from termcolor import colored 10 | 11 | from ..utils import logger 12 | 13 | __all__ = [] 14 | 15 | 16 | # TODO should also describe model_variables 17 | def describe_trainable_vars(): 18 | """ 19 | Print a description of the current model parameters. 20 | Skip variables starting with "tower", as they are just duplicates built by data-parallel logic. 21 | """ 22 | train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 23 | if len(train_vars) == 0: 24 | logger.warn("No trainable variables in the graph!") 25 | return 26 | total = 0 27 | total_bytes = 0 28 | data = [] 29 | for v in train_vars: 30 | if v.name.startswith('tower'): 31 | continue 32 | shape = v.get_shape() 33 | ele = shape.num_elements() 34 | if ele is None: 35 | logger.warn("Shape of variable {} is not fully defined but {}.".format(v.name, shape)) 36 | ele = 0 37 | try: 38 | shape = shape.as_list() 39 | except ValueError: 40 | shape = '' 41 | 42 | total += ele 43 | total_bytes += ele * v.dtype.size 44 | data.append([v.name, shape, ele, v.device, v.dtype.base_dtype.name]) 45 | headers = ['name', 'shape', 'dim', 'device', 'dtype'] 46 | 47 | dtypes = set([x[4] for x in data]) 48 | if len(dtypes) == 1: 49 | for x in data: 50 | del x[4] 51 | del headers[4] 52 | 53 | devices = set([x[3] for x in data]) 54 | if len(devices) == 1: 55 | # don't log the device if all vars on the same device 56 | for x in data: 57 | del x[3] 58 | del headers[3] 59 | 60 | table = tabulate(data, headers=headers) 61 | 62 | size_mb = total_bytes / 1024.0**2 63 | summary_msg = colored( 64 | "\nTotal #vars={}, #params={}, size={:.02f}MB".format( 65 | len(data), total, size_mb), 'cyan') 66 | logger.info(colored("Trainable Variables: \n", 'cyan') + table + summary_msg) 67 | 68 | 69 | def get_shape_str(tensors): 70 | """ 71 | Internally used by layer registry, to print shapes of inputs/outputs of layers. 72 | 73 | Args: 74 | tensors (list or tf.Tensor): a tensor or a list of tensors 75 | Returns: 76 | str: a string to describe the shape 77 | """ 78 | if isinstance(tensors, (list, tuple)): 79 | for v in tensors: 80 | assert isinstance(v, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(v)) 81 | shape_str = ",".join( 82 | map(lambda x: str(x.get_shape().as_list()), tensors)) 83 | else: 84 | assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors)) 85 | shape_str = str(tensors.get_shape().as_list()) 86 | return shape_str 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mask RCNN 2 | 3 | ## NOTE: This repository is archived. This project will continue to be worked on here - https://github.com/aws-samples/mask-rcnn-tensorflow 4 | 5 | Performance focused implementation of Mask RCNN based on the [Tensorpack implementation](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN). 6 | The original paper: [Mask R-CNN](https://arxiv.org/abs/1703.06870) 7 | ### Overview 8 | 9 | This implementation of Mask RCNN is focused on increasing training throughput without sacrificing any accuracy. We do this by training with a batch size > 1 per GPU using FP16 and two custom TF ops. 10 | 11 | ### Status 12 | 13 | Training on N GPUs (V100s in our experiments) with a per-gpu batch size of M = NxM training 14 | 15 | Training converges to target accuracy for configurations from 8x1 up to 32x4 training. Training throughput is substantially improved from original Tensorpack code. 16 | 17 | A pre-built dockerfile is available in DockerHub under `armandmcqueen/tensorpack-mask-rcnn:master-latest`. It is automatically built on each commit to master. 18 | 19 | ### Notes 20 | 21 | - Running this codebase requires a custom TF binary - available under GitHub releases (custom ops and fix for bug introduced in TF 1.13 22 | - We give some details the codebase and optimizations in `CODEBASE.md` 23 | 24 | ### To launch training 25 | - Data preprocessing 26 | - Follow the [data preprocess](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN) 27 | - If you want to use EKS or Sagemaker, you need to create your own S3 bucket which contains the data, and change the S3 bucket name in the following files: 28 | - EKS: [P3 config](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/eks/fsx/p3/stage-data.yaml), [P3dn config](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/eks/fsx/p3dn/stage-data.yaml) 29 | - SageMaker: [S3 download](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/sm/run_mpi.py#L122) 30 | - Container is recommended for training 31 | - To train with docker, refer to [Docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/docker) 32 | - To train with Amazon EKS, refer to [EKS](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/eks) 33 | - To train with Amazon SageMaker, refer to [SageMaker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/sm) 34 | 35 | ### Training results 36 | The result was running on P3dn.24xl instances using EKS. 37 | 12 epochs training: 38 | 39 | | Num_GPUs x Images_Per_GPU | Training time | Box mAP | Mask mAP | 40 | | ------------- | ------------- | ------------- | ------------- | 41 | | 8x4 | 5.09h | 37.47% | 34.45% | 42 | | 16x4 | 3.11h | 37.41% | 34.47% | 43 | | 32x4 | 1.94h | 37.20% | 34.25% | 44 | 45 | 24 epochs training: 46 | 47 | | Num_GPUs x Images_Per_GPU | Training time | Box mAP | Mask mAP | 48 | | ------------- | ------------- | ------------- | ------------- | 49 | | 8x4 | 9.78h | 38.25% | 35.08% | 50 | | 16x4 | 5.60h | 38.44% | 35.18% | 51 | | 32x4 | 3.33h | 38.33% | 35.12% | 52 | 53 | ### Tensorpack fork point 54 | 55 | Forked from the excellent Tensorpack repo at commit a9dce5b220dca34b15122a9329ba9ff055e8edc6 56 | -------------------------------------------------------------------------------- /tensorpack/callbacks/group.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: group.py 5 | 6 | 7 | import traceback 8 | from contextlib import contextmanager 9 | from time import time as timer 10 | import six 11 | import tensorflow as tf 12 | 13 | from ..utils import logger 14 | from ..utils.utils import humanize_time_delta 15 | from .base import Callback 16 | from .hooks import CallbackToHook 17 | 18 | if six.PY3: 19 | from time import perf_counter as timer # noqa 20 | 21 | __all__ = ['Callbacks'] 22 | 23 | 24 | class CallbackTimeLogger(object): 25 | def __init__(self): 26 | self.times = [] 27 | self.tot = 0 28 | 29 | def add(self, name, time): 30 | self.tot += time 31 | self.times.append((name, time)) 32 | 33 | @contextmanager 34 | def timed_callback(self, name): 35 | s = timer() 36 | yield 37 | self.add(name, timer() - s) 38 | 39 | def log(self): 40 | 41 | """ log the time of some heavy callbacks """ 42 | if self.tot < 3: 43 | return 44 | msgs = [] 45 | for name, t in self.times: 46 | if t / self.tot > 0.3 and t > 1: 47 | msgs.append(name + ": " + humanize_time_delta(t)) 48 | logger.info( 49 | "Callbacks took {:.3f} sec in total. {}".format( 50 | self.tot, '; '.join(msgs))) 51 | 52 | 53 | class Callbacks(Callback): 54 | """ 55 | A container to hold all callbacks, and trigger them iteratively. 56 | Note that it does nothing to before_run/after_run. 57 | """ 58 | 59 | def __init__(self, cbs): 60 | """ 61 | Args: 62 | cbs(list): a list of :class:`Callback` instances. 63 | """ 64 | # check type 65 | for cb in cbs: 66 | assert isinstance(cb, Callback), cb.__class__ 67 | self.cbs = cbs 68 | 69 | def _setup_graph(self): 70 | with tf.name_scope(None): # clear the name scope 71 | for cb in self.cbs: 72 | cb.setup_graph(self.trainer) 73 | 74 | def _before_train(self): 75 | for cb in self.cbs: 76 | cb.before_train() 77 | 78 | def _after_train(self): 79 | for cb in self.cbs: 80 | # make sure callbacks are properly finalized 81 | try: 82 | cb.after_train() 83 | except Exception: 84 | traceback.print_exc() 85 | 86 | def get_hooks(self): 87 | return [CallbackToHook(cb) for cb in self.cbs] 88 | 89 | def trigger_step(self): 90 | for cb in self.cbs: 91 | cb.trigger_step() 92 | 93 | def _trigger_epoch(self): 94 | tm = CallbackTimeLogger() 95 | 96 | for cb in self.cbs: 97 | display_name = str(cb) 98 | with tm.timed_callback(display_name): 99 | cb.trigger_epoch() 100 | tm.log() 101 | 102 | def _before_epoch(self): 103 | for cb in self.cbs: 104 | cb.before_epoch() 105 | 106 | def _after_epoch(self): 107 | for cb in self.cbs: 108 | cb.after_epoch() 109 | -------------------------------------------------------------------------------- /tensorpack/utils/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: timer.py 5 | 6 | 7 | import atexit 8 | from collections import defaultdict 9 | from contextlib import contextmanager 10 | from time import time as timer 11 | import six 12 | 13 | from . import logger 14 | from .stats import StatCounter 15 | 16 | if six.PY3: 17 | from time import perf_counter as timer # noqa 18 | 19 | 20 | __all__ = ['total_timer', 'timed_operation', 21 | 'print_total_timer', 'IterSpeedCounter'] 22 | 23 | 24 | @contextmanager 25 | def timed_operation(msg, log_start=False): 26 | """ 27 | Surround a context with a timer. 28 | 29 | Args: 30 | msg(str): the log to print. 31 | log_start(bool): whether to print also at the beginning. 32 | 33 | Example: 34 | .. code-block:: python 35 | 36 | with timed_operation('Good Stuff'): 37 | time.sleep(1) 38 | 39 | Will print: 40 | 41 | .. code-block:: python 42 | 43 | Good stuff finished, time:1sec. 44 | """ 45 | if log_start: 46 | logger.info('Start {} ...'.format(msg)) 47 | start = timer() 48 | yield 49 | logger.info('{} finished, time:{:.4f}sec.'.format( 50 | msg, timer() - start)) 51 | 52 | 53 | _TOTAL_TIMER_DATA = defaultdict(StatCounter) 54 | 55 | 56 | @contextmanager 57 | def total_timer(msg): 58 | """ A context which add the time spent inside to TotalTimer. """ 59 | start = timer() 60 | yield 61 | t = timer() - start 62 | _TOTAL_TIMER_DATA[msg].feed(t) 63 | 64 | 65 | def print_total_timer(): 66 | """ 67 | Print the content of the TotalTimer, if it's not empty. This function will automatically get 68 | called when program exits. 69 | """ 70 | if len(_TOTAL_TIMER_DATA) == 0: 71 | return 72 | for k, v in six.iteritems(_TOTAL_TIMER_DATA): 73 | logger.info("Total Time: {} -> {:.2f} sec, {} times, {:.3g} sec/time".format( 74 | k, v.sum, v.count, v.average)) 75 | 76 | 77 | atexit.register(print_total_timer) 78 | 79 | 80 | class IterSpeedCounter(object): 81 | """ Test how often some code gets reached. 82 | 83 | Example: 84 | Print the speed of the iteration every 100 times. 85 | 86 | .. code-block:: python 87 | 88 | speed = IterSpeedCounter(100) 89 | for k in range(1000): 90 | # do something 91 | speed() 92 | """ 93 | 94 | def __init__(self, print_every, name=None): 95 | """ 96 | Args: 97 | print_every(int): interval to print. 98 | name(str): name to used when print. 99 | """ 100 | self.cnt = 0 101 | self.print_every = int(print_every) 102 | self.name = name if name else 'IterSpeed' 103 | 104 | def reset(self): 105 | self.start = timer() 106 | 107 | def __call__(self): 108 | if self.cnt == 0: 109 | self.reset() 110 | self.cnt += 1 111 | if self.cnt % self.print_every != 0: 112 | return 113 | t = timer() - self.start 114 | logger.info("{}: {:.2f} sec, {} times, {:.3g} sec/time".format( 115 | self.name, t, self.cnt, t / self.cnt)) 116 | -------------------------------------------------------------------------------- /MaskRCNN/viz.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: viz.py 5 | 6 | import numpy as np 7 | from six.moves import zip 8 | 9 | from tensorpack.utils import viz 10 | from tensorpack.utils.palette import PALETTE_RGB 11 | 12 | from config import config as cfg 13 | from utils.np_box_ops import iou as np_iou 14 | 15 | 16 | def draw_annotation(img, boxes, klass, is_crowd=None): 17 | """Will not modify img""" 18 | labels = [] 19 | assert len(boxes) == len(klass) 20 | if is_crowd is not None: 21 | assert len(boxes) == len(is_crowd) 22 | for cls, crd in zip(klass, is_crowd): 23 | clsname = cfg.DATA.CLASS_NAMES[cls] 24 | if crd == 1: 25 | clsname += ';Crowd' 26 | labels.append(clsname) 27 | else: 28 | for cls in klass: 29 | labels.append(cfg.DATA.CLASS_NAMES[cls]) 30 | img = viz.draw_boxes(img, boxes, labels) 31 | return img 32 | 33 | 34 | def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes): 35 | """ 36 | Draw top3 proposals for each gt. 37 | Args: 38 | proposals: NPx4 39 | proposal_scores: NP 40 | gt_boxes: NG 41 | """ 42 | box_ious = np_iou(gt_boxes, proposals) # ng x np 43 | box_ious_argsort = np.argsort(-box_ious, axis=1) 44 | good_proposals_ind = box_ious_argsort[:, :3] # for each gt, find 3 best proposals 45 | good_proposals_ind = np.unique(good_proposals_ind.ravel()) 46 | 47 | proposals = proposals[good_proposals_ind, :] 48 | tags = list(map(str, proposal_scores[good_proposals_ind])) 49 | img = viz.draw_boxes(img, proposals, tags) 50 | return img, good_proposals_ind 51 | 52 | 53 | def draw_predictions(img, boxes, scores): 54 | """ 55 | Args: 56 | boxes: kx4 57 | scores: kxC 58 | """ 59 | if len(boxes) == 0: 60 | return img 61 | labels = scores.argmax(axis=1) 62 | scores = scores.max(axis=1) 63 | tags = ["{},{:.2f}".format(cfg.DATA.CLASS_NAMES[lb], score) for lb, score in zip(labels, scores)] 64 | return viz.draw_boxes(img, boxes, tags) 65 | 66 | 67 | def draw_final_outputs(img, results): 68 | """ 69 | Args: 70 | results: [DetectionResult] 71 | """ 72 | if len(results) == 0: 73 | return img 74 | 75 | tags = [] 76 | for r in results: 77 | tags.append( 78 | "{},{:.2f}".format(cfg.DATA.CLASS_NAMES[r.class_id], r.score)) 79 | boxes = np.asarray([r.box for r in results]) 80 | ret = viz.draw_boxes(img, boxes, tags) 81 | 82 | for r in results: 83 | if r.mask is not None: 84 | ret = draw_mask(ret, r.mask) 85 | return ret 86 | 87 | 88 | def draw_mask(im, mask, alpha=0.5, color=None): 89 | """ 90 | Overlay a mask on top of the image. 91 | 92 | Args: 93 | im: a 3-channel uint8 image in BGR 94 | mask: a binary 1-channel image of the same size 95 | color: if None, will choose automatically 96 | """ 97 | if color is None: 98 | color = PALETTE_RGB[np.random.choice(len(PALETTE_RGB))][::-1] 99 | im = np.where(np.repeat((mask > 0)[:, :, None], 3, axis=2), 100 | im * (1 - alpha) + color * alpha, im) 101 | im = im.astype('uint8') 102 | return im 103 | -------------------------------------------------------------------------------- /MaskRCNN/NOTES.md: -------------------------------------------------------------------------------- 1 | 2 | ### File Structure 3 | This is a minimal implementation that simply contains these files: 4 | + dataset.py: load and evaluate COCO dataset 5 | + data.py: prepare data for training & inference 6 | + common.py: common data preparation utilities 7 | + basemodel.py: implement backbones 8 | + model_box.py: implement box-related symbolic functions 9 | + model_{fpn,rpn,frcnn,mrcnn,cascade}.py: implement FPN,RPN,Fast-/Mask-/Cascade-RCNN models. 10 | + train.py: main entry script 11 | + utils/: third-party helper functions 12 | + eval.py: evaluation utilities 13 | + viz.py: visualization utilities 14 | 15 | ### Implementation Notes 16 | 17 | Data: 18 | 19 | 1. It's easy to train on your own data by changing `dataset.py`. 20 | 21 | + If your data is in COCO format, modify `COCODetection` 22 | to change the class names and the id mapping. 23 | + If your data is not in COCO format, ignore `COCODetection` completely and 24 | rewrite all the methods of 25 | `DetectionDataset` following its documents. 26 | You'll implement the logic to load your dataset and evaluate predictions. 27 | 28 | 2. You can easily add more augmentations such as rotation, but be careful how a box should be 29 | augmented. The code now will always use the minimal axis-aligned bounding box of the 4 corners, 30 | which is probably not the optimal way. 31 | A TODO is to generate bounding box from segmentation, so more augmentations can be naturally supported. 32 | 33 | Model: 34 | 35 | 1. Floating-point boxes are defined like this: 36 | 37 |

38 | 39 | 2. We use ROIAlign, and `tf.image.crop_and_resize` is __NOT__ ROIAlign. 40 | 41 | 3. We currently only support single image per GPU. 42 | 43 | 4. Because of (3), BatchNorm statistics are supposed to be freezed during fine-tuning. 44 | 45 | 5. An alternative to freezing BatchNorm is to sync BatchNorm statistics across 46 | GPUs (the `BACKBONE.NORM=SyncBN` option). This would require [my bugfix](https://github.com/tensorflow/tensorflow/pull/20360) 47 | which is available since TF 1.10. You can manually apply the patch to use it. 48 | For now the total batch size is at most 8, so this option does not improve the model by much. 49 | 50 | 6. Another alternative to BatchNorm is GroupNorm (`BACKBONE.NORM=GN`) which has better performance. 51 | 52 | Speed: 53 | 54 | 1. If CuDNN warmup is on, the training will start very slowly, until about 55 | 10k steps (or more if scale augmentation is used) to reach a maximum speed. 56 | As a result, the ETA is also inaccurate at the beginning. 57 | Warmup is by default on when no scale augmentation is used. 58 | 59 | 1. After warmup, the training speed will slowly decrease due to more accurate proposals. 60 | 61 | 1. The code should have around 70% GPU utilization on V100s, and 85%~90% scaling 62 | efficiency from 1 V100 to 8 V100s. 63 | 64 | 1. This implementation does not contain specialized CUDA ops (e.g. AffineChannel, ROIAlign), 65 | so it can be slightly (~10%) slower than Detectron (Caffe2) and 66 | maskrcnn-benchmark (PyTorch). 67 | 68 | Possible Future Enhancements: 69 | 70 | 1. Define a better interface to load custom dataset. 71 | 72 | 1. Support batch>1 per GPU. 73 | 74 | 1. Use dedicated ops to improve speed. (e.g. a TF implementation of ROIAlign op 75 | can be found in [light-head RCNN](https://github.com/zengarden/light_head_rcnn/tree/master/lib/lib_kernel)) 76 | -------------------------------------------------------------------------------- /MaskRCNN/utils/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: box_ops.py 5 | 6 | import tensorflow as tf 7 | 8 | from tensorpack.tfutils.scope_utils import under_name_scope 9 | 10 | 11 | """ 12 | This file is modified from 13 | https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py 14 | """ 15 | 16 | 17 | @under_name_scope() 18 | def area(boxes): 19 | """ 20 | Args: 21 | boxes: nx4 floatbox 22 | 23 | Returns: 24 | n 25 | """ 26 | x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1) 27 | return tf.squeeze((y_max - y_min) * (x_max - x_min), [1]) 28 | 29 | 30 | @under_name_scope() 31 | def pairwise_intersection(boxlist1, boxlist2): 32 | """Compute pairwise intersection areas between boxes. 33 | 34 | Args: 35 | boxlist1: Nx4 floatbox 36 | boxlist2: Mx4 37 | 38 | Returns: 39 | a tensor with shape [N, M] representing pairwise intersections 40 | """ 41 | x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1) 42 | x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1) 43 | all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) 44 | all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) 45 | intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin) 46 | all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) 47 | all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) 48 | intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin) 49 | return intersect_heights * intersect_widths 50 | 51 | 52 | @under_name_scope() 53 | def pairwise_iou(boxlist1, boxlist2): 54 | """Computes pairwise intersection-over-union between box collections. 55 | 56 | Args: 57 | boxlist1: Nx4 floatbox 58 | boxlist2: Mx4 59 | 60 | Returns: 61 | a tensor with shape [N, M] representing pairwise iou scores. 62 | """ 63 | intersections = pairwise_intersection(boxlist1, boxlist2) 64 | areas1 = area(boxlist1) 65 | areas2 = area(boxlist2) 66 | unions = ( 67 | tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) 68 | return tf.where( 69 | tf.equal(intersections, 0.0), 70 | tf.zeros_like(intersections), tf.truediv(intersections, unions)) 71 | 72 | 73 | 74 | @under_name_scope() 75 | def pairwise_iou_batch(proposal_boxes, gt_boxes, orig_gt_counts, batch_size): 76 | """Computes pairwise intersection-over-union between box collections. 77 | Args: 78 | proposal_boxes: K x 5 (batch_index, x1, y1, x2, y2) 79 | gt_boxes: BS x MaxNumGTs x 4 80 | orig_gt_counts: BS 81 | Returns: 82 | list of length BS, each element is output of pairwise_iou: N x M 83 | (where N is number of boxes for image and M is number of GTs for image) 84 | """ 85 | 86 | prefix = "pairwise_iou_batch" 87 | 88 | # For each image index, extract a ?x4 boxlist and gt_boxlist 89 | 90 | per_images_iou = [] 91 | for batch_idx in range(batch_size): 92 | 93 | box_mask_for_image = tf.equal(proposal_boxes[:, 0], batch_idx) 94 | 95 | single_image_boxes = tf.boolean_mask(proposal_boxes, box_mask_for_image) 96 | single_image_boxes = single_image_boxes[:, 1:] 97 | single_image_gt_boxes = gt_boxes[batch_idx, 0:orig_gt_counts[batch_idx], :] 98 | single_image_iou = pairwise_iou(single_image_boxes, single_image_gt_boxes) 99 | 100 | per_images_iou.append(single_image_iou) 101 | 102 | return per_images_iou 103 | -------------------------------------------------------------------------------- /MaskRCNN/utils/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py 4 | 5 | # -------------------------------------------------------- 6 | # Faster R-CNN 7 | # Copyright (c) 2015 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Ross Girshick and Sean Bell 10 | # -------------------------------------------------------- 11 | 12 | import numpy as np 13 | from six.moves import range 14 | 15 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 16 | # 17 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 18 | # >> anchors 19 | # 20 | # anchors = 21 | # 22 | # -83 -39 100 56 23 | # -175 -87 192 104 24 | # -359 -183 376 200 25 | # -55 -55 72 72 26 | # -119 -119 136 136 27 | # -247 -247 264 264 28 | # -35 -79 52 96 29 | # -79 -167 96 184 30 | # -167 -343 184 360 31 | 32 | # array([[ -83., -39., 100., 56.], 33 | # [-175., -87., 192., 104.], 34 | # [-359., -183., 376., 200.], 35 | # [ -55., -55., 72., 72.], 36 | # [-119., -119., 136., 136.], 37 | # [-247., -247., 264., 264.], 38 | # [ -35., -79., 52., 96.], 39 | # [ -79., -167., 96., 184.], 40 | # [-167., -343., 184., 360.]]) 41 | 42 | 43 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 44 | scales=2**np.arange(3, 6)): 45 | """ 46 | Generate anchor (reference) windows by enumerating aspect ratios X 47 | scales wrt a reference (0, 0, 15, 15) window. 48 | """ 49 | 50 | base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1 51 | ratio_anchors = _ratio_enum(base_anchor, ratios) 52 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 53 | for i in range(ratio_anchors.shape[0])]) 54 | return anchors 55 | 56 | 57 | def _whctrs(anchor): 58 | """ 59 | Return width, height, x center, and y center for an anchor (window). 60 | """ 61 | 62 | w = anchor[2] - anchor[0] + 1 63 | h = anchor[3] - anchor[1] + 1 64 | x_ctr = anchor[0] + 0.5 * (w - 1) 65 | y_ctr = anchor[1] + 0.5 * (h - 1) 66 | return w, h, x_ctr, y_ctr 67 | 68 | 69 | def _mkanchors(ws, hs, x_ctr, y_ctr): 70 | """ 71 | Given a vector of widths (ws) and heights (hs) around a center 72 | (x_ctr, y_ctr), output a set of anchors (windows). 73 | """ 74 | 75 | ws = ws[:, np.newaxis] 76 | hs = hs[:, np.newaxis] 77 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 78 | y_ctr - 0.5 * (hs - 1), 79 | x_ctr + 0.5 * (ws - 1), 80 | y_ctr + 0.5 * (hs - 1))) 81 | return anchors 82 | 83 | 84 | def _ratio_enum(anchor, ratios): 85 | """ 86 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 87 | """ 88 | 89 | w, h, x_ctr, y_ctr = _whctrs(anchor) 90 | size = w * h 91 | size_ratios = size / ratios 92 | ws = np.round(np.sqrt(size_ratios)) 93 | hs = np.round(ws * ratios) 94 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 95 | return anchors 96 | 97 | 98 | def _scale_enum(anchor, scales): 99 | """ 100 | Enumerate a set of anchors for each scale wrt an anchor. 101 | """ 102 | 103 | w, h, x_ctr, y_ctr = _whctrs(anchor) 104 | ws = w * scales 105 | hs = h * scales 106 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 107 | return anchors 108 | -------------------------------------------------------------------------------- /tensorpack/dataflow/imgaug/paste.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: paste.py 5 | 6 | 7 | import numpy as np 8 | from abc import abstractmethod 9 | 10 | from .base import ImageAugmentor 11 | 12 | __all__ = ['CenterPaste', 'BackgroundFiller', 'ConstantBackgroundFiller', 13 | 'RandomPaste'] 14 | 15 | 16 | class BackgroundFiller(object): 17 | """ Base class for all BackgroundFiller""" 18 | 19 | def fill(self, background_shape, img): 20 | """ 21 | Return a proper background image of background_shape, given img. 22 | 23 | Args: 24 | background_shape (tuple): a shape (h, w) 25 | img: an image 26 | Returns: 27 | a background image 28 | """ 29 | background_shape = tuple(background_shape) 30 | return self._fill(background_shape, img) 31 | 32 | @abstractmethod 33 | def _fill(self, background_shape, img): 34 | pass 35 | 36 | 37 | class ConstantBackgroundFiller(BackgroundFiller): 38 | """ Fill the background by a constant """ 39 | 40 | def __init__(self, value): 41 | """ 42 | Args: 43 | value (float): the value to fill the background. 44 | """ 45 | self.value = value 46 | 47 | def _fill(self, background_shape, img): 48 | assert img.ndim in [3, 2] 49 | if img.ndim == 3: 50 | return_shape = background_shape + (img.shape[2],) 51 | else: 52 | return_shape = background_shape 53 | return np.zeros(return_shape, dtype=img.dtype) + self.value 54 | 55 | 56 | class CenterPaste(ImageAugmentor): 57 | """ 58 | Paste the image onto the center of a background canvas. 59 | """ 60 | 61 | def __init__(self, background_shape, background_filler=None): 62 | """ 63 | Args: 64 | background_shape (tuple): shape of the background canvas. 65 | background_filler (BackgroundFiller): How to fill the background. Defaults to zero-filler. 66 | """ 67 | if background_filler is None: 68 | background_filler = ConstantBackgroundFiller(0) 69 | 70 | self._init(locals()) 71 | 72 | def _augment(self, img, _): 73 | img_shape = img.shape[:2] 74 | assert self.background_shape[0] >= img_shape[0] and self.background_shape[1] >= img_shape[1] 75 | 76 | background = self.background_filler.fill( 77 | self.background_shape, img) 78 | y0 = int((self.background_shape[0] - img_shape[0]) * 0.5) 79 | x0 = int((self.background_shape[1] - img_shape[1]) * 0.5) 80 | background[y0:y0 + img_shape[0], x0:x0 + img_shape[1]] = img 81 | return background 82 | 83 | def _augment_coords(self, coords, param): 84 | raise NotImplementedError() 85 | 86 | 87 | class RandomPaste(CenterPaste): 88 | """ 89 | Randomly paste the image onto a background canvas. 90 | """ 91 | 92 | def _get_augment_params(self, img): 93 | img_shape = img.shape[:2] 94 | assert self.background_shape[0] > img_shape[0] and self.background_shape[1] > img_shape[1] 95 | 96 | y0 = self._rand_range(self.background_shape[0] - img_shape[0]) 97 | x0 = self._rand_range(self.background_shape[1] - img_shape[1]) 98 | return int(x0), int(y0) 99 | 100 | def _augment(self, img, loc): 101 | x0, y0 = loc 102 | img_shape = img.shape[:2] 103 | background = self.background_filler.fill( 104 | self.background_shape, img) 105 | background[y0:y0 + img_shape[0], x0:x0 + img_shape[1]] = img 106 | return background 107 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/armandmcqueen/tensorpack-mask-rcnn/issues), or [recently closed](https://github.com/armandmcqueen/tensorpack-mask-rcnn/issues?q=is%3Aissue+is%3Aclosed), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/armandmcqueen/tensorpack-mask-rcnn/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws-samples/amazon-sagemaker-script-mode/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/templates/maskrcnn.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1alpha1 2 | kind: MPIJob 3 | metadata: 4 | name: {{ .Values.global.name }} 5 | namespace: {{ .Values.global.namespace }} 6 | labels: 7 | app.kubernetes.io/name: {{ .Values.global.name }} 8 | app.kubernetes.io/instance: {{ .Release.Name }} 9 | app.kubernetes.io/managed-by: {{ .Release.Service }} 10 | spec: 11 | gpus: {{ .Values.maskrcnn.gpus }} 12 | template: 13 | spec: 14 | restartPolicy: Never 15 | volumes: 16 | - name: {{ .Values.maskrcnn.shared_fs }} 17 | persistentVolumeClaim: 18 | claimName: {{ .Values.maskrcnn.shared_pvc }} 19 | - name: ebs 20 | hostPath: 21 | path: /ebs 22 | type: DirectoryOrCreate 23 | containers: 24 | - name: {{ .Values.global.name }} 25 | env: 26 | - name: HOROVOD_CYCLE_TIME 27 | value: "{{ .Values.maskrcnn.horovod_cycle_time }}" 28 | - name: HOROVOD_FUSION_THRESHOLD 29 | value: "{{ .Values.maskrcnn.horovod_fusion_threshold }}" 30 | - name: NCCL_SOCKET_IFNAME 31 | value: "{{ .Values.maskrcnn.nccl_socket_ifname }}" 32 | - name: NCCL_MIN_NRINGS 33 | value: "{{ .Values.maskrcnn.nccl_min_rings }}" 34 | - name: NCCL_DEBUG 35 | value: "{{ .Values.maskrcnn.nccl_debug }}" 36 | - name: TENSORPACK_FP16 37 | value: "{{ .Values.maskrcnn.fp_16 }}" 38 | command: 39 | - mpirun 40 | workingDir: {{ .Values.maskrcnn.working_dir }} 41 | args: 42 | - --output-filename 43 | - /{{ .Values.maskrcnn.shared_fs }}/logs/{{ .Values.maskrcnn.experiment_group }}/{{ .Release.Name }} 44 | - --allow-run-as-root 45 | - --display-map 46 | - --tag-output 47 | - --timestamp-output 48 | - python3 49 | - {{ .Values.maskrcnn.train_script }} 50 | - --logdir 51 | - /{{ .Values.maskrcnn.shared_fs }}/logs/{{ .Values.maskrcnn.experiment_group }}/{{ .Release.Name }}/train_log/ 52 | - --fp16 53 | - --images_per_epoch 54 | - "{{ .Values.maskrcnn.images_per_epoch }}" 55 | - --config 56 | - MODE_MASK={{ .Values.maskrcnn.mode_fpn }} 57 | - MODE_FPN={{ .Values.maskrcnn.mode_mask }} 58 | - DATA.BASEDIR=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.data_dir }} 59 | - DATA.TRAIN={{ .Values.maskrcnn.data_train }} 60 | - DATA.VAL={{ .Values.maskrcnn.data_val }} 61 | - TRAIN.GRADIENT_CLIP={{ .Values.maskrcnn.gradient_clip }} 62 | - TRAIN.BATCH_SIZE_PER_GPU={{ .Values.maskrcnn.batch_size_per_gpu }} 63 | - TRAIN.EVAL_PERIOD={{ .Values.maskrcnn.eval_period_in_epochs }} 64 | - TRAIN.BASE_LR={{ .Values.maskrcnn.base_lr }} 65 | - TRAIN.WARMUP_INIT_LR={{ .Values.maskrcnn.warmup_lr }} 66 | - TRAIN.LR_EPOCH_SCHEDULE={{ .Values.maskrcnn.lr_epoch_schedule }} 67 | - RPN.TOPK_PER_IMAGE={{ .Values.maskrcnn.topk_per_image }} 68 | - PREPROC.PREDEFINED_PADDING={{ .Values.maskrcnn.predefined_padding }} 69 | - FRCNN.BBOX_REG_WEIGHTS={{ .Values.maskrcnn.bbox_reg_weights }} 70 | - TEST.RESULT_SCORE_THRESH={{ .Values.maskrcnn.result_score_thresh }} 71 | - BACKBONE.WEIGHTS=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.backbone_weights }} 72 | - BACKBONE.NORM={{ .Values.maskrcnn.backbone_norm }} 73 | - TRAINER=horovod 74 | image: {{ .Values.maskrcnn.image }} 75 | imagePullPolicy: {{ .Values.maskrcnn.image_pull_policy }} 76 | volumeMounts: 77 | - mountPath: /{{ .Values.maskrcnn.shared_fs }} 78 | name: {{ .Values.maskrcnn.shared_fs }} 79 | - mountPath: /ebs 80 | name: ebs 81 | resources: 82 | limits: 83 | nvidia.com/gpu: {{ .Values.maskrcnn.gpus_per_node }} 84 | -------------------------------------------------------------------------------- /tensorpack/utils/rect.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: rect.py 5 | 6 | 7 | import numpy as np 8 | 9 | from .develop import log_deprecated 10 | 11 | __all__ = ['IntBox', 'FloatBox'] 12 | 13 | 14 | class BoxBase(object): 15 | __slots__ = ['x1', 'y1', 'x2', 'y2'] 16 | 17 | def __init__(self, x1, y1, x2, y2): 18 | log_deprecated("IntBox and FloatBox", "Please implement them by your own.", "2019-02-28") 19 | self.x1 = x1 20 | self.y1 = y1 21 | self.x2 = x2 22 | self.y2 = y2 23 | 24 | def copy(self): 25 | new = type(self)() 26 | for i in self.__slots__: 27 | setattr(new, i, getattr(self, i)) 28 | return new 29 | 30 | def __str__(self): 31 | return '{}(x1={}, y1={}, x2={}, y2={})'.format( 32 | type(self).__name__, self.x1, self.y1, self.x2, self.y2) 33 | 34 | __repr__ = __str__ 35 | 36 | def area(self): 37 | return self.w * self.h 38 | 39 | def is_box(self): 40 | return self.w > 0 and self.h > 0 41 | 42 | def to_list(self): 43 | return [self.x1, self.y1, self.x2, self.y2] 44 | 45 | 46 | class IntBox(BoxBase): 47 | def __init__(self, x1, y1, x2, y2): 48 | for k in [x1, y1, x2, y2]: 49 | assert isinstance(k, int) 50 | super(IntBox, self).__init__(x1, y1, x2, y2) 51 | 52 | @property 53 | def w(self): 54 | return self.x2 - self.x1 + 1 55 | 56 | @property 57 | def h(self): 58 | return self.y2 - self.y1 + 1 59 | 60 | def is_valid_box(self, shape): 61 | """ 62 | Check that this rect is a valid bounding box within this shape. 63 | 64 | Args: 65 | shape: int [h, w] or None. 66 | Returns: 67 | bool 68 | """ 69 | if min(self.x1, self.y1) < 0: 70 | return False 71 | if min(self.w, self.h) <= 0: 72 | return False 73 | if self.x2 >= shape[1]: 74 | return False 75 | if self.y2 >= shape[0]: 76 | return False 77 | return True 78 | 79 | def clip_by_shape(self, shape): 80 | """ 81 | Clip xs and ys to be valid coordinates inside shape 82 | 83 | Args: 84 | shape: int [h, w] or None. 85 | """ 86 | self.x1 = np.clip(self.x1, 0, shape[1] - 1) 87 | self.x2 = np.clip(self.x2, 0, shape[1] - 1) 88 | self.y1 = np.clip(self.y1, 0, shape[0] - 1) 89 | self.y2 = np.clip(self.y2, 0, shape[0] - 1) 90 | 91 | def roi(self, img): 92 | assert self.is_valid_box(img.shape[:2]), "{} vs {}".format(self, img.shape[:2]) 93 | return img[self.y1:self.y2 + 1, self.x1:self.x2 + 1] 94 | 95 | 96 | class FloatBox(BoxBase): 97 | def __init__(self, x1, y1, x2, y2): 98 | for k in [x1, y1, x2, y2]: 99 | assert isinstance(k, float), "type={},value={}".format(type(k), k) 100 | super(FloatBox, self).__init__(x1, y1, x2, y2) 101 | 102 | @property 103 | def w(self): 104 | return self.x2 - self.x1 105 | 106 | @property 107 | def h(self): 108 | return self.y2 - self.y1 109 | 110 | @staticmethod 111 | def from_intbox(intbox): 112 | return FloatBox(intbox.x1, intbox.y1, 113 | intbox.x2 + 1, intbox.y2 + 1) 114 | 115 | def clip_by_shape(self, shape): 116 | self.x1 = np.clip(self.x1, 0, shape[1]) 117 | self.x2 = np.clip(self.x2, 0, shape[1]) 118 | self.y1 = np.clip(self.y1, 0, shape[0]) 119 | self.y2 = np.clip(self.y2, 0, shape[0]) 120 | 121 | 122 | if __name__ == '__main__': 123 | x = IntBox(2, 1, 3, 3) 124 | img = np.random.rand(3, 3) 125 | print(img) 126 | -------------------------------------------------------------------------------- /tensorpack/utils/fs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: fs.py 5 | 6 | 7 | import errno 8 | import os 9 | import tqdm 10 | from six.moves import urllib 11 | 12 | from . import logger 13 | from .utils import execute_only_once 14 | 15 | __all__ = ['mkdir_p', 'download', 'recursive_walk', 'get_dataset_path'] 16 | 17 | 18 | def mkdir_p(dirname): 19 | """ Like "mkdir -p", make a dir recursively, but do nothing if the dir exists 20 | 21 | Args: 22 | dirname(str): 23 | """ 24 | assert dirname is not None 25 | if dirname == '' or os.path.isdir(dirname): 26 | return 27 | try: 28 | os.makedirs(dirname) 29 | except OSError as e: 30 | if e.errno != errno.EEXIST: 31 | raise e 32 | 33 | 34 | def download(url, dir, filename=None, expect_size=None): 35 | """ 36 | Download URL to a directory. 37 | Will figure out the filename automatically from URL, if not given. 38 | """ 39 | mkdir_p(dir) 40 | if filename is None: 41 | filename = url.split('/')[-1] 42 | fpath = os.path.join(dir, filename) 43 | 44 | if os.path.isfile(fpath): 45 | if expect_size is not None and os.stat(fpath).st_size == expect_size: 46 | logger.info("File {} exists! Skip download.".format(filename)) 47 | return fpath 48 | else: 49 | logger.warn("File {} exists. Will overwrite with a new download!".format(filename)) 50 | 51 | def hook(t): 52 | last_b = [0] 53 | 54 | def inner(b, bsize, tsize=None): 55 | if tsize is not None: 56 | t.total = tsize 57 | t.update((b - last_b[0]) * bsize) 58 | last_b[0] = b 59 | return inner 60 | try: 61 | with tqdm.tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: 62 | fpath, _ = urllib.request.urlretrieve(url, fpath, reporthook=hook(t)) 63 | statinfo = os.stat(fpath) 64 | size = statinfo.st_size 65 | except IOError: 66 | logger.error("Failed to download {}".format(url)) 67 | raise 68 | assert size > 0, "Downloaded an empty file from {}!".format(url) 69 | 70 | if expect_size is not None and size != expect_size: 71 | logger.error("File downloaded from {} does not match the expected size!".format(url)) 72 | logger.error("You may have downloaded a broken file, or the upstream may have modified the file.") 73 | 74 | # TODO human-readable size 75 | logger.info('Succesfully downloaded ' + filename + ". " + str(size) + ' bytes.') 76 | return fpath 77 | 78 | 79 | def recursive_walk(rootdir): 80 | """ 81 | Yields: 82 | str: All files in rootdir, recursively. 83 | """ 84 | for r, dirs, files in os.walk(rootdir): 85 | for f in files: 86 | yield os.path.join(r, f) 87 | 88 | 89 | def get_dataset_path(*args): 90 | """ 91 | Get the path to some dataset under ``$TENSORPACK_DATASET``. 92 | 93 | Args: 94 | args: strings to be joined to form path. 95 | 96 | Returns: 97 | str: path to the dataset. 98 | """ 99 | d = os.environ.get('TENSORPACK_DATASET', None) 100 | if d is None: 101 | d = os.path.join(os.path.expanduser('~'), 'tensorpack_data') 102 | if execute_only_once(): 103 | logger.warn("Env var $TENSORPACK_DATASET not set, using {} for datasets.".format(d)) 104 | if not os.path.isdir(d): 105 | mkdir_p(d) 106 | logger.info("Created the directory {}.".format(d)) 107 | assert os.path.isdir(d), d 108 | return os.path.join(d, *args) 109 | 110 | 111 | if __name__ == '__main__': 112 | download('http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz', '.') 113 | -------------------------------------------------------------------------------- /MaskRCNN/utils/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | """Operations for [N, 4] numpy arrays representing bounding boxes. 19 | 20 | Example box operations that are supported: 21 | * Areas: compute bounding box areas 22 | * IOU: pairwise intersection-over-union scores 23 | """ 24 | import numpy as np 25 | 26 | 27 | def area(boxes): 28 | """Computes area of boxes. 29 | 30 | Args: 31 | boxes: Numpy array with shape [N, 4] holding N boxes 32 | 33 | Returns: 34 | a numpy array with shape [N*1] representing box areas 35 | """ 36 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 37 | 38 | 39 | def intersection(boxes1, boxes2): 40 | """Compute pairwise intersection areas between boxes. 41 | 42 | Args: 43 | boxes1: a numpy array with shape [N, 4] holding N boxes 44 | boxes2: a numpy array with shape [M, 4] holding M boxes 45 | 46 | Returns: 47 | a numpy array with shape [N*M] representing pairwise intersection area 48 | """ 49 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 50 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 51 | 52 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 53 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 54 | intersect_heights = np.maximum( 55 | np.zeros(all_pairs_max_ymin.shape, dtype='f4'), 56 | all_pairs_min_ymax - all_pairs_max_ymin) 57 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 58 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 59 | intersect_widths = np.maximum( 60 | np.zeros(all_pairs_max_xmin.shape, dtype='f4'), 61 | all_pairs_min_xmax - all_pairs_max_xmin) 62 | return intersect_heights * intersect_widths 63 | 64 | 65 | def iou(boxes1, boxes2): 66 | """Computes pairwise intersection-over-union between box collections. 67 | 68 | Args: 69 | boxes1: a numpy array with shape [N, 4] holding N boxes. 70 | boxes2: a numpy array with shape [M, 4] holding M boxes. 71 | 72 | Returns: 73 | a numpy array with shape [N, M] representing pairwise iou scores. 74 | """ 75 | intersect = intersection(boxes1, boxes2) 76 | area1 = area(boxes1) 77 | area2 = area(boxes2) 78 | union = np.expand_dims(area1, axis=1) + np.expand_dims( 79 | area2, axis=0) - intersect 80 | return intersect / union 81 | 82 | 83 | def ioa(boxes1, boxes2): 84 | """Computes pairwise intersection-over-area between box collections. 85 | 86 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 87 | their intersection area over box2's area. Note that ioa is not symmetric, 88 | that is, IOA(box1, box2) != IOA(box2, box1). 89 | 90 | Args: 91 | boxes1: a numpy array with shape [N, 4] holding N boxes. 92 | boxes2: a numpy array with shape [M, 4] holding N boxes. 93 | 94 | Returns: 95 | a numpy array with shape [N, M] representing pairwise ioa scores. 96 | """ 97 | intersect = intersection(boxes1, boxes2) 98 | inv_areas = np.expand_dims(1.0 / area(boxes2), axis=0) 99 | return intersect * inv_areas 100 | -------------------------------------------------------------------------------- /tensorpack/tfutils/varreplace.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: varreplace.py 5 | # Credit: Qinyao He 6 | 7 | from contextlib import contextmanager 8 | import tensorflow as tf 9 | 10 | from .common import get_tf_version_tuple 11 | 12 | __all__ = ['custom_getter_scope', 'freeze_variables', 'remap_variables'] 13 | 14 | 15 | @contextmanager 16 | def custom_getter_scope(custom_getter): 17 | """ 18 | Args: 19 | custom_getter: the same as in :func:`tf.get_variable` 20 | 21 | Returns: 22 | The current variable scope with a custom_getter. 23 | """ 24 | scope = tf.get_variable_scope() 25 | if get_tf_version_tuple() >= (1, 5): 26 | with tf.variable_scope( 27 | scope, custom_getter=custom_getter, 28 | auxiliary_name_scope=False): 29 | yield 30 | else: 31 | ns = tf.get_default_graph().get_name_scope() 32 | with tf.variable_scope( 33 | scope, custom_getter=custom_getter): 34 | with tf.name_scope(ns + '/' if ns else ''): 35 | yield 36 | 37 | 38 | def remap_variables(fn): 39 | """ 40 | Use fn to map the output of any variable getter. 41 | 42 | Args: 43 | fn (tf.Variable -> tf.Tensor) 44 | 45 | Returns: 46 | The current variable scope with a custom_getter that maps 47 | all the variables by fn. 48 | 49 | Example: 50 | .. code-block:: python 51 | 52 | with varreplace.remap_variables(lambda var: quantize(var)): 53 | x = FullyConnected('fc', x, 1000) # fc/{W,b} will be quantized 54 | """ 55 | def custom_getter(getter, *args, **kwargs): 56 | v = getter(*args, **kwargs) 57 | return fn(v) 58 | return custom_getter_scope(custom_getter) 59 | 60 | 61 | def freeze_variables(stop_gradient=True, skip_collection=False): 62 | """ 63 | Return a context to freeze variables, 64 | by wrapping ``tf.get_variable`` with a custom getter. 65 | It works by either applying ``tf.stop_gradient`` on the variables, 66 | or by keeping them out of the ``TRAINABLE_VARIABLES`` collection, or 67 | both. 68 | 69 | Example: 70 | .. code-block:: python 71 | 72 | with varreplace.freeze_variable(stop_gradient=False, skip_collection=True): 73 | x = FullyConnected('fc', x, 1000) # fc/* will not be trained 74 | 75 | Args: 76 | stop_gradient (bool): if True, variables returned from `get_variable` 77 | will be wrapped with `tf.stop_gradient` and therefore has no 78 | gradient when used later. 79 | Note that the created variables may still have gradient when accessed 80 | by other approaches (e.g. by name, or by collection). 81 | Also note that this makes `tf.get_variable` returns a Tensor instead of a Variable, 82 | which may break existing code. 83 | Therefore, it's recommended to use the `skip_collection` option instead. 84 | skip_collection (bool): if True, do not add the variable to 85 | ``TRAINABLE_VARIABLES`` collection, but to ``MODEL_VARIABLES`` 86 | collection. As a result they will not be trained by default. 87 | """ 88 | def custom_getter(getter, *args, **kwargs): 89 | trainable = kwargs.get('trainable', True) 90 | name = args[0] if len(args) else kwargs.get('name') 91 | if skip_collection: 92 | kwargs['trainable'] = False 93 | v = getter(*args, **kwargs) 94 | if skip_collection: 95 | tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) 96 | if trainable and stop_gradient: 97 | v = tf.stop_gradient(v, name='freezed_' + name) 98 | return v 99 | return custom_getter_scope(custom_getter) 100 | -------------------------------------------------------------------------------- /tensorpack/callbacks/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: misc.py 5 | 6 | 7 | import numpy as np 8 | import os 9 | import time 10 | from collections import deque 11 | 12 | from ..utils import logger 13 | from ..utils.utils import humanize_time_delta 14 | from .base import Callback 15 | 16 | __all__ = ['SendStat', 'InjectShell', 'EstimatedTimeLeft'] 17 | 18 | 19 | class SendStat(Callback): 20 | """ An equivalent of :class:`SendMonitorData`, but as a normal callback. """ 21 | def __init__(self, command, names): 22 | self.command = command 23 | if not isinstance(names, list): 24 | names = [names] 25 | self.names = names 26 | 27 | def _trigger(self): 28 | M = self.trainer.monitors 29 | v = {k: M.get_latest(k) for k in self.names} 30 | cmd = self.command.format(**v) 31 | ret = os.system(cmd) 32 | if ret != 0: 33 | logger.error("Command {} failed with ret={}!".format(cmd, ret)) 34 | 35 | 36 | class InjectShell(Callback): 37 | """ 38 | Allow users to create a specific file as a signal to pause 39 | and iteratively debug the training. 40 | Once the :meth:`trigger` method is called, it detects whether the file exists, and opens an 41 | IPython/pdb shell if yes. 42 | In the shell, `self` is this callback, `self.trainer` is the trainer, and 43 | from that you can access everything else. 44 | 45 | Example: 46 | 47 | .. code-block:: none 48 | 49 | callbacks=[InjectShell('/path/to/pause-training.tmp'), ...] 50 | 51 | # the following command will pause the training when the epoch finishes: 52 | $ touch /path/to/pause-training.tmp 53 | 54 | """ 55 | 56 | def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'): 57 | """ 58 | Args: 59 | file (str): if this file exists, will open a shell. 60 | shell (str): one of 'ipython', 'pdb' 61 | """ 62 | self._file = file 63 | assert shell in ['ipython', 'pdb'] 64 | self._shell = shell 65 | logger.info("Create a file '{}' to open {} shell.".format(file, shell)) 66 | 67 | def _trigger(self): 68 | if os.path.isfile(self._file): 69 | logger.info("File {} exists, entering shell.".format(self._file)) 70 | self._inject() 71 | 72 | def _inject(self): 73 | trainer = self.trainer # noqa 74 | if self._shell == 'ipython': 75 | import IPython as IP # noqa 76 | IP.embed() 77 | elif self._shell == 'pdb': 78 | import pdb # noqa 79 | pdb.set_trace() 80 | 81 | def _after_train(self): 82 | if os.path.isfile(self._file): 83 | os.unlink(self._file) 84 | 85 | 86 | class EstimatedTimeLeft(Callback): 87 | """ 88 | Estimate the time left until completion of training. 89 | """ 90 | def __init__(self, last_k_epochs=5, median=False): 91 | """ 92 | Args: 93 | last_k_epochs (int): Use the time spent on last k epochs to estimate total time left. 94 | median (bool): Use mean by default. If True, use the median time spent on last k epochs. 95 | """ 96 | self._times = deque(maxlen=last_k_epochs) 97 | self._median = median 98 | 99 | def _before_train(self): 100 | self._max_epoch = self.trainer.max_epoch 101 | self._last_time = time.time() 102 | 103 | def _trigger_epoch(self): 104 | duration = time.time() - self._last_time 105 | self._last_time = time.time() 106 | self._times.append(duration) 107 | 108 | epoch_time = np.median(self._times) if self._median else np.mean(self._times) 109 | time_left = (self._max_epoch - self.epoch_num) * epoch_time 110 | if time_left > 0: 111 | logger.info("Estimated Time Left: " + humanize_time_delta(time_left)) 112 | -------------------------------------------------------------------------------- /tensorpack/models/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: layer_norm.py 5 | 6 | 7 | import tensorflow as tf 8 | 9 | from ..utils.argtools import get_data_format 10 | from .common import VariableHolder, layer_register 11 | 12 | __all__ = ['LayerNorm', 'InstanceNorm'] 13 | 14 | 15 | @layer_register() 16 | def LayerNorm( 17 | x, epsilon=1e-5, 18 | use_bias=True, use_scale=True, 19 | gamma_init=None, data_format='channels_last'): 20 | """ 21 | Layer Normalization layer, as described in the paper: 22 | `Layer Normalization `_. 23 | 24 | Args: 25 | x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format. 26 | epsilon (float): epsilon to avoid divide-by-zero. 27 | use_scale, use_bias (bool): whether to use the extra affine transformation or not. 28 | """ 29 | data_format = get_data_format(data_format, tfmode=False) 30 | shape = x.get_shape().as_list() 31 | ndims = len(shape) 32 | assert ndims in [2, 4] 33 | 34 | mean, var = tf.nn.moments(x, list(range(1, len(shape))), keep_dims=True) 35 | 36 | if data_format == 'NCHW': 37 | chan = shape[1] 38 | new_shape = [1, chan, 1, 1] 39 | else: 40 | chan = shape[-1] 41 | new_shape = [1, 1, 1, chan] 42 | if ndims == 2: 43 | new_shape = [1, chan] 44 | 45 | if use_bias: 46 | beta = tf.get_variable('beta', [chan], initializer=tf.constant_initializer()) 47 | beta = tf.reshape(beta, new_shape) 48 | else: 49 | beta = tf.zeros([1] * ndims, name='beta') 50 | if use_scale: 51 | if gamma_init is None: 52 | gamma_init = tf.constant_initializer(1.0) 53 | gamma = tf.get_variable('gamma', [chan], initializer=gamma_init) 54 | gamma = tf.reshape(gamma, new_shape) 55 | else: 56 | gamma = tf.ones([1] * ndims, name='gamma') 57 | 58 | ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output') 59 | 60 | vh = ret.variables = VariableHolder() 61 | if use_scale: 62 | vh.gamma = gamma 63 | if use_bias: 64 | vh.beta = beta 65 | return ret 66 | 67 | 68 | @layer_register() 69 | def InstanceNorm(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'): 70 | """ 71 | Instance Normalization, as in the paper: 72 | `Instance Normalization: The Missing Ingredient for Fast Stylization 73 | `_. 74 | 75 | Args: 76 | x (tf.Tensor): a 4D tensor. 77 | epsilon (float): avoid divide-by-zero 78 | use_affine (bool): whether to apply learnable affine transformation 79 | """ 80 | data_format = get_data_format(data_format, tfmode=False) 81 | shape = x.get_shape().as_list() 82 | assert len(shape) == 4, "Input of InstanceNorm has to be 4D!" 83 | 84 | if data_format == 'NHWC': 85 | axis = [1, 2] 86 | ch = shape[3] 87 | new_shape = [1, 1, 1, ch] 88 | else: 89 | axis = [2, 3] 90 | ch = shape[1] 91 | new_shape = [1, ch, 1, 1] 92 | assert ch is not None, "Input of InstanceNorm require known channel!" 93 | 94 | mean, var = tf.nn.moments(x, axis, keep_dims=True) 95 | 96 | if not use_affine: 97 | return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output') 98 | 99 | beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer()) 100 | beta = tf.reshape(beta, new_shape) 101 | if gamma_init is None: 102 | gamma_init = tf.constant_initializer(1.0) 103 | gamma = tf.get_variable('gamma', [ch], initializer=gamma_init) 104 | gamma = tf.reshape(gamma, new_shape) 105 | ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output') 106 | 107 | vh = ret.variables = VariableHolder() 108 | if use_affine: 109 | vh.gamma = gamma 110 | vh.beta = beta 111 | return ret 112 | -------------------------------------------------------------------------------- /infra/eks/maskrcnn/charts/mpi-operator/templates/mpi-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 5 | labels: 6 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 7 | app.kubernetes.io/instance: {{ .Release.Name }} 8 | app.kubernetes.io/managed-by: {{ .Release.Service }} 9 | rules: 10 | - apiGroups: 11 | - "" 12 | resources: 13 | - configmaps 14 | - serviceaccounts 15 | verbs: 16 | - create 17 | - list 18 | - watch 19 | - apiGroups: 20 | - "" 21 | resources: 22 | - pods 23 | verbs: 24 | - get 25 | - apiGroups: 26 | - "" 27 | resources: 28 | - pods/exec 29 | verbs: 30 | - create 31 | - apiGroups: 32 | - "" 33 | resources: 34 | - events 35 | verbs: 36 | - create 37 | - patch 38 | - apiGroups: 39 | - rbac.authorization.k8s.io 40 | resources: 41 | - roles 42 | - rolebindings 43 | verbs: 44 | - create 45 | - list 46 | - watch 47 | - apiGroups: 48 | - apps 49 | resources: 50 | - statefulsets 51 | verbs: 52 | - create 53 | - list 54 | - update 55 | - watch 56 | - apiGroups: 57 | - batch 58 | resources: 59 | - jobs 60 | verbs: 61 | - create 62 | - list 63 | - update 64 | - watch 65 | - apiGroups: 66 | - apiextensions.k8s.io 67 | resources: 68 | - customresourcedefinitions 69 | verbs: 70 | - create 71 | - get 72 | - apiGroups: 73 | - kubeflow.org 74 | resources: 75 | - mpijobs 76 | verbs: 77 | - '*' 78 | --- 79 | apiVersion: v1 80 | kind: ServiceAccount 81 | metadata: 82 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 83 | namespace: {{ .Values.global.namespace }} 84 | labels: 85 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 86 | app.kubernetes.io/instance: {{ .Release.Name }} 87 | app.kubernetes.io/managed-by: {{ .Release.Service }} 88 | --- 89 | apiVersion: rbac.authorization.k8s.io/v1 90 | kind: ClusterRoleBinding 91 | metadata: 92 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 93 | namespace: {{ .Values.global.namespace }} 94 | labels: 95 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 96 | app.kubernetes.io/instance: {{ .Release.Name }} 97 | app.kubernetes.io/managed-by: {{ .Release.Service }} 98 | roleRef: 99 | apiGroup: rbac.authorization.k8s.io 100 | kind: ClusterRole 101 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 102 | subjects: 103 | - kind: ServiceAccount 104 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 105 | namespace: {{ .Values.global.namespace }} 106 | --- 107 | apiVersion: apps/v1 108 | kind: Deployment 109 | metadata: 110 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 111 | namespace: {{ .Values.global.namespace }} 112 | labels: 113 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 114 | app.kubernetes.io/instance: {{ .Release.Name }} 115 | app.kubernetes.io/managed-by: {{ .Release.Service }} 116 | spec: 117 | replicas: 1 118 | selector: 119 | matchLabels: 120 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 121 | template: 122 | metadata: 123 | labels: 124 | app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 125 | app.kubernetes.io/instance: {{ .Release.Name }} 126 | app.kubernetes.io/managed-by: {{ .Release.Service }} 127 | spec: 128 | containers: 129 | - args: 130 | - --gpus-per-node 131 | - "{{ .Values.mpioperator.gpuspernode }}" 132 | - --kubectl-delivery-image 133 | - {{ .Values.mpioperator.deliveryimage }} 134 | image: {{ .Values.mpioperator.image }} 135 | imagePullPolicy: {{ .Values.mpioperator.pullpolicy }} 136 | name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 137 | serviceAccountName: {{ .Values.mpioperator.name }}-{{ .Values.global.name }} 138 | --- -------------------------------------------------------------------------------- /tensorpack/dataflow/dataset/bsds500.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # -*- coding: utf-8 -*- 4 | # File: bsds500.py 5 | 6 | 7 | import glob 8 | import numpy as np 9 | import os 10 | 11 | from ...utils.fs import download, get_dataset_path 12 | from ..base import RNGDataFlow 13 | 14 | __all__ = ['BSDS500'] 15 | 16 | 17 | DATA_URL = "http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz" 18 | DATA_SIZE = 70763455 19 | IMG_W, IMG_H = 481, 321 20 | 21 | 22 | class BSDS500(RNGDataFlow): 23 | """ 24 | `Berkeley Segmentation Data Set and Benchmarks 500 dataset 25 | `_. 26 | 27 | Produce ``(image, label)`` pair, where ``image`` has shape (321, 481, 3(BGR)) and 28 | ranges in [0,255]. 29 | ``Label`` is a floating point image of shape (321, 481) in range [0, 1]. 30 | The value of each pixel is ``number of times it is annotated as edge / total number of annotators for this image``. 31 | """ 32 | 33 | def __init__(self, name, data_dir=None, shuffle=True): 34 | """ 35 | Args: 36 | name (str): 'train', 'test', 'val' 37 | data_dir (str): a directory containing the original 'BSR' directory. 38 | """ 39 | # check and download data 40 | if data_dir is None: 41 | data_dir = get_dataset_path('bsds500_data') 42 | if not os.path.isdir(os.path.join(data_dir, 'BSR')): 43 | download(DATA_URL, data_dir, expect_size=DATA_SIZE) 44 | filename = DATA_URL.split('/')[-1] 45 | filepath = os.path.join(data_dir, filename) 46 | import tarfile 47 | tarfile.open(filepath, 'r:gz').extractall(data_dir) 48 | self.data_root = os.path.join(data_dir, 'BSR', 'BSDS500', 'data') 49 | assert os.path.isdir(self.data_root) 50 | 51 | self.shuffle = shuffle 52 | assert name in ['train', 'test', 'val'] 53 | self._load(name) 54 | 55 | def _load(self, name): 56 | image_glob = os.path.join(self.data_root, 'images', name, '*.jpg') 57 | image_files = glob.glob(image_glob) 58 | gt_dir = os.path.join(self.data_root, 'groundTruth', name) 59 | self.data = np.zeros((len(image_files), IMG_H, IMG_W, 3), dtype='uint8') 60 | self.label = np.zeros((len(image_files), IMG_H, IMG_W), dtype='float32') 61 | 62 | for idx, f in enumerate(image_files): 63 | im = cv2.imread(f, cv2.IMREAD_COLOR) 64 | assert im is not None 65 | if im.shape[0] > im.shape[1]: 66 | im = np.transpose(im, (1, 0, 2)) 67 | assert im.shape[:2] == (IMG_H, IMG_W), "{} != {}".format(im.shape[:2], (IMG_H, IMG_W)) 68 | 69 | imgid = os.path.basename(f).split('.')[0] 70 | gt_file = os.path.join(gt_dir, imgid) 71 | gt = loadmat(gt_file)['groundTruth'][0] 72 | n_annot = gt.shape[0] 73 | gt = sum(gt[k]['Boundaries'][0][0] for k in range(n_annot)) 74 | gt = gt.astype('float32') 75 | gt *= 1.0 / n_annot 76 | if gt.shape[0] > gt.shape[1]: 77 | gt = gt.transpose() 78 | assert gt.shape == (IMG_H, IMG_W) 79 | 80 | self.data[idx] = im 81 | self.label[idx] = gt 82 | 83 | def __len__(self): 84 | return self.data.shape[0] 85 | 86 | def __iter__(self): 87 | idxs = np.arange(self.data.shape[0]) 88 | if self.shuffle: 89 | self.rng.shuffle(idxs) 90 | for k in idxs: 91 | yield [self.data[k], self.label[k]] 92 | 93 | 94 | try: 95 | from scipy.io import loadmat 96 | import cv2 97 | except ImportError: 98 | from ...utils.develop import create_dummy_class 99 | BSDS500 = create_dummy_class('BSDS500', ['scipy.io', 'cv2']) # noqa 100 | 101 | if __name__ == '__main__': 102 | a = BSDS500('val') 103 | a.reset_state() 104 | for k in a: 105 | cv2.imshow("haha", k[1].astype('uint8') * 255) 106 | cv2.waitKey(1000) 107 | -------------------------------------------------------------------------------- /infra/eks/yaml_overlay: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | #!/usr/bin/env python3 4 | 5 | import argparse 6 | import os 7 | import sys 8 | import yaml 9 | import yamlloader 10 | 11 | 12 | 13 | 14 | 15 | def apply_overlay(base, overlay, append=False): 16 | """ 17 | 18 | :param base: Dict of yaml to apply changes to. Gets mutated 19 | :param overlay: Dict of changes. Identical structure to base 20 | :param append: True to append, false to replace values 21 | :return: base dict with changes applied. Mutation of base input dict 22 | """ 23 | 24 | 25 | for k1, v1 in overlay.items(): 26 | if not isinstance(v1, dict): 27 | if append: 28 | base[k1] += v1 29 | else: 30 | base[k1] = v1 31 | 32 | 33 | else: 34 | for k2, v2 in v1.items(): 35 | if not isinstance(v2, dict): 36 | if append: 37 | base[k1][k2] += v2 38 | else: 39 | base[k1][k2] = v2 40 | 41 | 42 | else: 43 | for k3, v3 in v2.items(): 44 | if not isinstance(v3, dict): 45 | if append: 46 | base[k1][k2][k3] += v3 47 | else: 48 | base[k1][k2][k3] = v3 49 | 50 | 51 | else: 52 | for k4, v4 in v3.items(): 53 | if not isinstance(v4, dict): 54 | if append: 55 | base[k1][k2][k3][k4] += v4 56 | else: 57 | base[k1][k2][k3][k4] = v4 58 | else: 59 | raise NotImplementedError("Exceeds current yaml max depth") 60 | return base 61 | 62 | 63 | 64 | 65 | 66 | 67 | if __name__ == '__main__': 68 | 69 | parser = argparse.ArgumentParser(description='Create a variant of a yaml by applying overlays which describe changes') 70 | parser.add_argument('base_yaml_path', 71 | help="Yaml to use as base. If '-' is given, will read from stdin instead.") 72 | parser.add_argument('overlays', metavar='N', type=str, nargs='+', 73 | help='Overlays to apply in sequential order') 74 | parser.add_argument('--overlay_dir', type=str, 75 | help='Path to dir containing all overlays. Can be passed in through OVERLAY_BASE_DIR ' 76 | 'environmental variable. If both env var and cli arg are present, cli arg wins.') 77 | 78 | args = parser.parse_args() 79 | 80 | overlay_base_dir = None 81 | if "OVERLAY_DIR" in os.environ.keys() and os.environ['OVERLAY_DIR'] is not None: 82 | overlay_base_dir = os.environ['OVERLAY_DIR'] 83 | if args.overlay_dir is not None: 84 | overlay_base_dir = args.overlay_dir 85 | 86 | if args.base_yaml_path == '-': 87 | s = "".join([l for l in sys.stdin]) 88 | values = yaml.load(s, Loader=yamlloader.ordereddict.CLoader) 89 | else: 90 | with open(args.base_yaml_path, 'r') as f: 91 | values = yaml.load(f, Loader=yamlloader.ordereddict.CLoader) 92 | 93 | overlay_dicts = [] 94 | for overlay in args.overlays: 95 | overlay_path = f'{overlay}.yaml' 96 | if overlay_base_dir is not None: 97 | overlay_path = os.path.join(overlay_base_dir, overlay_path) 98 | 99 | with open(overlay_path) as f: 100 | overlay_dicts.append(yaml.load(f, Loader=yamlloader.ordereddict.CLoader)) 101 | 102 | for overlay_dict in overlay_dicts: 103 | if 'append' in overlay_dict.keys(): 104 | values = apply_overlay(values, overlay_dict['append'], append=True) 105 | if 'set' in overlay_dict.keys(): 106 | values = apply_overlay(values, overlay_dict['set'], append=False) 107 | 108 | print(yaml.dump(values, 109 | Dumper=yamlloader.ordereddict.CDumper, 110 | default_flow_style=False)) 111 | --------------------------------------------------------------------------------