├── infra
    ├── eks
    │   ├── maskrcnn
    │   │   ├── overlays
    │   │   │   ├── run1.yaml
    │   │   │   ├── run2.yaml
    │   │   │   ├── run3.yaml
    │   │   │   ├── run4.yaml
    │   │   │   ├── run5.yaml
    │   │   │   ├── run6.yaml
    │   │   │   ├── run7.yaml
    │   │   │   ├── run8.yaml
    │   │   │   ├── run9.yaml
    │   │   │   ├── run10.yaml
    │   │   │   ├── 13rings.yaml
    │   │   │   ├── 8x4.yaml
    │   │   │   ├── 16x4.yaml
    │   │   │   ├── larc.yaml
    │   │   │   ├── 24epoch.yaml
    │   │   │   ├── predefined_padding.yaml
    │   │   │   ├── 64x4.yaml
    │   │   │   ├── syncbn.yaml
    │   │   │   ├── determinism.yaml
    │   │   │   ├── bbrw.yaml
    │   │   │   └── 32x4.yaml
    │   │   ├── requirements.yaml
    │   │   ├── charts
    │   │   │   └── mpi-operator
    │   │   │   │   ├── Chart.yaml
    │   │   │   │   ├── values.yaml
    │   │   │   │   └── templates
    │   │   │   │       └── mpi-operator.yaml
    │   │   ├── Chart.yaml
    │   │   ├── values.yaml
    │   │   └── templates
    │   │   │   └── maskrcnn.yaml
    │   ├── eksctl
    │   │   ├── p3
    │   │   │   ├── kubeconfig
    │   │   │   ├── delete.sh
    │   │   │   ├── additional_nodegroup.yaml
    │   │   │   ├── additional_nodegroup_non_gpu.yaml
    │   │   │   ├── create.sh
    │   │   │   └── config.yaml
    │   │   └── p3dn
    │   │   │   ├── kubeconfig
    │   │   │   ├── delete.sh
    │   │   │   ├── additional_nodegroup_non_gpu.yaml
    │   │   │   ├── create.sh
    │   │   │   └── config.yaml
    │   ├── helm
    │   │   ├── mpijob
    │   │   │   ├── Chart.yaml
    │   │   │   └── templates
    │   │   │   │   └── mpijob.yaml
    │   │   └── tiller-rbac-config.yaml
    │   ├── ssh.sh
    │   ├── tboard.sh
    │   ├── fsx
    │   │   ├── p3
    │   │   │   ├── pvc-fsx.yaml
    │   │   │   ├── pv-fsx.yaml
    │   │   │   ├── attach-pvc.yaml
    │   │   │   ├── attach-pvc-2.yaml
    │   │   │   └── stage-data.yaml
    │   │   └── p3dn
    │   │   │   ├── pvc-fsx.yaml
    │   │   │   ├── pv-fsx.yaml
    │   │   │   ├── attach-pvc.yaml
    │   │   │   ├── attach-pvc-2.yaml
    │   │   │   └── stage-data.yaml
    │   ├── TOOLS.md
    │   ├── tensorboard
    │   │   └── tensorboard.yaml
    │   ├── YAML_OVERLAY.md
    │   └── yaml_overlay
    ├── ami
    │   ├── reinstall_tensorpack.sh
    │   ├── tensorboard
    │   │   ├── tensorboard.sh
    │   │   └── ssh_tensorboard.sh
    │   ├── export_cmd
    │   ├── install_libs.sh
    │   ├── download_data.sh
    │   ├── README.md
    │   ├── train_efa.sh
    │   └── no_batch_train_1node_16xl_convergence.sh
    ├── docker
    │   ├── sleep.sh
    │   ├── run.sh
    │   ├── run_multinode.sh
    │   ├── build.sh
    │   ├── ssh_and_build.sh
    │   ├── docker.md
    │   ├── train.sh
    │   ├── README.md
    │   └── train_multinode.sh
    └── sm
    │   ├── Dockerfile_sm
    │   ├── build_push_submit.sh
    │   ├── run.sh
    │   ├── README.md
    │   ├── launch_sm_job.py
    │   ├── Dockerfile_base
    │   └── build_and_push.sh
├── .dockerignore
├── NOTICE
├── COCO_image_aspect_ratio_histogram.png
├── MaskRCNN
    ├── utils
    │   ├── __init__.py
    │   ├── README.md
    │   ├── randomnness.py
    │   ├── mixed_precision.py
    │   ├── box_ops.py
    │   ├── generate_anchors.py
    │   └── np_box_ops.py
    ├── viz.py
    └── NOTES.md
├── tensorpack
    ├── contrib
    │   └── __init__.py
    ├── train
    │   ├── utility.py
    │   └── __init__.py
    ├── callbacks
    │   ├── stats.py
    │   ├── __init__.py
    │   ├── hooks.py
    │   ├── concurrency.py
    │   ├── group.py
    │   └── misc.py
    ├── utils
    │   ├── naming.py
    │   ├── compatible_serialize.py
    │   ├── __init__.py
    │   ├── debug.py
    │   ├── palette.py
    │   ├── gpu.py
    │   ├── serialize.py
    │   ├── timer.py
    │   ├── rect.py
    │   └── fs.py
    ├── models
    │   ├── common.py
    │   ├── shapes.py
    │   ├── utils.py
    │   ├── _test.py
    │   ├── __init__.py
    │   ├── nonlin.py
    │   ├── shape_utils.py
    │   ├── fc.py
    │   └── layer_norm.py
    ├── dataflow
    │   ├── dftools.py
    │   ├── imgaug
    │   │   ├── _test.py
    │   │   ├── __init__.py
    │   │   ├── convert.py
    │   │   ├── noise.py
    │   │   ├── external.py
    │   │   └── paste.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── svhn.py
    │   │   └── bsds500.py
    │   └── __init__.py
    ├── __init__.py
    ├── input_source
    │   └── __init__.py
    ├── predict
    │   ├── __init__.py
    │   └── feedfree.py
    ├── graph_builder
    │   ├── __init__.py
    │   └── predict.py
    ├── tfutils
    │   ├── __init__.py
    │   ├── distributed.py
    │   ├── symbolic_functions.py
    │   ├── dependency.py
    │   ├── sesscreate.py
    │   ├── model_utils.py
    │   └── varreplace.py
    └── libinfo.py
├── update_git.sh
├── CODE_OF_CONDUCT.md
├── setup.cfg
├── patch
    ├── tensorflow_Conv2DTranspose.diff
    ├── roi_align.diff
    └── README.md
├── RESULTS.md
├── .gitignore
├── Dockerfile
├── setup.py
├── README.md
└── CONTRIBUTING.md


/infra/eks/maskrcnn/overlays/run1.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run1
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run2.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run2
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run3.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run3
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run4.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run4
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run5.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run5
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run6.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run6
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run7.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run7
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run8.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run8
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run9.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run9
4 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/run10.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -run10
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | venv/
2 | .ignored/
3 | .git/
4 | 
5 | tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | tensorflow-mask-rcnn
2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/kubeconfig:
--------------------------------------------------------------------------------
1 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3
2 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3dn/kubeconfig:
--------------------------------------------------------------------------------
1 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3dn
2 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/requirements.yaml:
--------------------------------------------------------------------------------
1 | dependencies:
2 |   - name: mpi-operator
3 |     version: 1.0.0
4 |     repository: ./charts


--------------------------------------------------------------------------------
/COCO_image_aspect_ratio_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/armandmcqueen/tensorpack-mask-rcnn/HEAD/COCO_image_aspect_ratio_histogram.png


--------------------------------------------------------------------------------
/MaskRCNN/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/13rings.yaml:
--------------------------------------------------------------------------------
1 | set:
2 |   maskrcnn:
3 |     nccl_min_rings: 13
4 | 
5 | append:
6 |   global:
7 |     name: -13rings
8 | 
9 | 


--------------------------------------------------------------------------------
/tensorpack/contrib/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/8x4.yaml:
--------------------------------------------------------------------------------
 1 | set:
 2 |   maskrcnn:
 3 |     gpus: 8
 4 |     batch_size_per_gpu: 4
 5 | 
 6 | append:
 7 |   global:
 8 |     name: -8x4
 9 | 
10 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/16x4.yaml:
--------------------------------------------------------------------------------
 1 | set:
 2 |   maskrcnn:
 3 |     gpus: 16
 4 |     batch_size_per_gpu: 4
 5 | 
 6 | append:
 7 |   global:
 8 |     name: -16x4
 9 | 
10 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/larc.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -larc
4 | 
5 | set:
6 |   maskrcnn:
7 |     image: armandmcqueen/tensorpack-mask-rcnn:dev-larc
8 | 
9 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/charts/mpi-operator/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for kubeflow mpi-operator
4 | name: mpi-operator
5 | version: 1.0.0


--------------------------------------------------------------------------------
/infra/eks/helm/mpijob/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for kubeflow  mpijob customer resource definition
4 | name: mpijob
5 | version: 1.0.0


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/24epoch.yaml:
--------------------------------------------------------------------------------
1 | set:
2 |   maskrcnn:
3 |     lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]"
4 | 
5 | append:
6 |   global:
7 |     name: -24e
8 | 
9 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/predefined_padding.yaml:
--------------------------------------------------------------------------------
 1 | append:
 2 |   global:
 3 |     name: -predefpad
 4 | 
 5 | set:
 6 |   maskrcnn:
 7 |     predefined_padding: 'True'
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/64x4.yaml:
--------------------------------------------------------------------------------
 1 | set:
 2 |   maskrcnn:
 3 |     gpus: 64
 4 |     batch_size_per_gpu: 4
 5 |     gradient_clip: 1.5
 6 | 
 7 | append:
 8 |   global:
 9 |     name: -64x4
10 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/syncbn.yaml:
--------------------------------------------------------------------------------
 1 | append:
 2 |   global:
 3 |     name: -syncbn
 4 | 
 5 | set:
 6 |   maskrcnn:
 7 |     experiment_group: syncbn
 8 |     backbone_norm: SyncBN
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/determinism.yaml:
--------------------------------------------------------------------------------
1 | append:
2 |   global:
3 |     name: -determinism
4 | 
5 | set:
6 |   maskrcnn:
7 |     image: armandmcqueen/tensorpack-mask-rcnn:dev-determinism_armand
8 | 
9 | 


--------------------------------------------------------------------------------
/infra/eks/ssh.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | kubectl exec attach-pvc-2 -it -- /bin/bash


--------------------------------------------------------------------------------
/update_git.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | git commit -am "Quick update"
6 | git push


--------------------------------------------------------------------------------
/infra/eks/tboard.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | kubectl port-forward tensorboard 6006:6006


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/delete.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | eksctl delete cluster -f p3_config.yaml


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | appVersion: "1.0"
3 | description: A Helm chart for running Mask RCNN (optimized) using kubeflow mpi-operator and mpi-job
4 | name: maskrcnn-optimized
5 | version: 1.0.0


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3dn/delete.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | eksctl delete cluster -f p3dn_config.yaml


--------------------------------------------------------------------------------
/infra/ami/reinstall_tensorpack.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | pip uninstall -y tensorpack
6 | pip install -e ./


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/charts/mpi-operator/values.yaml:
--------------------------------------------------------------------------------
1 | mpioperator:
2 |   name: mpi-op
3 |   image: mpioperator/mpi-operator:0.1.0
4 |   deliveryimage: mpioperator/kubectl-delivery:latest
5 |   pullpolicy: Always
6 |   gpuspernode: 8
7 | 


--------------------------------------------------------------------------------
/infra/docker/sleep.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | PORT_ID=${1:-1234}
5 | /usr/sbin/sshd -p $PORT_ID; sleep infinity
6 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/bbrw.yaml:
--------------------------------------------------------------------------------
 1 | set:
 2 |   maskrcnn:
 3 |     bbox_reg_weights: '[20., 20., 10., 10.]'
 4 | 
 5 | 
 6 | append:
 7 |   global:
 8 |     name: -bbrw
 9 |   maskrcnn:
10 |     experiment_group: _regweights
11 | 
12 | 


--------------------------------------------------------------------------------
/infra/ami/tensorboard/tensorboard.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | tensorboard --logdir=live:~/logs/train_log,old:~/old_logs


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/overlays/32x4.yaml:
--------------------------------------------------------------------------------
 1 | set:
 2 |   maskrcnn:
 3 |     gpus: 32
 4 |     batch_size_per_gpu: 4
 5 |     gradient_clip: 1.5 # set it to zero to disable gradient clipping
 6 | 
 7 | append:
 8 |   global:
 9 |     name: -32x4
10 | 


--------------------------------------------------------------------------------
/infra/eks/fsx/p3/pvc-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: tensorpack-fsx
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: fsx-sc
 9 |   resources:
10 |     requests:
11 |       storage: 100Gi


--------------------------------------------------------------------------------
/infra/eks/fsx/p3dn/pvc-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: tensorpack-fsx
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: fsx-sc
 9 |   resources:
10 |     requests:
11 |       storage: 100Gi


--------------------------------------------------------------------------------
/infra/ami/export_cmd:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:


--------------------------------------------------------------------------------
/infra/sm/Dockerfile_sm:
--------------------------------------------------------------------------------
1 | FROM fewu/sagemaker-mask-rcnn:lateset
2 | 
3 | # Copies the training code inside the container
4 | COPY run_mpi.py /opt/ml/code/run_mpi.py
5 | COPY run.sh /opt/ml/code/run.sh
6 | 
7 | # Defines train.py as script entry point
8 | ENV SAGEMAKER_PROGRAM run_mpi.py
9 | 


--------------------------------------------------------------------------------
/infra/sm/build_push_submit.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | set -e
4 | imagename=$1
5 | sagemaker_iam_role=$2
6 | ./build_and_push.sh $imagename
7 | python3 launch_sm_job.py $imagename $sagemaker_iam_role
8 | 


--------------------------------------------------------------------------------
/infra/ami/tensorboard/ssh_tensorboard.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | #!/usr/bin/env bash
4 | 
5 | # Will be able to access tensorboard at localhost:6007
6 | 
7 | echo "ssh -L 127.0.0.1:6007:127.0.0.1:6006 ubuntu@"
8 | 
9 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/tensorpack/train/utility.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | # SPDX-License-Identifier: Apache-2.0
3 | # -*- coding: utf-8 -*-
4 | # File: utility.py
5 | 
6 | # for backwards-compatibility
7 | from ..graph_builder.utils import LeastLoadedDeviceSetter, OverrideToLocalVariable, override_to_local_variable  # noqa
8 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/stats.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: stats.py
 5 | 
 6 | from .graph import DumpParamAsImage  # noqa
 7 | # for compatibility only
 8 | from .misc import InjectShell, SendStat  # noqa
 9 | 
10 | __all__ = []
11 | 


--------------------------------------------------------------------------------
/tensorpack/utils/naming.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: naming.py
 5 | 
 6 | 
 7 | GLOBAL_STEP_INCR_OP_NAME = 'global_step_incr'
 8 | 
 9 | # extra variables to summarize during training in a moving-average way
10 | MOVING_SUMMARY_OPS_KEY = 'MOVING_SUMMARY_OPS'
11 | 


--------------------------------------------------------------------------------
/infra/docker/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | BRANCH_NAME=${1:-"master"}
 6 | 
 7 | echo "Running docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}"
 8 | echo ""
 9 | 
10 | 
11 | 
12 | nvidia-docker run -it  -v ~/data:/data -v ~/logs:/logs tensorpack-mask-rcnn:dev-${BRANCH_NAME}


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | author = TensorPack contributors
 3 | author-email = ppwwyyxxc@gmail.com
 4 | url = https://github.com/tensorpack/tensorpack
 5 | keywords = tensorflow, deep learning, neural network
 6 | license = Apache
 7 | 
 8 | [options]
 9 | zip_safe = False		    # dataset and __init__ use file
10 | # will call find_packages()
11 | packages = find:
12 | 
13 | [wheel]
14 | universal = 1
15 | 


--------------------------------------------------------------------------------
/infra/ami/install_libs.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | pip install --upgrade pip
 6 | pip install ujson
 7 | pip install opencv-python
 8 | pip install pycocotools
 9 | pip install --ignore-installed numpy==1.14.5
10 | pip install tqdm
11 | pip install msgpack_numpy
12 | pip install tabulate
13 | 


--------------------------------------------------------------------------------
/tensorpack/models/common.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: common.py
 5 | 
 6 | from .registry import layer_register  # noqa
 7 | from .tflayer import rename_tflayer_get_variable
 8 | from .utils import VariableHolder  # noqa
 9 | 
10 | __all__ = ['layer_register', 'VariableHolder', 'rename_tflayer_get_variable']
11 | 


--------------------------------------------------------------------------------
/infra/docker/run_multinode.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | BRANCH_NAME=${1:-"master"}
 6 | 
 7 | echo "Running docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}"
 8 | echo ""
 9 | 
10 | 
11 | 
12 | nvidia-docker run -it --network=host -v /mnt/share/ssh:/root/.ssh -v ~/data:/data -v ~/logs:/logs tensorpack-mask-rcnn:dev-${BRANCH_NAME}
13 | 


--------------------------------------------------------------------------------
/infra/eks/helm/tiller-rbac-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: tiller
 5 |   namespace: kube-system
 6 | ---
 7 | apiVersion: rbac.authorization.k8s.io/v1
 8 | kind: ClusterRoleBinding
 9 | metadata:
10 |   name: tiller
11 | roleRef:
12 |   apiGroup: rbac.authorization.k8s.io
13 |   kind: ClusterRole
14 |   name: cluster-admin
15 | subjects:
16 |   - kind: ServiceAccount
17 |     name: tiller
18 |     namespace: kube-system


--------------------------------------------------------------------------------
/infra/ami/download_data.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | DATA_DIR=/home/ubuntu/data
 6 | 
 7 | mkdir -p $DATA_DIR
 8 | aws s3 cp s3://armand-ajay-workshop/mask-rcnn/sagemaker/input/train $DATA_DIR --recursive
 9 | 
10 | wget -O $DATA_DIR/pretrained-models/ImageNet-R50-AlignPadding.npz http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz
11 | 


--------------------------------------------------------------------------------
/infra/eks/fsx/p3/pv-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: tensorpack-fsx
 5 | spec:
 6 |   capacity:
 7 |     storage: 7Pi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: fsx-sc
13 |   csi:
14 |     driver: fsx.csi.aws.com
15 |     volumeHandle: fs-03f556d03c3c590a2
16 |     volumeAttributes:
17 |       dnsname: fs-03f556d03c3c590a2.fsx.us-east-1.amazonaws.com


--------------------------------------------------------------------------------
/infra/eks/fsx/p3dn/pv-fsx.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: tensorpack-fsx
 5 | spec:
 6 |   capacity:
 7 |     storage: 7Pi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: fsx-sc
13 |   csi:
14 |     driver: fsx.csi.aws.com
15 |     volumeHandle: fs-04d78cb1f96eb771e
16 |     volumeAttributes:
17 |       dnsname: fs-04d78cb1f96eb771e.fsx.us-east-1.amazonaws.com


--------------------------------------------------------------------------------
/MaskRCNN/utils/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Some third-party helper functions
3 | 
4 | + generate_anchors.py: copied from [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py).
5 | + box_ops.py: modified from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/core/box_list_ops.py).
6 | + np_box_ops.py: copied from [TF object detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/utils/np_box_ops.py).
7 | 
8 | 


--------------------------------------------------------------------------------
/infra/docker/build.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | BRANCH_NAME=${1:-"master"}
 6 | 
 7 | # The BRANCH_NAME refers to the git pull that happens inside of the Dockerfile
 8 | echo "Building docker image tensorpack-mask-rcnn:dev-${BRANCH_NAME}"
 9 | echo ""
10 | 
11 | 
12 | 
13 | docker build -t tensorpack-mask-rcnn:dev-${BRANCH_NAME} ../.. --build-arg CACHEBUST=$(date +%s) --build-arg BRANCH_NAME=${BRANCH_NAME}


--------------------------------------------------------------------------------
/infra/eks/fsx/p3/attach-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc
 8 |     image: ubuntu:latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"]
13 |     volumeMounts:
14 |     - name:  fsx
15 |       mountPath: /fsx
16 |   volumes:
17 |   - name:  fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/additional_nodegroup.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: eksctl.io/v1alpha5
 3 | kind: ClusterConfig
 4 | 
 5 | metadata:
 6 |   name: tensorpack-mask-rcnn-p3
 7 |   region: us-east-1
 8 | 
 9 | nodeGroups:
10 |   - name: ng-p3-1c
11 |     instanceType: p3.16xlarge
12 |     availabilityZones: ["us-east-1c"]
13 |     desiredCapacity: 1
14 |     iam:
15 |       withAddonPolicies:
16 |         imageBuilder: true
17 |         ebs: true
18 |         fsx: true
19 |         efs: true
20 |     ssh:
21 |       allow: true
22 |       publicKeyName: 'maskrcnn'
23 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/additional_nodegroup_non_gpu.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: eksctl.io/v1alpha5
 3 | kind: ClusterConfig
 4 | 
 5 | metadata:
 6 |   name: tensorpack-mask-rcnn-p3
 7 |   region: us-east-1
 8 | 
 9 | nodeGroups:
10 |   - name: ng-c5-1b
11 |     instanceType: c5.4xlarge
12 |     availabilityZones: ["us-east-1b"]
13 |     desiredCapacity: 1
14 |     iam:
15 |       withAddonPolicies:
16 |         imageBuilder: true
17 |         ebs: true
18 |         fsx: true
19 |         efs: true
20 |     ssh:
21 |       allow: true
22 |       publicKeyName: 'maskrcnn'
23 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3dn/additional_nodegroup_non_gpu.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: eksctl.io/v1alpha5
 3 | kind: ClusterConfig
 4 | 
 5 | metadata:
 6 |   name: tensorpack-mask-rcnn-p3dn
 7 |   region: us-east-1
 8 | 
 9 | nodeGroups:
10 |   - name: ng-c5-1f
11 |     instanceType: c5.4xlarge
12 |     availabilityZones: ["us-east-1f"]
13 |     desiredCapacity: 1
14 |     iam:
15 |       withAddonPolicies:
16 |         imageBuilder: true
17 |         ebs: true
18 |         fsx: true
19 |         efs: true
20 |     ssh:
21 |       allow: true
22 |       publicKeyName: 'maskrcnn'
23 | 


--------------------------------------------------------------------------------
/infra/eks/fsx/p3/attach-pvc-2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc-2
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc
 8 |     image: armandmcqueen/tensorpack-mask-rcnn:master-latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"]
13 |     volumeMounts:
14 |     - name:  fsx
15 |       mountPath: /fsx
16 |   volumes:
17 |   - name:  fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx
20 | 


--------------------------------------------------------------------------------
/infra/eks/fsx/p3dn/attach-pvc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc
 8 |     image: ubuntu:latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"]
13 |     volumeMounts:
14 |     - name:  fsx # efs or fsx
15 |       mountPath: /fsx # /efs or /fsx
16 |   volumes:
17 |   - name:  fsx # efs or fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx


--------------------------------------------------------------------------------
/MaskRCNN/utils/randomnness.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | class SeedGenerator:
 4 |     def __init__(self, seed):
 5 |         self.seed = seed
 6 |         self.counters = dict()
 7 | 
 8 |     def next(self, key='default'):
 9 |         if self.seed == None:
10 |             return None
11 | 
12 |         if key not in self.counters:
13 |             self.counters[key] = self.seed
14 |             return self.counters[key]
15 |         else:
16 |             self.counters[key] += 1
17 |             return self.counters[key]


--------------------------------------------------------------------------------
/infra/eks/fsx/p3dn/attach-pvc-2.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: attach-pvc-2
 5 | spec:
 6 |   containers:
 7 |   - name: attach-pvc
 8 |     image: armandmcqueen/tensorpack-mask-rcnn:master-latest
 9 |     command: ["/bin/bash"]
10 |     securityContext:
11 |       privileged: true
12 |     args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 3600; done"]
13 |     volumeMounts:
14 |     - name:  fsx # efs or fsx
15 |       mountPath: /fsx # /efs or /fsx
16 |   volumes:
17 |   - name:  fsx # efs or fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx
20 | 


--------------------------------------------------------------------------------
/infra/eks/TOOLS.md:
--------------------------------------------------------------------------------
 1 | # Tools
 2 | 
 3 | # Invoke Tasks
 4 | 
 5 | We use [invoke](https://www.pyinvoke.org/) to automate tasks around EKS training.
 6 | 
 7 | invoke tasks are essentially python functions that can be launched with `inv task_name args` or `invoke task_name args`
 8 | 
 9 | ## repeat
10 | 
11 | Repeatedly run a string as command, replacing the substring '|N|' (can be changed) with the iteration number
12 | 
13 | ```
14 | $ inv repeat 'echo |N|'
15 | 1
16 | 2
17 | 3
18 | 4
19 | 5
20 | $ inv repeat 'echo [I]' --repeat=2 --verbose --sub='[I]'
21 | [cmd = echo 1]
22 | 1
23 | [cmd = echo 2]
24 | 2
25 | ```
26 | 
27 | ## 
28 | 


--------------------------------------------------------------------------------
/patch/tensorflow_Conv2DTranspose.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
 2 | index fc2e8aa..e3081c0 100644
 3 | --- a/tensorflow/python/keras/backend.py
 4 | +++ b/tensorflow/python/keras/backend.py
 5 | @@ -594,7 +594,8 @@ def _has_nchw_support():
 6 |        bool: if the current scope device placement would support nchw
 7 |    """
 8 |    explicitly_on_cpu = _is_current_explicit_device('CPU')
 9 | -  gpus_available = bool(_get_available_gpus())
10 | +  #gpus_available = bool(_get_available_gpus())
11 | +  gpus_available = True
12 |    return not explicitly_on_cpu and gpus_available


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/create.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | # Need to make sure you are on the latest version of eksctl for fsx support. Tested on eksctl v0.1.32
 6 | 
 7 | eksctl create cluster -f config.yaml --auto-kubeconfig
 8 | 
 9 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3
10 | # aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER
11 | 
12 | kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta/nvidia-device-plugin.yml
13 | 
14 | 
15 | # eksctl scale nodegroup --cluster=tensorpack-mask-rcnn --nodes=12 --name=ng-1
16 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3dn/create.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | # Need to make sure you are on the latest version of eksctl for fsx support. Tested on eksctl v0.1.32
 6 | 
 7 | eksctl create cluster -f config.yaml --auto-kubeconfig
 8 | 
 9 | export KUBECONFIG=/Users/ubuntu/.kube/eksctl/clusters/tensorpack-mask-rcnn-p3dn
10 | # aws eks --region $AWS_REGION update-kubeconfig --name $EKS_CLUSTER
11 | 
12 | kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta/nvidia-device-plugin.yml
13 | 
14 | 
15 | # eksctl scale nodegroup --cluster=tensorpack-mask-rcnn --nodes=12 --name=ng-1
16 | 


--------------------------------------------------------------------------------
/infra/eks/tensorboard/tensorboard.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: tensorboard
 5 | spec:
 6 |   containers:
 7 |   - name: tensorboard
 8 |     image: armandmcqueen/tensorpack-mask-rcnn:dev-master
 9 |     command: ["tensorboard"]
10 |     args: ["--logdir=/fsx"]
11 |     securityContext:
12 |       privileged: true
13 |     volumeMounts:
14 |     - name:  fsx # efs or fsx
15 |       mountPath: /fsx # /efs or /fsx
16 |     ports:
17 |     - containerPort: 6006
18 |     # https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/
19 |     #
20 | 
21 |   volumes:
22 |   - name:  fsx # efs or fsx
23 |     persistentVolumeClaim:
24 |       claimName: tensorpack-fsx
25 | 


--------------------------------------------------------------------------------
/tensorpack/utils/compatible_serialize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | import os
 4 | 
 5 | from .serialize import dumps_msgpack, dumps_pyarrow, loads_msgpack, loads_pyarrow
 6 | 
 7 | """
 8 | Serialization that has compatibility guarantee (therefore is safe to store to disk).
 9 | """
10 | 
11 | __all__ = ['loads', 'dumps']
12 | 
13 | 
14 | # pyarrow has no compatibility guarantee
15 | # use msgpack for persistent serialization, unless explicitly set from envvar
16 | if os.environ.get('TENSORPACK_COMPATIBLE_SERIALIZE', 'msgpack') == 'msgpack':
17 |     loads = loads_msgpack
18 |     dumps = dumps_msgpack
19 | else:
20 |     loads = loads_pyarrow
21 |     dumps = dumps_pyarrow
22 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/dftools.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: dftools.py
 5 | 
 6 | 
 7 | from ..utils.develop import deprecated
 8 | from .remote import dump_dataflow_to_process_queue
 9 | from .serialize import LMDBSerializer, TFRecordSerializer
10 | 
11 | __all__ = ['dump_dataflow_to_process_queue',
12 |            'dump_dataflow_to_lmdb', 'dump_dataflow_to_tfrecord']
13 | 
14 | 
15 | @deprecated("Use LMDBSerializer.save instead!", "2019-01-31")
16 | def dump_dataflow_to_lmdb(df, lmdb_path, write_frequency=5000):
17 |     LMDBSerializer.save(df, lmdb_path, write_frequency)
18 | 
19 | 
20 | @deprecated("Use TFRecordSerializer.save instead!", "2019-01-31")
21 | def dump_dataflow_to_tfrecord(df, path):
22 |     TFRecordSerializer.save(df, path)
23 | 


--------------------------------------------------------------------------------
/tensorpack/models/shapes.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: shapes.py
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from .common import layer_register
10 | 
11 | __all__ = ['ConcatWith']
12 | 
13 | 
14 | @layer_register(use_scope=None)
15 | def ConcatWith(x, tensor, dim):
16 |     """
17 |     A wrapper around ``tf.concat`` to cooperate with :class:`LinearWrap`.
18 | 
19 |     Args:
20 |         x (tf.Tensor): input
21 |         tensor (list[tf.Tensor]): a tensor or list of tensors to concatenate with x.
22 |             x will be at the beginning
23 |         dim (int): the dimension along which to concatenate
24 | 
25 |     Returns:
26 |         tf.Tensor: ``tf.concat([x] + tensor, dim)``
27 |     """
28 |     if type(tensor) != list:
29 |         tensor = [tensor]
30 |     return tf.concat([x] + tensor, dim)
31 | 


--------------------------------------------------------------------------------
/infra/docker/ssh_and_build.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | MASTER_HOST=${1:-"127.0.0.1"}
 6 | HOSTS=${2:-"hosts"}
 7 | BRANCH_NAME=${3:-"master"}
 8 | 
 9 | 
10 | ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa
11 | hosts=`cat $HOSTS`
12 | for host in $hosts; do
13 |   scp ~/.ssh/id_rsa.pub $host:~/.ssh/
14 |   ssh $host "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys"
15 |   ssh $host "printf 'Host *\n  StrictHostKeyChecking no\n' >> ~/.ssh/config"
16 |   ssh $host "chmod 400 ~/.ssh/config"
17 |   ssh $host "sudo mkdir -p /mnt/share/ssh"
18 |   ssh $host "sudo cp -r ~/.ssh/* /mnt/share/ssh"
19 |   if [ $host != $MASTER_HOST ]; then
20 |     ssh $host "git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn.git -b ${BRANCH_NAME}"
21 |   fi
22 |   ssh $host "cd ~/tensorpack-mask-rcnn/infra/docker; ./build.sh"
23 | done
24 | 


--------------------------------------------------------------------------------
/patch/roi_align.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/tensorflow/core/kernels/roi_align_op.cu.cc b/tensorflow/core/kernels/roi_align_op.cu.cc
 2 | index 886f4bc81f..0a801dceb8 100644
 3 | --- a/tensorflow/core/kernels/roi_align_op.cu.cc
 4 | +++ b/tensorflow/core/kernels/roi_align_op.cu.cc
 5 | @@ -1298,8 +1298,8 @@ __global__ void WriteUprightBoxesOutput(const CudaLaunchConfig nboxes,
 6 |      d_image_out_rois[base_idx + 0] = image_index;
 7 |      d_image_out_rois[base_idx + 1] = box.x;
 8 |      d_image_out_rois[base_idx + 2] = box.y;
 9 | -    d_image_out_rois[base_idx + 3] = box.z;
10 | -    d_image_out_rois[base_idx + 4] = box.w;
11 | +    d_image_out_rois[base_idx + 3] = box.z + 1.0f;
12 | +    d_image_out_rois[base_idx + 4] = box.w + 1.0f;
13 |    }
14 |  }
15 |  
16 | @@ -2395,4 +2395,4 @@ REGISTER_KERNEL_BUILDER(
17 |      tensorflow::sami::BoxIntersectionOverUnion);
18 |  
19 |  }  // namespace tensorflow
20 | -#endif
21 | \ No newline at end of file
22 | +#endif
23 | 


--------------------------------------------------------------------------------
/infra/ami/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | ## Upgrading protoc to 3.6.1 for Horovod install
 3 | 
 4 | Required on DLAMI 21.2
 5 | 
 6 | ```
 7 | pip uninstall -y protobuf
 8 | 
 9 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/bin/protoc
10 | rm -r /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/include/google/protobuf
11 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/lib/python3.6/site-packages/protobuf-3.6.0-py3.6-nspkg.pth
12 | rm /home/ubuntu/anaconda3/bin//protoc
13 | 
14 | wget https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip
15 | mkdir -p /home/ubuntu/protoc
16 | mv protoc-3.6.1-linux-x86_64.zip /home/ubuntu/protoc/protoc-3.6.1-linux-x86_64.zip
17 | unzip /home/ubuntu/protoc/protoc-3.6.1-linux-x86_64.zip -d protoc
18 | sudo mv /home/ubuntu/protoc/bin/protoc /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/bin/protoc
19 | sudo mv /home/ubuntu/protoc/include/* /home/ubuntu/anaconda3/envs/tensorflow_p36_13rc1/include
20 | pip install protobuf==3.6.1


--------------------------------------------------------------------------------
/infra/sm/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | cd /opt/ml/code/tensorpack-mask-rcnn
 5 | BATCH_SIZE_PER_GPU=4
 6 | THROUGHPUT_LOG_FREQ=2000
 7 | echo "Launch training job...."
 8 | /usr/local/bin/python3 MaskRCNN/train.py \
 9 | 	--logdir /logs/train_log \
10 | 	--fp16 \
11 | 	--throughput_log_freq ${THROUGHPUT_LOG_FREQ} \
12 | 	--config \
13 | 	MODE_MASK=True \
14 | 	MODE_FPN=True \
15 | 	DATA.BASEDIR=/opt/ml/code/data \
16 | 	DATA.TRAIN='["train2017"]' \
17 | 	DATA.VAL='("val2017",)' \
18 | 	TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \
19 | 	TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \
20 | 	TRAIN.EVAL_PERIOD=12 \
21 | 	BACKBONE.WEIGHTS=/opt/ml/code/data/pretrained-models/ImageNet-R50-AlignPadding.npz \
22 |   RPN.TOPK_PER_IMAGE=True \
23 |   PREPROC.PREDEFINED_PADDING=True \
24 |   TRAIN.GRADIENT_CLIP=0 \
25 | 	BACKBONE.NORM=FreezeBN \
26 | 	TRAINER=horovod
27 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: _test.py
 5 | 
 6 | 
 7 | import sys
 8 | import cv2
 9 | 
10 | from . import AugmentorList
11 | from .crop import *
12 | from .deform import *
13 | from .imgproc import *
14 | from .noise import SaltPepperNoise
15 | from .noname import *
16 | 
17 | anchors = [(0.2, 0.2), (0.7, 0.2), (0.8, 0.8), (0.5, 0.5), (0.2, 0.5)]
18 | augmentors = AugmentorList([
19 |     Contrast((0.8, 1.2)),
20 |     Flip(horiz=True),
21 |     GaussianDeform(anchors, (360, 480), 0.2, randrange=20),
22 |     # RandomCropRandomShape(0.3),
23 |     SaltPepperNoise()
24 | ])
25 | 
26 | img = cv2.imread(sys.argv[1])
27 | newimg, prms = augmentors._augment_return_params(img)
28 | cv2.imshow(" ", newimg.astype('uint8'))
29 | cv2.waitKey()
30 | 
31 | newimg = augmentors._augment(img, prms)
32 | cv2.imshow(" ", newimg.astype('uint8'))
33 | cv2.waitKey()
34 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3/config.yaml:
--------------------------------------------------------------------------------
 1 | # An example of ClusterConfig object with access to CSI drivers:
 2 | ---
 3 | apiVersion: eksctl.io/v1alpha5
 4 | kind: ClusterConfig
 5 | 
 6 | metadata:
 7 |   name: tensorpack-mask-rcnn-p3
 8 |   region: us-east-1
 9 | 
10 | 
11 | vpc:
12 |   id: "vpc-f6570b8d"  # (optional, must match VPC ID used for each subnet below)
13 |   subnets:
14 |     # must provide 'private' and/or 'public' subnets by availability zone as shown
15 |     public:
16 |       us-east-1b:
17 |         id: "subnet-58b35b04"
18 | 
19 |       us-east-1c:
20 |         id: "subnet-b440b9d3"
21 | 
22 |       us-east-1f:
23 |         id: "subnet-21ac2f2e"
24 | 
25 | nodeGroups:
26 |   - name: ng-p3-1f
27 |     instanceType: p3.16xlarge
28 |     availabilityZones: ["us-east-1f"]
29 |     desiredCapacity: 1
30 |     iam:
31 |       withAddonPolicies:
32 |         imageBuilder: true
33 |         ebs: true
34 |         fsx: true
35 |         efs: true
36 |     ssh:
37 |       allow: true
38 |       publicKeyName: 'maskrcnn'
39 | 


--------------------------------------------------------------------------------
/tensorpack/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | """
 7 | Common utils.
 8 | These utils should be irrelevant to tensorflow.
 9 | """
10 | 
11 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
12 | STATICA_HACK = True
13 | globals()['kcah_acitats'[::-1].upper()] = False
14 | if STATICA_HACK:
15 |     from .utils import *
16 | 
17 | 
18 | __all__ = []
19 | 
20 | 
21 | def _global_import(name):
22 |     p = __import__(name, globals(), None, level=1)
23 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
24 |     for k in lst:
25 |         if not k.startswith('__'):
26 |             globals()[k] = p.__dict__[k]
27 |             __all__.append(k)
28 | 
29 | 
30 | _global_import('utils')
31 | 
32 | # Import no other submodules. they are supposed to be explicitly imported by users.
33 | __all__.extend(['logger'])
34 | 


--------------------------------------------------------------------------------
/tensorpack/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: __init__.py
 5 | 
 6 | 
 7 | from tensorpack.libinfo import __version__, __git_version__, _HAS_TF
 8 | 
 9 | from tensorpack.utils import *
10 | from tensorpack.dataflow import *
11 | 
12 | # dataflow can be used alone without installing tensorflow
13 | # TODO maybe separate dataflow to a new project if it's good enough
14 | 
15 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
16 | STATICA_HACK = True
17 | globals()['kcah_acitats'[::-1].upper()] = _HAS_TF
18 | if STATICA_HACK:
19 |     from tensorpack.models import *
20 | 
21 |     from tensorpack.callbacks import *
22 |     from tensorpack.tfutils import *
23 | 
24 |     from tensorpack.train import *
25 |     from tensorpack.graph_builder import InputDesc, ModelDesc, ModelDescBase
26 |     from tensorpack.input_source import *
27 |     from tensorpack.predict import *
28 | 


--------------------------------------------------------------------------------
/infra/eks/eksctl/p3dn/config.yaml:
--------------------------------------------------------------------------------
 1 | # An example of ClusterConfig object with access to CSI drivers:
 2 | ---
 3 | apiVersion: eksctl.io/v1alpha5
 4 | kind: ClusterConfig
 5 | 
 6 | metadata:
 7 |   name: tensorpack-mask-rcnn-p3dn
 8 |   region: us-east-1
 9 | 
10 | 
11 | vpc:
12 |   id: "vpc-f6570b8d"  # (optional, must match VPC ID used for each subnet below)
13 |   subnets:
14 |     # must provide 'private' and/or 'public' subnets by availability zone as shown
15 |     public:
16 |       us-east-1b:
17 |         id: "subnet-58b35b04"
18 | 
19 |       us-east-1c:
20 |         id: "subnet-b440b9d3"
21 | 
22 |       us-east-1f:
23 |         id: "subnet-21ac2f2e"
24 | 
25 | nodeGroups:
26 |   - name: ng-p3dn-1c
27 |     instanceType: p3dn.24xlarge
28 |     availabilityZones: ["us-east-1c"]
29 |     desiredCapacity: 1
30 |     iam:
31 |       withAddonPolicies:
32 |         imageBuilder: true
33 |         ebs: true
34 |         fsx: true
35 |         efs: true
36 |     ssh:
37 |       allow: true
38 |       publicKeyName: 'maskrcnn'
39 | 
40 | 
41 | 
42 | # Never eksctl version require:
43 | #
44 | #ssh:
45 | #  allow: true
46 | #  publicKeyName: 'us-east-1-benchmark-tf'
47 | 


--------------------------------------------------------------------------------
/tensorpack/models/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: utils.py
 5 | 
 6 | import six
 7 | 
 8 | 
 9 | class VariableHolder(object):
10 |     """ A proxy to access variables defined in a layer. """
11 |     def __init__(self, **kwargs):
12 |         """
13 |         Args:
14 |             kwargs: {name:variable}
15 |         """
16 |         self._vars = {}
17 |         for k, v in six.iteritems(kwargs):
18 |             self._add_variable(k, v)
19 | 
20 |     def _add_variable(self, name, var):
21 |         assert name not in self._vars
22 |         self._vars[name] = var
23 | 
24 |     def __setattr__(self, name, var):
25 |         if not name.startswith('_'):
26 |             self._add_variable(name, var)
27 |         else:
28 |             # private attributes
29 |             super(VariableHolder, self).__setattr__(name, var)
30 | 
31 |     def __getattr__(self, name):
32 |         return self._vars[name]
33 | 
34 |     def all(self):
35 |         """
36 |         Returns:
37 |             list of all variables
38 |         """
39 |         return list(six.itervalues(self._vars))
40 | 


--------------------------------------------------------------------------------
/infra/eks/fsx/p3/stage-data.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: stage-data
 6 | data:
 7 |   stage-data.sh: |
 8 |     aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR
 9 | ---
10 | apiVersion: v1
11 | kind: Pod
12 | metadata:
13 |   name: stage-data
14 | spec:
15 |   restartPolicy: Never
16 |   volumes:
17 |   - name: fsx # efs, or fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx
20 |   - name: config
21 |     configMap:
22 |       defaultMode: 420
23 |       items:
24 |       - key: stage-data.sh
25 |         mode: 365
26 |         path: stage-data.sh
27 |       name: stage-data
28 |   containers:
29 |   - name: data
30 |     env:
31 |     - name: S3_BUCKET
32 |       value: armand-ajay-workshop
33 |     - name: S3_PREFIX
34 |       value: mask-rcnn/sagemaker/input/train
35 |     - name: STAGE_DIR
36 |       value: /fsx
37 |     command:
38 |     -  sh
39 |     - /etc/config/stage-data.sh
40 |     image: armandmcqueen/tensorpack-mask-rcnn:master-latest
41 |     imagePullPolicy: IfNotPresent
42 |     volumeMounts:
43 |     - mountPath: /etc/config
44 |       name: config
45 |     - mountPath: /fsx # /efs or /fsx
46 |       name: fsx # efs, or fsx


--------------------------------------------------------------------------------
/infra/eks/fsx/p3dn/stage-data.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: stage-data
 6 | data:
 7 |   stage-data.sh: |
 8 |     aws s3 cp --recursive s3://$S3_BUCKET/$S3_PREFIX $STAGE_DIR
 9 | ---
10 | apiVersion: v1
11 | kind: Pod
12 | metadata:
13 |   name: stage-data
14 | spec:
15 |   restartPolicy: Never
16 |   volumes:
17 |   - name: fsx # efs, or fsx
18 |     persistentVolumeClaim:
19 |       claimName: tensorpack-fsx
20 |   - name: config
21 |     configMap:
22 |       defaultMode: 420
23 |       items:
24 |       - key: stage-data.sh
25 |         mode: 365
26 |         path: stage-data.sh
27 |       name: stage-data
28 |   containers:
29 |   - name: data
30 |     env:
31 |     - name: S3_BUCKET
32 |       value: armand-ajay-workshop
33 |     - name: S3_PREFIX
34 |       value: mask-rcnn/sagemaker/input/train
35 |     - name: STAGE_DIR
36 |       value: /fsx
37 |     command:
38 |     -  sh
39 |     - /etc/config/stage-data.sh
40 |     image: armandmcqueen/tensorpack-mask-rcnn:master-latest
41 |     imagePullPolicy: IfNotPresent
42 |     volumeMounts:
43 |     - mountPath: /etc/config
44 |       name: config
45 |     - mountPath: /fsx # /efs or /fsx
46 |       name: fsx # efs, or fsx


--------------------------------------------------------------------------------
/infra/docker/docker.md:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | 
 3 | Add image to dockerhub and add scripts to push to ECR.
 4 | 
 5 | https://github.com/ajayvohra2005/tf-tp-hvd-eks/blob/master/container/build_tools/build_and_push.sh
 6 | 
 7 | # Using Docker
 8 | 
 9 | The ec2 instance must have the training data available at ~/data.
10 | 
11 | ### Build container
12 | ```
13 | cd docker
14 | ./build.sh
15 | ```
16 | 
17 | ### Run container interactively
18 | ```
19 | ./run.sh
20 | ```
21 | 
22 | 
23 | ### Run training job inside container
24 | 
25 | ```
26 | cd tensorpack-mask-rcnn
27 | infra/docker/train.sh 8 1 250
28 | ```
29 | 
30 | 
31 | This is 8 GPUs, 1 img per GPU, summary writer logs every 250 steps.
32 | 
33 | Logs will be exposed to the ec2 instance at ~/logs.
34 | 
35 | ### Attaching/Detaching from docker container
36 | `ctl + p + q` will detach
37 | `docker ps` will give info on the running docker containers including convenient name.
38 | `docker attach $CONTAINER_NAME` will reattach to the running docker container.
39 | 
40 | ## Notes
41 | 
42 | The current Dockerfile uses the wheel built for p3.16xl. The wheel built for p3dn.24xl might have a performance improvement, but it does not run on 16xl due to different available instruction sets.
43 | 


--------------------------------------------------------------------------------
/tensorpack/models/_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: _test.py
 5 | 
 6 | 
 7 | import logging
 8 | import unittest
 9 | import tensorflow as tf
10 | 
11 | 
12 | class TestModel(unittest.TestCase):
13 | 
14 |     def run_variable(self, var):
15 |         sess = tf.Session()
16 |         sess.run(tf.global_variables_initializer())
17 |         if isinstance(var, list):
18 |             return sess.run(var)
19 |         else:
20 |             return sess.run([var])[0]
21 | 
22 |     def make_variable(self, *args):
23 |         if len(args) > 1:
24 |             return [tf.Variable(k) for k in args]
25 |         else:
26 |             return tf.Variable(args[0])
27 | 
28 | 
29 | def run_test_case(case):
30 |     suite = unittest.TestLoader().loadTestsFromTestCase(case)
31 |     unittest.TextTestRunner(verbosity=2).run(suite)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     import tensorpack
36 |     from tensorpack.utils import logger
37 |     from . import *  # noqa
38 |     logger.setLevel(logging.CRITICAL)
39 |     subs = tensorpack.models._test.TestModel.__subclasses__()
40 |     for cls in subs:
41 |         run_test_case(cls)
42 | 


--------------------------------------------------------------------------------
/tensorpack/input_source/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .input_source_base import *
11 |     from .input_source import *
12 | 
13 | from pkgutil import iter_modules
14 | import os
15 | import os.path
16 | 
17 | __all__ = []
18 | 
19 | 
20 | def global_import(name):
21 |     p = __import__(name, globals(), locals(), level=1)
22 |     lst = p.__all__ if '__all__' in dir(p) else []
23 |     del globals()[name]
24 |     for k in lst:
25 |         if not k.startswith('__'):
26 |             globals()[k] = p.__dict__[k]
27 |             __all__.append(k)
28 | 
29 | 
30 | _CURR_DIR = os.path.dirname(__file__)
31 | _SKIP = []
32 | for _, module_name, _ in iter_modules(
33 |         [_CURR_DIR]):
34 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
35 |     if not os.path.isfile(srcpath):
36 |         continue
37 |     if module_name.startswith('_'):
38 |         continue
39 |     if module_name not in _SKIP:
40 |         global_import(module_name)
41 | 


--------------------------------------------------------------------------------
/RESULTS.md:
--------------------------------------------------------------------------------
 1 | # Results
 2 | 
 3 | Detailed results coming soon!
 4 | 
 5 | ## Advanced configurations
 6 | 
 7 | There are a few advanced configurations that you should be aware of for optimal performance.
 8 | 
 9 | ### p3dn 
10 | 
11 | When using p3dn, you will want to use 13 NCCL rings. With p3.16xl, 8 NCCL rings is a good choice.
12 | 
13 | ### Prioritizing bounding box accuracy
14 | 
15 | You can use a improved bounding box regression weight (`cfg.FRCNN.BBOX_REG_WEIGHTS`) to get better bounding box mAP. If you use `[20, 20, 10, 10]` instead of `[10., 10., 5., 5.]` you will see a solid improvement in bbox mAP (for 12 epochs, 8x4 training, from 37.3 to 398) with a slight decrease in segmentation accuracy (34.3 to 34.2). As you increase the total batch size, the bbox improvement decreases and the segm penalty increases.
16 | 
17 | ### SyncBN
18 | 
19 | You can use SyncBN to train with very large batch sizes without getting NaN losses. However, currently the accuracy is generally lower than when using FreezeBN and the throughput is significantly worse.
20 | 
21 | ### Large batch size
22 | 
23 | When training in the 32x4 configuration, you will get NaN ~5% of the time if you do not use gradient clipping. To enable gradient clipping, you need to add `TRAIN.GRADIENT_CLIP=1.5` to the config. This has a minor throughput impact, but eliminates NaN runs.


--------------------------------------------------------------------------------
/tensorpack/predict/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .base import *
11 |     from .concurrency import *
12 |     from .config import *
13 |     from .dataset import *
14 |     from .multigpu import *
15 | 
16 | 
17 | from pkgutil import iter_modules
18 | import os
19 | import os.path
20 | 
21 | __all__ = []
22 | 
23 | 
24 | def global_import(name):
25 |     p = __import__(name, globals(), locals(), level=1)
26 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
27 |     if lst:
28 |         del globals()[name]
29 |         for k in lst:
30 |             globals()[k] = p.__dict__[k]
31 |             __all__.append(k)
32 | 
33 | 
34 | _CURR_DIR = os.path.dirname(__file__)
35 | for _, module_name, _ in iter_modules(
36 |         [_CURR_DIR]):
37 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
38 |     if not os.path.isfile(srcpath):
39 |         continue
40 |     if module_name.startswith('_'):
41 |         continue
42 |     global_import(module_name)
43 | 


--------------------------------------------------------------------------------
/tensorpack/utils/debug.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: debug.py
 5 | 
 6 | 
 7 | import sys
 8 | 
 9 | 
10 | def enable_call_trace():
11 |     """ Enable trace for calls to any function. """
12 |     def tracer(frame, event, arg):
13 |         if event == 'call':
14 |             co = frame.f_code
15 |             func_name = co.co_name
16 |             if func_name == 'write' or func_name == 'print':
17 |                 # ignore write() calls from print statements
18 |                 return
19 |             func_line_no = frame.f_lineno
20 |             func_filename = co.co_filename
21 |             caller = frame.f_back
22 |             if caller:
23 |                 caller_line_no = caller.f_lineno
24 |                 caller_filename = caller.f_code.co_filename
25 |                 print('Call to `%s` on line %s:%s from %s:%s' %
26 |                       (func_name, func_filename, func_line_no,
27 |                        caller_filename, caller_line_no))
28 |             return
29 |     sys.settrace(tracer)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     enable_call_trace()
34 | 
35 |     def b(a):
36 |         print(2)
37 | 
38 |     def a():
39 |         print(1)
40 |         b(1)
41 | 
42 |     a()
43 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/values.yaml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   namespace: default
 3 |   name: maskrcnn
 4 | maskrcnn:
 5 |   experiment_group: default # For organizing result dirs.
 6 |   gpus: 8
 7 |   batch_size_per_gpu: 1
 8 |   image: armandmcqueen/tensorpack-mask-rcnn:master-latest # image URL from ECR or DockerHub
 9 |   train_script: /tensorpack-mask-rcnn/MaskRCNN/train.py
10 |   fp_16: 1 # TODO: Setting this to 0 does not disable FP16, it just disables loss scaling
11 |   base_lr: 0.00125
12 |   warmup_lr: 0.000416667
13 |   shared_fs: fsx
14 |   data_fs: fsx
15 |   shared_pvc: tensorpack-fsx
16 |   data_dir: ''
17 |   working_dir:  /tensorpack-mask-rcnn
18 |   images_per_epoch: 120000
19 |   lr_epoch_schedule: "[(8, 0.1), (10, 0.01), (12, None)]"
20 |   eval_period_in_epochs: 24
21 |   data_train: "[\"train2017\"]"
22 |   data_val: "(\"val2017\")"
23 |   mode_fpn: 'True'
24 |   mode_mask: 'True'
25 |   backbone_norm: FreezeBN
26 |   backbone_weights: pretrained-models/ImageNet-R50-AlignPadding.npz
27 |   predefined_padding: 'True'
28 |   topk_per_image: 'True'
29 |   image_pull_policy: Always
30 |   horovod_cycle_time: "0.5"
31 |   horovod_fusion_threshold: "67108864"
32 |   nccl_socket_ifname: ^lo,docker0
33 |   nccl_min_rings: 8
34 |   nccl_debug: INFO
35 |   bbox_reg_weights: '[10., 10., 5., 5.]'
36 |   result_score_thresh: 0.05
37 |   gpus_per_node: 8
38 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .bsds500 import *
11 |     from .cifar import *
12 |     from .ilsvrc import *
13 |     from .mnist import *
14 |     from .svhn import *
15 | 
16 | from pkgutil import iter_modules
17 | import os
18 | import os.path
19 | 
20 | __all__ = []
21 | 
22 | 
23 | def global_import(name):
24 |     p = __import__(name, globals(), locals(), level=1)
25 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
26 |     if lst:
27 |         del globals()[name]
28 |         for k in lst:
29 |             if not k.startswith('__'):
30 |                 globals()[k] = p.__dict__[k]
31 |                 __all__.append(k)
32 | 
33 | 
34 | _CURR_DIR = os.path.dirname(__file__)
35 | for _, module_name, _ in iter_modules(
36 |         [_CURR_DIR]):
37 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
38 |     if not os.path.isfile(srcpath):
39 |         continue
40 |     if not module_name.startswith('_'):
41 |         global_import(module_name)
42 | 


--------------------------------------------------------------------------------
/tensorpack/train/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | # flake8: noqa
 6 | 
 7 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 8 | STATICA_HACK = True
 9 | globals()['kcah_acitats'[::-1].upper()] = False
10 | if STATICA_HACK:
11 |     from .base import *
12 |     from .config import *
13 |     from .interface import *
14 |     from .tower import *
15 |     from .trainers import *
16 | 
17 | 
18 | from pkgutil import iter_modules
19 | import os
20 | import os.path
21 | 
22 | __all__ = []
23 | 
24 | 
25 | def global_import(name):
26 |     p = __import__(name, globals(), locals(), level=1)
27 |     lst = p.__all__ if '__all__' in dir(p) else []
28 |     if lst:
29 |         del globals()[name]
30 |         for k in lst:
31 |             globals()[k] = p.__dict__[k]
32 |             __all__.append(k)
33 | 
34 | 
35 | _CURR_DIR = os.path.dirname(__file__)
36 | _SKIP = ['utility']
37 | for _, module_name, _ in iter_modules(
38 |         [_CURR_DIR]):
39 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
40 |     if not os.path.isfile(srcpath):
41 |         continue
42 |     if module_name.startswith('_'):
43 |         continue
44 |     if module_name not in _SKIP:
45 |         global_import(module_name)
46 | 


--------------------------------------------------------------------------------
/tensorpack/graph_builder/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .model_desc import *
11 |     from .training import *
12 |     from .distributed import *
13 |     from .predict import *
14 |     from .utils import *
15 | 
16 | from pkgutil import iter_modules
17 | import os
18 | import os.path
19 | 
20 | __all__ = []
21 | 
22 | def global_import(name):
23 |     p = __import__(name, globals(), locals(), level=1)
24 |     lst = p.__all__ if '__all__' in dir(p) else []
25 |     del globals()[name]
26 |     for k in lst:
27 |         if not k.startswith('__'):
28 |             globals()[k] = p.__dict__[k]
29 |             __all__.append(k)
30 | 
31 | 
32 | _CURR_DIR = os.path.dirname(__file__)
33 | _SKIP = ['distributed']
34 | for _, module_name, _ in iter_modules(
35 |         [_CURR_DIR]):
36 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
37 |     if not os.path.isfile(srcpath):
38 |         continue
39 |     if module_name.startswith('_'):
40 |         continue
41 |     if module_name not in _SKIP:
42 |         global_import(module_name)
43 | 


--------------------------------------------------------------------------------
/infra/eks/helm/mpijob/templates/mpijob.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: apiextensions.k8s.io/v1beta1
 3 | kind: CustomResourceDefinition
 4 | metadata:
 5 |   name: mpijobs.kubeflow.org
 6 | spec:
 7 |   group: kubeflow.org
 8 |   names:
 9 |     kind: MPIJob
10 |     plural: mpijobs
11 |     shortNames:
12 |     - mj
13 |     - mpij
14 |     singular: mpijob
15 |   scope: Namespaced
16 |   validation:
17 |     openAPIV3Schema:
18 |       properties:
19 |         spec:
20 |           description: Either `gpus` or `replicas` should be specified, but not both
21 |           oneOf:
22 |           - properties:
23 |               gpus:
24 |                 description: Valid values are 1, 2, 4, or any multiple of 8
25 |                 oneOf:
26 |                 - enum:
27 |                   - 1
28 |                   - 2
29 |                   - 4
30 |                   type: integer
31 |                 - minimum: 8
32 |                   multipleOf: 8
33 |                   type: integer
34 |                 title: Total number of GPUs
35 |             required:
36 |             - gpus
37 |           - properties:
38 |               replicas:
39 |                 description: The GPU resource limit should be specified for each replica
40 |                 minimum: 1
41 |                 title: Total number of replicas
42 |                 type: integer
43 |             required:
44 |             - replicas
45 |           title: The MPIJob spec
46 |   version: v1alpha1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # specific stuff
 2 | venv/
 3 | test.py
 4 | tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl
 5 | .ignored/
 6 | 
 7 | 
 8 | # EKS configs
 9 | infra/eks/maskrcnn/values/
10 | 
11 | 
12 | # tensorpack-specific stuff
13 | train_log
14 | train_log_*
15 | logs
16 | *.npy
17 | *.npz
18 | *.caffemodel
19 | *.tfmodel
20 | *.meta
21 | *.log*
22 | *.bin
23 | *.png
24 | *.jpg
25 | checkpoint
26 | *.json
27 | *.prototxt
28 | *.txt
29 | *.tgz
30 | *.gz
31 | 
32 | 
33 | 
34 | 
35 | 
36 | # Byte-compiled / optimized / DLL files
37 | __pycache__/
38 | *.py[cod]
39 | 
40 | # C extensions
41 | *.so
42 | 
43 | # Distribution / packaging
44 | .Python
45 | env/
46 | build/
47 | develop-eggs/
48 | dist/
49 | downloads/
50 | eggs/
51 | .eggs/
52 | lib/
53 | lib64/
54 | parts/
55 | sdist/
56 | var/
57 | *.egg-info/
58 | .installed.cfg
59 | *.egg
60 | 
61 | # PyInstaller
62 | #  Usually these files are written by a python script from a template
63 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
64 | *.manifest
65 | *.spec
66 | 
67 | # Installer logs
68 | pip-log.txt
69 | pip-delete-this-directory.txt
70 | 
71 | # Unit test / coverage reports
72 | htmlcov/
73 | .tox/
74 | .coverage
75 | .coverage.*
76 | .cache
77 | nosetests.xml
78 | coverage.xml
79 | *,cover
80 | 
81 | # Translations
82 | *.mo
83 | *.pot
84 | 
85 | # Django stuff:
86 | *.log
87 | 
88 | # Sphinx documentation
89 | docs/_build/
90 | 
91 | # PyBuilder
92 | target/
93 | *.dat
94 | 
95 | .idea/
96 | 


--------------------------------------------------------------------------------
/infra/sm/README.md:
--------------------------------------------------------------------------------
 1 | # Train with Sagemaker
 2 | 
 3 | ## To launch training
 4 | 
 5 | - (1) Set up your Sagemaker role according to https://medium.com/ml-bytes/how-to-a-create-a-sagemaker-execution-role-539866910bda and record it as `$YOUR_SM_ROLE`
 6 |     - Make sure you have full access for S3
 7 | - (2) Modify the `launch_sm_job.py`, pick your sagemaker_iam_role, instance type, instance numbers, GPUs per instance and other Sagemaker specifications.
 8 | - (3) Modify the `run.sh`, pick your batch_size, training epoches and other training parameters.
 9 | - (4) Create a repo in ECR with `$YOUR_JOB_NAME`
10 | - (4) Launch your training job by run `./build_push_submit $YOUR_JOB_NAME $YOUR_SM_ROLE`
11 |     - If your have your image ready in ECS and just want to launch the job, you can run `python3 Launch_sm_job.py $YOUR_JOB_NAME $YOUR_SM_ROLE`
12 | 
13 | ## What happened inside?
14 | 
15 | ### (1) Build image and push it to ECR
16 | - The `Dockerfile_base` is similar to dockerfile used for EKS and EC2, use that to build the base image
17 | - The `Dockerfile_sm` is specially for SageMaker, everytime if the `run_mpi.py` or `run.sh` is changed, the image needs to be rebuilt
18 | - The `build_and_push.sh` will build the image and push it to ECR
19 | ### (2) Launch SageMaker estimator job
20 | - `Launch_sm_job.py` will lauch the estimator, which essentially launch the instances in container with the docker image we built before. Once the instance is started, it will launch the `run_mpi.py`
21 | - `run_mpi.py` build all mpi commands to run multi-node multi-gpu training. It will run the `run.sh`, which launch the training job.
22 | 


--------------------------------------------------------------------------------
/tensorpack/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .batch_norm import *
11 |     from .common import *
12 |     from .conv2d import *
13 |     from .fc import *
14 |     from .layer_norm import *
15 |     from .linearwrap import *
16 |     from .nonlin import *
17 |     from .pool import *
18 |     from .regularize import *
19 | 
20 | 
21 | from pkgutil import iter_modules
22 | import os
23 | import os.path
24 | # this line is necessary for _TFModuleFunc to work
25 | import tensorflow as tf  # noqa: F401
26 | 
27 | __all__ = []
28 | 
29 | 
30 | def _global_import(name):
31 |     p = __import__(name, globals(), locals(), level=1)
32 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
33 |     del globals()[name]
34 |     for k in lst:
35 |         if not k.startswith('__'):
36 |             globals()[k] = p.__dict__[k]
37 |             __all__.append(k)
38 | 
39 | 
40 | _CURR_DIR = os.path.dirname(__file__)
41 | _SKIP = ['utils', 'registry', 'tflayer']
42 | for _, module_name, _ in iter_modules(
43 |         [_CURR_DIR]):
44 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
45 |     if not os.path.isfile(srcpath):
46 |         continue
47 |     if module_name.startswith('_'):
48 |         continue
49 |     if module_name not in _SKIP:
50 |         _global_import(module_name)
51 | 


--------------------------------------------------------------------------------
/infra/docker/train.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | NUM_GPU=${1:-1}
 5 | BATCH_SIZE_PER_GPU=${2:-1}
 6 | THROUGHPUT_LOG_FREQ=${3:-2000}
 7 | 
 8 | 
 9 | echo ""
10 | echo "NUM_GPU: ${NUM_GPU}"
11 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}"
12 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}"
13 | echo ""
14 | 
15 | 
16 | 
17 | /usr/local/bin/mpirun -np ${NUM_GPU} \
18 | --H localhost:${NUM_GPU} \
19 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \
20 | -mca btl_tcp_if_exclude lo,docker0 \
21 | -mca btl_vader_single_copy_mechanism none \
22 | -x LD_LIBRARY_PATH \
23 | -x PATH \
24 | -x NCCL_SOCKET_IFNAME=^docker0,lo \
25 | -x NCCL_MIN_NRINGS=8 \
26 | -x NCCL_DEBUG=INFO \
27 | -x TENSORPACK_FP16=1 \
28 | -x HOROVOD_CYCLE_TIME=0.5 \
29 | -x HOROVOD_FUSION_THRESHOLD=67108864 \
30 | --output-filename /logs/mpirun_logs \
31 | /usr/local/bin/python3 /tensorpack-mask-rcnn/MaskRCNN/train.py \
32 | --logdir /logs/train_log \
33 | --fp16 \
34 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \
35 | --config \
36 | MODE_MASK=True \
37 | MODE_FPN=True \
38 | DATA.BASEDIR=/data \
39 | DATA.TRAIN='["train2017"]' \
40 | DATA.VAL='("val2017",)' \
41 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \
42 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \
43 | TRAIN.EVAL_PERIOD=12 \
44 | RPN.TOPK_PER_IMAGE=True \
45 | PREPROC.PREDEFINED_PADDING=True \
46 | BACKBONE.WEIGHTS=/data/pretrained-models/ImageNet-R50-AlignPadding.npz \
47 | BACKBONE.NORM=FreezeBN \
48 | TRAINER=horovod
49 | #For 32x4
50 | #TRAIN.GRADIENT_CLIP=1.5
51 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | 
 7 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 8 | STATICA_HACK = True
 9 | globals()['kcah_acitats'[::-1].upper()] = False
10 | if STATICA_HACK:
11 |     from .base import *
12 |     from .concurrency import *
13 |     from .graph import *
14 |     from .group import *
15 |     from .hooks import *
16 |     from .inference import *
17 |     from .inference_runner import *
18 |     from .monitor import *
19 |     from .param import *
20 |     from .prof import *
21 |     from .saver import *
22 |     from .misc import *
23 |     from .steps import *
24 |     from .summary import *
25 |     from .trigger import *
26 | 
27 | 
28 | from pkgutil import iter_modules
29 | import os
30 | 
31 | 
32 | __all__ = []
33 | 
34 | 
35 | def _global_import(name):
36 |     p = __import__(name, globals(), locals(), level=1)
37 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
38 |     if lst:
39 |         del globals()[name]
40 |         for k in lst:
41 |             if not k.startswith('__'):
42 |                 globals()[k] = p.__dict__[k]
43 |                 __all__.append(k)
44 | 
45 | 
46 | _CURR_DIR = os.path.dirname(__file__)
47 | for _, module_name, _ in iter_modules(
48 |        [_CURR_DIR]):
49 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
50 |     if not os.path.isfile(srcpath):
51 |         continue
52 |     if not module_name.startswith('_'):
53 |         _global_import(module_name)
54 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | 
 7 | from .tower import get_current_tower_context, TowerContext
 8 | 
 9 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
10 | STATICA_HACK = True
11 | globals()['kcah_acitats'[::-1].upper()] = False
12 | if STATICA_HACK:
13 |     from .common import *
14 |     from .sessinit import *
15 |     from .argscope import *
16 | 
17 | 
18 | # don't want to include everything from .tower
19 | __all__ = ['get_current_tower_context', 'TowerContext']
20 | 
21 | 
22 | def _global_import(name):
23 |     p = __import__(name, globals(), None, level=1)
24 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
25 |     for k in lst:
26 |         if not k.startswith('__'):
27 |             globals()[k] = p.__dict__[k]
28 |             __all__.append(k)
29 | 
30 | 
31 | _TO_IMPORT = frozenset([
32 |     'common',
33 |     'sessinit',
34 |     'argscope',
35 | ])
36 | 
37 | for module_name in _TO_IMPORT:
38 |     _global_import(module_name)
39 | 
40 | """
41 | TODO remove this line in the future.
42 | Better to keep submodule names (sesscreate, varmanip, etc) out of __all__,
43 | so that these names will be invisible under `tensorpack.` namespace.
44 | 
45 | To use these utilities, users are expected to import them explicitly, e.g.:
46 | 
47 | import tensorpack.tfutils.symbolic_functions as symbf
48 | """
49 | __all__.extend(['sessinit', 'summary', 'optimizer',
50 |                 'sesscreate', 'gradproc', 'varreplace', 'symbolic_functions',
51 |                 'distributed', 'tower'])
52 | 


--------------------------------------------------------------------------------
/infra/docker/README.md:
--------------------------------------------------------------------------------
 1 | # To train with docker
 2 | 
 3 | ## To run on single-node
 4 | Refer to [Run with docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/docker/docker.md#using-docker "Run with docker")
 5 | 
 6 | ## To run on multi-node
 7 | Make sure you have your data ready as in [Run with docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/docker/docker.md#using-docker "Run with docker").
 8 | ### SSH settings and build container
 9 | - ssh into your master node and clone the repo by `git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn.git`
10 | - run `cd ~/tensorpack-mask-rcnn/infra/docker/`
11 | - create your hosts file without slots
12 | - run `./ssh_and_build.sh $YOUR_MASTER_IP $YOUR_HOST_FILE`, this will enable the passwordless ssh connection and build the container on each of the nodes
13 | ### run container
14 | For each of the instances
15 | - run `cd ~/tensorpack-mask-rcnn/infra/docker/`
16 | - run the container by run `./run_multinode.sh`
17 | 
18 | ### Launch training
19 | Inside the container:
20 | - On each host *apart from the primary* run the following in the container you started:
21 |   - run `cd tensorpack-mask-rcnn/infra/docker/`
22 |   - run `./sleep.sh`
23 | This will make those containers listen to the ssh connection from port 1234.
24 | - On primary host, `cd tensorpack-mask-rcnn/infra/docker`, create your hosts file, which contains all ips of your nodes (include the primary host). The format should be like:
25 | ```
26 | 127.0.0.1 slots=8
27 | 127.0.0.2 slots=8
28 | 127.0.0.3 slots=8
29 | 127.0.0.4 slots=8
30 | ```
31 | This is 4 nodes, 8 GPUs per node.
32 | Launch training with running `./train_multinode.sh 32 4` for 32 GPUs and 4 images per GPU
33 | 


--------------------------------------------------------------------------------
/infra/docker/train_multinode.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | NUM_GPU=${1:-1}
 5 | BATCH_SIZE_PER_GPU=${2:-1}
 6 | PORT_ID=${3:-1234}
 7 | THROUGHPUT_LOG_FREQ=${4:-2000}
 8 | 
 9 | 
10 | 
11 | echo ""
12 | echo "NUM_GPU: ${NUM_GPU}"
13 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}"
14 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}"
15 | echo ""
16 | 
17 | 
18 | 
19 | /usr/local/bin/mpirun -np ${NUM_GPU} \
20 | --hostfile hosts \
21 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \
22 | -mca btl_tcp_if_exclude lo,docker0 \
23 | -mca btl_vader_single_copy_mechanism none \
24 | -mca plm_rsh_args "-p ${PORT_ID}" \
25 | -x LD_LIBRARY_PATH \
26 | -x PATH \
27 | -x NCCL_SOCKET_IFNAME=^docker0,lo \
28 | -x NCCL_MIN_NRINGS=8 \
29 | -x NCCL_DEBUG=INFO \
30 | -x TENSORPACK_FP16=1 \
31 | -x HOROVOD_CYCLE_TIME=0.5 \
32 | -x HOROVOD_FUSION_THRESHOLD=67108864 \
33 | --output-filename /logs/mpirun_logs \
34 | /usr/local/bin/python3 /tensorpack-mask-rcnn/MaskRCNN/train.py \
35 | --logdir /logs/train_log \
36 | --fp16 \
37 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \
38 | --config \
39 | MODE_MASK=True \
40 | MODE_FPN=True \
41 | DATA.BASEDIR=/data \
42 | DATA.TRAIN='["train2017"]' \
43 | DATA.VAL='("val2017",)' \
44 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \
45 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \
46 | TRAIN.EVAL_PERIOD=12 \
47 | RPN.TOPK_PER_IMAGE=True \
48 | PREPROC.PREDEFINED_PADDING=True \
49 | BACKBONE.WEIGHTS=/data/pretrained-models/ImageNet-R50-AlignPadding.npz \
50 | BACKBONE.NORM=FreezeBN \
51 | TRAINER=horovod
52 | #For 32x4
53 | #TRAIN.GRADIENT_CLIP=1.5
54 | 


--------------------------------------------------------------------------------
/infra/sm/launch_sm_job.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | from sagemaker import get_execution_role
 4 | import sagemaker as sage
 5 | from sagemaker.estimator import Estimator
 6 | import datetime
 7 | import subprocess
 8 | import sys
 9 | 
10 | def get_str(cmd):
11 |     content = subprocess.check_output(cmd, shell=True)
12 |     return str(content)[2:-3]
13 | 
14 | account = get_str("echo $(aws sts get-caller-identity --query Account --output text)")
15 | region = get_str("echo $(aws configure get region)")
16 | image = str(sys.argv[1])
17 | sess = sage.Session()
18 | image_name=f"{account}.dkr.ecr.{region}.amazonaws.com/{image}"
19 | sagemaker_iam_role = str(sys.argv[2]) #get_execution_role()
20 | num_gpus = 8
21 | num_nodes = 4
22 | instance_type = 'ml.p3.16xlarge'
23 | custom_mpi_cmds = []
24 | 
25 | job_name = "maskrcnn-{}x{}-{}".format(num_nodes, num_gpus, image)
26 | 
27 | output_path = 's3://mrcnn-sagemaker/sagemaker_training_release'
28 | 
29 | hyperparams = {"sagemaker_use_mpi": "True",
30 |                "sagemaker_process_slots_per_host": num_gpus,
31 |                "num_gpus":num_gpus,
32 |                "num_nodes": num_nodes,
33 |                "custom_mpi_cmds": custom_mpi_cmds}
34 | 
35 | estimator = Estimator(image_name, role=sagemaker_iam_role, output_path=output_path,
36 |                       train_instance_count=num_nodes,
37 |                       train_instance_type=instance_type,
38 |                       sagemaker_session=sess,
39 |                       train_volume_size=200,
40 |                       base_job_name=job_name,
41 |                       subnets=['subnet-21ac2f2e'],
42 |                       hyperparameters=hyperparams)
43 | 
44 | estimator.fit(wait=False)
45 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .base import *
11 |     from .convert import *
12 |     from .crop import *
13 |     from .deform import *
14 |     from .geometry import *
15 |     from .imgproc import *
16 |     from .meta import *
17 |     from .misc import *
18 |     from .noise import *
19 |     from .paste import *
20 |     from .transform import *
21 |     from .external import *
22 | 
23 | 
24 | import os
25 | from pkgutil import iter_modules
26 | 
27 | __all__ = []
28 | 
29 | 
30 | def global_import(name):
31 |     p = __import__(name, globals(), locals(), level=1)
32 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
33 |     if lst:
34 |         del globals()[name]
35 |         for k in lst:
36 |             if not k.startswith('__'):
37 |                 globals()[k] = p.__dict__[k]
38 |                 __all__.append(k)
39 | 
40 | 
41 | try:
42 |     import cv2  # noqa
43 | except ImportError:
44 |     from ...utils import logger
45 |     logger.warn("Cannot import 'cv2', therefore image augmentation is not available.")
46 | else:
47 |     _CURR_DIR = os.path.dirname(__file__)
48 |     for _, module_name, _ in iter_modules(
49 |             [os.path.dirname(__file__)]):
50 |         srcpath = os.path.join(_CURR_DIR, module_name + '.py')
51 |         if not os.path.isfile(srcpath):
52 |             continue
53 |         if not module_name.startswith('_'):
54 |             global_import(module_name)
55 | 


--------------------------------------------------------------------------------
/infra/ami/train_efa.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | NUM_GPU=${1:-1}
 5 | BATCH_SIZE_PER_GPU=${2:-1}
 6 | THROUGHPUT_LOG_FREQ=${3:-2000}
 7 | 
 8 | 
 9 | echo ""
10 | echo "NUM_GPU: ${NUM_GPU}"
11 | echo "BATCH_SIZE_PER_GPU: ${BATCH_SIZE_PER_GPU}"
12 | echo "THROUGHPUT_LOG_FREQ: ${THROUGHPUT_LOG_FREQ}"
13 | echo ""
14 | 
15 | 
16 | 
17 | mpirun -np ${NUM_GPU} \
18 | --hostfile hosts \
19 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 \
20 | -mca btl_vader_single_copy_mechanism none \
21 | --mca btl tcp,self \
22 | --mca btl_tcp_if_exclude lo,docker0 \
23 | -x FI_PROVIDER="efa" \
24 | -x FI_OFI_RXR_RX_COPY_UNEXP=1 \
25 | -x FI_OFI_RXR_RX_COPY_OOO=1 \
26 | -x FI_EFA_MR_CACHE_ENABLE=1 \
27 | -x FI_OFI_RXR_INLINE_MR_ENABLE=1 \
28 | -x NCCL_TREE_THRESHOLD=4294967296 \
29 | -x LD_LIBRARY_PATH \
30 | -x PATH \
31 | -x NCCL_SOCKET_IFNAME=^docker0,lo \
32 | -x NCCL_MIN_NRINGS=13 \
33 | -x NCCL_DEBUG=INFO \
34 | -x TENSORPACK_FP16=1 \
35 | -x HOROVOD_CYCLE_TIME=0.5 \
36 | -x HOROVOD_FUSION_THRESHOLD=67108864 \
37 | python3 /home/ec2-user/tensorpack-mask-rcnn/MaskRCNN/train.py \
38 | --fp16 \
39 | --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \
40 | --config \
41 | MODE_MASK=True \
42 | MODE_FPN=True \
43 | DATA.BASEDIR=/home/ec2-user/data \
44 | DATA.TRAIN='["train2017"]' \
45 | DATA.VAL='("val2017",)' \
46 | TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \
47 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \
48 | TRAIN.EVAL_PERIOD=12 \
49 | RPN.TOPK_PER_IMAGE=True \
50 | PREPROC.PREDEFINED_PADDING=True \
51 | BACKBONE.WEIGHTS=/home/ec2-user/data/pretrained-models/ImageNet-R50-AlignPadding.npz \
52 | BACKBONE.NORM=FreezeBN \
53 | TRAINER=horovod
54 | #For 32x4
55 | #TRAIN.GRADIENT_CLIP=1.5
56 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # DockerHub unaltered mirror of AWS Deep Learning Container
 2 | FROM armandmcqueen/tensorflow-training:1.13-horovod-gpu-py36-cu100-ubuntu16.04
 3 | 
 4 | RUN apt-get install less
 5 | 
 6 | # Need to reinstall some libraries the DL container provides due to custom Tensorflow binary
 7 | RUN pip uninstall -y tensorflow tensorboard tensorflow-estimator keras h5py horovod numpy
 8 | 
 9 | # Download and install custom Tensorflow binary
10 | RUN wget https://github.com/armandmcqueen/tensorpack-mask-rcnn/releases/download/v0.0.0-WIP/tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
11 |     pip install tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
12 |     pip install tensorflow-estimator==1.13.0 && \
13 |     rm tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl
14 | 
15 | RUN pip install keras h5py
16 | 
17 | # Install Horovod, temporarily using CUDA stubs
18 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
19 |     HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1  pip install --no-cache-dir horovod==0.15.2 && \
20 |     ldconfig
21 | 
22 | 
23 | # Install OpenSSH for MPI to communicate between containers
24 | RUN mkdir -p /root/.ssh/ && \
25 |   ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
26 |   cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
27 |   printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
28 | 
29 | 
30 | RUN pip install Cython
31 | RUN pip install ujson opencv-python pycocotools matplotlib
32 | RUN pip install --ignore-installed numpy==1.16.2
33 | 
34 | 
35 | # TODO: Do I really need this now that we are using the DL container?
36 | ARG CACHEBUST=1
37 | ARG BRANCH_NAME
38 | 
39 | RUN git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn -b $BRANCH_NAME
40 | 
41 | RUN chmod -R +w /tensorpack-mask-rcnn
42 | RUN pip install --ignore-installed -e /tensorpack-mask-rcnn/
43 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: distributed.py
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | 
10 | def get_distributed_session_creator(server):
11 |     """
12 |     Args:
13 |        server (tf.train.Server):
14 | 
15 |     Returns:
16 |         tf.train.SessionCreator
17 |     """
18 | 
19 |     server_def = server.server_def
20 |     is_chief = (server_def.job_name == 'worker') and (server_def.task_index == 0)
21 | 
22 |     init_op = tf.global_variables_initializer()
23 |     local_init_op = tf.local_variables_initializer()
24 |     ready_op = tf.report_uninitialized_variables()
25 |     ready_for_local_init_op = tf.report_uninitialized_variables(tf.global_variables())
26 |     sm = tf.train.SessionManager(
27 |         local_init_op=local_init_op,
28 |         ready_op=ready_op,
29 |         ready_for_local_init_op=ready_for_local_init_op,
30 |         graph=tf.get_default_graph())
31 | 
32 |     # to debug wrong variable collection
33 |     # from pprint import pprint
34 |     # print("GLOBAL:")
35 |     # pprint([(k.name, k.device) for k in tf.global_variables()])
36 |     # print("LOCAL:")
37 |     # pprint([(k.name, k.device) for k in tf.local_variables()])
38 | 
39 |     class _Creator(tf.train.SessionCreator):
40 |         def create_session(self):
41 |             if is_chief:
42 |                 return sm.prepare_session(master=server.target, init_op=init_op)
43 |             else:
44 |                 tf.logging.set_verbosity(tf.logging.INFO)   # print message about uninitialized vars
45 |                 ret = sm.wait_for_session(master=server.target)
46 |                 tf.logging.set_verbosity(tf.logging.WARN)
47 |                 return ret
48 | 
49 |     return _Creator()
50 | 


--------------------------------------------------------------------------------
/patch/README.md:
--------------------------------------------------------------------------------
 1 | # Building the Wheel
 2 | 
 3 | Use the codebase here to build a TF wheel: https://github.com/samikama/tensorflow/commits/GenerateProposalsOp
 4 | 
 5 | Apply the diff patches above.
 6 | 
 7 | 
 8 | 
 9 | ## Building Tensorflow
10 | 
11 | Requires custom Tensorflow for GPU optimized ops. Build steps were run on the AWS DLAMI 21.2.
12 | 
13 | ```
14 | source activate tensorflow_p36
15 | pip uninstall -y tensorflow horovod
16 | 
17 | ############################################################################################################
18 | # Upgrade Bazel
19 | ############################################################################################################ 
20 | rm /home/ubuntu/anaconda3/envs/tensorflow_p36/bin/bazel
21 | wget https://github.com/bazelbuild/bazel/releases/download/0.19.2/bazel-0.19.2-installer-linux-x86_64.sh
22 | chmod +x bazel-0.19.2-installer-linux-x86_64.sh
23 | ./bazel-0.19.2-installer-linux-x86_64.sh --user
24 | 
25 | 
26 | ############################################################################################################
27 | # Build TF 1.13 with CUDA 10
28 | ############################################################################################################
29 | 
30 | ./configure
31 | 
32 | # XLA JIT: N
33 | # CUDA: Y
34 | # CUDA/CUDNN/NCCL dir: /usr/local/cuda-10.0
35 | # CUDNN: 7.4.1
36 | # NCCL: 2.3.7
37 | 
38 | 
39 | ############################################################################################################
40 | # Create pip wheel
41 | ############################################################################################################
42 | 
43 | bazel build --config=opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --config=cuda //tensorflow/tools/pip_package:build_pip_package
44 | ./bazel-bin/tensorflow/tools/pip_package/build_pip_package ./tensorflow_pkg
45 | ```
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/tensorpack/graph_builder/predict.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: predict.py
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from ..tfutils.tower import PredictTowerContext
 9 | from ..utils import logger
10 | from ..utils.develop import deprecated
11 | from .training import GraphBuilder
12 | 
13 | __all__ = ['SimplePredictBuilder']
14 | 
15 | 
16 | class SimplePredictBuilder(GraphBuilder):
17 |     """
18 |     Single-tower predictor.
19 |     """
20 |     @deprecated("Please use TowerContext to build it by yourself!", "2018-12-31")
21 |     def __init__(self, ns_name='', vs_name='', device=0):
22 |         """
23 |         Args:
24 |             ns_name (str):
25 |             vs_name (str):
26 |             device (int):
27 |         """
28 |         self._ns_name = ns_name
29 |         self._vs_name = vs_name
30 | 
31 |         device = '/gpu:{}'.format(device) if device >= 0 else '/cpu:0'
32 |         self._device = device
33 | 
34 |     def build(self, input, tower_fn):
35 |         """
36 |         Args:
37 |             input (InputSource): must have been setup
38 |             tower_fn ( [tf.Tensors] ->): callable that takes input tensors.
39 | 
40 |         Returns:
41 |             The return value of tower_fn called under the proper context.
42 |         """
43 |         assert input.setup_done()
44 |         logger.info("Building predictor tower '{}' on device {} ...".format(
45 |             self._ns_name, self._device))
46 | 
47 |         with tf.device(self._device), \
48 |                 PredictTowerContext(
49 |                     self._ns_name, vs_name=self._vs_name):
50 |             inputs = input.get_input_tensors()
51 |             assert isinstance(inputs, (list, tuple)), inputs
52 |             return tower_fn(*inputs)
53 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/hooks.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: hooks.py
 5 | 
 6 | 
 7 | """ Compatible layers between tf.train.SessionRunHook and Callback"""
 8 | 
 9 | import tensorflow as tf
10 | 
11 | from ..tfutils.common import tfv1
12 | from .base import Callback
13 | 
14 | __all__ = ['CallbackToHook', 'HookToCallback']
15 | 
16 | 
17 | class CallbackToHook(tfv1.train.SessionRunHook):
18 |     """ This is only for internal implementation of
19 |         before_run/after_run callbacks.
20 |         You shouldn't need to use this.
21 |     """
22 | 
23 |     def __init__(self, cb):
24 |         self._cb = cb
25 | 
26 |     def before_run(self, ctx):
27 |         return self._cb.before_run(ctx)
28 | 
29 |     def after_run(self, ctx, vals):
30 |         self._cb.after_run(ctx, vals)
31 | 
32 | 
33 | class HookToCallback(Callback):
34 |     """
35 |     Make a ``tf.train.SessionRunHook`` into a callback.
36 |     Note that when `SessionRunHook.after_create_session` is called, the `coord` argument will be None.
37 |     """
38 | 
39 |     _chief_only = False
40 | 
41 |     def __init__(self, hook):
42 |         """
43 |         Args:
44 |             hook (tf.train.SessionRunHook):
45 |         """
46 |         self._hook = hook
47 | 
48 |     def _setup_graph(self):
49 |         with tf.name_scope(None):   # jump out of the name scope
50 |             self._hook.begin()
51 | 
52 |     def _before_train(self):
53 |         sess = tf.get_default_session()
54 |         # coord is set to None when converting
55 |         self._hook.after_create_session(sess, None)
56 | 
57 |     def _before_run(self, ctx):
58 |         return self._hook.before_run(ctx)
59 | 
60 |     def _after_run(self, ctx, run_values):
61 |         self._hook.after_run(ctx, run_values)
62 | 
63 |     def _after_train(self):
64 |         self._hook.end(self.trainer.sess)
65 | 


--------------------------------------------------------------------------------
/infra/sm/Dockerfile_base:
--------------------------------------------------------------------------------
 1 | # DockerHub unaltered mirror of AWS Deep Learning Container
 2 | FROM 578276202366.dkr.ecr.us-east-1.amazonaws.com/dlami
 3 | 
 4 | RUN apt-get install less
 5 | 
 6 | # Need to reinstall some libraries the DL container provides due to custom Tensorflow binary
 7 | RUN pip uninstall -y tensorflow tensorboard tensorflow-estimator keras h5py horovod numpy
 8 | 
 9 | # Download and install custom Tensorflow binary
10 | RUN wget https://github.com/armandmcqueen/tensorpack-mask-rcnn/releases/download/v0.0.0-WIP/tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
11 |     pip install tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
12 |     pip install tensorflow-estimator==1.13.0 && \
13 |     rm tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl
14 | 
15 | RUN pip install keras h5py
16 | 
17 | # Install Horovod, temporarily using CUDA stubs
18 | RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
19 |     HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1  pip install --no-cache-dir horovod==0.15.2 && \
20 |     ldconfig
21 | 
22 | 
23 | # Install OpenSSH for MPI to communicate between containers
24 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server
25 | RUN mkdir -p /var/run/sshd && \
26 |   sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
27 | 
28 | RUN mkdir -p /root/.ssh/ && \
29 |   ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
30 |   cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
31 |   printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
32 | 
33 | 
34 | RUN pip install Cython
35 | RUN pip install ujson opencv-python pycocotools matplotlib
36 | RUN pip install --ignore-installed numpy==1.16.2
37 | 
38 | 
39 | # TODO: Do I really need this now that we are using the DL container?
40 | ARG CACHEBUST=1
41 | ARG BRANCH_NAME
42 | 
43 | RUN pip install mpi4py
44 | 
45 | # For Sagemaker
46 | RUN pip install sagemaker-containers
47 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #  -*- coding: utf-8 -*-
 4 | #  File: __init__.py
 5 | 
 6 | # https://github.com/celery/kombu/blob/7d13f9b95d0b50c94393b962e6def928511bfda6/kombu/__init__.py#L34-L36
 7 | STATICA_HACK = True
 8 | globals()['kcah_acitats'[::-1].upper()] = False
 9 | if STATICA_HACK:
10 |     from .base import *
11 |     from .common import *
12 |     from .format import *
13 |     from .image import *
14 |     from .parallel_map import *
15 |     from .parallel import *
16 |     from .raw import *
17 |     from .remote import *
18 |     from . import imgaug
19 |     from . import dataset
20 |     from . import dftools
21 | 
22 | 
23 | from pkgutil import iter_modules
24 | import os
25 | import os.path
26 | from ..utils.develop import LazyLoader
27 | 
28 | __all__ = []
29 | 
30 | 
31 | def _global_import(name):
32 |     p = __import__(name, globals(), locals(), level=1)
33 |     lst = p.__all__ if '__all__' in dir(p) else dir(p)
34 |     if lst:
35 |         del globals()[name]
36 |         for k in lst:
37 |             if not k.startswith('__'):
38 |                 globals()[k] = p.__dict__[k]
39 |                 __all__.append(k)
40 | 
41 | 
42 | __SKIP = set(['dftools', 'dataset', 'imgaug'])
43 | _CURR_DIR = os.path.dirname(__file__)
44 | for _, module_name, __ in iter_modules(
45 |         [os.path.dirname(__file__)]):
46 |     srcpath = os.path.join(_CURR_DIR, module_name + '.py')
47 |     if not os.path.isfile(srcpath):
48 |         continue
49 |     if not module_name.startswith('_') and \
50 |             module_name not in __SKIP:
51 |         _global_import(module_name)
52 | 
53 | 
54 | globals()['dataset'] = LazyLoader('dataset', globals(), 'tensorpack.dataflow.dataset')
55 | globals()['imgaug'] = LazyLoader('imgaug', globals(), 'tensorpack.dataflow.imgaug')
56 | 
57 | del LazyLoader
58 | 
59 | __all__.extend(['imgaug', 'dftools', 'dataset'])
60 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/convert.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: convert.py
 5 | 
 6 | import numpy as np
 7 | import cv2
 8 | 
 9 | from .base import ImageAugmentor
10 | from .meta import MapImage
11 | 
12 | __all__ = ['ColorSpace', 'Grayscale', 'ToUint8', 'ToFloat32']
13 | 
14 | 
15 | class ColorSpace(ImageAugmentor):
16 |     """ Convert into another color space.  """
17 | 
18 |     def __init__(self, mode, keepdims=True):
19 |         """
20 |         Args:
21 |             mode: OpenCV color space conversion code (e.g., `cv2.COLOR_BGR2HSV`)
22 |             keepdims (bool): keep the dimension of image unchanged if OpenCV
23 |                 changes it.
24 |         """
25 |         self._init(locals())
26 | 
27 |     def _augment(self, img, _):
28 |         transf = cv2.cvtColor(img, self.mode)
29 |         if self.keepdims:
30 |             if len(transf.shape) is not len(img.shape):
31 |                 transf = transf[..., None]
32 |         return transf
33 | 
34 | 
35 | class Grayscale(ColorSpace):
36 |     """ Convert image to grayscale.  """
37 | 
38 |     def __init__(self, keepdims=True, rgb=False):
39 |         """
40 |         Args:
41 |             keepdims (bool): return image of shape [H, W, 1] instead of [H, W]
42 |             rgb (bool): interpret input as RGB instead of the default BGR
43 |         """
44 |         mode = cv2.COLOR_RGB2GRAY if rgb else cv2.COLOR_BGR2GRAY
45 |         super(Grayscale, self).__init__(mode, keepdims)
46 | 
47 | 
48 | class ToUint8(MapImage):
49 |     """ Convert image to uint8. Useful to reduce communication overhead. """
50 |     def __init__(self):
51 |         super(ToUint8, self).__init__(lambda x: np.clip(x, 0, 255).astype(np.uint8), lambda x: x)
52 | 
53 | 
54 | class ToFloat32(MapImage):
55 |     """ Convert image to float32, may increase quality of the augmentor. """
56 |     def __init__(self):
57 |         super(ToFloat32, self).__init__(lambda x: x.astype(np.float32), lambda x: x)
58 | 


--------------------------------------------------------------------------------
/MaskRCNN/utils/mixed_precision.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | import tensorflow as tf
 4 | from contextlib import suppress
 5 | 
 6 | def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
 7 |                                     initializer=None, regularizer=None,
 8 |                                     trainable=True,
 9 |                                     *args, **kwargs):
10 |     """Custom variable getter that forces trainable variables to be stored in
11 |     float32 precision and then casts them to the training precision.
12 |     """
13 |     norm = "norm" in name.lower() or "bn" in name.lower()
14 |     storage_dtype = tf.float32 if trainable else dtype
15 |     variable = getter(name, shape, dtype=storage_dtype,
16 |                       initializer=initializer,
17 |                       regularizer=regularizer if not norm else None,
18 |                       trainable=trainable,
19 |                       *args, **kwargs)
20 | 
21 |     # print(name, "trainable={} dtype={} storage_dtype={} id={} reuse={}".format(trainable, dtype, storage_dtype, id(variable), kwargs['reuse']))
22 | 
23 |     if norm:
24 |         return variable
25 | 
26 |     if trainable and dtype != tf.float32:
27 |         # print(name, "fp16_cast")
28 |         cast_name = name + '/fp16_cast'
29 |         try:
30 |             cast_variable = tf.get_default_graph().get_tensor_by_name(
31 |                 cast_name + ':0'
32 |             )
33 |         except KeyError:
34 |             cast_variable = tf.cast(variable, dtype, name=cast_name)
35 |         cast_variable._ref = variable._ref
36 |         variable = cast_variable
37 |     return variable
38 | 
39 | 
40 | def mixed_precision_scope(mixed=True, *args, **kwargs):
41 |     if not mixed:
42 |         return suppress()
43 | 
44 |     return tf.variable_scope(name_or_scope=tf.get_variable_scope(),
45 |                              custom_getter=float32_variable_storage_getter,
46 |                              reuse=tf.AUTO_REUSE, *args, **kwargs)
47 | 
48 | 


--------------------------------------------------------------------------------
/infra/sm/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 6 | # by SageMaker.
 7 | 
 8 | # The argument to this script is the image name. This will be used as the image on the local
 9 | # machine and combined with the account and region to form the repository name for ECR.
10 | image=$1
11 | if [ "$image" == "" ]
12 | then
13 |     echo "Usage: $0 <image-name>"
14 |     exit 1
15 | fi
16 | 
17 | export AWS_ACCESS_KEY_ID=$(aws --profile default configure get aws_access_key_id)
18 | export AWS_SECRET_ACCESS_KEY=$(aws --profile default configure get aws_secret_access_key)
19 | 
20 | 
21 | # Get the account number associated with the current IAM credentials
22 | account=$(aws sts get-caller-identity --query Account --output text)
23 | 
24 | if [ $? -ne 0 ]
25 | then
26 |     exit 255
27 | fi
28 | 
29 | 
30 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
31 | region=$(aws configure get region)
32 | #region=${region:-us-east-1}
33 | 
34 | 
35 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest"
36 | # If the repository doesn't exist in ECR, create it.
37 | 
38 | aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1
39 | 
40 | if [ $? -ne 0 ]
41 | then
42 |     aws ecr create-repository --repository-name "${image}" > /dev/null
43 | fi
44 | 
45 | # Get the login command from ECR and execute it directly
46 | $(aws ecr get-login --region ${region} --no-include-email)
47 | 
48 | # Build the docker image locally with the image name and then push it to ECR
49 | # with the full name.
50 | echo "Building docker image tensorpack-mask-rcnn"
51 | echo ""
52 | 
53 | docker build  -t ${image} -f Dockerfile_sm . --build-arg CACHEBUST=$(date +%s) \
54 | 				--build-arg AWS_ACCESS_KEY_ID \
55 | 				--build-arg AWS_SECRET_ACCESS_KEY \
56 | 
57 | if [ $? -ne 0 ]
58 | then
59 |   echo "Local build failed. Not pushing."
60 |   exit 1
61 | fi
62 | 
63 | docker tag ${image} ${fullname}
64 | 
65 | docker push ${fullname}
66 | 


--------------------------------------------------------------------------------
/tensorpack/utils/palette.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: palette.py
 5 | 
 6 | import numpy as np
 7 | 
 8 | __all__ = ['PALETTE_RGB']
 9 | 
10 | # copied from https://stackoverflow.com/questions/2328339/how-to-generate-n-different-colors-for-any-natural-number-n
11 | PALETTE_HEX = [
12 |     "#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
13 |     "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
14 |     "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
15 |     "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
16 |     "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
17 |     "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
18 |     "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
19 |     "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
20 |     "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
21 |     "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
22 |     "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
23 |     "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
24 |     "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C",
25 |     "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800",
26 |     "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51",
27 |     "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94",
28 |     "#7ED379", "#012C58"]
29 | 
30 | 
31 | def _parse_hex_color(s):
32 |     r = int(s[1:3], 16)
33 |     g = int(s[3:5], 16)
34 |     b = int(s[5:7], 16)
35 |     return (r, g, b)
36 | 
37 | 
38 | PALETTE_RGB = np.asarray(
39 |     list(map(_parse_hex_color, PALETTE_HEX)),
40 |     dtype='int32')
41 | 


--------------------------------------------------------------------------------
/tensorpack/models/nonlin.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: nonlin.py
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from .batch_norm import BatchNorm
10 | from .common import VariableHolder, layer_register
11 | 
12 | __all__ = ['Maxout', 'PReLU', 'BNReLU']
13 | 
14 | 
15 | @layer_register(use_scope=None)
16 | def Maxout(x, num_unit):
17 |     """
18 |     Maxout as in the paper `Maxout Networks <http://arxiv.org/abs/1302.4389>`_.
19 | 
20 |     Args:
21 |         x (tf.Tensor): a NHWC or NC tensor. Channel has to be known.
22 |         num_unit (int): a int. Must be divisible by C.
23 | 
24 |     Returns:
25 |         tf.Tensor: of shape NHW(C/num_unit) named ``output``.
26 |     """
27 |     input_shape = x.get_shape().as_list()
28 |     ndim = len(input_shape)
29 |     assert ndim == 4 or ndim == 2
30 |     ch = input_shape[-1]
31 |     assert ch is not None and ch % num_unit == 0
32 |     if ndim == 4:
33 |         x = tf.reshape(x, [-1, input_shape[1], input_shape[2], ch / num_unit, num_unit])
34 |     else:
35 |         x = tf.reshape(x, [-1, ch / num_unit, num_unit])
36 |     return tf.reduce_max(x, ndim, name='output')
37 | 
38 | 
39 | @layer_register()
40 | def PReLU(x, init=0.001, name='output'):
41 |     """
42 |     Parameterized ReLU as in the paper `Delving Deep into Rectifiers: Surpassing
43 |     Human-Level Performance on ImageNet Classification
44 |     <http://arxiv.org/abs/1502.01852>`_.
45 | 
46 |     Args:
47 |         x (tf.Tensor): input
48 |         init (float): initial value for the learnable slope.
49 |         name (str): name of the output.
50 | 
51 |     Variable Names:
52 | 
53 |     * ``alpha``: learnable slope.
54 |     """
55 |     init = tf.constant_initializer(init)
56 |     alpha = tf.get_variable('alpha', [], initializer=init)
57 |     x = ((1 + alpha) * x + (1 - alpha) * tf.abs(x))
58 |     ret = tf.multiply(x, 0.5, name=name)
59 | 
60 |     ret.variables = VariableHolder(alpha=alpha)
61 |     return ret
62 | 
63 | 
64 | @layer_register(use_scope=None)
65 | def BNReLU(x, name=None):
66 |     """
67 |     A shorthand of BatchNormalization + ReLU.
68 |     """
69 |     x = BatchNorm('bn', x)
70 |     x = tf.nn.relu(x, name=name)
71 |     return x
72 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | import platform
 4 | from os import path
 5 | import setuptools
 6 | from setuptools import setup
 7 | 
 8 | version = int(setuptools.__version__.split('.')[0])
 9 | assert version > 30, "Tensorpack installation requires setuptools > 30"
10 | 
11 | this_directory = path.abspath(path.dirname(__file__))
12 | 
13 | # setup metainfo
14 | libinfo_py = path.join(this_directory, 'tensorpack', 'libinfo.py')
15 | libinfo_content = open(libinfo_py, "r").readlines()
16 | version_line = [l.strip() for l in libinfo_content if l.startswith('__version__')][0]
17 | exec(version_line)  # produce __version__
18 | 
19 | with open(path.join(this_directory, 'README.md'), 'rb') as f:
20 |     long_description = f.read().decode('utf-8')
21 | 
22 | 
23 | def add_git_version():
24 | 
25 |     def get_git_version():
26 |         from subprocess import check_output
27 |         try:
28 |             return check_output("git describe --tags --long --dirty".split()).decode('utf-8').strip()
29 |         except Exception:
30 |             return __version__  # noqa
31 | 
32 |     newlibinfo_content = [l for l in libinfo_content if not l.startswith('__git_version__')]
33 |     newlibinfo_content.append('__git_version__ = "{}"'.format(get_git_version()))
34 |     with open(libinfo_py, "w") as f:
35 |         f.write("".join(newlibinfo_content))
36 | 
37 | 
38 | add_git_version()
39 | 
40 | 
41 | setup(
42 |     name='tensorpack',
43 |     version=__version__,   # noqa
44 |     description='Neural Network Toolbox on TensorFlow',
45 |     long_description=long_description,
46 |     long_description_content_type='text/markdown',
47 |     install_requires=[
48 |         "numpy>=1.14",
49 |         "six",
50 |         "termcolor>=1.1",
51 |         "tabulate>=0.7.7",
52 |         "tqdm>4.11.1",
53 |         "msgpack>=0.5.2",
54 |         "msgpack-numpy>=0.4.4.2",
55 |         "pyzmq>=16",
56 |         "subprocess32; python_version < '3.0'",
57 |         "functools32; python_version < '3.0'",
58 |     ],
59 |     tests_require=['flake8', 'scikit-image'],
60 |     extras_require={
61 |         'all': ['pillow', 'scipy', 'h5py', 'lmdb>=0.92', 'matplotlib', 'scikit-learn'] +
62 |                ['python-prctl'] if platform.system() == 'Linux' else [],
63 |         'all: python_version < "3.0"': ['tornado'],
64 |     },
65 | )
66 | 


--------------------------------------------------------------------------------
/infra/eks/YAML_OVERLAY.md:
--------------------------------------------------------------------------------
 1 | # Overyaml
 2 | 
 3 | Take a base yaml file, apply a series of changes (overlays) and print out new yaml.
 4 | 
 5 | e.g. take base maskrcnn params and change to run 5 experiments of 24 epochs, predefined_padding=True, 32x4 GPU configuration without helm naming conflicts. Then run 5 more experiments with 32x2 GPU configuration.
 6 | 
 7 | * Be able to make changes to the base yaml and have it impact all other configurations.
 8 | * Add a new experiment without having an exploding number of yaml files to maintain and update.
 9 | 
10 | ## CLI Syntax
11 | 
12 | `./yaml_overlay $BASE $OVERLAY1 $OVERLAY2 $OVERLAY3 ...`
13 | 
14 | Takes a base yaml and applies overlays sequentially. At the end, prints new yaml out to stdout. Overlay names should be the path to the overlay file minus '.yaml'.
15 | 
16 | `./yaml_overlay maskrcnn/values.yaml maskrcnn/overlays/24epoch maskrcnn/overlays/32x4`
17 | 
18 | ## Overlay folder
19 | 
20 | You can keep all your overlays in a single folder and then pass in an `overlay_dir` either through the `--overlay_dir` flag or through the `OVERLAY_DIR` environment variable.
21 | 
22 | ```
23 | export OVERLAY_DIR=maskrcnn/overlays
24 | ./yaml_overlay maskrcnn/values.yaml 24epoch 32x4
25 | ```
26 | 
27 | ## Overlay syntax
28 | 
29 | An overlay is a yaml file containing two sets of changes - changes where you want to `set` a new value for a field and changes where you want to `append` a postfix to the existing value.
30 | 
31 | ```
32 | set:
33 |     someScope:
34 |         someField: "new_value"
35 | append:
36 |     someScope:
37 |         someOtherField: "_new_postfix"
38 | ```
39 | 
40 | Both `set` and `append` are optional.
41 | 
42 | Changes are represented as a copy of the original object with unchanged fields ommitted and each changed field holding the new value or the postfix as the field's value. See example below.
43 | 
44 | 
45 | ## Example
46 | 
47 | **base.yaml**
48 | 
49 | ```
50 | someScope:
51 |     someField: 1
52 |     someOtherField: "my_name"
53 | ```
54 | 
55 | **overlay.yaml**
56 | 
57 | ```
58 | set:
59 |     someScope:
60 |         someField: "new_value"
61 | append:
62 |     someScope:
63 |         someOtherField: "_new_postfix"
64 | ```
65 | 
66 | 
67 | 
68 | ###`$ ./yaml_overlay base.yaml overlay > output.yaml`
69 | 
70 | 
71 | **output.yaml**
72 | ```
73 | someScope:
74 |     someField: "new_value"
75 |     someOtherField: "my_name_new_postfix"
76 | ```
77 | 


--------------------------------------------------------------------------------
/tensorpack/utils/gpu.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: gpu.py
 5 | 
 6 | 
 7 | import os
 8 | 
 9 | from . import logger
10 | from .concurrency import subproc_call
11 | from .nvml import NVMLContext
12 | from .utils import change_env
13 | 
14 | __all__ = ['change_gpu', 'get_nr_gpu', 'get_num_gpu']
15 | 
16 | 
17 | def change_gpu(val):
18 |     """
19 |     Args:
20 |         val: an integer, the index of the GPU or -1 to disable GPU.
21 | 
22 |     Returns:
23 |         a context where ``CUDA_VISIBLE_DEVICES=val``.
24 |     """
25 |     val = str(val)
26 |     if val == '-1':
27 |         val = ''
28 |     return change_env('CUDA_VISIBLE_DEVICES', val)
29 | 
30 | 
31 | def get_num_gpu():
32 |     """
33 |     Returns:
34 |         int: #available GPUs in CUDA_VISIBLE_DEVICES, or in the system.
35 |     """
36 | 
37 |     def warn_return(ret, message):
38 |         try:
39 |             import tensorflow as tf
40 |         except ImportError:
41 |             return ret
42 | 
43 |         built_with_cuda = tf.test.is_built_with_cuda()
44 |         if not built_with_cuda and ret > 0:
45 |             logger.warn(message + "But TensorFlow was not built with CUDA support!")
46 |         return ret
47 | 
48 |     env = os.environ.get('CUDA_VISIBLE_DEVICES', None)
49 |     if env is not None:
50 |         return warn_return(len(env.split(',')), "Found non-empty CUDA_VISIBLE_DEVICES. ")
51 |     output, code = subproc_call("nvidia-smi -L", timeout=5)
52 |     if code == 0:
53 |         output = output.decode('utf-8')
54 |         return warn_return(len(output.strip().split('\n')), "Found nvidia-smi. ")
55 |     try:
56 |         # Use NVML to query device properties
57 |         with NVMLContext() as ctx:
58 |             return warn_return(ctx.num_devices(), "NVML found nvidia devices. ")
59 |     except Exception:
60 |         # Fallback
61 |         # Note this will initialize all GPUs and therefore has side effect
62 |         # https://github.com/tensorflow/tensorflow/issues/8136
63 |         logger.info("Loading local devices by TensorFlow ...")
64 |         from tensorflow.python.client import device_lib
65 |         local_device_protos = device_lib.list_local_devices()
66 |         return len([x.name for x in local_device_protos if x.device_type == 'GPU'])
67 | 
68 | 
69 | get_nr_gpu = get_num_gpu
70 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/concurrency.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: concurrency.py
 5 | 
 6 | 
 7 | import multiprocessing as mp
 8 | 
 9 | from ..utils import logger
10 | from ..utils.concurrency import StoppableThread, start_proc_mask_signal
11 | from .base import Callback
12 | 
13 | __all__ = ['StartProcOrThread']
14 | 
15 | 
16 | class StartProcOrThread(Callback):
17 |     """
18 |     Start some threads or processes before training.
19 |     """
20 | 
21 |     _chief_only = False
22 | 
23 |     def __init__(self, startable, stop_at_last=True):
24 |         """
25 |         Args:
26 |             startable (list): list of processes or threads which have ``start()`` method.
27 |                 Can also be a single instance of process of thread.
28 |             stop_at_last (bool): whether to stop the processes or threads
29 |                 after training. It will use :meth:`Process.terminate()` or
30 |                 :meth:`StoppableThread.stop()`, but will do nothing on normal
31 |                 `threading.Thread` or other startable objects.
32 |         """
33 |         if not isinstance(startable, list):
34 |             startable = [startable]
35 |         self._procs_threads = startable
36 |         self._stop_at_last = stop_at_last
37 | 
38 |     def _before_train(self):
39 |         logger.info("Starting " +
40 |                     ', '.join([k.name for k in self._procs_threads]) + ' ...')
41 |         # avoid sigint get handled by other processes
42 |         start_proc_mask_signal(self._procs_threads)
43 | 
44 |     def _after_train(self):
45 |         if not self._stop_at_last:
46 |             return
47 |         for k in self._procs_threads:
48 |             if not k.is_alive():
49 |                 continue
50 |             if isinstance(k, mp.Process):
51 |                 logger.info("Stopping {} ...".format(k.name))
52 |                 k.terminate()
53 |                 k.join(5.0)
54 |                 if k.is_alive():
55 |                     logger.error("Cannot join process {}.".format(k.name))
56 |             elif isinstance(k, StoppableThread):
57 |                 logger.info("Stopping {} ...".format(k.name))
58 |                 k.stop()
59 |                 k.join(5.0)
60 |                 if k.is_alive():
61 |                     logger.error("Cannot join thread {}.".format(k.name))
62 | 


--------------------------------------------------------------------------------
/tensorpack/utils/serialize.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: serialize.py
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | import msgpack
10 | import msgpack_numpy
11 | 
12 | from . import logger
13 | from .develop import create_dummy_func
14 | 
15 | msgpack_numpy.patch()
16 | assert msgpack.version >= (0, 5, 2)
17 | 
18 | __all__ = ['loads', 'dumps']
19 | 
20 | 
21 | MAX_MSGPACK_LEN = 1000000000
22 | 
23 | 
24 | def dumps_msgpack(obj):
25 |     """
26 |     Serialize an object.
27 | 
28 |     Returns:
29 |         Implementation-dependent bytes-like object.
30 |     """
31 |     return msgpack.dumps(obj, use_bin_type=True)
32 | 
33 | 
34 | def loads_msgpack(buf):
35 |     """
36 |     Args:
37 |         buf: the output of `dumps`.
38 |     """
39 |     # Since 0.6, the default max size was set to 1MB.
40 |     # We change it to approximately 1G.
41 |     return msgpack.loads(buf, raw=False,
42 |                          max_bin_len=MAX_MSGPACK_LEN,
43 |                          max_array_len=MAX_MSGPACK_LEN,
44 |                          max_map_len=MAX_MSGPACK_LEN,
45 |                          max_str_len=MAX_MSGPACK_LEN)
46 | 
47 | 
48 | def dumps_pyarrow(obj):
49 |     """
50 |     Serialize an object.
51 | 
52 |     Returns:
53 |         Implementation-dependent bytes-like object.
54 |         May not be compatible across different versions of pyarrow.
55 |     """
56 |     return pa.serialize(obj).to_buffer()
57 | 
58 | 
59 | def loads_pyarrow(buf):
60 |     """
61 |     Args:
62 |         buf: the output of `dumps`.
63 |     """
64 |     return pa.deserialize(buf)
65 | 
66 | 
67 | # import pyarrow has a lot of side effect:
68 | # https://github.com/apache/arrow/pull/2329
69 | # https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/TMqRaT-H2bI
70 | # So we use msgpack as default.
71 | if os.environ.get('TENSORPACK_SERIALIZE', 'msgpack') == 'pyarrow':
72 |     try:
73 |         import pyarrow as pa
74 |     except ImportError:
75 |         loads_pyarrow = create_dummy_func('loads_pyarrow', ['pyarrow'])  # noqa
76 |         dumps_pyarrow = create_dummy_func('dumps_pyarrow', ['pyarrow'])  # noqa
77 | 
78 |     if 'horovod' in sys.modules:
79 |         logger.warn("Horovod and pyarrow may have symbol conflicts. "
80 |                     "Uninstall pyarrow and use msgpack instead.")
81 |     loads = loads_pyarrow
82 |     dumps = dumps_pyarrow
83 | else:
84 |     loads = loads_msgpack
85 |     dumps = dumps_msgpack
86 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/symbolic_functions.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: symbolic_functions.py
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from ..utils.develop import deprecated
10 | 
11 | __all__ = ['print_stat', 'rms']
12 | 
13 | 
14 | def print_stat(x, message=None):
15 |     """ A simple print Op that might be easier to use than :meth:`tf.Print`.
16 |         Use it like: ``x = print_stat(x, message='This is x')``.
17 |     """
18 |     if message is None:
19 |         message = x.op.name
20 |     lst = [tf.shape(x), tf.reduce_mean(x)]
21 |     if x.dtype.is_floating:
22 |         lst.append(rms(x))
23 |     return tf.Print(x, lst + [x], summarize=20,
24 |                     message=message, name='print_' + x.op.name)
25 | 
26 | 
27 | # for internal use only
28 | def rms(x, name=None):
29 |     """
30 |     Returns:
31 |         root mean square of tensor x.
32 |     """
33 |     if name is None:
34 |         name = x.op.name + '/rms'
35 |         with tf.name_scope(None):   # name already contains the scope
36 |             return tf.sqrt(tf.reduce_mean(tf.square(x)), name=name)
37 |     return tf.sqrt(tf.reduce_mean(tf.square(x)), name=name)
38 | 
39 | 
40 | # don't hurt to leave it here
41 | @deprecated("Please implement it by yourself.", "2018-04-28")
42 | def psnr(prediction, ground_truth, maxp=None, name='psnr'):
43 |     """`Peek Signal to Noise Ratio <https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio>`_.
44 | 
45 |     .. math::
46 | 
47 |         PSNR = 20 \cdot \log_{10}(MAX_p) - 10 \cdot \log_{10}(MSE)
48 | 
49 |     Args:
50 |         prediction: a :class:`tf.Tensor` representing the prediction signal.
51 |         ground_truth: another :class:`tf.Tensor` with the same shape.
52 |         maxp: maximum possible pixel value of the image (255 in in 8bit images)
53 | 
54 |     Returns:
55 |         A scalar tensor representing the PSNR
56 |     """
57 | 
58 |     maxp = float(maxp)
59 | 
60 |     def log10(x):
61 |         with tf.name_scope("log10"):
62 |             numerator = tf.log(x)
63 |             denominator = tf.log(tf.constant(10, dtype=numerator.dtype))
64 |             return numerator / denominator
65 | 
66 |     mse = tf.reduce_mean(tf.square(prediction - ground_truth))
67 |     if maxp is None:
68 |         psnr = tf.multiply(log10(mse), -10., name=name)
69 |     else:
70 |         psnr = tf.multiply(log10(mse), -10.)
71 |         psnr = tf.add(tf.multiply(20., log10(maxp)), psnr, name=name)
72 | 
73 |     return psnr
74 | 


--------------------------------------------------------------------------------
/tensorpack/models/shape_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: shape_utils.py
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | __all__ = []
 9 | 
10 | 
11 | class StaticDynamicAxis(object):
12 |     def __init__(self, static, dynamic):
13 |         self.static = static
14 |         self.dynamic = dynamic
15 | 
16 |     def apply(self, f):
17 |         try:
18 |             st = f(self.static)
19 |             return StaticDynamicAxis(st, st)
20 |         except TypeError:
21 |             return StaticDynamicAxis(None, f(self.dynamic))
22 | 
23 |     def __str__(self):
24 |         return "S={}, D={}".format(str(self.static), str(self.dynamic))
25 | 
26 | 
27 | def DynamicLazyAxis(shape, idx):
28 |     return lambda: shape[idx]
29 | 
30 | 
31 | def StaticLazyAxis(dim):
32 |     return lambda: dim
33 | 
34 | 
35 | class StaticDynamicShape(object):
36 |     def __init__(self, tensor):
37 |         assert isinstance(tensor, tf.Tensor), tensor
38 |         ndims = tensor.shape.ndims
39 |         self.static = tensor.shape.as_list()
40 |         if tensor.shape.is_fully_defined():
41 |             self.dynamic = self.static[:]
42 |         else:
43 |             dynamic = tf.shape(tensor)
44 |             self.dynamic = [DynamicLazyAxis(dynamic, k) for k in range(ndims)]
45 | 
46 |         for k in range(ndims):
47 |             if self.static[k] is not None:
48 |                 self.dynamic[k] = StaticLazyAxis(self.static[k])
49 | 
50 |     def apply(self, axis, f):
51 |         if self.static[axis] is not None:
52 |             try:
53 |                 st = f(self.static[axis])
54 |                 self.static[axis] = st
55 |                 self.dynamic[axis] = StaticLazyAxis(st)
56 |                 return
57 |             except TypeError:
58 |                 pass
59 |         self.static[axis] = None
60 |         dyn = self.dynamic[axis]
61 |         self.dynamic[axis] = lambda: f(dyn())
62 | 
63 |     def get_static(self):
64 |         return self.static
65 | 
66 |     @property
67 |     def ndims(self):
68 |         return len(self.static)
69 | 
70 |     def get_dynamic(self, axis=None):
71 |         if axis is None:
72 |             return [self.dynamic[k]() for k in range(self.ndims)]
73 |         return self.dynamic[axis]()
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     x = tf.placeholder(tf.float32, shape=[None, 3, None, 10])
78 |     shape = StaticDynamicShape(x)
79 |     shape.apply(1, lambda x: x * 3)
80 |     shape.apply(2, lambda x: x + 5)
81 |     print(shape.get_static())
82 |     print(shape.get_dynamic())
83 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/dependency.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import tensorflow as tf
 5 | from tensorflow.contrib.graph_editor import get_backward_walk_ops
 6 | 
 7 | from ..utils.argtools import graph_memoized
 8 | 
 9 | """
10 | Utils about parsing dependencies in the graph.
11 | """
12 | 
13 | __all__ = [
14 |     'dependency_of_targets', 'dependency_of_fetches'
15 | ]
16 | 
17 | 
18 | @graph_memoized
19 | def dependency_of_targets(targets, op):
20 |     """
21 |     Check that op is in the subgraph induced by the dependencies of targets.
22 |     The result is memoized.
23 | 
24 |     This is useful if some SessionRunHooks should be run only together with certain ops.
25 | 
26 |     Args:
27 |         targets: a tuple of ops or tensors. The targets to find dependencies of.
28 |         op (tf.Operation or tf.Tensor):
29 | 
30 |     Returns:
31 |         bool
32 |     """
33 |     # TODO tensorarray? sparsetensor?
34 |     if isinstance(op, tf.Tensor):
35 |         op = op.op
36 |     assert isinstance(op, tf.Operation), op
37 | 
38 |     # alternative implementation can use graph_util.extract_sub_graph
39 |     dependent_ops = get_backward_walk_ops(targets, control_inputs=True)
40 |     return op in dependent_ops
41 | 
42 | 
43 | def dependency_of_fetches(fetches, op):
44 |     """
45 |     Check that op is in the subgraph induced by the dependencies of fetches.
46 |     fetches may have more general structure.
47 | 
48 |     Args:
49 |         fetches: An argument to `sess.run`. Nested structure will affect performance.
50 |         op (tf.Operation or tf.Tensor):
51 | 
52 |     Returns:
53 |         bool
54 |     """
55 |     try:
56 |         from tensorflow.python.client.session import _FetchHandler as FetchHandler
57 |         # use the graph of the op, so that this function can be called without being under a default graph
58 |         handler = FetchHandler(op.graph, fetches, {})
59 |         targets = tuple(handler.fetches() + handler.targets())
60 |     except ImportError:
61 |         if isinstance(fetches, list):
62 |             targets = tuple(fetches)
63 |         elif isinstance(fetches, dict):
64 |             raise ValueError("Don't know how to parse dictionary to fetch list! "
65 |                              "This is a bug of tensorpack.")
66 |         else:
67 |             targets = (fetches, )
68 |     return dependency_of_targets(targets, op)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     a = tf.random_normal(shape=[3, 3])
73 |     b = tf.random_normal(shape=[3, 3])
74 |     print(dependency_of_fetches(a, a))
75 |     print(dependency_of_fetches([a, b], a))
76 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/noise.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: noise.py
 5 | 
 6 | 
 7 | import numpy as np
 8 | import cv2
 9 | 
10 | from .base import ImageAugmentor
11 | 
12 | __all__ = ['JpegNoise', 'GaussianNoise', 'SaltPepperNoise']
13 | 
14 | 
15 | class JpegNoise(ImageAugmentor):
16 |     """ Random JPEG noise. """
17 | 
18 |     def __init__(self, quality_range=(40, 100)):
19 |         """
20 |         Args:
21 |             quality_range (tuple): range to sample JPEG quality
22 |         """
23 |         super(JpegNoise, self).__init__()
24 |         self._init(locals())
25 | 
26 |     def _get_augment_params(self, img):
27 |         return self.rng.randint(*self.quality_range)
28 | 
29 |     def _augment(self, img, q):
30 |         enc = cv2.imencode('.jpg', img, [cv2.IMWRITE_JPEG_QUALITY, q])[1]
31 |         return cv2.imdecode(enc, 1).astype(img.dtype)
32 | 
33 | 
34 | class GaussianNoise(ImageAugmentor):
35 |     """
36 |     Add random Gaussian noise N(0, sigma^2) of the same shape to img.
37 |     """
38 |     def __init__(self, sigma=1, clip=True):
39 |         """
40 |         Args:
41 |             sigma (float): stddev of the Gaussian distribution.
42 |             clip (bool): clip the result to [0,255] in the end.
43 |         """
44 |         super(GaussianNoise, self).__init__()
45 |         self._init(locals())
46 | 
47 |     def _get_augment_params(self, img):
48 |         return self.rng.randn(*img.shape)
49 | 
50 |     def _augment(self, img, noise):
51 |         old_dtype = img.dtype
52 |         ret = img + noise * self.sigma
53 |         if self.clip or old_dtype == np.uint8:
54 |             ret = np.clip(ret, 0, 255)
55 |         return ret.astype(old_dtype)
56 | 
57 | 
58 | class SaltPepperNoise(ImageAugmentor):
59 |     """ Salt and pepper noise.
60 |         Randomly set some elements in image to 0 or 255, regardless of its channels.
61 |     """
62 | 
63 |     def __init__(self, white_prob=0.05, black_prob=0.05):
64 |         """
65 |         Args:
66 |             white_prob (float), black_prob (float): probabilities setting an element to 255 or 0.
67 |         """
68 |         assert white_prob + black_prob <= 1, "Sum of probabilities cannot be greater than 1"
69 |         super(SaltPepperNoise, self).__init__()
70 |         self._init(locals())
71 | 
72 |     def _get_augment_params(self, img):
73 |         return self.rng.uniform(low=0, high=1, size=img.shape)
74 | 
75 |     def _augment(self, img, param):
76 |         img[param > (1 - self.white_prob)] = 255
77 |         img[param < self.black_prob] = 0
78 |         return img
79 | 


--------------------------------------------------------------------------------
/tensorpack/models/fc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: fc.py
 5 | 
 6 | 
 7 | import numpy as np
 8 | import tensorflow as tf
 9 | 
10 | from ..tfutils.common import get_tf_version_tuple
11 | from .common import VariableHolder, layer_register
12 | from .tflayer import convert_to_tflayer_args, rename_get_variable
13 | 
14 | __all__ = ['FullyConnected']
15 | 
16 | 
17 | def batch_flatten(x):
18 |     """
19 |     Flatten the tensor except the first dimension.
20 |     """
21 |     shape = x.get_shape().as_list()[1:]
22 |     if None not in shape:
23 |         return tf.reshape(x, [-1, int(np.prod(shape))])
24 |     return tf.reshape(x, tf.stack([tf.shape(x)[0], -1]))
25 | 
26 | 
27 | @layer_register(log_shape=True)
28 | @convert_to_tflayer_args(
29 |     args_names=['units'],
30 |     name_mapping={'out_dim': 'units'})
31 | def FullyConnected(
32 |         inputs,
33 |         units,
34 |         activation=None,
35 |         use_bias=True,
36 |         kernel_initializer=None,
37 |         bias_initializer=tf.zeros_initializer(),
38 |         kernel_regularizer=None,
39 |         bias_regularizer=None,
40 |         activity_regularizer=None):
41 |     """
42 |     A wrapper around `tf.layers.Dense`.
43 |     One difference to maintain backward-compatibility:
44 |     Default weight initializer is variance_scaling_initializer(2.0).
45 | 
46 |     Variable Names:
47 | 
48 |     * ``W``: weights of shape [in_dim, out_dim]
49 |     * ``b``: bias
50 |     """
51 |     if kernel_initializer is None:
52 |         if get_tf_version_tuple() <= (1, 12):
53 |             kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
54 |         else:
55 |             kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
56 | 
57 |     inputs = batch_flatten(inputs)
58 |     with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
59 |         layer = tf.layers.Dense(
60 |             units=units,
61 |             activation=activation,
62 |             use_bias=use_bias,
63 |             kernel_initializer=kernel_initializer,
64 |             bias_initializer=bias_initializer,
65 |             kernel_regularizer=kernel_regularizer,
66 |             bias_regularizer=bias_regularizer,
67 |             activity_regularizer=activity_regularizer,
68 |             _reuse=tf.get_variable_scope().reuse)
69 |         ret = layer.apply(inputs, scope=tf.get_variable_scope())
70 |         ret = tf.identity(ret, name='output')
71 | 
72 |     ret.variables = VariableHolder(W=layer.kernel)
73 |     if use_bias:
74 |         ret.variables.b = layer.bias
75 |     return ret
76 | 


--------------------------------------------------------------------------------
/tensorpack/libinfo.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import os
 5 | 
 6 | # issue#7378 may happen with custom opencv. It doesn't hurt to disable opencl
 7 | os.environ['OPENCV_OPENCL_RUNTIME'] = 'disabled'     # https://github.com/opencv/opencv/pull/10155
 8 | try:
 9 |     # issue#1924 may happen on old systems
10 |     import cv2  # noqa
11 |     # cv2.setNumThreads(0)
12 |     if int(cv2.__version__.split('.')[0]) == 3:
13 |         cv2.ocl.setUseOpenCL(False)
14 |     # check if cv is built with cuda or openmp
15 |     info = cv2.getBuildInformation().split('\n')
16 |     for line in info:
17 |         splits = line.split()
18 |         if not len(splits):
19 |             continue
20 |         answer = splits[-1].lower()
21 |         if answer in ['yes', 'no']:
22 |             if 'cuda' in line.lower() and answer == 'yes':
23 |                 # issue#1197
24 |                 print("OpenCV is built with CUDA support. "
25 |                       "This may cause slow initialization or sometimes segfault with TensorFlow.")
26 |         if answer == 'openmp':
27 |             print("OpenCV is built with OpenMP support. This usually results in poor performance. For details, see "
28 |                   "https://github.com/tensorpack/benchmarks/blob/master/ImageNet/benchmark-opencv-resize.py")
29 | except (ImportError, TypeError):
30 |     pass
31 | 
32 | os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'  # issue#9339
33 | os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'   # use more warm-up
34 | 
35 | # Since 1.3, this is not needed
36 | os.environ['TF_AVGPOOL_USE_CUDNN'] = '1'   # issue#8566
37 | 
38 | # TF1.5 features
39 | os.environ['TF_SYNC_ON_FINISH'] = '0'   # will become default
40 | os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
41 | os.environ['TF_GPU_THREAD_COUNT'] = '2'
42 | 
43 | # Available in TF1.6+ & cudnn7. Haven't seen different performance on R50.
44 | # NOTE we disable it because:
45 | # this mode may use scaled atomic integer reduction that may cause a numerical
46 | # overflow for certain input data range.
47 | os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0'
48 | 
49 | # Available since 1.12. issue#15874
50 | os.environ['TF_ENABLE_WHILE_V2'] = '1'
51 | os.environ['TF_ENABLE_COND_V2'] = '1'
52 | 
53 | try:
54 |     import tensorflow as tf  # noqa
55 |     _version = tf.__version__.split('.')
56 |     assert int(_version[0]) >= 1 and int(_version[1]) >= 3, "TF>=1.3 is required!"
57 |     _HAS_TF = True
58 | except ImportError:
59 |     print("Failed to import tensorflow.")
60 |     _HAS_TF = False
61 | 
62 | 
63 | # These lines will be programatically read/write by setup.py
64 | # Don't touch them.
65 | __version__ = '0.9.0.1'
66 | __git_version__ = __version__
67 | 


--------------------------------------------------------------------------------
/infra/ami/no_batch_train_1node_16xl_convergence.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env bash
 4 | 
 5 | # Set timestamp and logging directory, begin writing to it.
 6 | TS=`date +'%Y%m%d_%H%M%S'`
 7 | LOG_DIR=/home/ubuntu/logs/train_log_${TS}
 8 | mkdir -p ${LOG_DIR}
 9 | exec &> >(tee ${LOG_DIR}/nohup.out)
10 | 
11 | # Print evaluated script commands
12 | set -x
13 | 
14 | # Set VENV
15 | VENV=${CONDA_DEFAULT_ENV}
16 | 
17 | # Write current branch and commit hash to log directory
18 | git branch | grep \* | awk '{print $2}' > ${LOG_DIR}/git_info
19 | git log | head -1 >> ${LOG_DIR}/git_info
20 | git diff >> ${LOG_DIR}/git_info
21 | 
22 | # Copy this script into logging directory
23 | cp `basename $0` ${LOG_DIR}
24 | 
25 | # Record environment variables
26 | env > ${LOG_DIR}/env.txt
27 | 
28 | # Record python libaries
29 | pip freeze > ${LOG_DIR}/requirements.txt
30 | 
31 | # Record tensorflow shared object linkages (CUDA version?)
32 | ldd /home/ubuntu/anaconda3/envs/${VENV}/lib/python3.6/site-packages/tensorflow/libtensorflow_framework.so > ${LOG_DIR}/tf_so_links.txt
33 | 
34 | # Execute training job
35 | # HOROVOD_TIMELINE=${LOG_DIR}/htimeline.json \
36 | #HOROVOD_AUTOTUNE=1 \
37 | #HOROVOD_AUTOTUNE_LOG=${LOG_DIR}/hvd_autotune.log \
38 | HOROVOD_CYCLE_TIME=0.5 \
39 | HOROVOD_FUSION_THRESHOLD=67108864 \
40 | HOROVOD_LOG_LEVEL=INFO \
41 | TENSORPACK_FP16=1 \
42 | /home/ubuntu/anaconda3/envs/${VENV}/bin/mpirun -np 8 -H localhost:8 \
43 | --mca plm_rsh_no_tree_spawn 1 -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib \
44 | -mca btl_tcp_if_exclude lo,docker0 \
45 | -mca btl_vader_single_copy_mechanism none \
46 | -x NCCL_SOCKET_IFNAME=^docker0,lo \
47 | -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO \
48 | -x HOROVOD_CYCLE_TIME \
49 | -x HOROVOD_FUSION_THRESHOLD \
50 | -x TENSORPACK_FP16 \
51 | -x LD_LIBRARY_PATH -x PATH \
52 | --output-filename ${LOG_DIR}/mpirun_logs \
53 | /home/ubuntu/anaconda3/envs/${VENV}/bin/python3 /home/ubuntu/tensorpack-mask-rcnn/MaskRCNN/train.py \
54 | --logdir ${LOG_DIR} \
55 | --fp16 \
56 | --throughput_log_freq 2000 \
57 | --config MODE_MASK=True \
58 | MODE_FPN=True \
59 | DATA.BASEDIR=/home/ubuntu/data \
60 | DATA.TRAIN='["train2017"]' \
61 | DATA.VAL='("val2017",)' \
62 | TRAIN.BATCH_SIZE_PER_GPU=1 \
63 | TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \
64 | BACKBONE.WEIGHTS=/home/ubuntu/data/pretrained-models/ImageNet-R50-AlignPadding.npz \
65 | BACKBONE.NORM=FreezeBN \
66 | TRAIN.EVAL_PERIOD=12 \
67 | TRAINER=horovod
68 | 
69 | #For 32x4
70 | #TRAIN.GRADIENT_CLIP=1.5
71 | 
72 | #-x HOROVOD_AUTOTUNE \
73 | #-x HOROVOD_AUTOTUNE_LOG \
74 | #-x HOROVOD_LOG_LEVEL=INFO \
75 | #-x HOROVOD_CYCLE_TIME -x HOROVOD_FUSION_THRESHOLD \
76 | #TRAIN.EVAL_PERIOD=1 \
77 | #TRAIN.STEPS_PER_EPOCH=15000 \
78 | #TRAIN.LR_SCHEDULE='[120000, 160000, 180000]' \
79 | 


--------------------------------------------------------------------------------
/tensorpack/predict/feedfree.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env python
 4 | 
 5 | from tensorflow.python.training.monitored_session import _HookedSession as HookedSession
 6 | 
 7 | from ..callbacks import Callbacks
 8 | from ..tfutils.tower import PredictTowerContext
 9 | from .base import PredictorBase
10 | 
11 | __all__ = ['FeedfreePredictor']
12 | 
13 | 
14 | class FeedfreePredictor(PredictorBase):
15 |     """
16 |     Create a predictor that takes inputs from an :class:`InputSource`, instead of from feeds.
17 |     An instance `pred` of :class:`FeedfreePredictor` can be called only by `pred()`, which returns
18 |     a list of output values as defined in config.output_names.
19 |     """
20 | 
21 |     def __init__(self, config, input_source):
22 |         """
23 |         Args:
24 |             config (PredictConfig): the config to use.
25 |             input_source (InputSource): the feedfree InputSource to use.
26 |                 Must match the inputs_desc in config.
27 |         """
28 |         self._config = config
29 |         self._input_source = input_source
30 |         assert config.return_input is False, \
31 |             "return_input is not supported in FeedfreePredictor! " \
32 |             "If you need to fetch inputs, add the names to the output_names!"
33 | 
34 |         self._hooks = []
35 |         self.graph = config._maybe_create_graph()
36 |         with self.graph.as_default():
37 |             self._input_callbacks = Callbacks(
38 |                 self._input_source.setup(config.inputs_desc))
39 |             with PredictTowerContext(''):
40 |                 self._input_tensors = self._input_source.get_input_tensors()
41 |                 config.tower_func(*self._input_tensors)
42 |                 self._tower_handle = config.tower_func.towers[-1]
43 | 
44 |             self._output_tensors = self._tower_handle.get_tensors(config.output_names)
45 | 
46 |             self._input_callbacks.setup_graph(None)
47 | 
48 |             for h in self._input_callbacks.get_hooks():
49 |                 self._register_hook(h)
50 |             self._initialize_session()
51 | 
52 |     def _register_hook(self, hook):
53 |         """
54 |         Args:
55 |             hook (tf.train.SessionRunHook):
56 |         """
57 |         self._hooks.append(hook)
58 | 
59 |     def _initialize_session(self):
60 |         # init the session
61 |         self._config.session_init._setup_graph()
62 |         self._sess = self._config.session_creator.create_session()
63 |         self._config.session_init._run_init(self._sess)
64 | 
65 |         with self._sess.as_default():
66 |             self._input_callbacks.before_train()
67 |             self._hooked_sess = HookedSession(self._sess, self._hooks)
68 | 
69 |     def __call__(self):
70 |         return self._hooked_sess.run(self._output_tensors)
71 | 
72 |     def _do_call(self):
73 |         raise NotImplementedError("You're calling the wrong function!")
74 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/dataset/svhn.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: svhn.py
 5 | 
 6 | 
 7 | import numpy as np
 8 | import os
 9 | 
10 | from ...utils import logger
11 | from ...utils.fs import download, get_dataset_path
12 | from ..base import RNGDataFlow
13 | 
14 | __all__ = ['SVHNDigit']
15 | 
16 | SVHN_URL = "http://ufldl.stanford.edu/housenumbers/"
17 | 
18 | 
19 | class SVHNDigit(RNGDataFlow):
20 |     """
21 |     `SVHN <http://ufldl.stanford.edu/housenumbers/>`_ Cropped Digit Dataset.
22 |     Produces [img, label], img of 32x32x3 in range [0,255], label of 0-9
23 |     """
24 |     _Cache = {}
25 | 
26 |     def __init__(self, name, data_dir=None, shuffle=True):
27 |         """
28 |         Args:
29 |             name (str): 'train', 'test', or 'extra'.
30 |             data_dir (str): a directory containing the original {train,test,extra}_32x32.mat.
31 |             shuffle (bool): shuffle the dataset.
32 |         """
33 |         self.shuffle = shuffle
34 | 
35 |         if name in SVHNDigit._Cache:
36 |             self.X, self.Y = SVHNDigit._Cache[name]
37 |             return
38 |         if data_dir is None:
39 |             data_dir = get_dataset_path('svhn_data')
40 |         assert name in ['train', 'test', 'extra'], name
41 |         filename = os.path.join(data_dir, name + '_32x32.mat')
42 |         if not os.path.isfile(filename):
43 |             url = SVHN_URL + os.path.basename(filename)
44 |             logger.info("File {} not found!".format(filename))
45 |             logger.info("Downloading from {} ...".format(url))
46 |             download(url, os.path.dirname(filename))
47 |         logger.info("Loading {} ...".format(filename))
48 |         data = scipy.io.loadmat(filename)
49 |         self.X = data['X'].transpose(3, 0, 1, 2)
50 |         self.Y = data['y'].reshape((-1))
51 |         self.Y[self.Y == 10] = 0
52 |         SVHNDigit._Cache[name] = (self.X, self.Y)
53 | 
54 |     def __len__(self):
55 |         return self.X.shape[0]
56 | 
57 |     def __iter__(self):
58 |         n = self.X.shape[0]
59 |         idxs = np.arange(n)
60 |         if self.shuffle:
61 |             self.rng.shuffle(idxs)
62 |         for k in idxs:
63 |             # since svhn is quite small, just do it for safety
64 |             yield [self.X[k], self.Y[k]]
65 | 
66 |     @staticmethod
67 |     def get_per_pixel_mean():
68 |         """
69 |         Returns:
70 |             a 32x32x3 image
71 |         """
72 |         a = SVHNDigit('train')
73 |         b = SVHNDigit('test')
74 |         c = SVHNDigit('extra')
75 |         return np.concatenate((a.X, b.X, c.X)).mean(axis=0)
76 | 
77 | 
78 | try:
79 |     import scipy.io
80 | except ImportError:
81 |     from ...utils.develop import create_dummy_class
82 |     SVHNDigit = create_dummy_class('SVHNDigit', 'scipy.io')  # noqa
83 | 
84 | if __name__ == '__main__':
85 |     a = SVHNDigit('train')
86 |     b = SVHNDigit.get_per_pixel_mean()
87 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/sesscreate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: sesscreate.py
 5 | 
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from ..tfutils.common import tfv1
10 | from ..utils import logger
11 | from .common import get_default_sess_config
12 | 
13 | __all__ = ['NewSessionCreator', 'ReuseSessionCreator', 'SessionCreatorAdapter']
14 | 
15 | """
16 | A SessionCreator should:
17 |     create the session
18 |     initialize all variables
19 |     return a session that is ready to use
20 |     not finalize the graph
21 | """
22 | 
23 | 
24 | class NewSessionCreator(tfv1.train.SessionCreator):
25 |     def __init__(self, target='', config=None):
26 |         """
27 |         Args:
28 |             target, config: same as :meth:`Session.__init__()`.
29 |             config: a :class:`tf.ConfigProto` instance, defaults to :func:`tfutils.get_default_sess_config()`
30 |         """
31 |         self.target = target
32 | 
33 |         if config is None:
34 |             # distributed trainer doesn't support user-provided config
35 |             # we set this attribute so that they can check
36 |             self.user_provided_config = False
37 |             config = get_default_sess_config()
38 |         else:
39 |             self.user_provided_config = True
40 |             logger.warn(
41 |                 "User-provided custom session config may not work due to TF \
42 | bugs. See https://github.com/tensorpack/tensorpack/issues/497 for workarounds.")
43 |         self.config = config
44 | 
45 |     def create_session(self):
46 |         sess = tf.Session(target=self.target, config=self.config)
47 |         sess.run(tf.global_variables_initializer())
48 |         sess.run(tf.local_variables_initializer())
49 |         sess.run(tf.tables_initializer())
50 |         return sess
51 | 
52 | 
53 | class ReuseSessionCreator(tfv1.train.SessionCreator):
54 |     """
55 |     Returns an existing session.
56 |     """
57 |     def __init__(self, sess):
58 |         """
59 |         Args:
60 |             sess (tf.Session): the session to reuse
61 |         """
62 |         self.sess = sess
63 | 
64 |     def create_session(self):
65 |         return self.sess
66 | 
67 | 
68 | class SessionCreatorAdapter(tfv1.train.SessionCreator):
69 |     """
70 |     Apply a function on the output of a SessionCreator. Can be used to create a debug session.
71 |     """
72 |     def __init__(self, session_creator, func):
73 |         """
74 |         Args:
75 |             session_creator (tf.train.SessionCreator): a session creator
76 |             func (tf.Session -> tf.Session): takes a session created by
77 |             ``session_creator``, and return a new session to be returned by ``self.create_session``
78 |         """
79 |         self._creator = session_creator
80 |         self._func = func
81 | 
82 |     def create_session(self):
83 |         sess = self._creator.create_session()
84 |         return self._func(sess)
85 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/external.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #!/usr/bin/env python
 4 | 
 5 | import numpy as np
 6 | 
 7 | from .base import ImageAugmentor
 8 | 
 9 | __all__ = ['IAAugmentor', 'Albumentations']
10 | 
11 | 
12 | class IAAugmentor(ImageAugmentor):
13 |     """
14 |     Wrap an augmentor form the IAA library: https://github.com/aleju/imgaug.
15 |     Both images and coordinates are supported.
16 | 
17 |     Note:
18 |         1. It's NOT RECOMMENDED
19 |            to use coordinates because the IAA library does not handle coordinates accurately.
20 | 
21 |         2. Only uint8 images are supported by the IAA library.
22 | 
23 |         3. The IAA library can only produces images of the same shape.
24 | 
25 |     Example:
26 | 
27 |     .. code-block:: python
28 | 
29 |         from tensorpack import imgaug  # this is not the aleju/imgaug library
30 |         from imgaug import augmentors as iaa  # this is the aleju/imgaug library
31 |         myaug = imgaug.IAAugmentor(
32 |             iaa.Sequential([
33 |                 iaa.Sharpen(alpha=(0, 1), lightness=(0.75, 1.5)),
34 |                 iaa.Fliplr(0.5),
35 |                 iaa.Crop(px=(0, 100)),
36 |             ])
37 |     """
38 | 
39 |     def __init__(self, augmentor):
40 |         """
41 |         Args:
42 |             augmentor (iaa.Augmenter):
43 |         """
44 |         super(IAAugmentor, self).__init__()
45 |         self._aug = augmentor
46 | 
47 |     def _get_augment_params(self, img):
48 |         return (self._aug.to_deterministic(), img.shape)
49 | 
50 |     def _augment(self, img, param):
51 |         aug, _ = param
52 |         return aug.augment_image(img)
53 | 
54 |     def _augment_coords(self, coords, param):
55 |         import imgaug as IA
56 |         aug, shape = param
57 |         points = [IA.Keypoint(x=x, y=y) for x, y in coords]
58 |         points = IA.KeypointsOnImage(points, shape=shape)
59 |         augmented = aug.augment_keypoints([points])[0].keypoints
60 |         return np.asarray([[p.x, p.y] for p in augmented])
61 | 
62 | 
63 | class Albumentations(ImageAugmentor):
64 |     """
65 |     Wrap an augmentor form the albumentations library: https://github.com/albu/albumentations.
66 |     Coordinate augmentation is not supported by the library.
67 | 
68 |     Example:
69 | 
70 |     .. code-block:: python
71 | 
72 |         from tensorpack import imgaug
73 |         import albumentations as AB
74 |         myaug = imgaug.Albumentations(AB.RandomRotate90(p=1))
75 |     """
76 |     def __init__(self, augmentor):
77 |         """
78 |         Args:
79 |             augmentor (albumentations.BasicTransform):
80 |         """
81 |         super(Albumentations, self).__init__()
82 |         self._aug = augmentor
83 | 
84 |     def _get_augment_params(self, img):
85 |         return self._aug.get_params()
86 | 
87 |     def _augment(self, img, param):
88 |         return self._aug.apply(img, **param)
89 | 
90 |     def _augment_coords(self, coords, param):
91 |         raise NotImplementedError()
92 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/model_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | # -*- coding: utf-8 -*-
 4 | # File: model_utils.py
 5 | # Author: tensorpack contributors
 6 | 
 7 | import tensorflow as tf
 8 | from tabulate import tabulate
 9 | from termcolor import colored
10 | 
11 | from ..utils import logger
12 | 
13 | __all__ = []
14 | 
15 | 
16 | # TODO should also describe model_variables
17 | def describe_trainable_vars():
18 |     """
19 |     Print a description of the current model parameters.
20 |     Skip variables starting with "tower", as they are just duplicates built by data-parallel logic.
21 |     """
22 |     train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
23 |     if len(train_vars) == 0:
24 |         logger.warn("No trainable variables in the graph!")
25 |         return
26 |     total = 0
27 |     total_bytes = 0
28 |     data = []
29 |     for v in train_vars:
30 |         if v.name.startswith('tower'):
31 |             continue
32 |         shape = v.get_shape()
33 |         ele = shape.num_elements()
34 |         if ele is None:
35 |             logger.warn("Shape of variable {} is not fully defined but {}.".format(v.name, shape))
36 |             ele = 0
37 |         try:
38 |             shape = shape.as_list()
39 |         except ValueError:
40 |             shape = '<unknown>'
41 | 
42 |         total += ele
43 |         total_bytes += ele * v.dtype.size
44 |         data.append([v.name, shape, ele, v.device, v.dtype.base_dtype.name])
45 |     headers = ['name', 'shape', 'dim', 'device', 'dtype']
46 | 
47 |     dtypes = set([x[4] for x in data])
48 |     if len(dtypes) == 1:
49 |         for x in data:
50 |             del x[4]
51 |         del headers[4]
52 | 
53 |     devices = set([x[3] for x in data])
54 |     if len(devices) == 1:
55 |         # don't log the device if all vars on the same device
56 |         for x in data:
57 |             del x[3]
58 |         del headers[3]
59 | 
60 |     table = tabulate(data, headers=headers)
61 | 
62 |     size_mb = total_bytes / 1024.0**2
63 |     summary_msg = colored(
64 |         "\nTotal #vars={}, #params={}, size={:.02f}MB".format(
65 |             len(data), total, size_mb), 'cyan')
66 |     logger.info(colored("Trainable Variables: \n", 'cyan') + table + summary_msg)
67 | 
68 | 
69 | def get_shape_str(tensors):
70 |     """
71 |     Internally used by layer registry, to print shapes of inputs/outputs of layers.
72 | 
73 |     Args:
74 |         tensors (list or tf.Tensor): a tensor or a list of tensors
75 |     Returns:
76 |         str: a string to describe the shape
77 |     """
78 |     if isinstance(tensors, (list, tuple)):
79 |         for v in tensors:
80 |             assert isinstance(v, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(v))
81 |         shape_str = ",".join(
82 |             map(lambda x: str(x.get_shape().as_list()), tensors))
83 |     else:
84 |         assert isinstance(tensors, (tf.Tensor, tf.Variable)), "Not a tensor: {}".format(type(tensors))
85 |         shape_str = str(tensors.get_shape().as_list())
86 |     return shape_str
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mask RCNN
 2 | 
 3 | ## NOTE: This repository is archived. This project will continue to be worked on here - https://github.com/aws-samples/mask-rcnn-tensorflow
 4 | 
 5 | Performance focused implementation of Mask RCNN based on the [Tensorpack implementation](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN).
 6 | The original paper: [Mask R-CNN](https://arxiv.org/abs/1703.06870)
 7 | ### Overview
 8 | 
 9 | This implementation of Mask RCNN is focused on increasing training throughput without sacrificing any accuracy. We do this by training with a batch size > 1 per GPU using FP16 and two custom TF ops.
10 | 
11 | ### Status
12 | 
13 | Training on N GPUs (V100s in our experiments) with a per-gpu batch size of M = NxM training
14 | 
15 | Training converges to target accuracy for configurations from 8x1 up to 32x4 training. Training throughput is substantially improved from original Tensorpack code.
16 | 
17 | A pre-built dockerfile is available in DockerHub under `armandmcqueen/tensorpack-mask-rcnn:master-latest`. It is automatically built on each commit to master.
18 | 
19 | ### Notes
20 | 
21 | - Running this codebase requires a custom TF binary - available under GitHub releases (custom ops and fix for bug introduced in TF 1.13
22 | - We give some details the codebase and optimizations in `CODEBASE.md`
23 | 
24 | ### To launch training
25 | - Data preprocessing
26 |   - Follow the [data preprocess](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN)
27 |   - If you want to use EKS or Sagemaker, you need to create your own S3 bucket which contains the data, and change the S3 bucket name in the following files:
28 |     - EKS: [P3 config](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/eks/fsx/p3/stage-data.yaml), [P3dn config](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/eks/fsx/p3dn/stage-data.yaml)
29 |     - SageMaker: [S3 download](https://github.com/armandmcqueen/tensorpack-mask-rcnn/blob/master/infra/sm/run_mpi.py#L122)
30 | - Container is recommended for training
31 |   - To train with docker, refer to [Docker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/docker)
32 |   - To train with Amazon EKS, refer to [EKS](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/eks)
33 |   - To train with Amazon SageMaker, refer to [SageMaker](https://github.com/armandmcqueen/tensorpack-mask-rcnn/tree/master/infra/sm)
34 | 
35 | ### Training results
36 | The result was running on P3dn.24xl instances using EKS.
37 | 12 epochs training:
38 | 
39 | | Num_GPUs x Images_Per_GPU | Training time | Box mAP | Mask mAP |
40 | | ------------- | ------------- | ------------- | ------------- |
41 | | 8x4 | 5.09h | 37.47% | 34.45% |
42 | | 16x4 | 3.11h | 37.41% | 34.47% |
43 | | 32x4 | 1.94h | 37.20% | 34.25% |
44 | 
45 | 24 epochs training:
46 | 
47 | | Num_GPUs x Images_Per_GPU | Training time | Box mAP | Mask mAP |
48 | | ------------- | ------------- | ------------- | ------------- |
49 | | 8x4 | 9.78h | 38.25% | 35.08% |
50 | | 16x4 | 5.60h | 38.44% | 35.18% |
51 | | 32x4 | 3.33h | 38.33% | 35.12% |
52 | 
53 | ### Tensorpack fork point
54 | 
55 | Forked from the excellent Tensorpack repo at commit a9dce5b220dca34b15122a9329ba9ff055e8edc6
56 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/group.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: group.py
  5 | 
  6 | 
  7 | import traceback
  8 | from contextlib import contextmanager
  9 | from time import time as timer
 10 | import six
 11 | import tensorflow as tf
 12 | 
 13 | from ..utils import logger
 14 | from ..utils.utils import humanize_time_delta
 15 | from .base import Callback
 16 | from .hooks import CallbackToHook
 17 | 
 18 | if six.PY3:
 19 |     from time import perf_counter as timer  # noqa
 20 | 
 21 | __all__ = ['Callbacks']
 22 | 
 23 | 
 24 | class CallbackTimeLogger(object):
 25 |     def __init__(self):
 26 |         self.times = []
 27 |         self.tot = 0
 28 | 
 29 |     def add(self, name, time):
 30 |         self.tot += time
 31 |         self.times.append((name, time))
 32 | 
 33 |     @contextmanager
 34 |     def timed_callback(self, name):
 35 |         s = timer()
 36 |         yield
 37 |         self.add(name, timer() - s)
 38 | 
 39 |     def log(self):
 40 | 
 41 |         """ log the time of some heavy callbacks """
 42 |         if self.tot < 3:
 43 |             return
 44 |         msgs = []
 45 |         for name, t in self.times:
 46 |             if t / self.tot > 0.3 and t > 1:
 47 |                 msgs.append(name + ": " + humanize_time_delta(t))
 48 |         logger.info(
 49 |             "Callbacks took {:.3f} sec in total. {}".format(
 50 |                 self.tot, '; '.join(msgs)))
 51 | 
 52 | 
 53 | class Callbacks(Callback):
 54 |     """
 55 |     A container to hold all callbacks, and trigger them iteratively.
 56 |     Note that it does nothing to before_run/after_run.
 57 |     """
 58 | 
 59 |     def __init__(self, cbs):
 60 |         """
 61 |         Args:
 62 |             cbs(list): a list of :class:`Callback` instances.
 63 |         """
 64 |         # check type
 65 |         for cb in cbs:
 66 |             assert isinstance(cb, Callback), cb.__class__
 67 |         self.cbs = cbs
 68 | 
 69 |     def _setup_graph(self):
 70 |         with tf.name_scope(None):   # clear the name scope
 71 |             for cb in self.cbs:
 72 |                 cb.setup_graph(self.trainer)
 73 | 
 74 |     def _before_train(self):
 75 |         for cb in self.cbs:
 76 |             cb.before_train()
 77 | 
 78 |     def _after_train(self):
 79 |         for cb in self.cbs:
 80 |             # make sure callbacks are properly finalized
 81 |             try:
 82 |                 cb.after_train()
 83 |             except Exception:
 84 |                 traceback.print_exc()
 85 | 
 86 |     def get_hooks(self):
 87 |         return [CallbackToHook(cb) for cb in self.cbs]
 88 | 
 89 |     def trigger_step(self):
 90 |         for cb in self.cbs:
 91 |             cb.trigger_step()
 92 | 
 93 |     def _trigger_epoch(self):
 94 |         tm = CallbackTimeLogger()
 95 | 
 96 |         for cb in self.cbs:
 97 |             display_name = str(cb)
 98 |             with tm.timed_callback(display_name):
 99 |                 cb.trigger_epoch()
100 |         tm.log()
101 | 
102 |     def _before_epoch(self):
103 |         for cb in self.cbs:
104 |             cb.before_epoch()
105 | 
106 |     def _after_epoch(self):
107 |         for cb in self.cbs:
108 |             cb.after_epoch()
109 | 


--------------------------------------------------------------------------------
/tensorpack/utils/timer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: timer.py
  5 | 
  6 | 
  7 | import atexit
  8 | from collections import defaultdict
  9 | from contextlib import contextmanager
 10 | from time import time as timer
 11 | import six
 12 | 
 13 | from . import logger
 14 | from .stats import StatCounter
 15 | 
 16 | if six.PY3:
 17 |     from time import perf_counter as timer  # noqa
 18 | 
 19 | 
 20 | __all__ = ['total_timer', 'timed_operation',
 21 |            'print_total_timer', 'IterSpeedCounter']
 22 | 
 23 | 
 24 | @contextmanager
 25 | def timed_operation(msg, log_start=False):
 26 |     """
 27 |     Surround a context with a timer.
 28 | 
 29 |     Args:
 30 |         msg(str): the log to print.
 31 |         log_start(bool): whether to print also at the beginning.
 32 | 
 33 |     Example:
 34 |         .. code-block:: python
 35 | 
 36 |             with timed_operation('Good Stuff'):
 37 |                 time.sleep(1)
 38 | 
 39 |         Will print:
 40 | 
 41 |         .. code-block:: python
 42 | 
 43 |             Good stuff finished, time:1sec.
 44 |     """
 45 |     if log_start:
 46 |         logger.info('Start {} ...'.format(msg))
 47 |     start = timer()
 48 |     yield
 49 |     logger.info('{} finished, time:{:.4f}sec.'.format(
 50 |         msg, timer() - start))
 51 | 
 52 | 
 53 | _TOTAL_TIMER_DATA = defaultdict(StatCounter)
 54 | 
 55 | 
 56 | @contextmanager
 57 | def total_timer(msg):
 58 |     """ A context which add the time spent inside to TotalTimer. """
 59 |     start = timer()
 60 |     yield
 61 |     t = timer() - start
 62 |     _TOTAL_TIMER_DATA[msg].feed(t)
 63 | 
 64 | 
 65 | def print_total_timer():
 66 |     """
 67 |     Print the content of the TotalTimer, if it's not empty. This function will automatically get
 68 |     called when program exits.
 69 |     """
 70 |     if len(_TOTAL_TIMER_DATA) == 0:
 71 |         return
 72 |     for k, v in six.iteritems(_TOTAL_TIMER_DATA):
 73 |         logger.info("Total Time: {} -> {:.2f} sec, {} times, {:.3g} sec/time".format(
 74 |             k, v.sum, v.count, v.average))
 75 | 
 76 | 
 77 | atexit.register(print_total_timer)
 78 | 
 79 | 
 80 | class IterSpeedCounter(object):
 81 |     """ Test how often some code gets reached.
 82 | 
 83 |     Example:
 84 |         Print the speed of the iteration every 100 times.
 85 | 
 86 |         .. code-block:: python
 87 | 
 88 |             speed = IterSpeedCounter(100)
 89 |             for k in range(1000):
 90 |                 # do something
 91 |                 speed()
 92 |     """
 93 | 
 94 |     def __init__(self, print_every, name=None):
 95 |         """
 96 |         Args:
 97 |             print_every(int): interval to print.
 98 |             name(str): name to used when print.
 99 |         """
100 |         self.cnt = 0
101 |         self.print_every = int(print_every)
102 |         self.name = name if name else 'IterSpeed'
103 | 
104 |     def reset(self):
105 |         self.start = timer()
106 | 
107 |     def __call__(self):
108 |         if self.cnt == 0:
109 |             self.reset()
110 |         self.cnt += 1
111 |         if self.cnt % self.print_every != 0:
112 |             return
113 |         t = timer() - self.start
114 |         logger.info("{}: {:.2f} sec, {} times, {:.3g} sec/time".format(
115 |             self.name, t, self.cnt, t / self.cnt))
116 | 


--------------------------------------------------------------------------------
/MaskRCNN/viz.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: viz.py
  5 | 
  6 | import numpy as np
  7 | from six.moves import zip
  8 | 
  9 | from tensorpack.utils import viz
 10 | from tensorpack.utils.palette import PALETTE_RGB
 11 | 
 12 | from config import config as cfg
 13 | from utils.np_box_ops import iou as np_iou
 14 | 
 15 | 
 16 | def draw_annotation(img, boxes, klass, is_crowd=None):
 17 |     """Will not modify img"""
 18 |     labels = []
 19 |     assert len(boxes) == len(klass)
 20 |     if is_crowd is not None:
 21 |         assert len(boxes) == len(is_crowd)
 22 |         for cls, crd in zip(klass, is_crowd):
 23 |             clsname = cfg.DATA.CLASS_NAMES[cls]
 24 |             if crd == 1:
 25 |                 clsname += ';Crowd'
 26 |             labels.append(clsname)
 27 |     else:
 28 |         for cls in klass:
 29 |             labels.append(cfg.DATA.CLASS_NAMES[cls])
 30 |     img = viz.draw_boxes(img, boxes, labels)
 31 |     return img
 32 | 
 33 | 
 34 | def draw_proposal_recall(img, proposals, proposal_scores, gt_boxes):
 35 |     """
 36 |     Draw top3 proposals for each gt.
 37 |     Args:
 38 |         proposals: NPx4
 39 |         proposal_scores: NP
 40 |         gt_boxes: NG
 41 |     """
 42 |     box_ious = np_iou(gt_boxes, proposals)    # ng x np
 43 |     box_ious_argsort = np.argsort(-box_ious, axis=1)
 44 |     good_proposals_ind = box_ious_argsort[:, :3]   # for each gt, find 3 best proposals
 45 |     good_proposals_ind = np.unique(good_proposals_ind.ravel())
 46 | 
 47 |     proposals = proposals[good_proposals_ind, :]
 48 |     tags = list(map(str, proposal_scores[good_proposals_ind]))
 49 |     img = viz.draw_boxes(img, proposals, tags)
 50 |     return img, good_proposals_ind
 51 | 
 52 | 
 53 | def draw_predictions(img, boxes, scores):
 54 |     """
 55 |     Args:
 56 |         boxes: kx4
 57 |         scores: kxC
 58 |     """
 59 |     if len(boxes) == 0:
 60 |         return img
 61 |     labels = scores.argmax(axis=1)
 62 |     scores = scores.max(axis=1)
 63 |     tags = ["{},{:.2f}".format(cfg.DATA.CLASS_NAMES[lb], score) for lb, score in zip(labels, scores)]
 64 |     return viz.draw_boxes(img, boxes, tags)
 65 | 
 66 | 
 67 | def draw_final_outputs(img, results):
 68 |     """
 69 |     Args:
 70 |         results: [DetectionResult]
 71 |     """
 72 |     if len(results) == 0:
 73 |         return img
 74 | 
 75 |     tags = []
 76 |     for r in results:
 77 |         tags.append(
 78 |             "{},{:.2f}".format(cfg.DATA.CLASS_NAMES[r.class_id], r.score))
 79 |     boxes = np.asarray([r.box for r in results])
 80 |     ret = viz.draw_boxes(img, boxes, tags)
 81 | 
 82 |     for r in results:
 83 |         if r.mask is not None:
 84 |             ret = draw_mask(ret, r.mask)
 85 |     return ret
 86 | 
 87 | 
 88 | def draw_mask(im, mask, alpha=0.5, color=None):
 89 |     """
 90 |     Overlay a mask on top of the image.
 91 | 
 92 |     Args:
 93 |         im: a 3-channel uint8 image in BGR
 94 |         mask: a binary 1-channel image of the same size
 95 |         color: if None, will choose automatically
 96 |     """
 97 |     if color is None:
 98 |         color = PALETTE_RGB[np.random.choice(len(PALETTE_RGB))][::-1]
 99 |     im = np.where(np.repeat((mask > 0)[:, :, None], 3, axis=2),
100 |                   im * (1 - alpha) + color * alpha, im)
101 |     im = im.astype('uint8')
102 |     return im
103 | 


--------------------------------------------------------------------------------
/MaskRCNN/NOTES.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### File Structure
 3 | This is a minimal implementation that simply contains these files:
 4 | + dataset.py: load and evaluate COCO dataset
 5 | + data.py: prepare data for training & inference
 6 | + common.py: common data preparation utilities
 7 | + basemodel.py: implement backbones
 8 | + model_box.py: implement box-related symbolic functions
 9 | + model_{fpn,rpn,frcnn,mrcnn,cascade}.py: implement FPN,RPN,Fast-/Mask-/Cascade-RCNN models.
10 | + train.py: main entry script
11 | + utils/: third-party helper functions
12 | + eval.py: evaluation utilities
13 | + viz.py: visualization utilities
14 | 
15 | ### Implementation Notes
16 | 
17 | Data:
18 | 
19 | 1. It's easy to train on your own data by changing `dataset.py`.
20 | 
21 |    + If your data is in COCO format, modify `COCODetection`
22 |      to change the class names and the id mapping.
23 |    + If your data is not in COCO format, ignore `COCODetection` completely and
24 |      rewrite all the methods of
25 |      `DetectionDataset` following its documents.
26 |      You'll implement the logic to load your dataset and evaluate predictions.
27 | 
28 | 2. You can easily add more augmentations such as rotation, but be careful how a box should be
29 | 	 augmented. The code now will always use the minimal axis-aligned bounding box of the 4 corners,
30 | 	 which is probably not the optimal way.
31 | 	 A TODO is to generate bounding box from segmentation, so more augmentations can be naturally supported.
32 | 
33 | Model:
34 | 
35 | 1. Floating-point boxes are defined like this:
36 | 
37 | <p align="center"> <img src="https://user-images.githubusercontent.com/1381301/31527740-2f1b38ce-af84-11e7-8de1-628e90089826.png"> </p>
38 | 
39 | 2. We use ROIAlign, and `tf.image.crop_and_resize` is __NOT__ ROIAlign.
40 | 
41 | 3. We currently only support single image per GPU.
42 | 
43 | 4. Because of (3), BatchNorm statistics are supposed to be freezed during fine-tuning.
44 | 
45 | 5. An alternative to freezing BatchNorm is to sync BatchNorm statistics across
46 |    GPUs (the `BACKBONE.NORM=SyncBN` option). This would require [my bugfix](https://github.com/tensorflow/tensorflow/pull/20360)
47 |    which is available since TF 1.10. You can manually apply the patch to use it.
48 |    For now the total batch size is at most 8, so this option does not improve the model by much.
49 | 
50 | 6. Another alternative to BatchNorm is GroupNorm (`BACKBONE.NORM=GN`) which has better performance.
51 | 
52 | Speed:
53 | 
54 | 1. If CuDNN warmup is on, the training will start very slowly, until about
55 |    10k steps (or more if scale augmentation is used) to reach a maximum speed.
56 |    As a result, the ETA is also inaccurate at the beginning.
57 |    Warmup is by default on when no scale augmentation is used.
58 | 
59 | 1. After warmup, the training speed will slowly decrease due to more accurate proposals.
60 | 
61 | 1. The code should have around 70% GPU utilization on V100s, and 85%~90% scaling
62 |    efficiency from 1 V100 to 8 V100s.
63 | 
64 | 1. This implementation does not contain specialized CUDA ops (e.g. AffineChannel, ROIAlign),
65 |    so it can be slightly (~10%) slower than Detectron (Caffe2) and
66 |    maskrcnn-benchmark (PyTorch).
67 | 
68 | Possible Future Enhancements:
69 | 
70 | 1. Define a better interface to load custom dataset.
71 | 
72 | 1. Support batch>1 per GPU.
73 | 
74 | 1. Use dedicated ops to improve speed. (e.g. a TF implementation of ROIAlign op
75 |    can be found in [light-head RCNN](https://github.com/zengarden/light_head_rcnn/tree/master/lib/lib_kernel))
76 | 


--------------------------------------------------------------------------------
/MaskRCNN/utils/box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: box_ops.py
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | from tensorpack.tfutils.scope_utils import under_name_scope
  9 | 
 10 | 
 11 | """
 12 | This file is modified from
 13 | https://github.com/tensorflow/models/blob/master/object_detection/core/box_list_ops.py
 14 | """
 15 | 
 16 | 
 17 | @under_name_scope()
 18 | def area(boxes):
 19 |     """
 20 |     Args:
 21 |       boxes: nx4 floatbox
 22 | 
 23 |     Returns:
 24 |       n
 25 |     """
 26 |     x_min, y_min, x_max, y_max = tf.split(boxes, 4, axis=1)
 27 |     return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
 28 | 
 29 | 
 30 | @under_name_scope()
 31 | def pairwise_intersection(boxlist1, boxlist2):
 32 |     """Compute pairwise intersection areas between boxes.
 33 | 
 34 |     Args:
 35 |       boxlist1: Nx4 floatbox
 36 |       boxlist2: Mx4
 37 | 
 38 |     Returns:
 39 |       a tensor with shape [N, M] representing pairwise intersections
 40 |     """
 41 |     x_min1, y_min1, x_max1, y_max1 = tf.split(boxlist1, 4, axis=1)
 42 |     x_min2, y_min2, x_max2, y_max2 = tf.split(boxlist2, 4, axis=1)
 43 |     all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
 44 |     all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
 45 |     intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
 46 |     all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
 47 |     all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
 48 |     intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
 49 |     return intersect_heights * intersect_widths
 50 | 
 51 | 
 52 | @under_name_scope()
 53 | def pairwise_iou(boxlist1, boxlist2):
 54 |     """Computes pairwise intersection-over-union between box collections.
 55 | 
 56 |     Args:
 57 |       boxlist1: Nx4 floatbox
 58 |       boxlist2: Mx4
 59 | 
 60 |     Returns:
 61 |       a tensor with shape [N, M] representing pairwise iou scores.
 62 |     """
 63 |     intersections = pairwise_intersection(boxlist1, boxlist2)
 64 |     areas1 = area(boxlist1)
 65 |     areas2 = area(boxlist2)
 66 |     unions = (
 67 |         tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
 68 |     return tf.where(
 69 |         tf.equal(intersections, 0.0),
 70 |         tf.zeros_like(intersections), tf.truediv(intersections, unions))
 71 | 
 72 | 
 73 | 
 74 | @under_name_scope()
 75 | def pairwise_iou_batch(proposal_boxes, gt_boxes, orig_gt_counts, batch_size):
 76 |     """Computes pairwise intersection-over-union between box collections.
 77 |     Args:
 78 |       proposal_boxes: K x 5  (batch_index, x1, y1, x2, y2)
 79 |       gt_boxes: BS x MaxNumGTs x 4
 80 |       orig_gt_counts: BS
 81 |     Returns:
 82 |         list of length BS, each element is output of pairwise_iou: N x M
 83 |         (where N is number of boxes for image and M is number of GTs for image)
 84 |     """
 85 | 
 86 |     prefix = "pairwise_iou_batch"
 87 | 
 88 |     # For each image index, extract a ?x4 boxlist and gt_boxlist
 89 | 
 90 |     per_images_iou = []
 91 |     for batch_idx in range(batch_size):
 92 | 
 93 |         box_mask_for_image = tf.equal(proposal_boxes[:, 0], batch_idx)
 94 | 
 95 |         single_image_boxes = tf.boolean_mask(proposal_boxes, box_mask_for_image)
 96 |         single_image_boxes = single_image_boxes[:, 1:]
 97 |         single_image_gt_boxes = gt_boxes[batch_idx, 0:orig_gt_counts[batch_idx], :]
 98 |         single_image_iou = pairwise_iou(single_image_boxes, single_image_gt_boxes)
 99 | 
100 |         per_images_iou.append(single_image_iou)
101 | 
102 |     return per_images_iou
103 | 


--------------------------------------------------------------------------------
/MaskRCNN/utils/generate_anchors.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/rpn/generate_anchors.py
  4 | 
  5 | # --------------------------------------------------------
  6 | # Faster R-CNN
  7 | # Copyright (c) 2015 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Ross Girshick and Sean Bell
 10 | # --------------------------------------------------------
 11 | 
 12 | import numpy as np
 13 | from six.moves import range
 14 | 
 15 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
 16 | #
 17 | #    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
 18 | #    >> anchors
 19 | #
 20 | #    anchors =
 21 | #
 22 | #       -83   -39   100    56
 23 | #      -175   -87   192   104
 24 | #      -359  -183   376   200
 25 | #       -55   -55    72    72
 26 | #      -119  -119   136   136
 27 | #      -247  -247   264   264
 28 | #       -35   -79    52    96
 29 | #       -79  -167    96   184
 30 | #      -167  -343   184   360
 31 | 
 32 | # array([[ -83.,  -39.,  100.,   56.],
 33 | #       [-175.,  -87.,  192.,  104.],
 34 | #       [-359., -183.,  376.,  200.],
 35 | #       [ -55.,  -55.,   72.,   72.],
 36 | #       [-119., -119.,  136.,  136.],
 37 | #       [-247., -247.,  264.,  264.],
 38 | #       [ -35.,  -79.,   52.,   96.],
 39 | #       [ -79., -167.,   96.,  184.],
 40 | #       [-167., -343.,  184.,  360.]])
 41 | 
 42 | 
 43 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
 44 |                      scales=2**np.arange(3, 6)):
 45 |     """
 46 |     Generate anchor (reference) windows by enumerating aspect ratios X
 47 |     scales wrt a reference (0, 0, 15, 15) window.
 48 |     """
 49 | 
 50 |     base_anchor = np.array([1, 1, base_size, base_size], dtype='float32') - 1
 51 |     ratio_anchors = _ratio_enum(base_anchor, ratios)
 52 |     anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
 53 |                          for i in range(ratio_anchors.shape[0])])
 54 |     return anchors
 55 | 
 56 | 
 57 | def _whctrs(anchor):
 58 |     """
 59 |     Return width, height, x center, and y center for an anchor (window).
 60 |     """
 61 | 
 62 |     w = anchor[2] - anchor[0] + 1
 63 |     h = anchor[3] - anchor[1] + 1
 64 |     x_ctr = anchor[0] + 0.5 * (w - 1)
 65 |     y_ctr = anchor[1] + 0.5 * (h - 1)
 66 |     return w, h, x_ctr, y_ctr
 67 | 
 68 | 
 69 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 70 |     """
 71 |     Given a vector of widths (ws) and heights (hs) around a center
 72 |     (x_ctr, y_ctr), output a set of anchors (windows).
 73 |     """
 74 | 
 75 |     ws = ws[:, np.newaxis]
 76 |     hs = hs[:, np.newaxis]
 77 |     anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
 78 |                          y_ctr - 0.5 * (hs - 1),
 79 |                          x_ctr + 0.5 * (ws - 1),
 80 |                          y_ctr + 0.5 * (hs - 1)))
 81 |     return anchors
 82 | 
 83 | 
 84 | def _ratio_enum(anchor, ratios):
 85 |     """
 86 |     Enumerate a set of anchors for each aspect ratio wrt an anchor.
 87 |     """
 88 | 
 89 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
 90 |     size = w * h
 91 |     size_ratios = size / ratios
 92 |     ws = np.round(np.sqrt(size_ratios))
 93 |     hs = np.round(ws * ratios)
 94 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 95 |     return anchors
 96 | 
 97 | 
 98 | def _scale_enum(anchor, scales):
 99 |     """
100 |     Enumerate a set of anchors for each scale wrt an anchor.
101 |     """
102 | 
103 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
104 |     ws = w * scales
105 |     hs = h * scales
106 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
107 |     return anchors
108 | 


--------------------------------------------------------------------------------
/tensorpack/dataflow/imgaug/paste.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: paste.py
  5 | 
  6 | 
  7 | import numpy as np
  8 | from abc import abstractmethod
  9 | 
 10 | from .base import ImageAugmentor
 11 | 
 12 | __all__ = ['CenterPaste', 'BackgroundFiller', 'ConstantBackgroundFiller',
 13 |            'RandomPaste']
 14 | 
 15 | 
 16 | class BackgroundFiller(object):
 17 |     """ Base class for all BackgroundFiller"""
 18 | 
 19 |     def fill(self, background_shape, img):
 20 |         """
 21 |         Return a proper background image of background_shape, given img.
 22 | 
 23 |         Args:
 24 |             background_shape (tuple): a shape (h, w)
 25 |             img: an image
 26 |         Returns:
 27 |             a background image
 28 |         """
 29 |         background_shape = tuple(background_shape)
 30 |         return self._fill(background_shape, img)
 31 | 
 32 |     @abstractmethod
 33 |     def _fill(self, background_shape, img):
 34 |         pass
 35 | 
 36 | 
 37 | class ConstantBackgroundFiller(BackgroundFiller):
 38 |     """ Fill the background by a constant """
 39 | 
 40 |     def __init__(self, value):
 41 |         """
 42 |         Args:
 43 |             value (float): the value to fill the background.
 44 |         """
 45 |         self.value = value
 46 | 
 47 |     def _fill(self, background_shape, img):
 48 |         assert img.ndim in [3, 2]
 49 |         if img.ndim == 3:
 50 |             return_shape = background_shape + (img.shape[2],)
 51 |         else:
 52 |             return_shape = background_shape
 53 |         return np.zeros(return_shape, dtype=img.dtype) + self.value
 54 | 
 55 | 
 56 | class CenterPaste(ImageAugmentor):
 57 |     """
 58 |     Paste the image onto the center of a background canvas.
 59 |     """
 60 | 
 61 |     def __init__(self, background_shape, background_filler=None):
 62 |         """
 63 |         Args:
 64 |             background_shape (tuple): shape of the background canvas.
 65 |             background_filler (BackgroundFiller): How to fill the background. Defaults to zero-filler.
 66 |         """
 67 |         if background_filler is None:
 68 |             background_filler = ConstantBackgroundFiller(0)
 69 | 
 70 |         self._init(locals())
 71 | 
 72 |     def _augment(self, img, _):
 73 |         img_shape = img.shape[:2]
 74 |         assert self.background_shape[0] >= img_shape[0] and self.background_shape[1] >= img_shape[1]
 75 | 
 76 |         background = self.background_filler.fill(
 77 |             self.background_shape, img)
 78 |         y0 = int((self.background_shape[0] - img_shape[0]) * 0.5)
 79 |         x0 = int((self.background_shape[1] - img_shape[1]) * 0.5)
 80 |         background[y0:y0 + img_shape[0], x0:x0 + img_shape[1]] = img
 81 |         return background
 82 | 
 83 |     def _augment_coords(self, coords, param):
 84 |         raise NotImplementedError()
 85 | 
 86 | 
 87 | class RandomPaste(CenterPaste):
 88 |     """
 89 |     Randomly paste the image onto a background canvas.
 90 |     """
 91 | 
 92 |     def _get_augment_params(self, img):
 93 |         img_shape = img.shape[:2]
 94 |         assert self.background_shape[0] > img_shape[0] and self.background_shape[1] > img_shape[1]
 95 | 
 96 |         y0 = self._rand_range(self.background_shape[0] - img_shape[0])
 97 |         x0 = self._rand_range(self.background_shape[1] - img_shape[1])
 98 |         return int(x0), int(y0)
 99 | 
100 |     def _augment(self, img, loc):
101 |         x0, y0 = loc
102 |         img_shape = img.shape[:2]
103 |         background = self.background_filler.fill(
104 |             self.background_shape, img)
105 |         background[y0:y0 + img_shape[0], x0:x0 + img_shape[1]] = img
106 |         return background
107 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/armandmcqueen/tensorpack-mask-rcnn/issues), or [recently closed](https://github.com/armandmcqueen/tensorpack-mask-rcnn/issues?q=is%3Aissue+is%3Aclosed), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/armandmcqueen/tensorpack-mask-rcnn/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/amazon-sagemaker-script-mode/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/templates/maskrcnn.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kubeflow.org/v1alpha1
 2 | kind: MPIJob
 3 | metadata:
 4 |   name: {{ .Values.global.name }}
 5 |   namespace: {{ .Values.global.namespace }}
 6 |   labels:
 7 |     app.kubernetes.io/name: {{ .Values.global.name }}
 8 |     app.kubernetes.io/instance: {{ .Release.Name }}
 9 |     app.kubernetes.io/managed-by: {{ .Release.Service }}
10 | spec:
11 |   gpus: {{ .Values.maskrcnn.gpus }}
12 |   template:
13 |     spec:
14 |       restartPolicy: Never
15 |       volumes:
16 |       - name: {{ .Values.maskrcnn.shared_fs }}
17 |         persistentVolumeClaim:
18 |             claimName: {{ .Values.maskrcnn.shared_pvc }}
19 |       - name: ebs
20 |         hostPath:
21 |             path: /ebs
22 |             type: DirectoryOrCreate
23 |       containers:
24 |       - name: {{ .Values.global.name }}
25 |         env:
26 |         - name: HOROVOD_CYCLE_TIME
27 |           value: "{{ .Values.maskrcnn.horovod_cycle_time }}"
28 |         - name: HOROVOD_FUSION_THRESHOLD
29 |           value: "{{ .Values.maskrcnn.horovod_fusion_threshold }}"
30 |         - name: NCCL_SOCKET_IFNAME
31 |           value: "{{ .Values.maskrcnn.nccl_socket_ifname }}"
32 |         - name: NCCL_MIN_NRINGS
33 |           value: "{{ .Values.maskrcnn.nccl_min_rings }}"
34 |         - name: NCCL_DEBUG
35 |           value: "{{ .Values.maskrcnn.nccl_debug }}"
36 |         - name: TENSORPACK_FP16
37 |           value: "{{ .Values.maskrcnn.fp_16 }}"
38 |         command:
39 |         - mpirun
40 |         workingDir: {{ .Values.maskrcnn.working_dir }}
41 |         args:
42 |         - --output-filename
43 |         - /{{ .Values.maskrcnn.shared_fs }}/logs/{{ .Values.maskrcnn.experiment_group }}/{{ .Release.Name }}
44 |         - --allow-run-as-root
45 |         - --display-map
46 |         - --tag-output
47 |         - --timestamp-output
48 |         - python3
49 |         - {{ .Values.maskrcnn.train_script }}
50 |         - --logdir
51 |         - /{{ .Values.maskrcnn.shared_fs }}/logs/{{ .Values.maskrcnn.experiment_group }}/{{ .Release.Name }}/train_log/
52 |         - --fp16
53 |         - --images_per_epoch
54 |         - "{{ .Values.maskrcnn.images_per_epoch }}"
55 |         - --config
56 |         - MODE_MASK={{ .Values.maskrcnn.mode_fpn }}
57 |         - MODE_FPN={{ .Values.maskrcnn.mode_mask }}
58 |         - DATA.BASEDIR=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.data_dir }}
59 |         - DATA.TRAIN={{ .Values.maskrcnn.data_train }}
60 |         - DATA.VAL={{ .Values.maskrcnn.data_val }}
61 |         - TRAIN.GRADIENT_CLIP={{ .Values.maskrcnn.gradient_clip }}
62 |         - TRAIN.BATCH_SIZE_PER_GPU={{ .Values.maskrcnn.batch_size_per_gpu }}
63 |         - TRAIN.EVAL_PERIOD={{ .Values.maskrcnn.eval_period_in_epochs }}
64 |         - TRAIN.BASE_LR={{ .Values.maskrcnn.base_lr }}
65 |         - TRAIN.WARMUP_INIT_LR={{ .Values.maskrcnn.warmup_lr }}
66 |         - TRAIN.LR_EPOCH_SCHEDULE={{ .Values.maskrcnn.lr_epoch_schedule }}
67 |         - RPN.TOPK_PER_IMAGE={{ .Values.maskrcnn.topk_per_image }}
68 |         - PREPROC.PREDEFINED_PADDING={{ .Values.maskrcnn.predefined_padding }}
69 |         - FRCNN.BBOX_REG_WEIGHTS={{ .Values.maskrcnn.bbox_reg_weights }}
70 |         - TEST.RESULT_SCORE_THRESH={{ .Values.maskrcnn.result_score_thresh }}
71 |         - BACKBONE.WEIGHTS=/{{ .Values.maskrcnn.data_fs }}/{{ .Values.maskrcnn.backbone_weights }}
72 |         - BACKBONE.NORM={{ .Values.maskrcnn.backbone_norm }}
73 |         - TRAINER=horovod
74 |         image: {{ .Values.maskrcnn.image }}
75 |         imagePullPolicy: {{ .Values.maskrcnn.image_pull_policy }}
76 |         volumeMounts:
77 |         - mountPath: /{{ .Values.maskrcnn.shared_fs }}
78 |           name: {{ .Values.maskrcnn.shared_fs }}
79 |         - mountPath: /ebs
80 |           name: ebs
81 |         resources:
82 |           limits:
83 |             nvidia.com/gpu: {{ .Values.maskrcnn.gpus_per_node }}
84 | 


--------------------------------------------------------------------------------
/tensorpack/utils/rect.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: rect.py
  5 | 
  6 | 
  7 | import numpy as np
  8 | 
  9 | from .develop import log_deprecated
 10 | 
 11 | __all__ = ['IntBox', 'FloatBox']
 12 | 
 13 | 
 14 | class BoxBase(object):
 15 |     __slots__ = ['x1', 'y1', 'x2', 'y2']
 16 | 
 17 |     def __init__(self, x1, y1, x2, y2):
 18 |         log_deprecated("IntBox and FloatBox", "Please implement them by your own.", "2019-02-28")
 19 |         self.x1 = x1
 20 |         self.y1 = y1
 21 |         self.x2 = x2
 22 |         self.y2 = y2
 23 | 
 24 |     def copy(self):
 25 |         new = type(self)()
 26 |         for i in self.__slots__:
 27 |             setattr(new, i, getattr(self, i))
 28 |         return new
 29 | 
 30 |     def __str__(self):
 31 |         return '{}(x1={}, y1={}, x2={}, y2={})'.format(
 32 |             type(self).__name__, self.x1, self.y1, self.x2, self.y2)
 33 | 
 34 |     __repr__ = __str__
 35 | 
 36 |     def area(self):
 37 |         return self.w * self.h
 38 | 
 39 |     def is_box(self):
 40 |         return self.w > 0 and self.h > 0
 41 | 
 42 |     def to_list(self):
 43 |         return [self.x1, self.y1, self.x2, self.y2]
 44 | 
 45 | 
 46 | class IntBox(BoxBase):
 47 |     def __init__(self, x1, y1, x2, y2):
 48 |         for k in [x1, y1, x2, y2]:
 49 |             assert isinstance(k, int)
 50 |         super(IntBox, self).__init__(x1, y1, x2, y2)
 51 | 
 52 |     @property
 53 |     def w(self):
 54 |         return self.x2 - self.x1 + 1
 55 | 
 56 |     @property
 57 |     def h(self):
 58 |         return self.y2 - self.y1 + 1
 59 | 
 60 |     def is_valid_box(self, shape):
 61 |         """
 62 |         Check that this rect is a valid bounding box within this shape.
 63 | 
 64 |         Args:
 65 |             shape: int [h, w] or None.
 66 |         Returns:
 67 |             bool
 68 |         """
 69 |         if min(self.x1, self.y1) < 0:
 70 |             return False
 71 |         if min(self.w, self.h) <= 0:
 72 |             return False
 73 |         if self.x2 >= shape[1]:
 74 |             return False
 75 |         if self.y2 >= shape[0]:
 76 |             return False
 77 |         return True
 78 | 
 79 |     def clip_by_shape(self, shape):
 80 |         """
 81 |         Clip xs and ys to be valid coordinates inside shape
 82 | 
 83 |         Args:
 84 |             shape: int [h, w] or None.
 85 |         """
 86 |         self.x1 = np.clip(self.x1, 0, shape[1] - 1)
 87 |         self.x2 = np.clip(self.x2, 0, shape[1] - 1)
 88 |         self.y1 = np.clip(self.y1, 0, shape[0] - 1)
 89 |         self.y2 = np.clip(self.y2, 0, shape[0] - 1)
 90 | 
 91 |     def roi(self, img):
 92 |         assert self.is_valid_box(img.shape[:2]), "{} vs {}".format(self, img.shape[:2])
 93 |         return img[self.y1:self.y2 + 1, self.x1:self.x2 + 1]
 94 | 
 95 | 
 96 | class FloatBox(BoxBase):
 97 |     def __init__(self, x1, y1, x2, y2):
 98 |         for k in [x1, y1, x2, y2]:
 99 |             assert isinstance(k, float), "type={},value={}".format(type(k), k)
100 |         super(FloatBox, self).__init__(x1, y1, x2, y2)
101 | 
102 |     @property
103 |     def w(self):
104 |         return self.x2 - self.x1
105 | 
106 |     @property
107 |     def h(self):
108 |         return self.y2 - self.y1
109 | 
110 |     @staticmethod
111 |     def from_intbox(intbox):
112 |         return FloatBox(intbox.x1, intbox.y1,
113 |                         intbox.x2 + 1, intbox.y2 + 1)
114 | 
115 |     def clip_by_shape(self, shape):
116 |         self.x1 = np.clip(self.x1, 0, shape[1])
117 |         self.x2 = np.clip(self.x2, 0, shape[1])
118 |         self.y1 = np.clip(self.y1, 0, shape[0])
119 |         self.y2 = np.clip(self.y2, 0, shape[0])
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     x = IntBox(2, 1, 3, 3)
124 |     img = np.random.rand(3, 3)
125 |     print(img)
126 | 


--------------------------------------------------------------------------------
/tensorpack/utils/fs.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: fs.py
  5 | 
  6 | 
  7 | import errno
  8 | import os
  9 | import tqdm
 10 | from six.moves import urllib
 11 | 
 12 | from . import logger
 13 | from .utils import execute_only_once
 14 | 
 15 | __all__ = ['mkdir_p', 'download', 'recursive_walk', 'get_dataset_path']
 16 | 
 17 | 
 18 | def mkdir_p(dirname):
 19 |     """ Like "mkdir -p", make a dir recursively, but do nothing if the dir exists
 20 | 
 21 |     Args:
 22 |         dirname(str):
 23 |     """
 24 |     assert dirname is not None
 25 |     if dirname == '' or os.path.isdir(dirname):
 26 |         return
 27 |     try:
 28 |         os.makedirs(dirname)
 29 |     except OSError as e:
 30 |         if e.errno != errno.EEXIST:
 31 |             raise e
 32 | 
 33 | 
 34 | def download(url, dir, filename=None, expect_size=None):
 35 |     """
 36 |     Download URL to a directory.
 37 |     Will figure out the filename automatically from URL, if not given.
 38 |     """
 39 |     mkdir_p(dir)
 40 |     if filename is None:
 41 |         filename = url.split('/')[-1]
 42 |     fpath = os.path.join(dir, filename)
 43 | 
 44 |     if os.path.isfile(fpath):
 45 |         if expect_size is not None and os.stat(fpath).st_size == expect_size:
 46 |             logger.info("File {} exists! Skip download.".format(filename))
 47 |             return fpath
 48 |         else:
 49 |             logger.warn("File {} exists. Will overwrite with a new download!".format(filename))
 50 | 
 51 |     def hook(t):
 52 |         last_b = [0]
 53 | 
 54 |         def inner(b, bsize, tsize=None):
 55 |             if tsize is not None:
 56 |                 t.total = tsize
 57 |             t.update((b - last_b[0]) * bsize)
 58 |             last_b[0] = b
 59 |         return inner
 60 |     try:
 61 |         with tqdm.tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t:
 62 |             fpath, _ = urllib.request.urlretrieve(url, fpath, reporthook=hook(t))
 63 |         statinfo = os.stat(fpath)
 64 |         size = statinfo.st_size
 65 |     except IOError:
 66 |         logger.error("Failed to download {}".format(url))
 67 |         raise
 68 |     assert size > 0, "Downloaded an empty file from {}!".format(url)
 69 | 
 70 |     if expect_size is not None and size != expect_size:
 71 |         logger.error("File downloaded from {} does not match the expected size!".format(url))
 72 |         logger.error("You may have downloaded a broken file, or the upstream may have modified the file.")
 73 | 
 74 |     # TODO human-readable size
 75 |     logger.info('Succesfully downloaded ' + filename + ". " + str(size) + ' bytes.')
 76 |     return fpath
 77 | 
 78 | 
 79 | def recursive_walk(rootdir):
 80 |     """
 81 |     Yields:
 82 |         str: All files in rootdir, recursively.
 83 |     """
 84 |     for r, dirs, files in os.walk(rootdir):
 85 |         for f in files:
 86 |             yield os.path.join(r, f)
 87 | 
 88 | 
 89 | def get_dataset_path(*args):
 90 |     """
 91 |     Get the path to some dataset under ``$TENSORPACK_DATASET``.
 92 | 
 93 |     Args:
 94 |         args: strings to be joined to form path.
 95 | 
 96 |     Returns:
 97 |         str: path to the dataset.
 98 |     """
 99 |     d = os.environ.get('TENSORPACK_DATASET', None)
100 |     if d is None:
101 |         d = os.path.join(os.path.expanduser('~'), 'tensorpack_data')
102 |         if execute_only_once():
103 |             logger.warn("Env var $TENSORPACK_DATASET not set, using {} for datasets.".format(d))
104 |         if not os.path.isdir(d):
105 |             mkdir_p(d)
106 |             logger.info("Created the directory {}.".format(d))
107 |     assert os.path.isdir(d), d
108 |     return os.path.join(d, *args)
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     download('http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz', '.')
113 | 


--------------------------------------------------------------------------------
/MaskRCNN/utils/np_box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | """Operations for [N, 4] numpy arrays representing bounding boxes.
 19 | 
 20 | Example box operations that are supported:
 21 |   * Areas: compute bounding box areas
 22 |   * IOU: pairwise intersection-over-union scores
 23 | """
 24 | import numpy as np
 25 | 
 26 | 
 27 | def area(boxes):
 28 |   """Computes area of boxes.
 29 | 
 30 |   Args:
 31 |     boxes: Numpy array with shape [N, 4] holding N boxes
 32 | 
 33 |   Returns:
 34 |     a numpy array with shape [N*1] representing box areas
 35 |   """
 36 |   return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 37 | 
 38 | 
 39 | def intersection(boxes1, boxes2):
 40 |   """Compute pairwise intersection areas between boxes.
 41 | 
 42 |   Args:
 43 |     boxes1: a numpy array with shape [N, 4] holding N boxes
 44 |     boxes2: a numpy array with shape [M, 4] holding M boxes
 45 | 
 46 |   Returns:
 47 |     a numpy array with shape [N*M] representing pairwise intersection area
 48 |   """
 49 |   [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
 50 |   [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
 51 | 
 52 |   all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
 53 |   all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
 54 |   intersect_heights = np.maximum(
 55 |       np.zeros(all_pairs_max_ymin.shape, dtype='f4'),
 56 |       all_pairs_min_ymax - all_pairs_max_ymin)
 57 |   all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
 58 |   all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
 59 |   intersect_widths = np.maximum(
 60 |       np.zeros(all_pairs_max_xmin.shape, dtype='f4'),
 61 |       all_pairs_min_xmax - all_pairs_max_xmin)
 62 |   return intersect_heights * intersect_widths
 63 | 
 64 | 
 65 | def iou(boxes1, boxes2):
 66 |   """Computes pairwise intersection-over-union between box collections.
 67 | 
 68 |   Args:
 69 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
 70 |     boxes2: a numpy array with shape [M, 4] holding M boxes.
 71 | 
 72 |   Returns:
 73 |     a numpy array with shape [N, M] representing pairwise iou scores.
 74 |   """
 75 |   intersect = intersection(boxes1, boxes2)
 76 |   area1 = area(boxes1)
 77 |   area2 = area(boxes2)
 78 |   union = np.expand_dims(area1, axis=1) + np.expand_dims(
 79 |       area2, axis=0) - intersect
 80 |   return intersect / union
 81 | 
 82 | 
 83 | def ioa(boxes1, boxes2):
 84 |   """Computes pairwise intersection-over-area between box collections.
 85 | 
 86 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
 87 |   their intersection area over box2's area. Note that ioa is not symmetric,
 88 |   that is, IOA(box1, box2) != IOA(box2, box1).
 89 | 
 90 |   Args:
 91 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
 92 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
 93 | 
 94 |   Returns:
 95 |     a numpy array with shape [N, M] representing pairwise ioa scores.
 96 |   """
 97 |   intersect = intersection(boxes1, boxes2)
 98 |   inv_areas = np.expand_dims(1.0 / area(boxes2), axis=0)
 99 |   return intersect * inv_areas
100 | 


--------------------------------------------------------------------------------
/tensorpack/tfutils/varreplace.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: varreplace.py
  5 | # Credit: Qinyao He
  6 | 
  7 | from contextlib import contextmanager
  8 | import tensorflow as tf
  9 | 
 10 | from .common import get_tf_version_tuple
 11 | 
 12 | __all__ = ['custom_getter_scope', 'freeze_variables', 'remap_variables']
 13 | 
 14 | 
 15 | @contextmanager
 16 | def custom_getter_scope(custom_getter):
 17 |     """
 18 |     Args:
 19 |         custom_getter: the same as in :func:`tf.get_variable`
 20 | 
 21 |     Returns:
 22 |         The current variable scope with a custom_getter.
 23 |     """
 24 |     scope = tf.get_variable_scope()
 25 |     if get_tf_version_tuple() >= (1, 5):
 26 |         with tf.variable_scope(
 27 |                 scope, custom_getter=custom_getter,
 28 |                 auxiliary_name_scope=False):
 29 |             yield
 30 |     else:
 31 |         ns = tf.get_default_graph().get_name_scope()
 32 |         with tf.variable_scope(
 33 |                 scope, custom_getter=custom_getter):
 34 |             with tf.name_scope(ns + '/' if ns else ''):
 35 |                 yield
 36 | 
 37 | 
 38 | def remap_variables(fn):
 39 |     """
 40 |     Use fn to map the output of any variable getter.
 41 | 
 42 |     Args:
 43 |         fn (tf.Variable -> tf.Tensor)
 44 | 
 45 |     Returns:
 46 |         The current variable scope with a custom_getter that maps
 47 |         all the variables by fn.
 48 | 
 49 |     Example:
 50 |         .. code-block:: python
 51 | 
 52 |             with varreplace.remap_variables(lambda var: quantize(var)):
 53 |                 x = FullyConnected('fc', x, 1000)   # fc/{W,b} will be quantized
 54 |     """
 55 |     def custom_getter(getter, *args, **kwargs):
 56 |         v = getter(*args, **kwargs)
 57 |         return fn(v)
 58 |     return custom_getter_scope(custom_getter)
 59 | 
 60 | 
 61 | def freeze_variables(stop_gradient=True, skip_collection=False):
 62 |     """
 63 |     Return a context to freeze variables,
 64 |     by wrapping ``tf.get_variable`` with a custom getter.
 65 |     It works by either applying ``tf.stop_gradient`` on the variables,
 66 |     or by keeping them out of the ``TRAINABLE_VARIABLES`` collection, or
 67 |     both.
 68 | 
 69 |     Example:
 70 |         .. code-block:: python
 71 | 
 72 |             with varreplace.freeze_variable(stop_gradient=False, skip_collection=True):
 73 |                 x = FullyConnected('fc', x, 1000)   # fc/* will not be trained
 74 | 
 75 |     Args:
 76 |         stop_gradient (bool): if True, variables returned from `get_variable`
 77 |             will be wrapped with `tf.stop_gradient` and therefore has no
 78 |             gradient when used later.
 79 |             Note that the created variables may still have gradient when accessed
 80 |             by other approaches (e.g. by name, or by collection).
 81 |             Also note that this makes `tf.get_variable` returns a Tensor instead of a Variable,
 82 |             which may break existing code.
 83 |             Therefore, it's recommended to use the `skip_collection` option instead.
 84 |         skip_collection (bool): if True, do not add the variable to
 85 |             ``TRAINABLE_VARIABLES`` collection, but to ``MODEL_VARIABLES``
 86 |             collection. As a result they will not be trained by default.
 87 |     """
 88 |     def custom_getter(getter, *args, **kwargs):
 89 |         trainable = kwargs.get('trainable', True)
 90 |         name = args[0] if len(args) else kwargs.get('name')
 91 |         if skip_collection:
 92 |             kwargs['trainable'] = False
 93 |         v = getter(*args, **kwargs)
 94 |         if skip_collection:
 95 |             tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v)
 96 |         if trainable and stop_gradient:
 97 |             v = tf.stop_gradient(v, name='freezed_' + name)
 98 |         return v
 99 |     return custom_getter_scope(custom_getter)
100 | 


--------------------------------------------------------------------------------
/tensorpack/callbacks/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: misc.py
  5 | 
  6 | 
  7 | import numpy as np
  8 | import os
  9 | import time
 10 | from collections import deque
 11 | 
 12 | from ..utils import logger
 13 | from ..utils.utils import humanize_time_delta
 14 | from .base import Callback
 15 | 
 16 | __all__ = ['SendStat', 'InjectShell', 'EstimatedTimeLeft']
 17 | 
 18 | 
 19 | class SendStat(Callback):
 20 |     """ An equivalent of :class:`SendMonitorData`, but as a normal callback. """
 21 |     def __init__(self, command, names):
 22 |         self.command = command
 23 |         if not isinstance(names, list):
 24 |             names = [names]
 25 |         self.names = names
 26 | 
 27 |     def _trigger(self):
 28 |         M = self.trainer.monitors
 29 |         v = {k: M.get_latest(k) for k in self.names}
 30 |         cmd = self.command.format(**v)
 31 |         ret = os.system(cmd)
 32 |         if ret != 0:
 33 |             logger.error("Command {} failed with ret={}!".format(cmd, ret))
 34 | 
 35 | 
 36 | class InjectShell(Callback):
 37 |     """
 38 |     Allow users to create a specific file as a signal to pause
 39 |     and iteratively debug the training.
 40 |     Once the :meth:`trigger` method is called, it detects whether the file exists, and opens an
 41 |     IPython/pdb shell if yes.
 42 |     In the shell, `self` is this callback, `self.trainer` is the trainer, and
 43 |     from that you can access everything else.
 44 | 
 45 |     Example:
 46 | 
 47 |     .. code-block:: none
 48 | 
 49 |         callbacks=[InjectShell('/path/to/pause-training.tmp'), ...]
 50 | 
 51 |         # the following command will pause the training when the epoch finishes:
 52 |         $ touch /path/to/pause-training.tmp
 53 | 
 54 |     """
 55 | 
 56 |     def __init__(self, file='INJECT_SHELL.tmp', shell='ipython'):
 57 |         """
 58 |         Args:
 59 |            file (str): if this file exists, will open a shell.
 60 |            shell (str): one of 'ipython', 'pdb'
 61 |         """
 62 |         self._file = file
 63 |         assert shell in ['ipython', 'pdb']
 64 |         self._shell = shell
 65 |         logger.info("Create a file '{}' to open {} shell.".format(file, shell))
 66 | 
 67 |     def _trigger(self):
 68 |         if os.path.isfile(self._file):
 69 |             logger.info("File {} exists, entering shell.".format(self._file))
 70 |             self._inject()
 71 | 
 72 |     def _inject(self):
 73 |         trainer = self.trainer   # noqa
 74 |         if self._shell == 'ipython':
 75 |             import IPython as IP    # noqa
 76 |             IP.embed()
 77 |         elif self._shell == 'pdb':
 78 |             import pdb   # noqa
 79 |             pdb.set_trace()
 80 | 
 81 |     def _after_train(self):
 82 |         if os.path.isfile(self._file):
 83 |             os.unlink(self._file)
 84 | 
 85 | 
 86 | class EstimatedTimeLeft(Callback):
 87 |     """
 88 |     Estimate the time left until completion of training.
 89 |     """
 90 |     def __init__(self, last_k_epochs=5, median=False):
 91 |         """
 92 |         Args:
 93 |             last_k_epochs (int): Use the time spent on last k epochs to estimate total time left.
 94 |             median (bool): Use mean by default. If True, use the median time spent on last k epochs.
 95 |         """
 96 |         self._times = deque(maxlen=last_k_epochs)
 97 |         self._median = median
 98 | 
 99 |     def _before_train(self):
100 |         self._max_epoch = self.trainer.max_epoch
101 |         self._last_time = time.time()
102 | 
103 |     def _trigger_epoch(self):
104 |         duration = time.time() - self._last_time
105 |         self._last_time = time.time()
106 |         self._times.append(duration)
107 | 
108 |         epoch_time = np.median(self._times) if self._median else np.mean(self._times)
109 |         time_left = (self._max_epoch - self.epoch_num) * epoch_time
110 |         if time_left > 0:
111 |             logger.info("Estimated Time Left: " + humanize_time_delta(time_left))
112 | 


--------------------------------------------------------------------------------
/tensorpack/models/layer_norm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: layer_norm.py
  5 | 
  6 | 
  7 | import tensorflow as tf
  8 | 
  9 | from ..utils.argtools import get_data_format
 10 | from .common import VariableHolder, layer_register
 11 | 
 12 | __all__ = ['LayerNorm', 'InstanceNorm']
 13 | 
 14 | 
 15 | @layer_register()
 16 | def LayerNorm(
 17 |         x, epsilon=1e-5,
 18 |         use_bias=True, use_scale=True,
 19 |         gamma_init=None, data_format='channels_last'):
 20 |     """
 21 |     Layer Normalization layer, as described in the paper:
 22 |     `Layer Normalization <https://arxiv.org/abs/1607.06450>`_.
 23 | 
 24 |     Args:
 25 |         x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format.
 26 |         epsilon (float): epsilon to avoid divide-by-zero.
 27 |         use_scale, use_bias (bool): whether to use the extra affine transformation or not.
 28 |     """
 29 |     data_format = get_data_format(data_format, tfmode=False)
 30 |     shape = x.get_shape().as_list()
 31 |     ndims = len(shape)
 32 |     assert ndims in [2, 4]
 33 | 
 34 |     mean, var = tf.nn.moments(x, list(range(1, len(shape))), keep_dims=True)
 35 | 
 36 |     if data_format == 'NCHW':
 37 |         chan = shape[1]
 38 |         new_shape = [1, chan, 1, 1]
 39 |     else:
 40 |         chan = shape[-1]
 41 |         new_shape = [1, 1, 1, chan]
 42 |     if ndims == 2:
 43 |         new_shape = [1, chan]
 44 | 
 45 |     if use_bias:
 46 |         beta = tf.get_variable('beta', [chan], initializer=tf.constant_initializer())
 47 |         beta = tf.reshape(beta, new_shape)
 48 |     else:
 49 |         beta = tf.zeros([1] * ndims, name='beta')
 50 |     if use_scale:
 51 |         if gamma_init is None:
 52 |             gamma_init = tf.constant_initializer(1.0)
 53 |         gamma = tf.get_variable('gamma', [chan], initializer=gamma_init)
 54 |         gamma = tf.reshape(gamma, new_shape)
 55 |     else:
 56 |         gamma = tf.ones([1] * ndims, name='gamma')
 57 | 
 58 |     ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output')
 59 | 
 60 |     vh = ret.variables = VariableHolder()
 61 |     if use_scale:
 62 |         vh.gamma = gamma
 63 |     if use_bias:
 64 |         vh.beta = beta
 65 |     return ret
 66 | 
 67 | 
 68 | @layer_register()
 69 | def InstanceNorm(x, epsilon=1e-5, use_affine=True, gamma_init=None, data_format='channels_last'):
 70 |     """
 71 |     Instance Normalization, as in the paper:
 72 |     `Instance Normalization: The Missing Ingredient for Fast Stylization
 73 |     <https://arxiv.org/abs/1607.08022>`_.
 74 | 
 75 |     Args:
 76 |         x (tf.Tensor): a 4D tensor.
 77 |         epsilon (float): avoid divide-by-zero
 78 |         use_affine (bool): whether to apply learnable affine transformation
 79 |     """
 80 |     data_format = get_data_format(data_format, tfmode=False)
 81 |     shape = x.get_shape().as_list()
 82 |     assert len(shape) == 4, "Input of InstanceNorm has to be 4D!"
 83 | 
 84 |     if data_format == 'NHWC':
 85 |         axis = [1, 2]
 86 |         ch = shape[3]
 87 |         new_shape = [1, 1, 1, ch]
 88 |     else:
 89 |         axis = [2, 3]
 90 |         ch = shape[1]
 91 |         new_shape = [1, ch, 1, 1]
 92 |     assert ch is not None, "Input of InstanceNorm require known channel!"
 93 | 
 94 |     mean, var = tf.nn.moments(x, axis, keep_dims=True)
 95 | 
 96 |     if not use_affine:
 97 |         return tf.divide(x - mean, tf.sqrt(var + epsilon), name='output')
 98 | 
 99 |     beta = tf.get_variable('beta', [ch], initializer=tf.constant_initializer())
100 |     beta = tf.reshape(beta, new_shape)
101 |     if gamma_init is None:
102 |         gamma_init = tf.constant_initializer(1.0)
103 |     gamma = tf.get_variable('gamma', [ch], initializer=gamma_init)
104 |     gamma = tf.reshape(gamma, new_shape)
105 |     ret = tf.nn.batch_normalization(x, mean, var, beta, gamma, epsilon, name='output')
106 | 
107 |     vh = ret.variables = VariableHolder()
108 |     if use_affine:
109 |         vh.gamma = gamma
110 |         vh.beta = beta
111 |     return ret
112 | 


--------------------------------------------------------------------------------
/infra/eks/maskrcnn/charts/mpi-operator/templates/mpi-operator.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: rbac.authorization.k8s.io/v1
  2 | kind: ClusterRole
  3 | metadata:
  4 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
  5 |   labels:
  6 |     app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
  7 |     app.kubernetes.io/instance: {{ .Release.Name }}
  8 |     app.kubernetes.io/managed-by: {{ .Release.Service }}
  9 | rules:
 10 | - apiGroups:
 11 |   - ""
 12 |   resources:
 13 |   - configmaps
 14 |   - serviceaccounts
 15 |   verbs:
 16 |   - create
 17 |   - list
 18 |   - watch
 19 | - apiGroups:
 20 |   - ""
 21 |   resources:
 22 |   - pods
 23 |   verbs:
 24 |   - get
 25 | - apiGroups:
 26 |   - ""
 27 |   resources:
 28 |   - pods/exec
 29 |   verbs:
 30 |   - create
 31 | - apiGroups:
 32 |   - ""
 33 |   resources:
 34 |   - events
 35 |   verbs:
 36 |   - create
 37 |   - patch
 38 | - apiGroups:
 39 |   - rbac.authorization.k8s.io
 40 |   resources:
 41 |   - roles
 42 |   - rolebindings
 43 |   verbs:
 44 |   - create
 45 |   - list
 46 |   - watch
 47 | - apiGroups:
 48 |   - apps
 49 |   resources:
 50 |   - statefulsets
 51 |   verbs:
 52 |   - create
 53 |   - list
 54 |   - update
 55 |   - watch
 56 | - apiGroups:
 57 |   - batch
 58 |   resources:
 59 |   - jobs
 60 |   verbs:
 61 |   - create
 62 |   - list
 63 |   - update
 64 |   - watch
 65 | - apiGroups:
 66 |   - apiextensions.k8s.io
 67 |   resources:
 68 |   - customresourcedefinitions
 69 |   verbs:
 70 |   - create
 71 |   - get
 72 | - apiGroups:
 73 |   - kubeflow.org
 74 |   resources:
 75 |   - mpijobs
 76 |   verbs:
 77 |   - '*'
 78 | ---
 79 | apiVersion: v1
 80 | kind: ServiceAccount
 81 | metadata:
 82 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
 83 |   namespace: {{ .Values.global.namespace }}
 84 |   labels:
 85 |     app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
 86 |     app.kubernetes.io/instance: {{ .Release.Name }}
 87 |     app.kubernetes.io/managed-by: {{ .Release.Service }}
 88 | ---
 89 | apiVersion: rbac.authorization.k8s.io/v1
 90 | kind: ClusterRoleBinding
 91 | metadata:
 92 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
 93 |   namespace: {{ .Values.global.namespace }}
 94 |   labels:
 95 |     app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
 96 |     app.kubernetes.io/instance: {{ .Release.Name }}
 97 |     app.kubernetes.io/managed-by: {{ .Release.Service }}
 98 | roleRef:
 99 |   apiGroup: rbac.authorization.k8s.io
100 |   kind: ClusterRole
101 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
102 | subjects:
103 | - kind: ServiceAccount
104 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
105 |   namespace: {{ .Values.global.namespace }}
106 | ---
107 | apiVersion: apps/v1
108 | kind: Deployment
109 | metadata:
110 |   name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
111 |   namespace: {{ .Values.global.namespace }}
112 |   labels:
113 |     app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
114 |     app.kubernetes.io/instance: {{ .Release.Name }}
115 |     app.kubernetes.io/managed-by: {{ .Release.Service }}
116 | spec:
117 |   replicas: 1
118 |   selector:
119 |     matchLabels:
120 |      app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
121 |   template:
122 |     metadata:
123 |       labels:
124 |         app.kubernetes.io/name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
125 |         app.kubernetes.io/instance: {{ .Release.Name }}
126 |         app.kubernetes.io/managed-by: {{ .Release.Service }}
127 |     spec:
128 |       containers:
129 |       - args:
130 |         - --gpus-per-node
131 |         - "{{ .Values.mpioperator.gpuspernode }}"
132 |         - --kubectl-delivery-image
133 |         - {{ .Values.mpioperator.deliveryimage }}
134 |         image: {{ .Values.mpioperator.image }}
135 |         imagePullPolicy: {{ .Values.mpioperator.pullpolicy }}
136 |         name: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
137 |       serviceAccountName: {{ .Values.mpioperator.name }}-{{ .Values.global.name }}
138 | ---


--------------------------------------------------------------------------------
/tensorpack/dataflow/dataset/bsds500.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | # -*- coding: utf-8 -*-
  4 | # File: bsds500.py
  5 | 
  6 | 
  7 | import glob
  8 | import numpy as np
  9 | import os
 10 | 
 11 | from ...utils.fs import download, get_dataset_path
 12 | from ..base import RNGDataFlow
 13 | 
 14 | __all__ = ['BSDS500']
 15 | 
 16 | 
 17 | DATA_URL = "http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz"
 18 | DATA_SIZE = 70763455
 19 | IMG_W, IMG_H = 481, 321
 20 | 
 21 | 
 22 | class BSDS500(RNGDataFlow):
 23 |     """
 24 |     `Berkeley Segmentation Data Set and Benchmarks 500 dataset
 25 |     <http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html#bsds500>`_.
 26 | 
 27 |     Produce ``(image, label)`` pair, where ``image`` has shape (321, 481, 3(BGR)) and
 28 |     ranges in [0,255].
 29 |     ``Label`` is a floating point image of shape (321, 481) in range [0, 1].
 30 |     The value of each pixel is ``number of times it is annotated as edge / total number of annotators for this image``.
 31 |     """
 32 | 
 33 |     def __init__(self, name, data_dir=None, shuffle=True):
 34 |         """
 35 |         Args:
 36 |             name (str): 'train', 'test', 'val'
 37 |             data_dir (str): a directory containing the original 'BSR' directory.
 38 |         """
 39 |         # check and download data
 40 |         if data_dir is None:
 41 |             data_dir = get_dataset_path('bsds500_data')
 42 |         if not os.path.isdir(os.path.join(data_dir, 'BSR')):
 43 |             download(DATA_URL, data_dir, expect_size=DATA_SIZE)
 44 |             filename = DATA_URL.split('/')[-1]
 45 |             filepath = os.path.join(data_dir, filename)
 46 |             import tarfile
 47 |             tarfile.open(filepath, 'r:gz').extractall(data_dir)
 48 |         self.data_root = os.path.join(data_dir, 'BSR', 'BSDS500', 'data')
 49 |         assert os.path.isdir(self.data_root)
 50 | 
 51 |         self.shuffle = shuffle
 52 |         assert name in ['train', 'test', 'val']
 53 |         self._load(name)
 54 | 
 55 |     def _load(self, name):
 56 |         image_glob = os.path.join(self.data_root, 'images', name, '*.jpg')
 57 |         image_files = glob.glob(image_glob)
 58 |         gt_dir = os.path.join(self.data_root, 'groundTruth', name)
 59 |         self.data = np.zeros((len(image_files), IMG_H, IMG_W, 3), dtype='uint8')
 60 |         self.label = np.zeros((len(image_files), IMG_H, IMG_W), dtype='float32')
 61 | 
 62 |         for idx, f in enumerate(image_files):
 63 |             im = cv2.imread(f, cv2.IMREAD_COLOR)
 64 |             assert im is not None
 65 |             if im.shape[0] > im.shape[1]:
 66 |                 im = np.transpose(im, (1, 0, 2))
 67 |             assert im.shape[:2] == (IMG_H, IMG_W), "{} != {}".format(im.shape[:2], (IMG_H, IMG_W))
 68 | 
 69 |             imgid = os.path.basename(f).split('.')[0]
 70 |             gt_file = os.path.join(gt_dir, imgid)
 71 |             gt = loadmat(gt_file)['groundTruth'][0]
 72 |             n_annot = gt.shape[0]
 73 |             gt = sum(gt[k]['Boundaries'][0][0] for k in range(n_annot))
 74 |             gt = gt.astype('float32')
 75 |             gt *= 1.0 / n_annot
 76 |             if gt.shape[0] > gt.shape[1]:
 77 |                 gt = gt.transpose()
 78 |             assert gt.shape == (IMG_H, IMG_W)
 79 | 
 80 |             self.data[idx] = im
 81 |             self.label[idx] = gt
 82 | 
 83 |     def __len__(self):
 84 |         return self.data.shape[0]
 85 | 
 86 |     def __iter__(self):
 87 |         idxs = np.arange(self.data.shape[0])
 88 |         if self.shuffle:
 89 |             self.rng.shuffle(idxs)
 90 |         for k in idxs:
 91 |             yield [self.data[k], self.label[k]]
 92 | 
 93 | 
 94 | try:
 95 |     from scipy.io import loadmat
 96 |     import cv2
 97 | except ImportError:
 98 |     from ...utils.develop import create_dummy_class
 99 |     BSDS500 = create_dummy_class('BSDS500', ['scipy.io', 'cv2'])  # noqa
100 | 
101 | if __name__ == '__main__':
102 |     a = BSDS500('val')
103 |     a.reset_state()
104 |     for k in a:
105 |         cv2.imshow("haha", k[1].astype('uint8') * 255)
106 |         cv2.waitKey(1000)
107 | 


--------------------------------------------------------------------------------
/infra/eks/yaml_overlay:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #!/usr/bin/env python3
  4 | 
  5 | import argparse
  6 | import os
  7 | import sys
  8 | import yaml
  9 | import yamlloader
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | def apply_overlay(base, overlay, append=False):
 16 |     """
 17 | 
 18 |     :param base: Dict of yaml to apply changes to. Gets mutated
 19 |     :param overlay: Dict of changes. Identical structure to base
 20 |     :param append: True to append, false to replace values
 21 |     :return: base dict with changes applied. Mutation of base input dict
 22 |     """
 23 | 
 24 | 
 25 |     for k1, v1 in overlay.items():
 26 |         if not isinstance(v1, dict):
 27 |             if append:
 28 |                 base[k1] += v1
 29 |             else:
 30 |                 base[k1] = v1
 31 | 
 32 | 
 33 |         else:
 34 |             for k2, v2 in v1.items():
 35 |                 if not isinstance(v2, dict):
 36 |                     if append:
 37 |                         base[k1][k2] += v2
 38 |                     else:
 39 |                         base[k1][k2] = v2
 40 | 
 41 | 
 42 |                 else:
 43 |                     for k3, v3 in v2.items():
 44 |                         if not isinstance(v3, dict):
 45 |                             if append:
 46 |                                 base[k1][k2][k3] += v3
 47 |                             else:
 48 |                                 base[k1][k2][k3] = v3
 49 | 
 50 | 
 51 |                         else:
 52 |                             for k4, v4 in v3.items():
 53 |                                 if not isinstance(v4, dict):
 54 |                                     if append:
 55 |                                         base[k1][k2][k3][k4] += v4
 56 |                                     else:
 57 |                                         base[k1][k2][k3][k4] = v4
 58 |                                 else:
 59 |                                     raise NotImplementedError("Exceeds current yaml max depth")
 60 |     return base
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 | 
 69 |     parser = argparse.ArgumentParser(description='Create a variant of a yaml by applying overlays which describe changes')
 70 |     parser.add_argument('base_yaml_path',
 71 |                         help="Yaml to use as base. If '-' is given, will read from stdin instead.")
 72 |     parser.add_argument('overlays', metavar='N', type=str, nargs='+',
 73 |                         help='Overlays to apply in sequential order')
 74 |     parser.add_argument('--overlay_dir', type=str,
 75 |                         help='Path to dir containing all overlays. Can be passed in through OVERLAY_BASE_DIR '
 76 |                              'environmental variable. If both env var and cli arg are present, cli arg wins.')
 77 | 
 78 |     args = parser.parse_args()
 79 | 
 80 |     overlay_base_dir = None
 81 |     if "OVERLAY_DIR" in os.environ.keys() and os.environ['OVERLAY_DIR'] is not None:
 82 |         overlay_base_dir = os.environ['OVERLAY_DIR']
 83 |     if args.overlay_dir is not None:
 84 |         overlay_base_dir = args.overlay_dir
 85 | 
 86 |     if args.base_yaml_path == '-':
 87 |         s = "".join([l for l in sys.stdin])
 88 |         values = yaml.load(s, Loader=yamlloader.ordereddict.CLoader)
 89 |     else:
 90 |         with open(args.base_yaml_path, 'r') as f:
 91 |             values = yaml.load(f, Loader=yamlloader.ordereddict.CLoader)
 92 | 
 93 |     overlay_dicts = []
 94 |     for overlay in args.overlays:
 95 |         overlay_path = f'{overlay}.yaml'
 96 |         if overlay_base_dir is not None:
 97 |             overlay_path = os.path.join(overlay_base_dir, overlay_path)
 98 | 
 99 |         with open(overlay_path) as f:
100 |             overlay_dicts.append(yaml.load(f, Loader=yamlloader.ordereddict.CLoader))
101 | 
102 |     for overlay_dict in overlay_dicts:
103 |         if 'append' in overlay_dict.keys():
104 |             values = apply_overlay(values, overlay_dict['append'], append=True)
105 |         if 'set' in overlay_dict.keys():
106 |             values = apply_overlay(values, overlay_dict['set'], append=False)
107 | 
108 |     print(yaml.dump(values,
109 |                     Dumper=yamlloader.ordereddict.CDumper,
110 |                     default_flow_style=False))
111 | 


--------------------------------------------------------------------------------