├── .gitignore
├── LICENSE
├── README.md
├── anomaly-detection
    └── PaDiM
    │   ├── 0-preparation
    │       └── preparation.ipynb
    │   ├── 1-training
    │       ├── source
    │       │   ├── mvtec.py
    │       │   ├── requirements2.txt
    │       │   └── training.py
    │       └── training.ipynb
    │   ├── 2-inference
    │       ├── Dockerfile
    │       ├── aws
    │       │   └── config
    │       ├── inference.ipynb
    │       └── source
    │       │   ├── __init__.py
    │       │   ├── inference.py
    │       │   ├── mvtec.py
    │       │   ├── mytime.py
    │       │   ├── nginx.conf
    │       │   ├── predictor.py
    │       │   ├── requirements.txt
    │       │   ├── serve
    │       │   ├── test.py
    │       │   └── wsgi.py
    │   ├── LICENSE
    │   ├── README.md
    │   └── images
    │       └── detection_example.png
├── distributed-training
    ├── PyTorch
    │   ├── README.md
    │   ├── code
    │   │   ├── mnist.py
    │   │   └── requirements.txt
    │   └── pytorch_mnist.ipynb
    └── TensorFlow
    │   └── data-parallel
    │       ├── README.md
    │       ├── code
    │           ├── requirements.txt
    │           └── train_tensorflow_smdataparallel_mnist.py
    │       └── tensorflow2_smdataparallel_mnist_demo.ipynb
├── encapsulation
    ├── Dockerfile
    ├── LICENSE
    ├── README.md
    ├── hyperparameter-tuning.ipynb
    ├── inference-custom-image.ipynb
    ├── inference-default-image.ipynb
    ├── nginx.conf
    ├── source
    │   ├── export_model.py
    │   ├── processing.py
    │   ├── requirements.txt
    │   └── train.py
    ├── test
    │   ├── cat.681.jpg
    │   └── dog.592.jpg
    └── train.ipynb
├── hyperspectral
    └── DeepHyperX
    │   ├── 1-preparation
    │       ├── explore_data.ipynb
    │       ├── preparation.ipynb
    │       └── preprocess.py
    │   ├── 2-training
    │       ├── source
    │       │   ├── custom_datasets.py
    │       │   ├── datasets.py
    │       │   ├── inference.py
    │       │   ├── main.py
    │       │   ├── models.py
    │       │   ├── requirements2.txt
    │       │   └── utils.py
    │       └── training.ipynb
    │   ├── 3-inference
    │       └── inference.ipynb
    │   ├── LICENSE
    │   └── README.md
├── image-classification
    ├── Image-classification-lst-format.ipynb
    ├── LICENSE
    ├── README.md
    └── im2rec.py
├── images
    └── sagemaker_notebook.png
├── object-detection
    └── yolov5-on-sagemaker
    │   ├── 0-preparation
    │       └── preparation.ipynb
    │   ├── 1-training
    │       ├── container
    │       │   ├── Dockerfile
    │       │   ├── changehostname.c
    │       │   ├── local_test
    │       │   │   └── input
    │       │   │   │   ├── config
    │       │   │   │       ├── hyperparameters.json
    │       │   │   │       └── resourceconfig.json
    │       │   │   │   └── data
    │       │   │   │       └── training
    │       │   │   │           ├── cfg
    │       │   │   │               ├── hyp.yaml
    │       │   │   │               └── yolov5s.yaml
    │       │   │   │           └── weights
    │       │   │   │               └── yolov5s.pt
    │       │   ├── sources.list
    │       │   ├── start_with_right_hostname.sh
    │       │   └── train
    │       ├── training-build.ipynb
    │       └── training.ipynb
    │   ├── 2-inference
    │       ├── Dockerfile
    │       ├── aws
    │       │   └── config
    │       ├── inference-build.ipynb
    │       ├── inference.ipynb
    │       └── source
    │       │   ├── detect.py
    │       │   ├── models
    │       │       ├── __init__.py
    │       │       ├── common.py
    │       │       ├── experimental.py
    │       │       ├── export.py
    │       │       ├── hub
    │       │       │   ├── yolov3-spp.yaml
    │       │       │   ├── yolov5-fpn.yaml
    │       │       │   └── yolov5-panet.yaml
    │       │       ├── yolo.py
    │       │       ├── yolov5l.yaml
    │       │       ├── yolov5m.yaml
    │       │       ├── yolov5s.yaml
    │       │       └── yolov5x.yaml
    │       │   ├── nginx.conf
    │       │   ├── predictor.py
    │       │   ├── requirements.txt
    │       │   ├── serve
    │       │   ├── utils
    │       │       ├── __init__.py
    │       │       ├── activations.py
    │       │       ├── datasets.py
    │       │       ├── evolve.sh
    │       │       ├── general.py
    │       │       ├── google_app_engine
    │       │       │   ├── Dockerfile
    │       │       │   ├── additional_requirements.txt
    │       │       │   └── app.yaml
    │       │       ├── google_utils.py
    │       │       └── torch_utils.py
    │       │   └── wsgi.py
    │   ├── LICENSE
    │   ├── README.md
    │   └── images
    │       └── detection_example.jpg
├── runtime
    ├── Java
    │   ├── Inference.java
    │   └── pom.xml
    ├── Java2
    │   ├── Inference.java
    │   └── pom.xml
    └── LICENSE
├── training-data-input
    ├── EFS.ipynb
    ├── FSx.ipynb
    ├── ListFile.py
    └── README.md
└── update-endpoint
    └── UpdateEndpoint.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | #this project specific
  2 | anomaly-detection/PaDiM/0-preparation/bottle
  3 | anomaly-detection/PaDiM/1-training/result
  4 | anomaly-detection/PaDiM/2-inference/result
  5 | anomaly-detection/PaDiM/2-inference/source/train.pkl
  6 | image-classification/image-*
  7 | object-detection/yolov5-on-sagemaker/0-preparation/biaozhu/*
  8 | object-detection/yolov5-on-sagemaker/1-training/container/dockersource/
  9 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/model/
 10 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/data.yaml
 11 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/images/*
 12 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/labels/*
 13 | object-detection/yolov5-on-sagemaker/1-training/runs/*
 14 | object-detection/yolov5-on-sagemaker/1-training/model_data.txt
 15 | object-detection/yolov5-on-sagemaker/2-inference/source/yolov5s.pt
 16 | object-detection/yolov5-on-sagemaker/2-inference/result/*
 17 | object-detection/yolov5-on-sagemaker/2-inference/model/
 18 | hyperspectral/DeepHyperX/1-preparation/dataset/
 19 | hyperspectral/DeepHyperX/1-preparation/Datasets/
 20 | hyperspectral/DeepHyperX/2-training/result/
 21 | distributed-training/PyTorch/data/
 22 | update-endpoint/data/
 23 | 
 24 | 
 25 | 
 26 | venv/*
 27 | */cdk.out/*
 28 | 
 29 | 
 30 | # Compiled class file
 31 | *.class
 32 | 
 33 | # Log file
 34 | *.log
 35 | 
 36 | # BlueJ files
 37 | *.ctxt
 38 | 
 39 | # Mobile Tools for Java (J2ME)
 40 | .mtj.tmp/
 41 | 
 42 | # Package Files #
 43 | #*.jar
 44 | *.war
 45 | *.nar
 46 | *.ear
 47 | *.zip
 48 | *.tar.gz
 49 | *.rar
 50 | *.pth
 51 | 
 52 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
 53 | hs_err_pid*
 54 | 
 55 | 
 56 | __pycache__/
 57 | */__pycache__
 58 | /sample_data/*
 59 | */dataset/*
 60 | /models/
 61 | */models
 62 | /.idea/*
 63 | /logs/*
 64 | /classes/*
 65 | !.mvn/wrapper/maven-wrapper.jar
 66 | .DS_Store
 67 | .idea/
 68 | 
 69 | 
 70 | 
 71 | *Dataset/
 72 | *Datasets/
 73 | *checkpoints/
 74 | *checkpoint/
 75 | 
 76 | .DS_Store
 77 | ### STS ###
 78 | .apt_generated
 79 | .classpath
 80 | .factorypath
 81 | .project
 82 | .settings
 83 | .springBeans
 84 | .sts4-cache
 85 | 
 86 | .ipynb_checkpoints/
 87 | .ipynb_checkpoints/*
 88 | .ipynb_checkpoints
 89 | 
 90 | ### IntelliJ IDEA ###
 91 | .idea
 92 | *.iws
 93 | *.iml
 94 | *.ipr
 95 | */target/*
 96 | */target
 97 | /target/*
 98 | /target/
 99 | */dataset
100 | /sample_data/*
101 | 
102 | /raw-data/*
103 | */cdk.context.json
104 | cdk-infra/cdk.context.json
105 | /output/*
106 | cdk.out*
107 | *cdk.out*
108 | build.sh
109 | 
110 | ### NetBeans ###
111 | /nbproject/private/
112 | /build/
113 | /nbbuild/
114 | /dist/
115 | /nbdist/
116 | /.nb-gradle/
117 | .idea/
118 | .idea/*
119 | 
120 | */.env/
121 | .env/*
122 | */.env/*
123 | 
124 | /cdk.out/
125 | */cdk.out/
126 | 
127 | */temp/*
128 | */temp
129 | /temp/*
130 | /temp/*
131 | /temp/
132 | ./temp/
133 | cdk.context.json
134 | 
135 | *.zip
136 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | 本repository为多个软件（算法或项目）的目录，不是一个具体软件。各个软件（算法或项目）在具体文件夹中，比如[anomaly-detection/PaDiM](anomaly-detection/PaDiM)，由于上游软件（算法或项目）的LICENSE不同，各个软件（算法或项目）的LICENSE在具体软件（算法或项目）中提供。  
2 | 本目录收集的各个软件（算法或项目）互相独立，无依赖。


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon SageMaker Workshop
 2 | 利用Amazon SageMaker进行机器学习和深度学习开发。
 3 | ## 版权说明
 4 | 本repository为多个软件（算法或项目）的目录，不是一个具体软件。各个软件（算法或项目）在具体文件夹中，比如[anomaly-detection/PaDiM](anomaly-detection/PaDiM)，由于上游软件（算法或项目）的LICENSE不同，各个软件（算法或项目）的LICENSE在具体软件（算法或项目）中提供。  
 5 | 本目录收集的各个软件（算法或项目）互相独立，无依赖。
 6 | ## 免责声明
 7 | 建议测试过程中使用此方案，生产环境使用请自行考虑评估。
 8 | 
 9 | 当您对方案需要进一步的沟通和反馈后，可以联系 nwcd_labs@nwcdcloud.cn 获得更进一步的支持。
10 | 
11 | 欢迎联系参与方案共建和提交方案需求, 也欢迎在 github 项目 issue 中留言反馈 bugs。
12 | 
13 | ## 内容简介
14 | 以深度学习中的常用场景，介绍如何使用Amazon SageMaker进行模型训练和推理部署。
15 | 
16 | 本目录有以下内容：
17 | - [异常检测anomaly-detection](anomaly-detection/PaDiM/README.md)，使用PaDiM演示异常检测
18 | - [图片分类image-classification](image-classification/README.md)，使用Amazon SageMaker内置的图片分类算法进行模型训练和部署
19 | - [对象检测object-detection](object-detection/yolov5-on-sagemaker/README.md)，使用YOLOv5算法演示对象检测
20 | - [高光谱hyperspectral](hyperspectral/DeepHyperX/README.md),使用DeepHyperX算法对高光谱进行处理
21 | - [封装自定义算法encapsulation](encapsulation/README.md)，使用自定义算法，通过Amazon SageMaker进行封装在AWS平台上进行模型训练和部署
22 | - [分布式训练distributed-training](distributed-training)，多机多卡分布式训练，[PyTorch](distributed-training/PyTorch/README.md)、[TensorFlow](distributed-training/TensorFlow/data-parallel/README.md)
23 | - [训练数据输入](training-data-input/README.md)，解决直接从S3上下载训练数据耗时过长问题；也支持从EFS获取训练数据
24 | - [在线更新模型](update-endpoint/UpdateEndpoint.ipynb)，在不停止endpoint服务情况下，更新模型
25 | - [运行时客户端调用runtime](runtime)，[Java SDK2调用推理示例](runtime/Java2)(推荐)、[Java SDK1调用推理示例](runtime/Java)
26 | 
27 | ## 准备工作
28 | 为了使用Amazon SageMaker您只需要拥有一个AWS的账号，我们就可以实践起来。
29 | 
30 | ## 常见问题
31 | ### 1.升级相应Kernel中sagemaker版本
32 | 以升级tensorflow_p36 kernal中sagemaker为例，可先使用`conda env list`查看当前所有虚拟环境
33 | ```
34 | conda env list
35 | source activate tensorflow_p36
36 | pip install sagemaker --upgrade
37 | ```
38 | 执行完以上命令重启kernel
39 | ### 2.提示`ResourceLimitExceeded`
40 | 如果训练时，提示类似以下内容：
41 | ```
42 | ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.2xlarge for spot training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit.
43 | ```
44 | 为避免误操作造成浪费，默认未开通ml机型大型实例，需要在支持控制面板创建案例，选择提高服务限制。  
45 | 限制类型选择`SageMaker`，根据需要选择对应区域，资源类型选择`SageMaker培训`，限制选择期望的机型。  
46 | 如果要使用Spot实例进行训练，在描述中说明，参考：`希望提升宁夏区域的 Sagemaker Managed Spot Training ml.p3.2xlarge 限额为1。`  
47 | 如果要对推理的机型进行提高服务限制，资源类型选择`SageMaker托管`。


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/0-preparation/preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "palestinian-oriental",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# PaDiM on SageMaker--数据准备"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "still-tonight",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 说明\n",
 17 |     "本章内容为准备需要所需数据\n",
 18 |     "## 运行环境\n",
 19 |     "Kernel 选择pytorch_latest_p37。 \n",
 20 |     "### S3目录存放格式\n",
 21 |     "```\n",
 22 |     "training\n",
 23 |     "├── ground_truth\n",
 24 |     "│   ├── broken_1\n",
 25 |     "│   │   ├── image001.jpg\n",
 26 |     "│   │   ├── image002.jpg\n",
 27 |     "│   │   └── ...\n",
 28 |     "│   └── broken_2\n",
 29 |     "│       ├── image101.jpg\n",
 30 |     "│       ├── image102.jpg\n",
 31 |     "│       └── ...\n",
 32 |     "├── test\n",
 33 |     "│   ├── broken_1\n",
 34 |     "│   │   ├── image001.jpg\n",
 35 |     "│   │   ├── image002.jpg\n",
 36 |     "│   │   └── ...\n",
 37 |     "│   ├── broken_2\n",
 38 |     "│   │   ├── image101.jpg\n",
 39 |     "│   │   ├── image102.jpg\n",
 40 |     "│   │   └── ...\n",
 41 |     "│   └── good\n",
 42 |     "│       ├── image201.jpg\n",
 43 |     "│       ├── image202.jpg\n",
 44 |     "│       └── ...\n",
 45 |     "└── train\n",
 46 |     "    └── good\n",
 47 |     "        ├── image301.txt\n",
 48 |     "        ├── image302.txt\n",
 49 |     "        └── ...\n",
 50 |     "```\n",
 51 |     "### SageMaker输入数据根目录\n",
 52 |     "运行SageMaker时，SageMaker会从S3拷贝数据放到到运行容器的`/opt/ml/input/data/training/`下。即`ground_truth/broken_1/image001.jpg`对应全路径为`/opt/ml/input/data/training/ground_truth/broken_1/image001.jpg`\n",
 53 |     "### 文件说明\n",
 54 |     "- train目录下只能有一个good目录\n",
 55 |     "- test目录下除了有一个good目录，还需要至少1个非good目录 \n",
 56 |     "- ground_truth目录下只有非good目录，且和test目录、文件名一致"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "similar-projection",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## 下载示例数据\n",
 65 |     "访问https://www.mvtec.com/company/research/datasets/mvtec-ad/ 下载数据，本文有所修改，请下载单独类别。\n",
 66 |     "本文以bottle.tar.xz为例进行介绍。"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "capable-barrel",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!xz -d bottle.tar.xz"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "possible-prediction",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!tar -xf bottle.tar"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "supposed-backing",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "#增加写权限\n",
 97 |     "!chmod -R u+w bottle"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "bored-google",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "#去掉ground_truth文件名称中的_mask\n",
108 |     "import os\n",
109 |     "gt_dir=os.path.join(\"bottle\",'ground_truth')\n",
110 |     "img_types = sorted(os.listdir(gt_dir))\n",
111 |     "for img_type in img_types:\n",
112 |     "    img_type_dir = os.path.join(gt_dir, img_type)\n",
113 |     "    for f in sorted(os.listdir(img_type_dir)):\n",
114 |     "        if(f.find(\"_mask\")!=-1):\n",
115 |     "            os.rename(os.path.join(img_type_dir,f),os.path.join(img_type_dir,f.replace(\"_mask\",\"\")))"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "fantastic-richmond",
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "#修改input_data，训练章节中会继续用到该地址\n",
126 |     "input_data = 's3://junzhong/data/mvtec/bottle/'\n",
127 |     "!aws s3 sync --quiet bottle $input_data"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "falling-drama",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": []
137 |   }
138 |  ],
139 |  "metadata": {
140 |   "kernelspec": {
141 |    "display_name": "Environment (conda_pytorch_latest_p37)",
142 |    "language": "python",
143 |    "name": "conda_pytorch_latest_p37"
144 |   },
145 |   "language_info": {
146 |    "codemirror_mode": {
147 |     "name": "ipython",
148 |     "version": 3
149 |    },
150 |    "file_extension": ".py",
151 |    "mimetype": "text/x-python",
152 |    "name": "python",
153 |    "nbconvert_exporter": "python",
154 |    "pygments_lexer": "ipython3",
155 |    "version": "3.7.10"
156 |   }
157 |  },
158 |  "nbformat": 4,
159 |  "nbformat_minor": 5
160 | }
161 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/1-training/source/mvtec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | # import tarfile
  3 | from PIL import Image
  4 | from tqdm import tqdm
  5 | # import urllib.request
  6 | 
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | from torchvision import transforms as T
 10 | 
 11 | 
 12 | # URL = 'ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz'
 13 | 
 14 | 
 15 | class MVTecDataset(Dataset):
 16 |     def __init__(self, dataset_path, is_train=True,
 17 |                  resize=256, cropsize=224):
 18 |         #assert class_name in CLASS_NAMES, 'class_name: {}, should be in {}'.format(class_name, CLASS_NAMES)
 19 |         self.dataset_path = dataset_path
 20 |         #self.class_name = class_name
 21 |         self.is_train = is_train
 22 |         self.resize = resize
 23 |         self.cropsize = cropsize
 24 |         # self.mvtec_folder_path = os.path.join(root_path, 'mvtec_anomaly_detection')
 25 | 
 26 |         # download dataset if not exist
 27 |         # self.download()
 28 | 
 29 |         # load dataset
 30 |         self.x, self.y, self.mask = self.load_dataset_folder()
 31 | 
 32 |         # set transforms
 33 |         self.transform_x = T.Compose([T.Resize(resize, Image.ANTIALIAS),
 34 |                                       T.CenterCrop(cropsize),
 35 |                                       T.ToTensor(),
 36 |                                       T.Normalize(mean=[0.485, 0.456, 0.406],
 37 |                                                   std=[0.229, 0.224, 0.225])])
 38 |         self.transform_mask = T.Compose([T.Resize(resize, Image.NEAREST),
 39 |                                          T.CenterCrop(cropsize),
 40 |                                          T.ToTensor()])
 41 | 
 42 |     def __getitem__(self, idx):
 43 |         x, y, mask = self.x[idx], self.y[idx], self.mask[idx]
 44 | 
 45 |         x = Image.open(x).convert('RGB')
 46 |         x = self.transform_x(x)
 47 | 
 48 |         if y == 0 or not self.hasGt:#good or not has gt
 49 |             mask = torch.zeros([1, self.cropsize, self.cropsize])
 50 |         else:
 51 |             mask = Image.open(mask)
 52 |             mask = self.transform_mask(mask)
 53 | 
 54 |         return x, y, mask
 55 | 
 56 |     def __len__(self):
 57 |         return len(self.x)
 58 | 
 59 |     def load_dataset_folder(self):
 60 |         phase = 'train' if self.is_train else 'test'
 61 |         x, y, mask = [], [], []
 62 | 
 63 |         #img_dir = os.path.join(self.dataset_path, self.class_name, phase)
 64 |         #gt_dir = os.path.join(self.dataset_path, self.class_name, 'ground_truth')
 65 |         img_dir = os.path.join(self.dataset_path,  phase)
 66 |         gt_dir = os.path.join(self.dataset_path, 'ground_truth')
 67 |         hasGt = True if os.path.exists(gt_dir) else False
 68 |         self.hasGt = hasGt
 69 | 
 70 |         img_types = sorted(os.listdir(img_dir))
 71 |         for img_type in img_types:
 72 | 
 73 |             # load images
 74 |             img_type_dir = os.path.join(img_dir, img_type)
 75 |             if not os.path.isdir(img_type_dir):
 76 |                 continue
 77 |             img_fpath_list = sorted([os.path.join(img_type_dir, f)
 78 |                                      for f in os.listdir(img_type_dir)
 79 |                                      if f.endswith('.jpg') or f.endswith('.png')])
 80 |             x.extend(img_fpath_list)
 81 | 
 82 |             # load gt labels
 83 |             if img_type == 'good':
 84 |                 y.extend([0] * len(img_fpath_list))
 85 |                 mask.extend([None] * len(img_fpath_list))
 86 |             else:
 87 |                 y.extend([1] * len(img_fpath_list))
 88 |                 if hasGt:
 89 |                     gt_type_dir = os.path.join(gt_dir, img_type)
 90 |                     #img_fname_list = [os.path.splitext(os.path.basename(f))[0] for f in img_fpath_list]
 91 |                     img_fname_list = [os.path.basename(f) for f in img_fpath_list]
 92 |                     gt_fpath_list = [os.path.join(gt_type_dir, img_fname)
 93 |                                      for img_fname in img_fname_list]
 94 |                     mask.extend(gt_fpath_list)
 95 |                 else:
 96 |                     mask.extend([None] * len(img_fpath_list))
 97 |                     
 98 | 
 99 |         assert len(x) == len(y), 'number of x and y should be same'
100 | 
101 |         return list(x), list(y), list(mask)
102 | 
103 | #     def download(self):
104 | #         """Download dataset if not exist"""
105 | 
106 | #         if not os.path.exists(self.mvtec_folder_path):
107 | #             tar_file_path = self.mvtec_folder_path + '.tar.xz'
108 | #             if not os.path.exists(tar_file_path):
109 | #                 download_url(URL, tar_file_path)
110 | #             print('unzip downloaded dataset: %s' % tar_file_path)
111 | #             tar = tarfile.open(tar_file_path, 'r:xz')
112 | #             tar.extractall(self.mvtec_folder_path)
113 | #             tar.close()
114 | 
115 | #         return
116 | 
117 | 
118 | # class DownloadProgressBar(tqdm):
119 | #     def update_to(self, b=1, bsize=1, tsize=None):
120 | #         if tsize is not None:
121 | #             self.total = tsize
122 | #         self.update(b * bsize - self.n)
123 | 
124 | 
125 | # def download_url(url, output_path):
126 | #     with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
127 | #         urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
128 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/1-training/source/requirements2.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | sklearn
3 | matplotlib
4 | scikit-image


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/1-training/training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "applicable-zimbabwe",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# PaDiM on SageMaker--训练"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "ethical-commitment",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 说明\n",
 17 |     "本章内容为调用SageMaker进行训练，数据来自S3，训练后的模型放到S3。"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "id": "southeast-pasta",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## 运行环境\n",
 26 |     "Kernel 选择pytorch_latest_p37。  \n",
 27 |     "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "alert-trial",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import boto3,sagemaker\n",
 38 |     "print(boto3.__version__)\n",
 39 |     "print(sagemaker.__version__)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "acute-instrumentation",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## 训练"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "german-protein",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "#修改为自己的路径\n",
 58 |     "input_data = 's3://junzhong/data/mvtec/bottle/'\n",
 59 |     "output_data = 's3://junzhong/result/mvtec/'"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "independent-migration",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import boto3\n",
 70 |     "iam = boto3.client('iam')\n",
 71 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
 72 |     "role=\"\"\n",
 73 |     "for current_role in roles[\"Roles\"]:\n",
 74 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
 75 |     "        role=current_role[\"Arn\"]\n",
 76 |     "        break\n",
 77 |     "#如果role为空表示有问题，需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n",
 78 |     "print(role)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "sapphire-ordinary",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from sagemaker.pytorch import PyTorch\n",
 89 |     "\n",
 90 |     "instance_type='ml.m5.2xlarge'\n",
 91 |     "\n",
 92 |     "estimator = PyTorch(entry_point='training.py',\n",
 93 |     "                     source_dir='./source',\n",
 94 |     "                     role=role,\n",
 95 |     "                     output_path=output_data,\n",
 96 |     "                     framework_version='1.6.0',\n",
 97 |     "                     hyperparameters={'data_path':\"/opt/ml/input/data/training/\", 'save_path':'/opt/ml/model'}, \n",
 98 |     "                     py_version='py3',\n",
 99 |     "                     instance_count=1,\n",
100 |     "                     instance_type=instance_type,\n",
101 |     "                     use_spot_instances=True,\n",
102 |     "                     max_wait=432000,\n",
103 |     "                    )"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "id": "solved-network",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "result = estimator.fit(input_data)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "academic-parts",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "import os\n",
124 |     "os.makedirs(\"result\", exist_ok=True)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "id": "spatial-republic",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "!aws s3 cp $estimator.model_data ./result"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "confirmed-replacement",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "%%sh\n",
145 |     "cd result\n",
146 |     "tar zxvf model.tar.gz"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "illegal-billion",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "!pwd"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "id": "mysterious-publicity",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "!mv result/temp_wide_resnet50_2/train.pkl ../2-inference/source/"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "primary-fifty",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": []
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "Environment (conda_pytorch_latest_p37)",
181 |    "language": "python",
182 |    "name": "conda_pytorch_latest_p37"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.7.10"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 5
199 | }
200 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/Dockerfile:
--------------------------------------------------------------------------------
 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04
 2 | ARG BASE_IMG=${BASE_IMG}
 3 | FROM ${BASE_IMG} 
 4 | 
 5 | RUN apt-get update
 6 | RUN apt-get install -y --no-install-recommends nginx net-tools\
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN pip install flask gevent gunicorn boto3 -i https://opentuna.cn/pypi/web/simple/ && \
10 |         rm -rf /root/.cache
11 | 
12 | COPY aws /root/.aws
13 | # RUN mkdir /opt/ml/code
14 | WORKDIR /opt/ml/code
15 | COPY source ./
16 | 
17 | RUN pip install -r requirements.txt -i https://opentuna.cn/pypi/web/simple/
18 |         
19 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard
20 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE
21 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update
22 | # PATH so that the train and serve programs are found when the container is invoked.
23 | 
24 | ENV PYTHONUNBUFFERED=TRUE
25 | ENV PYTHONDONTWRITEBYTECODE=TRUE
26 | ENV PATH="/opt/ml/code/:${PATH}"
27 | 
28 | ENTRYPOINT ["python3"]


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/aws/config:
--------------------------------------------------------------------------------
1 | [default]
2 | region = cn-northwest-1
3 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/anomaly-detection/PaDiM/2-inference/source/__init__.py


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/inference.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from random import sample
  3 | import argparse
  4 | import numpy as np
  5 | import os
  6 | import pickle
  7 | from tqdm import tqdm
  8 | from collections import OrderedDict
  9 | from sklearn.metrics import roc_auc_score
 10 | from sklearn.metrics import roc_curve
 11 | from sklearn.metrics import precision_recall_curve
 12 | from sklearn.covariance import LedoitWolf
 13 | from scipy.spatial.distance import mahalanobis
 14 | from scipy.ndimage import gaussian_filter
 15 | from skimage import morphology
 16 | from skimage.segmentation import mark_boundaries
 17 | import matplotlib.pyplot as plt
 18 | import matplotlib
 19 | 
 20 | import torch
 21 | import torch.nn.functional as F
 22 | from torch.utils.data import DataLoader
 23 | from torchvision.models import wide_resnet50_2, resnet18
 24 | import mvtec as mvtec
 25 | from mytime import get_current_time
 26 | import boto3
 27 | import shutil
 28 | 
 29 | 
 30 | class Config(object):
 31 |     def __init__(self):
 32 |         self.arch = "wide_resnet50_2"
 33 |         self.data_path = "../dataset/mvtec_anomaly_detection/bottle"
 34 | 
 35 | class DetectionSystem(object):
 36 |     def __init__(self):
 37 |         self.s3_client = boto3.client("s3")
 38 |         # extract train set features
 39 |         train_feature_filepath = 'train.pkl'
 40 |         print('load train set feature from: %s' % train_feature_filepath)
 41 |         with open(train_feature_filepath, 'rb') as f:
 42 |             self.train_outputs = pickle.load(f)
 43 |         
 44 |     def predict(self,data_path,upload_bucket,upload_path,threshold=0.673):
 45 |         args = Config()
 46 |         save_path = data_path+"save"
 47 |  
 48 |         use_cuda = torch.cuda.is_available()
 49 |         device = torch.device('cuda' if use_cuda else 'cpu')
 50 |         # load model
 51 |         if args.arch == 'resnet18':
 52 |             model = resnet18(pretrained=True, progress=True)
 53 |             t_d = 448
 54 |             d = 100
 55 |         elif args.arch == 'wide_resnet50_2':
 56 |             model = wide_resnet50_2(pretrained=True, progress=True)
 57 |             t_d = 1792
 58 |             d = 550
 59 |         model.to(device)
 60 |         model.eval()
 61 |         random.seed(1024)
 62 |         torch.manual_seed(1024)
 63 |         if use_cuda:
 64 |             torch.cuda.manual_seed_all(1024)
 65 | 
 66 |         idx = torch.tensor(sample(range(0, t_d), d))
 67 | 
 68 |         # set model's intermediate outputs
 69 |         outputs = []
 70 | 
 71 |         def hook(module, input, output):
 72 |             outputs.append(output)
 73 | 
 74 |         model.layer1[-1].register_forward_hook(hook)
 75 |         model.layer2[-1].register_forward_hook(hook)
 76 |         model.layer3[-1].register_forward_hook(hook)
 77 | 
 78 |         test_outputs = OrderedDict([('layer1', []), ('layer2', []), ('layer3', [])])
 79 | 
 80 |         test_dataset = mvtec.MVTecDataset(data_path, is_train=False)
 81 |         test_dataloader = DataLoader(test_dataset, batch_size=32, pin_memory=True)
 82 |         test_imgs = []
 83 | 
 84 |         # extract test set features
 85 |         for (x, y, mask) in test_dataloader:
 86 |             test_imgs.extend(x.cpu().detach().numpy())
 87 |             # model prediction
 88 |             with torch.no_grad():
 89 |                 _ = model(x.to(device))
 90 |             # get intermediate layer outputs
 91 |             for k, v in zip(test_outputs.keys(), outputs):
 92 |                 test_outputs[k].append(v.cpu().detach())
 93 |             # initialize hook outputs
 94 |             outputs = []
 95 |         for k, v in test_outputs.items():
 96 |             test_outputs[k] = torch.cat(v, 0)
 97 | 
 98 |         # Embedding concat
 99 |         embedding_vectors = test_outputs['layer1']
100 |         for layer_name in ['layer2', 'layer3']:
101 |             embedding_vectors = self.embedding_concat(embedding_vectors, test_outputs[layer_name])
102 | 
103 |         # randomly select d dimension
104 |         embedding_vectors = torch.index_select(embedding_vectors, 1, idx)
105 | 
106 |         # calculate distance matrix
107 |         B, C, H, W = embedding_vectors.size()
108 |         embedding_vectors = embedding_vectors.view(B, C, H * W).numpy()
109 |         dist_list = []
110 |         for i in range(H * W):
111 |             mean = self.train_outputs[0][:, i]
112 |             conv_inv = np.linalg.inv(self.train_outputs[1][:, :, i])
113 |             dist = [mahalanobis(sample[:, i], mean, conv_inv) for sample in embedding_vectors]
114 |             dist_list.append(dist)
115 | 
116 |         dist_list = np.array(dist_list).transpose(1, 0).reshape(B, H, W)
117 | 
118 |         # upsample
119 |         dist_list = torch.tensor(dist_list)
120 |         score_map = F.interpolate(dist_list.unsqueeze(1), size=x.size(2), mode='bilinear',
121 |                                   align_corners=False).squeeze().numpy()
122 | 
123 |         # apply gaussian smoothing on the score map
124 |         for i in range(score_map.shape[0]):
125 |             score_map[i] = gaussian_filter(score_map[i], sigma=4)
126 | 
127 |         # Normalization
128 |         max_score = score_map.max()
129 |         min_score = score_map.min()
130 |         scores = (score_map - min_score) / (max_score - min_score)
131 | 
132 |         os.makedirs(save_path, exist_ok=True)
133 |         self.plot_fig(test_imgs, scores, threshold, save_path)
134 |         
135 |         self.upload(save_path,upload_bucket,upload_path)
136 |         shutil.rmtree(data_path)
137 |         shutil.rmtree(save_path)
138 |         print(data_path+"推理完毕")
139 |         
140 |         return "{'result':'OK'}"
141 | 
142 |     def upload(self,save_path,upload_bucket,upload_path):
143 |         if not upload_path.endswith("/"):
144 |             upload_path = upload_path + "/"
145 |         for f in os.listdir(save_path):
146 |             file_name = os.path.join(save_path,f)
147 |             self.s3_client.upload_file(file_name,upload_bucket,upload_path+f)
148 | 
149 |     def plot_fig(self,test_img, scores, threshold, save_dir):
150 |         num = len(scores)
151 |         vmax = scores.max() * 255.
152 |         vmin = scores.min() * 255.
153 |         for i in range(num):
154 |             img = test_img[i]
155 |             img = self.denormalization(img)
156 |             heat_map = scores[i] * 255
157 |             mask = scores[i]
158 |             mask[mask > threshold] = 1
159 |             mask[mask <= threshold] = 0
160 |             kernel = morphology.disk(4)
161 |             mask = morphology.opening(mask, kernel)
162 |             mask *= 255
163 |             vis_img = mark_boundaries(img, mask, color=(1, 0, 0), mode='thick')
164 |             fig_img, ax_img = plt.subplots(1, 4, figsize=(10, 3))
165 |             fig_img.subplots_adjust(right=0.9)
166 |             norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
167 |             for ax_i in ax_img:
168 |                 ax_i.axes.xaxis.set_visible(False)
169 |                 ax_i.axes.yaxis.set_visible(False)
170 |             ax_img[0].imshow(img)
171 |             ax_img[0].title.set_text('Image')
172 |             ax = ax_img[1].imshow(heat_map, cmap='jet', norm=norm)
173 |             ax_img[1].imshow(img, cmap='gray', interpolation='none')
174 |             ax_img[1].imshow(heat_map, cmap='jet', alpha=0.5, interpolation='none')
175 |             ax_img[1].title.set_text('Predicted heat map')
176 |             ax_img[2].imshow(mask, cmap='gray')
177 |             ax_img[2].title.set_text('Predicted mask')
178 |             ax_img[3].imshow(vis_img)
179 |             ax_img[3].title.set_text('Segmentation result')
180 |             left = 0.92
181 |             bottom = 0.15
182 |             width = 0.015
183 |             height = 1 - 2 * bottom
184 |             rect = [left, bottom, width, height]
185 |             cbar_ax = fig_img.add_axes(rect)
186 |             cb = plt.colorbar(ax, shrink=0.6, cax=cbar_ax, fraction=0.046)
187 |             cb.ax.tick_params(labelsize=8)
188 |             font = {
189 |                 'family': 'serif',
190 |                 'color': 'black',
191 |                 'weight': 'normal',
192 |                 'size': 8,
193 |             }
194 |             cb.set_label('Anomaly Score', fontdict=font)
195 | 
196 |             fig_img.savefig(os.path.join(save_dir, '{}'.format(i)), dpi=100)
197 |             plt.close()
198 | 
199 | 
200 |     def denormalization(self,x):
201 |         mean = np.array([0.485, 0.456, 0.406])
202 |         std = np.array([0.229, 0.224, 0.225])
203 |         x = (((x.transpose(1, 2, 0) * std) + mean) * 255.).astype(np.uint8)
204 | 
205 |         return x
206 | 
207 | 
208 |     def embedding_concat(self,x, y):
209 |         B, C1, H1, W1 = x.size()
210 |         _, C2, H2, W2 = y.size()
211 |         s = int(H1 / H2)
212 |         x = F.unfold(x, kernel_size=s, dilation=1, stride=s)
213 |         x = x.view(B, C1, -1, H2, W2)
214 |         z = torch.zeros(B, C1 + C2, x.size(2), H2, W2)
215 |         for i in range(x.size(2)):
216 |             z[:, :, i, :, :] = torch.cat((x[:, :, i, :, :], y), 1)
217 |         z = z.view(B, -1, H2 * W2)
218 |         z = F.fold(z, kernel_size=s, output_size=(H1, W1), stride=s)
219 | 
220 |         return z
221 | 
222 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/mvtec.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # import tarfile
 3 | from PIL import Image
 4 | from tqdm import tqdm
 5 | # import urllib.request
 6 | 
 7 | import torch
 8 | from torch.utils.data import Dataset
 9 | from torchvision import transforms as T
10 | 
11 | 
12 | # URL = 'ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz'
13 | 
14 | 
15 | class MVTecDataset(Dataset):
16 |     def __init__(self, dataset_path, is_train=True,
17 |                  resize=256, cropsize=224):
18 |         #assert class_name in CLASS_NAMES, 'class_name: {}, should be in {}'.format(class_name, CLASS_NAMES)
19 |         self.dataset_path = dataset_path
20 |         #self.class_name = class_name
21 |         self.is_train = is_train
22 |         self.resize = resize
23 |         self.cropsize = cropsize
24 |         # self.mvtec_folder_path = os.path.join(root_path, 'mvtec_anomaly_detection')
25 | 
26 |         # download dataset if not exist
27 |         # self.download()
28 | 
29 |         # load dataset
30 |         self.x, self.y, self.mask = self.load_dataset_folder()
31 | 
32 |         # set transforms
33 |         self.transform_x = T.Compose([T.Resize(resize, Image.ANTIALIAS),
34 |                                       T.CenterCrop(cropsize),
35 |                                       T.ToTensor(),
36 |                                       T.Normalize(mean=[0.485, 0.456, 0.406],
37 |                                                   std=[0.229, 0.224, 0.225])])
38 |         self.transform_mask = T.Compose([T.Resize(resize, Image.NEAREST),
39 |                                          T.CenterCrop(cropsize),
40 |                                          T.ToTensor()])
41 | 
42 |     def __getitem__(self, idx):
43 |         x, y, mask = self.x[idx], self.y[idx], self.mask[idx]
44 | 
45 |         x = Image.open(x).convert('RGB')
46 |         x = self.transform_x(x)
47 | 
48 |         mask = torch.zeros([1, self.cropsize, self.cropsize])
49 | 
50 |         return x, y, mask
51 | 
52 |     def __len__(self):
53 |         return len(self.x)
54 | 
55 |     def load_dataset_folder(self):
56 |         x, y, mask = [], [], []
57 | 
58 |         img_fpath_list = sorted([os.path.join(self.dataset_path, f)
59 |                                  for f in os.listdir(self.dataset_path)
60 |                                  if f.endswith('.jpg') or f.endswith('.png')])
61 |         x.extend(img_fpath_list)
62 |         y.extend([1] * len(img_fpath_list))
63 |         mask.extend([None] * len(img_fpath_list))
64 |                     
65 | 
66 |         assert len(x) == len(y), 'number of x and y should be same'
67 | 
68 |         return list(x), list(y), list(mask)
69 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/mytime.py:
--------------------------------------------------------------------------------
1 | import time
2 | def get_current_time():
3 |     ct = time.time()
4 |     local_time = time.localtime(ct)
5 |     data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
6 |     data_secs = (ct - int(ct)) * 1000
7 |     time_stamp = "%s.%03d" % (data_head, data_secs)
8 |     return time_stamp


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 |     proxy_read_timeout 1200s;
27 | 
28 |     location ~ ^/(ping|invocations) {
29 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
30 |       proxy_set_header Host $http_host;
31 |       proxy_redirect off;
32 |       proxy_pass http://gunicorn;
33 |     }
34 | 
35 |     location / {
36 |       return 404 "{}";
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import boto3
 4 | import flask
 5 | import json
 6 | import shutil
 7 | import time,datetime
 8 | import random
 9 | from inference import DetectionSystem
10 | import _thread
11 | 
12 | DEBUG = False
13 | 
14 | # The flask app for serving predictions
15 | app = flask.Flask(__name__)
16 | 
17 | import logging
18 | logger = logging.getLogger(__name__)
19 | logger.setLevel(logging.DEBUG)
20 | logger.addHandler(logging.StreamHandler(sys.stdout))
21 | 
22 | @app.route('/ping', methods=['GET'])
23 | def ping():
24 |     """Determine if the container is working and healthy. In this sample container, we declare
25 |     it healthy if we can load the model successfully."""
26 |     #health = boto3.client('s3') is not None  # You can insert a health check here
27 | 
28 |     #status = 200 if health else 404
29 |     status = 200
30 |     return flask.Response(response='\n', status=status, mimetype='application/json')
31 | 
32 | 
33 | @app.route('/')
34 | def hello_world():
35 |     return 'PaDiM endpoint'
36 | 
37 | 
38 | @app.route('/invocations', methods=['POST'])
39 | def invocations():
40 |     content_type = flask.request.content_type
41 |     if content_type != 'application/json' :
42 |         return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain')
43 |     
44 |     tt = time.strftime("%Y%m%d%H%M%S", time.localtime())
45 |     for i in range(0,5):
46 |         randomstr = str(random.randint(1000,9999))
47 |         current_data_dir = os.path.join(init_data_dir,tt+randomstr)
48 |         if not os.path.exists(current_data_dir):
49 |             try:
50 |                 os.mkdir(current_data_dir)
51 |                 break
52 |             except FileExistsError:
53 |                 logger.info("Dir Exist."+current_data_dir)
54 |     else:
55 |         return flask.Response(response='Make dir error', status=500, mimetype='text/plain')
56 |     
57 |     data = flask.request.data.decode('utf-8')
58 |     logger.info("invocations params [{}]".format(data))
59 |     try:
60 |         data = json.loads(data)
61 |     except:
62 |         return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain')
63 | 
64 |     bucket = data['bucket']
65 |     for image_uri in data['image_uri']:
66 |         download_file_name = image_uri.split('/')[-1]
67 |         download_file_name = os.path.join(current_data_dir, download_file_name)
68 |         s3_client.download_file(bucket, image_uri, download_file_name)
69 |     upload_bucket = data['upload_bucket']
70 |     upload_path = data['upload_path']
71 | 
72 |     #inference_result = detection.predict(current_data_dir)
73 |     #shutil.rmtree(current_data_dir)
74 |     _thread.start_new_thread( asyncPredict, (current_data_dir,upload_bucket,upload_path) )
75 |     
76 |     _payload = json.dumps({'code': 1, 'msg': 'async Predict'})
77 |     return flask.Response(response=_payload, status=200, mimetype='application/json')
78 | 
79 | def asyncPredict(current_data_dir,bucket,path):
80 |     inference_result = detection.predict(current_data_dir,bucket,path)
81 | 
82 | #---------------------------------------
83 | init_data_dir = '/opt/ml/data_dir'
84 | 
85 | if not os.path.exists(init_data_dir):
86 |     try:
87 |         os.mkdir(init_data_dir)
88 |     except FileExistsError:
89 |         logger.info("Dir Exist.")
90 | 
91 | s3_client = boto3.client("s3")
92 | detection = DetectionSystem()
93 | #---------------------------------------
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     app.run()


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | sklearn
3 | matplotlib
4 | scikit-image


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | from __future__ import print_function
17 | import multiprocessing
18 | import os
19 | import signal
20 | import subprocess
21 | import sys
22 | 
23 | 
24 | cpu_count = multiprocessing.cpu_count()
25 | 
26 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 120)
27 | #model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count))
28 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', 1)) 
29 | 
30 | def sigterm_handler(nginx_pid, gunicorn_pid):
31 |     try:
32 |         os.kill(nginx_pid, signal.SIGQUIT)
33 |     except OSError:
34 |         pass
35 |     try:
36 |         os.kill(gunicorn_pid, signal.SIGTERM)
37 |     except OSError:
38 |         pass
39 | 
40 |     sys.exit(0)
41 | 
42 | def start_server():
43 |     print('Starting the inference server with {} workers.'.format(model_server_workers))
44 | 
45 | 
46 |     # link the log streams to stdout/err so they will be logged to the container logs
47 |     subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
48 |     subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
49 | 
50 |     nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf'])
51 |     gunicorn = subprocess.Popen(['gunicorn',
52 |                                  '--timeout', str(model_server_timeout),
53 |                                  '-k', 'gevent',
54 |                                  '-b', 'unix:/tmp/gunicorn.sock',
55 |                                  '-w', str(model_server_workers),
56 |                                  'wsgi:app'])
57 | 
58 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
59 | 
60 |     # If either subprocess exits, so do we.
61 |     pids = set([nginx.pid, gunicorn.pid])
62 |     while True:
63 |         pid, _ = os.wait()
64 |         if pid in pids:
65 |             break
66 | 
67 |     sigterm_handler(nginx.pid, gunicorn.pid)
68 |     print('Inference server exiting')
69 | 
70 | # The main routine just invokes the start function.
71 | 
72 | if __name__ == '__main__':
73 |     start_server()
74 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/test.py:
--------------------------------------------------------------------------------
 1 | import _thread
 2 | from inference import DetectionSystem
 3 | 
 4 | detection = DetectionSystem()
 5 | def asyncPredict(current_data_dir,bucket,path):
 6 |     inference_result = detection.predict(current_data_dir,bucket,path)
 7 | 
 8 | current_data_dir="/opt/ml/data_dir/202103180920572600"
 9 | _thread.start_new_thread( asyncPredict, (current_data_dir,"junzhong","result/ad") )
10 | while 1:
11 |    pass


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/2-inference/source/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app
8 | 


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/README.md:
--------------------------------------------------------------------------------
 1 | # PaDiM on SageMaker
 2 | 本workshop演示使用PaDiM在SageMaker上如何进行训练和推理。  
 3 | PaDiM可进行异常检测
 4 | ![检测例子](images/detection_example.png)
 5 | ## 数据准备
 6 | [0-preparation](0-preparation)说明按格式准备好数据，并放入到S3。
 7 | ## 训练
 8 | [1-training](1-training)演示在SageMaker上进行训练。
 9 | ## 推理
10 | [2-inference](2-inference)演示在SageMaker上部署Endpoint，以及调用Endpoint进行推理。


--------------------------------------------------------------------------------
/anomaly-detection/PaDiM/images/detection_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/anomaly-detection/PaDiM/images/detection_example.png


--------------------------------------------------------------------------------
/distributed-training/PyTorch/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch distributed training on SageMaker
2 | 本workshop演示PyTorch在SageMaker上如何进行分布式训练。  
3 | 原地址：https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/pytorch_mnist  
4 | 本Workshop使用的是PyTorch自带的DistributedDataParallel，未使用SageMaker的smdistributed。


--------------------------------------------------------------------------------
/distributed-training/PyTorch/code/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/distributed-training/PyTorch/code/requirements.txt


--------------------------------------------------------------------------------
/distributed-training/TensorFlow/data-parallel/README.md:
--------------------------------------------------------------------------------
1 | # TensorFlow distributed training on SageMaker
2 | 本workshop演示TensorFlow在SageMaker上使用smdistributed进行分布式训练。  


--------------------------------------------------------------------------------
/distributed-training/TensorFlow/data-parallel/code/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/distributed-training/TensorFlow/data-parallel/code/requirements.txt


--------------------------------------------------------------------------------
/distributed-training/TensorFlow/data-parallel/code/train_tensorflow_smdataparallel_mnist.py:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | 
 18 | import os
 19 | 
 20 | # Import SMDataParallel TensorFlow2 Modules
 21 | import smdistributed.dataparallel.tensorflow as dist
 22 | import tensorflow as tf
 23 | 
 24 | import argparse
 25 | if __name__ == '__main__':
 26 |     parser = argparse.ArgumentParser()
 27 |     parser.add_argument('--batch_size', type=int, default=128)
 28 | 
 29 |     #不能使用 args = parser.parse_args()
 30 |     args, _ = parser.parse_known_args()
 31 |     print("args.batch_size="+str(args.batch_size))
 32 | 
 33 | tf.random.set_seed(42)
 34 | 
 35 | # SMDataParallel: Initialize
 36 | dist.init()
 37 | 
 38 | print("dist.size()="+str(dist.size()))
 39 | print("dist.rank()="+str(dist.rank()))
 40 | 
 41 | gpus = tf.config.experimental.list_physical_devices("GPU")
 42 | for gpu in gpus:
 43 |     tf.config.experimental.set_memory_growth(gpu, True)
 44 | if gpus:
 45 |     # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API]
 46 |     tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU")
 47 | 
 48 | #(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="mnist-%d.npz" % dist.rank())
 49 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="/opt/ml/input/data/training/mnist.npz")
 50 | 
 51 | dataset = tf.data.Dataset.from_tensor_slices(
 52 |     (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))
 53 | )
 54 | dataset = dataset.repeat().shuffle(10000).batch(args.batch_size)
 55 | 
 56 | mnist_model = tf.keras.Sequential(
 57 |     [
 58 |         tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
 59 |         tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
 60 |         tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
 61 |         tf.keras.layers.Dropout(0.25),
 62 |         tf.keras.layers.Flatten(),
 63 |         tf.keras.layers.Dense(128, activation="relu"),
 64 |         tf.keras.layers.Dropout(0.5),
 65 |         tf.keras.layers.Dense(10, activation="softmax"),
 66 |     ]
 67 | )
 68 | loss = tf.losses.SparseCategoricalCrossentropy()
 69 | 
 70 | # SMDataParallel: dist.size()
 71 | # LR for 8 node run : 0.000125
 72 | # LR for single node run : 0.001
 73 | opt = tf.optimizers.Adam(0.000125 * dist.size())
 74 | 
 75 | checkpoint_dir = os.environ["SM_MODEL_DIR"]
 76 | 
 77 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
 78 | 
 79 | 
 80 | @tf.function
 81 | def training_step(images, labels, first_batch):
 82 |     with tf.GradientTape() as tape:
 83 |         probs = mnist_model(images, training=True)
 84 |         loss_value = loss(labels, probs)
 85 | 
 86 |     # SMDataParallel: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape
 87 |     tape = dist.DistributedGradientTape(tape)
 88 | 
 89 |     grads = tape.gradient(loss_value, mnist_model.trainable_variables)
 90 |     opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
 91 | 
 92 |     if first_batch:
 93 |         # SMDataParallel: Broadcast model and optimizer variables
 94 |         dist.broadcast_variables(mnist_model.variables, root_rank=0)
 95 |         dist.broadcast_variables(opt.variables(), root_rank=0)
 96 | 
 97 |     # SMDataParallel: all_reduce call
 98 |     loss_value = dist.oob_allreduce(loss_value)  # Average the loss across workers
 99 |     return loss_value
100 | 
101 | 
102 | for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())):
103 |     loss_value = training_step(images, labels, batch == 0)
104 | 
105 |     if batch % 50 == 0 and dist.rank() == 0:
106 |         print("Step #%d\tLoss: %.6f" % (batch, loss_value))
107 | 
108 | # SMDataParallel: Save checkpoints only from master node.
109 | if dist.rank() == 0:
110 |     mnist_model.save(os.path.join(checkpoint_dir, "1"))
111 | 


--------------------------------------------------------------------------------
/distributed-training/TensorFlow/data-parallel/tensorflow2_smdataparallel_mnist_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# TensorFlow distributed training on SageMaker"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本章内容为演示TensorFlow在SageMaker上使用smdistributed进行分布式训练。  "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 2 运行环境\n",
 23 |     "Kernel 选择tensorflow2_p36。  \n",
 24 |     "本文在boto3 1.17.109和sagemaker 2.48.1下测试通过。"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import boto3,sagemaker\n",
 34 |     "print(boto3.__version__)\n",
 35 |     "print(sagemaker.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "如果版本较低，请执行以下命令，重启kernal后再检查版本"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "!pip install -U boto3 -i https://opentuna.cn/pypi/web/simple/"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "!pip install -U sagemaker -i https://opentuna.cn/pypi/web/simple/"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## 3 设置/获取相关参数"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "import boto3\n",
 77 |     "import sagemaker\n",
 78 |     "from sagemaker.image_uris import retrieve\n",
 79 |     "\n",
 80 |     "sagemaker_session = sagemaker.Session()\n",
 81 |     "iam = boto3.client('iam')\n",
 82 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
 83 |     "role=\"\"\n",
 84 |     "for current_role in roles[\"Roles\"]:\n",
 85 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
 86 |     "        role=current_role[\"Arn\"]\n",
 87 |     "        break\n",
 88 |     "#如果role为空表示有问题，需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n",
 89 |     "print(role)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "data_input=\"s3://junzhong/data/mnist.npz\""
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "use_spot = True"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## 4 训练"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "from sagemaker.tensorflow import TensorFlow\n",
124 |     "\n",
125 |     "estimator = TensorFlow(\n",
126 |     "    base_job_name=\"tensorflow2-smdataparallel-mnist\",\n",
127 |     "    source_dir=\"code\",\n",
128 |     "    entry_point=\"train_tensorflow_smdataparallel_mnist.py\",\n",
129 |     "    role=role,\n",
130 |     "    py_version=\"py37\",\n",
131 |     "    framework_version=\"2.3.1\",\n",
132 |     "    instance_count=2,\n",
133 |     "    instance_type=\"ml.p3.16xlarge\",\n",
134 |     "    sagemaker_session=sagemaker_session,\n",
135 |     "    hyperparameters={'batch_size':128},\n",
136 |     "    use_spot_instances=use_spot,\n",
137 |     "    max_wait=7200 if use_spot else None,\n",
138 |     "    max_run=7200,\n",
139 |     "    # Training using SMDataParallel Distributed Training Framework\n",
140 |     "    distribution={\"smdistributed\": {\"dataparallel\": {\"enabled\": True}}},\n",
141 |     ")\n",
142 |     "#日志都会输出到第1个node上\n",
143 |     "estimator.fit(data_input)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "model_data = estimator.model_data\n",
153 |     "model_data"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "## 5 部署"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "from sagemaker.tensorflow.model import TensorFlowModel\n",
170 |     "model = TensorFlowModel(\n",
171 |     "            model_data=model_data, \n",
172 |     "            role=role,\n",
173 |     "            framework_version='2.3.1')\n",
174 |     "predictor = model.deploy(initial_instance_count=1, instance_type=\"ml.m5.large\",endpoint_name=\"tensorflowmnist\")"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "## 6 推理"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "import tensorflow as tf\n",
191 |     "(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "import numpy as np\n",
201 |     "import random\n",
202 |     "image_size = 3\n",
203 |     "mask1 = random.sample(range(len(mnist_images)), image_size)\n",
204 |     "mask2 = np.array(mask1, dtype=np.int)\n",
205 |     "data = mnist_images[mask2]"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "from matplotlib import pyplot as plt\n",
215 |     "plt.figure(figsize=(2,2))\n",
216 |     "for index, mask in enumerate(mask1):\n",
217 |     "    plt.subplot(1,image_size,index+1)\n",
218 |     "    plt.imshow(mnist_images[mask])\n",
219 |     "    plt.axis('off')\n",
220 |     "plt.show()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "from sagemaker.tensorflow.model import TensorFlowPredictor\n",
230 |     "endpoint_name = \"tensorflowmnist\"\n",
231 |     "predictor = TensorFlowPredictor(\n",
232 |     "    endpoint_name=endpoint_name,\n",
233 |     "    sagemaker_session=sagemaker.Session())"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "predict输入和输出数据的格式直接对应Predict于TensorFlow Serving REST API 中方法的请求和响应格式  \n",
241 |     "除此外，还支持简化的 json 格式、行分隔的 json 对象（“jsons”或“jsonlines”）和 CSV 数据"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "response = predictor.predict(np.expand_dims(data, axis=3))\n",
251 |     "for i in range(0,image_size):\n",
252 |     "    print(\"Most likely answer: {}\".format(np.argmax(response[\"predictions\"][i])))"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "## 7 清理"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "predictor.delete_endpoint()"
269 |    ]
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "kernelspec": {
274 |    "display_name": "Environment (conda_tensorflow2_p36)",
275 |    "language": "python",
276 |    "name": "conda_tensorflow2_p36"
277 |   },
278 |   "language_info": {
279 |    "codemirror_mode": {
280 |     "name": "ipython",
281 |     "version": 3
282 |    },
283 |    "file_extension": ".py",
284 |    "mimetype": "text/x-python",
285 |    "name": "python",
286 |    "nbconvert_exporter": "python",
287 |    "pygments_lexer": "ipython3",
288 |    "version": "3.6.13"
289 |   }
290 |  },
291 |  "nbformat": 4,
292 |  "nbformat_minor": 4
293 | }
294 | 


--------------------------------------------------------------------------------
/encapsulation/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Using the official tensorflow serving image from docker hub as base image
 2 | FROM tensorflow/serving
 3 | 
 4 | # Installing NGINX, used to rever proxy the predictions from SageMaker to TF Serving
 5 | RUN apt-get update && apt-get install -y --no-install-recommends nginx git
 6 | 
 7 | # Copy our model folder to the container
 8 | COPY ./output/tf_server  /model
 9 | 
10 | # Copy NGINX configuration to the container
11 | COPY nginx.conf /etc/nginx/nginx.conf
12 | 
13 | # starts NGINX and TF serving pointing to our model
14 | ENTRYPOINT service nginx start | tensorflow_model_server --rest_api_port=8501 \
15 |  --model_name=sagemaker-demo \
16 |  --model_base_path=/model


--------------------------------------------------------------------------------
/encapsulation/README.md:
--------------------------------------------------------------------------------
 1 | # 说明
 2 | 
 3 | 演示使用Sagemaker 封装图片分类算法, 使用TensorFlow-server 部署模型，并在客户端进行调用。 
 4 | 
 5 | 
 6 | ## 使用sagemaker 训练
 7 | 
 8 | 打开 [train.ipynb](train.ipynb) 进行训练，然后打开[inference-custom-image.ipynb](inference-custom-image.ipynb)或[inference-default-image.ipynb](inference-default-image.ipynb)进行部署和使用
 9 | 
10 | ## 超级参数优化
11 | 打开 [hyperparameter-tuning.ipynb](hyperparameter-tuning.ipynb) 进行参数优化


--------------------------------------------------------------------------------
/encapsulation/inference-default-image.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 使用TensorFlow默认Image进行推理"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "需要用到Tensorflow 和Keras , kernel 选择conda_tensorflow_p36\n",
 15 |     "\n",
 16 |     "## 把训练好的模型存放S3上\n",
 17 |     "model.tar.gz内目录结构如下\n",
 18 |     "```\n",
 19 |     "model.tar.gz\n",
 20 |     "└── tf_server\n",
 21 |     "    └── 1\n",
 22 |     "        ├── saved_model.pb\n",
 23 |     "        └── variables\n",
 24 |     "            ├── variables.data-00000-of-00001\n",
 25 |     "            └── variables.index\n",
 26 |     "```"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## 部署模型到SageMaker"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from sagemaker.tensorflow.model import TensorFlowModel\n",
 43 |     "from sagemaker import get_execution_role\n",
 44 |     "\n",
 45 |     "#role = get_execution_role()\n",
 46 |     "role=\"arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235\" \n",
 47 |     "\n",
 48 |     "model_uri = \"s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-09-16-06-27-15-538/output/model.tar.gz\"\n",
 49 |     "endpoint_name = \"sagemaker-cat-vs-dog-2\"\n",
 50 |     "my_model = TensorFlowModel(\n",
 51 |     "            model_data=model_uri, \n",
 52 |     "            role=role,\n",
 53 |     "            framework_version='1.15.2')\n",
 54 |     "\n",
 55 |     "#该步骤大概需要10分钟\n",
 56 |     "my_model.deploy(initial_instance_count=1,\n",
 57 |     "                                endpoint_name=endpoint_name,\n",
 58 |     "                                instance_type='ml.t2.medium'\n",
 59 |     "                                )"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## 推理\n",
 67 |     "### 读取数据"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from keras.preprocessing import image\n",
 77 |     "import json\n",
 78 |     "import numpy as np\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "IMAGE_WIDTH = 150\n",
 82 |     "IMAGE_HEIGHT = 150\n",
 83 |     "# 修改测试图片地址\n",
 84 |     "image_paths = 'test/cat.681.jpg'\n",
 85 |     "#image_paths = 'test/dog.592.jpg'\n",
 86 |     "images = image.load_img(image_paths, target_size=(IMAGE_WIDTH, IMAGE_HEIGHT))\n",
 87 |     "input_image = image.img_to_array(images)\n",
 88 |     "input_image = np.expand_dims(input_image, axis=0)\n",
 89 |     "input_image /= 255.\n",
 90 |     "\n",
 91 |     "input_images = input_image.tolist()\n",
 92 |     "\n",
 93 |     "data = {\"name\": 'tensorflow/serving/predict',\"signature_name\":'predict',\"inputs\":input_images}"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import sagemaker\n",
103 |     "from sagemaker.tensorflow.model import TensorFlowPredictor\n",
104 |     "\n",
105 |     "endpoint_name = \"sagemaker-cat-vs-dog-2\"\n",
106 |     "predictor = TensorFlowPredictor(\n",
107 |     "    endpoint_name=endpoint_name,\n",
108 |     "    sagemaker_session=sagemaker.Session())"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "result = predictor.predict(data)\n",
118 |     "print(result)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 删除Endpoint"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "import boto3\n",
135 |     "sage = boto3.Session().client(service_name='sagemaker') \n",
136 |     "sage.delete_endpoint(EndpointName=endpoint_name)"
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Environment (conda_tensorflow_p37)",
143 |    "language": "python",
144 |    "name": "conda_tensorflow_p37"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.7.10"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 4
161 | }
162 | 


--------------------------------------------------------------------------------
/encapsulation/nginx.conf:
--------------------------------------------------------------------------------
 1 | events {
 2 |     # determines how many requests can simultaneously be served
 3 |     # https://www.digitalocean.com/community/tutorials/how-to-optimize-nginx-configuration
 4 |     # for more information
 5 |     worker_connections 2048;
 6 | }
 7 | 
 8 | http {
 9 |     client_max_body_size 100m;
10 |     client_body_buffer_size  128k;
11 |   server {
12 |     # configures the server to listen to the port 8080
13 |     # Amazon SageMaker sends inference requests to port 8080.
14 |     # For more information: https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
15 |     listen 8080 deferred;
16 | 
17 |     # redirects requests from SageMaker to TF Serving
18 |     location /invocations {
19 |       proxy_pass http://localhost:8501/v1/models/sagemaker-demo:predict;
20 |     }
21 | 
22 |     # Used by SageMaker to confirm if server is alive.
23 |     # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
24 |     location /ping {
25 |       return 200 "OK";
26 |     }
27 |   }
28 | }


--------------------------------------------------------------------------------
/encapsulation/source/export_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import os
 3 | import tensorflow.keras.backend as K
 4 | from tensorflow.keras.losses import categorical_crossentropy
 5 | from tensorflow.keras.optimizers import Adadelta
 6 | 
 7 | 
 8 | def export_model(model,
 9 |                  export_model_dir,
10 |                  model_version
11 |                  ):
12 |     """
13 |     :param export_model_dir: type string, save dir for exported model    url
14 |     :param model_version: type int best
15 |     :return:no return
16 |     """
17 |     with tf.get_default_graph().as_default():
18 |         # prediction_signature
19 |         tensor_info_input = tf.saved_model.utils.build_tensor_info(model.input)
20 |         tensor_info_output = tf.saved_model.utils.build_tensor_info(model.output)
21 |         print(model.output.shape, '**', tensor_info_output)
22 |         prediction_signature = (
23 |             tf.saved_model.signature_def_utils.build_signature_def(
24 |                 inputs={'images': tensor_info_input}, # Tensorflow.TensorInfo
25 |                 outputs={'result': tensor_info_output},
26 |                 #method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
27 |                  method_name= "tensorflow/serving/predict")
28 |                
29 |         )
30 |         print('step1 => prediction_signature created successfully')
31 |         # set-up a builder
32 |         
33 |         export_path_base = export_model_dir
34 |         export_path = os.path.join(
35 |             tf.compat.as_bytes(export_path_base),
36 |             tf.compat.as_bytes(str(model_version)))
37 |         builder = tf.saved_model.builder.SavedModelBuilder(export_path)
38 |         builder.add_meta_graph_and_variables(
39 |             # tags:SERVING,TRAINING,EVAL,GPU,TPU
40 |             sess=K.get_session(),
41 |             tags=[tf.saved_model.tag_constants.SERVING],
42 |             signature_def_map={
43 |                 'predict':
44 |                     prediction_signature,
45 |                    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
46 |               prediction_signature,
47 | 
48 |             },
49 |             )
50 |         print('step2 => Export path(%s) ready to export trained model' % export_path, '\n starting to export model...')
51 |         #builder.save(as_text=True)
52 |         builder.save()
53 |         print('Done exporting!')
54 | 


--------------------------------------------------------------------------------
/encapsulation/source/processing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import numpy as np
 4 | import glob
 5 | import random
 6 |     
 7 |     
 8 | def processing_data(training_dir, validation_dir, testing_dir, validation_rate=0.1,  testing_rate=0.1):
 9 |     class_equal,class_list = check_class(training_dir, validation_dir)
10 |     if not class_equal:
11 |         files = os.listdir(training_dir)
12 |         for file in files :
13 |             dir_path = os.path.join(os.path.join(training_dir, file))
14 |             if os.path.isdir(dir_path):
15 |                 print('处理 ：', dir_path)
16 |                 get_filelist(dir_path, file, validation_dir, validation_rate, testing_dir,testing_rate)
17 |                 
18 |             
19 |     training_count = get_all_count(training_dir)
20 |     validation_count = get_all_count(validation_dir)
21 |     testing_count = get_all_count(testing_dir)
22 |     
23 |     print("==================================================")
24 |     print('training count        : ', training_count)
25 |     print('validation count      : ', validation_count)
26 |     print('testing count         : ', testing_count)
27 |     print("class: ", class_list)
28 |     print("==================================================")
29 |     return class_list,training_count
30 |     
31 | 
32 | def check_class(training_dir, validation_dir):
33 |     class_list = []
34 |     for file in os.listdir(training_dir):
35 |         if os.path.isdir(os.path.join(training_dir , file)):
36 |             class_list.append(file)
37 |     class_list.sort()
38 |     
39 |     if not os.path.exists(validation_dir):
40 |         return False,class_list
41 |         
42 |     class_list_validation = []
43 |     for file in os.listdir(validation_dir):
44 |         if os.path.isdir(os.path.join(validation_dir , file)):
45 |             class_list_validation.append(file)
46 |     class_list_validation.sort()
47 |     
48 |     class_equal = (class_list == class_list_validation)
49 |     return class_equal,class_list
50 |     
51 |     
52 | def get_all_count(dir_path):
53 |     files = os.listdir(dir_path)
54 |     count = 0 
55 |     for file in files :
56 |         if os.path.isdir(dir_path):
57 |             label_dir = os.path.join(dir_path, file)
58 |             images = os.listdir(label_dir)
59 |             tmp_count = len(images)
60 |             #print('{}   {}'.format(label_dir, tmp_count))
61 |             count += tmp_count
62 |     return count
63 | 
64 | 
65 |     
66 | def move_file(file_path, target_path, class_name, item ):
67 |     target_dir = os.path.join(target_path, class_name)
68 |     if not os.path.exists(target_dir): 
69 |         os.makedirs(target_dir)
70 |     shutil.move(file_path, os.path.join(target_dir, item))
71 | 
72 | def get_filelist(dir_path, class_name, validation_dir, validation_rate, testing_dir,testing_rate):
73 |     files = os.listdir(dir_path)
74 |     random.shuffle(files)
75 |     count = len(files)
76 |     
77 |     validation_count = int(count * validation_rate)
78 |     testing_count = int(count * testing_rate)
79 |     
80 |     validation_list = files[0:validation_count]
81 |     testing_list = files[validation_count: validation_count + testing_count]
82 |     training_list = files[validation_count + testing_count: ]
83 |     
84 |     for item in validation_list:
85 |         move_file(os.path.join(os.path.join(dir_path, item)) , validation_dir, class_name, item ) 
86 |     for item in testing_list:
87 |         move_file(os.path.join(os.path.join(dir_path, item)) , testing_dir,class_name, item)
88 |     
89 | 
90 |     
91 |     
92 | if __name__ == '__main__':
93 |     processing_data("/opt/ml/input/data/training","/opt/ml/input/data/validation","/opt/ml/input/data/testing")


--------------------------------------------------------------------------------
/encapsulation/source/requirements.txt:
--------------------------------------------------------------------------------
1 | Keras==2.3.1
2 | Keras-Applications==1.0.8
3 | Keras-Preprocessing==1.1.2
4 | tensorboard==1.15.0
5 | tensorflow==1.15.2
6 | tensorflow-estimator==1.15.1
7 | tensorflow-gpu==1.15.2
8 | 
9 | 


--------------------------------------------------------------------------------
/encapsulation/source/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import argparse
  4 | import keras
  5 | from keras import models
  6 | from keras import layers
  7 | from keras import optimizers
  8 | from keras.applications import VGG16
  9 | import tensorflow as tf
 10 | from processing import processing_data
 11 | from export_model import export_model
 12 | from keras.preprocessing.image import ImageDataGenerator
 13 | from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
 14 | from keras.preprocessing.image import ImageDataGenerator
 15 | import sys
 16 | import logging
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(logging.DEBUG)
 19 | logger.addHandler(logging.StreamHandler(sys.stdout))
 20 | 
 21 | 
 22 | IMAGE_WIDTH = 150
 23 | IMAGE_HEIGHT = 150
 24 | 
 25 | 
 26 | logger.info('tensorflow version:{}'.format(tf.__version__))
 27 | logger.info('keras version:{}'.format(keras.__version__))
 28 | logger.info("gpu_device_name:{}".format(tf.test.gpu_device_name()))
 29 | logger.info("tf.test.is_gpu_available():{}".format(str(tf.test.is_gpu_available())))
 30 | 
 31 | 
 32 | 
 33 | def train(train_dir, validation_dir, test_dir, log_dir, model_dir, tf_server_dir, checkpoint_dir, class_list ,training_count, args):
 34 |     class_count = len(class_list)
 35 |     print("class_count:"+str(class_count))
 36 |     
 37 |     # 创建模型
 38 |     conv_base = VGG16(weights='imagenet',
 39 |                   include_top=False,
 40 |                   input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3))    
 41 |     model = models.Sequential()
 42 |     model.add(conv_base)
 43 |     model.add(layers.Flatten())
 44 |     model.add(layers.Dense(256, activation='relu'))
 45 |     model.add(layers.Dense(class_count, activation='softmax'))
 46 |     
 47 |     # 查看模型结构
 48 |     logger.info('This is the number of trainable weights '
 49 |       'before freezing the conv base:'+ str(len(model.trainable_weights)))
 50 |     conv_base.trainable = False
 51 |     logger.info('This is the number of trainable weights '
 52 |           'after freezing the conv base:'+ str(len(model.trainable_weights)))
 53 |     #conv_base.summary()
 54 |     
 55 |     # 准备训练参数
 56 |     RUN = RUN + 1 if 'RUN' in locals() else 1
 57 |     EPOCHS = args.epoch_count
 58 |     batch_size = args.batch_size
 59 |     lr = args.lr
 60 |     steps_per_epoch = training_count // batch_size
 61 |     logger.info("RUN:"+str(RUN))
 62 |     logger.info("steps_per_epoch:"+str(steps_per_epoch))
 63 |     logger.info("learning rate:"+str(lr))
 64 | 
 65 |     # 载入图片数据
 66 |     train_datagen = ImageDataGenerator(
 67 |           rescale=1./255,
 68 |           rotation_range=20,
 69 |           width_shift_range=0.30,
 70 |           height_shift_range=0.30,
 71 |           shear_range=0.20,
 72 |           zoom_range=0.40,
 73 |           horizontal_flip=True,
 74 |           fill_mode='nearest')
 75 | 
 76 |     # Note that the validation data should not be augmented!
 77 |     test_datagen = ImageDataGenerator(rescale=1./255)
 78 | 
 79 |     train_generator = train_datagen.flow_from_directory(
 80 |             # This is the target directory
 81 |             train_dir,
 82 |             classes=class_list,
 83 |             # All images will be resized to 150x150
 84 |             target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
 85 |             batch_size=batch_size,
 86 |             # Since we use binary_crossentropy loss, we need binary labels
 87 |             class_mode='categorical')
 88 | 
 89 |     validation_generator = test_datagen.flow_from_directory(
 90 |             validation_dir,
 91 |             classes=class_list,
 92 |             target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
 93 |             batch_size=batch_size,
 94 |             class_mode='categorical')
 95 |             
 96 |     # 第一次训练
 97 |     LOG_DIR_1 = os.path.join(log_dir, 'run{}-1'.format(RUN)) 
 98 |     LOG_FILE_PATH_1 = os.path.join(checkpoint_dir, 'checkpoint-1-{epoch:02d}-{val_acc:.4f}.hdf5')
 99 | 
100 |     model.compile(loss='categorical_crossentropy',
101 |                   optimizer=optimizers.Adam(lr=2e-5) ,
102 |                   metrics=['acc'])
103 |     tensorboard = TensorBoard(log_dir=LOG_DIR_1, write_images=True)
104 |     checkpoint = ModelCheckpoint(filepath=LOG_FILE_PATH_1, monitor='val_acc', verbose=1, save_best_only=True)
105 |     early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=1)
106 | 
107 |     history = model.fit_generator(
108 |           train_generator,
109 |           steps_per_epoch=steps_per_epoch,
110 |           epochs=EPOCHS,
111 |           validation_data=validation_generator,
112 |           validation_steps=50,
113 |           verbose=args.verbose,
114 |           callbacks=[tensorboard, checkpoint, early_stopping])
115 | 
116 |     # 微调模型
117 |     conv_base.trainable = True
118 | 
119 |     set_trainable = False
120 |     for layer in conv_base.layers:
121 |         if layer.name == 'block5_conv1':
122 |             set_trainable = True
123 |         layer.trainable = set_trainable
124 |     #conv_base.summary()
125 |     
126 |     # 第二次训练
127 |     LOG_DIR_2 = os.path.join(log_dir, 'run{}-2'.format(RUN)) 
128 |     LOG_FILE_PATH_2 = os.path.join(checkpoint_dir, 'checkpoint-2-{epoch:02d}-{val_acc:.4f}.hdf5')
129 |     model.compile(loss='categorical_crossentropy',
130 |                   optimizer=optimizers.RMSprop(lr=lr) ,
131 |                   metrics=['acc'])
132 |     tensorboard = TensorBoard(log_dir=LOG_DIR_2, write_images=True)
133 |     checkpoint = ModelCheckpoint(filepath=LOG_FILE_PATH_2, monitor='val_acc', verbose=1, save_best_only=True)
134 |     early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=1)
135 | 
136 |     history = model.fit_generator(
137 |           train_generator,
138 |           steps_per_epoch=steps_per_epoch,
139 |           epochs=EPOCHS,
140 |           validation_data=validation_generator,
141 |           validation_steps=50,
142 |           verbose=args.verbose,
143 |           callbacks=[tensorboard, checkpoint, early_stopping])
144 |           
145 |     # 测试
146 |     test_generator = test_datagen.flow_from_directory(
147 |         test_dir,
148 |         target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
149 |         batch_size=32,
150 |         class_mode='categorical')
151 | 
152 |     test_loss, test_acc = model.evaluate_generator(test_generator, steps=50)
153 |     logger.info('test acc:'+str(test_acc))
154 |     
155 |     
156 |     
157 |     logger.info("保存模型")
158 |     model.save(os.path.join(model_dir, 'model.h5'))
159 |     
160 |     #model = keras.models.load_model(os.path.join(model_dir, 'model.h5'))
161 |     export_model(
162 |         model,
163 |         tf_server_dir,
164 |         1
165 |     )
166 |     
167 |     
168 | 
169 |     
170 | def main(args):
171 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_list
172 |     
173 |     input_dir = args.input_dir
174 |     training_dir = os.path.join(input_dir, "data","training")
175 |     validation_dir = os.path.join(input_dir, "data","validation")
176 |     testing_dir = os.path.join(input_dir, "data","testing")
177 |     log_dir =  os.path.join(args.output_dir, "log")
178 |     model_dir =  args.model_dir
179 |     tf_server_dir =  os.path.join(model_dir, "tf_server")
180 |     checkpoint_dir =  os.path.join(args.output_dir, "log/checkpoint")
181 |     
182 |     if not os.path.exists(validation_dir):
183 |         os.makedirs(validation_dir)
184 |     if not os.path.exists(testing_dir):
185 |         os.makedirs(testing_dir)
186 |     if not os.path.exists(log_dir):
187 |         os.makedirs(log_dir)
188 |     if not os.path.exists(model_dir):
189 |         os.makedirs(model_dir)
190 |     if not os.path.exists(tf_server_dir):
191 |         os.makedirs(tf_server_dir)
192 |     if not os.path.exists(checkpoint_dir):
193 |         os.makedirs(checkpoint_dir)
194 |         
195 |         
196 |     logger.info("epoch_count:"+str(args.epoch_count))
197 |     logger.info("batch_size:"+str(args.batch_size))
198 |     #logger.info("--------------processing data ----------------- ")
199 |     class_list,training_count = processing_data(training_dir, validation_dir, testing_dir)
200 |     #logger.info("--------------start train --------------------- ")    
201 |     train(training_dir, validation_dir, testing_dir, log_dir, model_dir, tf_server_dir, checkpoint_dir, class_list ,training_count, args)
202 |     logger.info("  训练完成   ")
203 |     
204 |     
205 |     
206 |     
207 | if __name__ == '__main__':
208 |     parser = argparse.ArgumentParser(description='Train model.')
209 | 
210 |     parser.add_argument(
211 |         "-e",
212 |         "--epoch_count",
213 |         type=int,
214 |         nargs="?",
215 |         help="Epoch count",
216 |         default=30,
217 |     )
218 |     parser.add_argument(
219 |         "-r",
220 |         "--lr",
221 |         type=float,
222 |         nargs="?",
223 |         help="learning rate (default: 1e-5)",
224 |         default=1e-5,
225 |     )
226 |     parser.add_argument(
227 |         "-b",
228 |         "--batch_size",
229 |         type=int,
230 |         nargs="?",
231 |         help="Batch size (default: 32)",
232 |         default=32,
233 |     )
234 |     parser.add_argument(
235 |         "-m",
236 |         "--model_dir",
237 |         type=str,
238 |         help="Model保存路径.",
239 |         default="/opt/ml/model/",
240 |     )
241 |     parser.add_argument(
242 |         "-i",
243 |         "--input_dir",
244 |         type=str,
245 |         help="input dir",
246 |         default="/opt/ml/input/",
247 |     )
248 |     parser.add_argument(
249 |         "-o",
250 |         "--output_dir",
251 |         type=str,
252 |         help="outpudif",
253 |         default="/opt/ml/output/",
254 |     )
255 |     parser.add_argument(
256 |         "-g",
257 |         "--gpu_list",
258 |         type=str,
259 |         help="gpu list",
260 |         default="0",
261 |     )
262 |     parser.add_argument(
263 |         "-v",
264 |         "--verbose",
265 |         type=int,
266 |         help="log level",
267 |         default=2,
268 |     )
269 |     args = parser.parse_args()
270 |     main(args)
271 |     
272 | 


--------------------------------------------------------------------------------
/encapsulation/test/cat.681.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/encapsulation/test/cat.681.jpg


--------------------------------------------------------------------------------
/encapsulation/test/dog.592.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/encapsulation/test/dog.592.jpg


--------------------------------------------------------------------------------
/encapsulation/train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import boto3\n",
 10 |     "import sagemaker\n",
 11 |     "import os\n",
 12 |     "from sagemaker import get_execution_role\n",
 13 |     "\n",
 14 |     "region = boto3.session.Session().region_name\n",
 15 |     "\n",
 16 |     "#如果使用SageMaker的笔记本实例使用下一行\n",
 17 |     "role = get_execution_role()\n",
 18 |     "#如果使用自建的笔记本实例请自行获取Role，可从IAM控制台获取到\n",
 19 |     "#role = \"arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235\""
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "#确保sagemaker版本为2.4.0及以上\n",
 29 |     "print(sagemaker.__version__)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "#修改bucket\n",
 39 |     "input_data = 's3://<<your bucket>>/data/cat-vs-dog-1000/'\n",
 40 |     "output_data = 's3://<<your bucket>>/data/cat-vs-dog-output/'"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## 准备图片\n",
 48 |     "\n",
 49 |     "原始数据按不同分类上传到input_data目录\n",
 50 |     "```\n",
 51 |     "input_data\n",
 52 |     "├── class1\n",
 53 |     "│   ├── image001.jpg\n",
 54 |     "│   ├── image002.jpg\n",
 55 |     "│   └── ...\n",
 56 |     "├── class2\n",
 57 |     "│   ├── image001.jpg\n",
 58 |     "│   ├── image002.jpg\n",
 59 |     "│   └── ...\n",
 60 |     "└── classn\n",
 61 |     "    ├── image001.jpg\n",
 62 |     "    ├── image002.jpg\n",
 63 |     "    └── ...\n",
 64 |     "```\n",
 65 |     "可从Kaggle获取[猫狗图片](https://www.kaggle.com/c/dogs-vs-cats/data)，然后按目录存放图片。"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "from sagemaker.tensorflow import TensorFlow\n",
 75 |     "\n",
 76 |     "# 建议使用gpu类型的实例\n",
 77 |     "instance_type='ml.p3.2xlarge'\n",
 78 |     "#instance_type='local'\n",
 79 |     "model_dir = '/opt/ml/model'\n",
 80 |     "\n",
 81 |     "# 可以修改epoch_count，batch_size\n",
 82 |     "estimator = TensorFlow(entry_point='train.py',\n",
 83 |     "                             source_dir='./source',\n",
 84 |     "                             role=role,\n",
 85 |     "                             output_path=output_data,\n",
 86 |     "                             model_dir=model_dir,\n",
 87 |     "                             framework_version='1.15.2',\n",
 88 |     "                             hyperparameters={'epoch_count':30, 'batch_size':32}, \n",
 89 |     "                             py_version='py3',\n",
 90 |     "                             instance_count=1,\n",
 91 |     "                             instance_type=instance_type,\n",
 92 |     "#                             train_volume_size=50,\n",
 93 |     "#                             train_max_run=432000,\n",
 94 |     "                             use_spot_instances=True,\n",
 95 |     "                             max_wait=432000,\n",
 96 |     "#                             metric_definitions=[{'Name': 'loss', 'Regex': 'loss = (.*?),'},\n",
 97 |     "#                                                 {'Name':'epoch','Regex': 'Step_Train = (.*?),'}]\n",
 98 |     "                           )\n"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "result = estimator.fit(input_data)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "### 打印 model_data 路径， 下载并且解压"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "print(estimator.model_data)\n",
124 |     "os.environ['S3_URL']=str(estimator.model_data)  #environ的键值必须是字符串"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "%%sh\n",
134 |     "echo ${S3_URL}"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "%%sh\n",
144 |     "\n",
145 |     "if [ ! -d \"output\" ];then\n",
146 |     "mkdir output\n",
147 |     "fi\n",
148 |     "\n",
149 |     "cd output\n",
150 |     "aws s3 cp ${S3_URL} ./model.tar.gz\n",
151 |     "\n",
152 |     "# aws s3 cp {sli_estimator.model_data} ./model.tar.gz\n",
153 |     "\n",
154 |     "tar -xvzf ./model.tar.gz "
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "确保模型文件保存到以下目录\n",
162 |     "\n",
163 |     "`inference.ipynb` 里面会用到\n",
164 |     "\n",
165 |     "```\n",
166 |     "output\n",
167 |     "└── tf_server\n",
168 |     "    └── 1\n",
169 |     "        ├── saved_model.pb\n",
170 |     "        └── variables\n",
171 |     "            ├── variables.data-00000-of-00001\n",
172 |     "            └── variables.index\n",
173 |     "```\n",
174 |     "\n",
175 |     "```"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": []
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "Environment (conda_tensorflow_p37)",
189 |    "language": "python",
190 |    "name": "conda_tensorflow_p37"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 3
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython3",
202 |    "version": "3.7.10"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 4
207 | }
208 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/1-preparation/explore_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import scipy.io as sio\n",
 10 |     "import os\n",
 11 |     "import scipy.io\n",
 12 |     "import scipy.ndimage\n",
 13 |     "import spectral\n",
 14 |     "import spectral.io.envi as envi\n",
 15 |     "import pandas as pd\n",
 16 |     "import numpy as np\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import matplotlib.colors as colors\n",
 19 |     "import matplotlib.cm as cmx\n",
 20 |     "from random import shuffle\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## 分类\n",
 28 |     "0.   其他\n",
 29 |     "1.   较低油分\n",
 30 |     "2.   低油分\n",
 31 |     "3.    中油分\n",
 32 |     "4.    高油分\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def loadData(flieName, dataIndex, temp_split=4):\n",
 42 |     "    \n",
 43 |     "    print(\"------------  loadData  \", dataIndex)\n",
 44 |     "    # 原始数据路径\n",
 45 |     "    DATA_PATH = os.path.join(os.getcwd(), flieName)\n",
 46 |     "\n",
 47 |     "    index = str(dataIndex)\n",
 48 |     "    data = envi.open( os.path.join(DATA_PATH, \"{}.hdr\".format(index)) ,os.path.join(DATA_PATH, \"{}.dat\".format(index)))\n",
 49 |     "    mask_data = envi.open( os.path.join(DATA_PATH, \"mask_{}.hdr\".format(index)) ,os.path.join(DATA_PATH, \"mask_{}.tiff\".format(index)))\n",
 50 |     "\n",
 51 |     "    HEIGHT = data.shape[0] //temp_split\n",
 52 |     "    WIDTH = data.shape[1] //temp_split\n",
 53 |     "    BAND = data.shape[2]\n",
 54 |     "#     BAND = BAND_SIZE\n",
 55 |     "    new_shape=(BAND,HEIGHT,WIDTH)\n",
 56 |     "    new_data = np.zeros(new_shape, dtype = float)\n",
 57 |     "    label = np.zeros((HEIGHT, WIDTH), dtype = int)\n",
 58 |     "    \n",
 59 |     "\n",
 60 |     "    sample_count = 0\n",
 61 |     "    for h in range(HEIGHT): \n",
 62 |     "        for w in range(WIDTH):\n",
 63 |     "            x = h*temp_split\n",
 64 |     "            y = w*temp_split\n",
 65 |     "            for b in range(BAND):\n",
 66 |     "                new_data[b][h][w] = data[x,y][b]\n",
 67 |     "\n",
 68 |     "            if(sum(mask_data[x, y])  > 0.01 ):\n",
 69 |     "                label[h][w] = dataIndex \n",
 70 |     "                sample_count += 1\n",
 71 |     "            else:\n",
 72 |     "                label[h][w] = 0\n",
 73 |     "    \n",
 74 |     "    \n",
 75 |     "    new_data = np.transpose(new_data, (1, 2, 0))  # 将通道数提前，便于数组处理操作\n",
 76 |     "    print(\"sample_count = {} \".format(sample_count))\n",
 77 |     "    print(\"data shape : \", new_data.shape)\n",
 78 |     "    print(\"label shape : \", label.shape)\n",
 79 |     "    return new_data, label"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "PATCH  样本数量 * 通道 * 高 * 宽"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "def create_sample_data(index):\n",
 96 |     "    \n",
 97 |     "    data, label = loadData(\"dataset\", index)\n",
 98 |     "\n",
 99 |     "    height = data.shape[0]\n",
100 |     "    width = data.shape[1]\n",
101 |     "    band = data.shape[2]\n",
102 |     "    \n",
103 |     "    \n",
104 |     "    print(\"band    : \", band)\n",
105 |     "    print(\"height  : \", height)\n",
106 |     "    print(\"width   : \", width)\n",
107 |     "    sample_count = 0\n",
108 |     "    for h in range(height):\n",
109 |     "        for w in range(width):\n",
110 |     "            if label[h][w] == index:\n",
111 |     "                sample_count += 1\n",
112 |     "\n",
113 |     "    print(\"count   : \", sample_count)\n",
114 |     "    new_shape= (sample_count, band)\n",
115 |     "    temp_data = np.zeros(new_shape, dtype = float) \n",
116 |     "    \n",
117 |     "    count = 0 \n",
118 |     "    for h in range(height):\n",
119 |     "        for w in range(width):\n",
120 |     "            if label[h][w] == index:\n",
121 |     "                for b in range(band):\n",
122 |     "                    temp_data[count][b] = data[h][w][b]\n",
123 |     "                count += 1\n",
124 |     "                \n",
125 |     "    return temp_data\n",
126 |     "    \n",
127 |     "    "
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "%%time\n",
137 |     "\n",
138 |     "\n",
139 |     "new_data1  =  create_sample_data(1)\n",
140 |     "new_data2  =  create_sample_data(2)\n",
141 |     "new_data3  =  create_sample_data(3)\n",
142 |     "new_data4  =  create_sample_data(4)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "def drawLine(new_data):\n",
152 |     "\n",
153 |     "    plt.figure(figsize=(22, 3))\n",
154 |     "    new_data1.shape\n",
155 |     "    size = new_data1.shape[0]\n",
156 |     "    split = size // 4\n",
157 |     "    \n",
158 |     "\n",
159 |     "    #第一行第一列图形\n",
160 |     "    ax1 = plt.subplot(1,4,1)\n",
161 |     "    ax2 = plt.subplot(1,4,2)\n",
162 |     "    ax3 = plt.subplot(1,4,3)\n",
163 |     "    ax4 = plt.subplot(1,4,4)\n",
164 |     "    x= np.linspace(0, 100 ,new_data.shape[1])\n",
165 |     "\n",
166 |     "    \n",
167 |     "    plt.sca(ax1)\n",
168 |     "    plt.plot(x,new_data[split * 0])\n",
169 |     "    \n",
170 |     "    \n",
171 |     "    plt.sca(ax2)\n",
172 |     "    plt.plot(x,new_data[split * 1])\n",
173 |     "    \n",
174 |     "    \n",
175 |     "    plt.sca(ax3)\n",
176 |     "    plt.plot(x,new_data[split * 2])\n",
177 |     "    \n",
178 |     "    \n",
179 |     "    plt.sca(ax4)\n",
180 |     "    plt.plot(x,new_data[split * 3])\n",
181 |     "\n",
182 |     "    plt.show()\n",
183 |     "    \n"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "drawLine(new_data1)\n",
193 |     "drawLine(new_data2)\n",
194 |     "drawLine(new_data3)\n",
195 |     "drawLine(new_data4)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "def drawLine(d1, d2):\n",
205 |     "\n",
206 |     "    plt.figure(figsize=(22, 5))\n",
207 |     "    size_1 = d1.shape[0]\n",
208 |     "    split_1 = size_1 // 4\n",
209 |     "    \n",
210 |     "    size_2 = d2.shape[0]\n",
211 |     "    split_2 = size_2 // 4\n",
212 |     "    \n",
213 |     "\n",
214 |     "    #第一行第一列图形\n",
215 |     "    ax1 = plt.subplot(1,4,1)\n",
216 |     "    ax2 = plt.subplot(1,4,2)\n",
217 |     "    ax3 = plt.subplot(1,4,3)\n",
218 |     "    ax4 = plt.subplot(1,4,4)\n",
219 |     "    x= np.linspace(0, d1.shape[1] ,d1.shape[1])\n",
220 |     "\n",
221 |     "    \n",
222 |     "    plt.sca(ax1)\n",
223 |     "    plt.plot(x,d1[split_1 * 0])\n",
224 |     "    plt.plot(x,d2[split_2 * 0], color='red')\n",
225 |     "    \n",
226 |     "    \n",
227 |     "    plt.sca(ax2)\n",
228 |     "    plt.plot(x,d1[split_1 * 1])\n",
229 |     "    plt.plot(x,d2[split_2 * 1], color='red')\n",
230 |     "    \n",
231 |     "    \n",
232 |     "    plt.sca(ax3)\n",
233 |     "    plt.plot(x,d1[split_1 * 2])\n",
234 |     "    plt.plot(x,d2[split_2 * 2], color='red')\n",
235 |     "    \n",
236 |     "    \n",
237 |     "    plt.sca(ax4)\n",
238 |     "    plt.plot(x,d1[split_1 * 3])\n",
239 |     "    plt.plot(x,d2[split_2 * 3], color='red')\n",
240 |     "    \n",
241 |     "    plt.show()\n"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### 中油分和高油分"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "drawLine(new_data3, new_data4)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "### 低油分和较低油分"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "drawLine(new_data1, new_data2)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Environment (conda_pytorch_latest_p37)",
287 |    "language": "python",
288 |    "name": "conda_pytorch_latest_p37"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.7.10"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 4
305 | }
306 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/1-preparation/preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "exposed-selection",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# DeepHyperX on SageMaker--数据准备"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "interstate-routine",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 1 说明\n",
 17 |     "本章内容主要是把原始数据格式转化为mat格式。\n",
 18 |     "## 2 运行环境\n",
 19 |     "Kernel 选择pytorch_latest_p36。 \n",
 20 |     "## 3 已有mat格式数据\n",
 21 |     "如果已有YOLOv5格式的数据，可跳过数据准备，把数据放入S3即可。  \n",
 22 |     "### 3.1 S3目录存放格式\n",
 23 |     "```\n",
 24 |     "deephyper\n",
 25 |     "├── class1\n",
 26 |     "│   ├── class1_gt.mat\n",
 27 |     "│   └── class1.mat\n",
 28 |     "├── class2\n",
 29 |     "│   ├── class2_gt.mat\n",
 30 |     "│   └── class2.mat\n",
 31 |     "...\n",
 32 |     "└── classn\n",
 33 |     "    ├── classn_gt.mat\n",
 34 |     "    └── classn.mat\n",
 35 |     "```\n",
 36 |     "### 3.2 SageMaker输入数据根目录\n",
 37 |     "运行SageMaker时，SageMaker会从S3拷贝数据放到到运行容器的`/opt/ml/input/data/training/`下。即`deephyper/class1/class1.mat`对应全路径为`/opt/ml/input/data/training/class1/class1.mat`"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "polyphonic-estimate",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 4 没有mat格式数据"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "multiple-archives",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### 4.1 拷贝数据到本地"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "opponent-first",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import os\n",
 64 |     "if not os.path.exists(\"dataset\"):\n",
 65 |     "    os.mkdir(\"dataset\")"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "strange-medication",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "!aws s3 sync s3://junzhong/data/hyper_leaf/ ./dataset"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "id": "looking-triumph",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "### 4.2 转化为mat格式"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "theoretical-sheriff",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "!pip install spectral -i https://opentuna.cn/pypi/web/simple/"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "id": "rising-lunch",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "!python preprocess.py"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "facial-stable",
109 |    "metadata": {},
110 |    "source": [
111 |     "### 4.3 拷贝到S3"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "considered-museum",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "# 设置数据存放S3 bucket\n",
122 |     "bucket = 'junzhong'"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "fitting-opinion",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "!aws s3 sync Datasets/ s3://{bucket}/data/deephyper/"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "pressed-composite",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": []
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": "Environment (conda_pytorch_latest_p37)",
147 |    "language": "python",
148 |    "name": "conda_pytorch_latest_p37"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 3
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython3",
160 |    "version": "3.7.10"
161 |   }
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 5
165 | }
166 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/1-preparation/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import scipy.io
 4 | import spectral.io.envi as envi
 5 | 
 6 | 
 7 | DATASET_NAME = 'leaf'
 8 | NEW_DATA_PATH = os.path.join(os.getcwd(), "Datasets/"+DATASET_NAME)  # 存放数据路径 patch是文件夹名称
 9 | 
10 | 
11 | """
12 | temp_split: 对数据进行拆分
13 | """
14 | def loadData(flieName, dataIndex, temp_split=4):
15 |     
16 |     print("------------  loadData  ", dataIndex)
17 |     # 原始数据路径
18 |     DATA_PATH = os.path.join(os.getcwd(), flieName)
19 | 
20 |     index = str(dataIndex)
21 |     data = envi.open( os.path.join(DATA_PATH, "{}.hdr".format(index)) ,os.path.join(DATA_PATH, "{}.dat".format(index)))
22 |     mask_data = envi.open( os.path.join(DATA_PATH, "mask_{}.hdr".format(index)) ,os.path.join(DATA_PATH, "mask_{}.tiff".format(index)))
23 | 
24 |     HEIGHT = data.shape[0] //temp_split
25 |     WIDTH = data.shape[1] //temp_split
26 |     BAND = data.shape[2]
27 | #     BAND = BAND_SIZE
28 |     new_shape=(BAND,HEIGHT,WIDTH)
29 |     new_data = np.zeros(new_shape, dtype = float)
30 |     label = np.zeros((HEIGHT, WIDTH), dtype = int)
31 |     
32 | 
33 |     sample_count = 0
34 |     for h in range(HEIGHT): 
35 |         for w in range(WIDTH):
36 |             x = h*temp_split
37 |             y = w*temp_split
38 |             for b in range(BAND):
39 |                 new_data[b][h][w] = data[x,y][b]
40 | 
41 |             if(sum(mask_data[x, y])  > 0.01 ):
42 |                 label[h][w] = dataIndex 
43 |                 sample_count += 1
44 |             else:
45 |                 label[h][w] = 0
46 |     
47 |     
48 |     new_data = np.transpose(new_data, (1, 2, 0))  # 将通道数提前，便于数组处理操作
49 |     print("sample_count = {} ".format(sample_count))
50 |     print("data shape : ", new_data.shape)
51 |     print("label shape : ", label.shape)
52 |     return new_data, label
53 | 
54 | if not os.path.exists(NEW_DATA_PATH):
55 |     print("  ", NEW_DATA_PATH)
56 |     os.makedirs(NEW_DATA_PATH)
57 |     print("create dataset dir success.")
58 | 
59 | data1, label1 = loadData("dataset", 1)
60 | data2, label2 = loadData("dataset", 2)
61 | data3, label3 = loadData("dataset", 3)
62 | data4, label4 = loadData("dataset", 4)
63 | 
64 | 
65 | 
66 | X1 = np.hstack((data1, data2))
67 | X2 = np.hstack((data3, data4))
68 | 
69 | gt1 = np.hstack((label1, label2))
70 | gt2 = np.hstack((label3, label4))
71 | 
72 | X = np.vstack((X1, X2))
73 | 
74 | gt = np.vstack((gt1, gt2))
75 | 
76 | 
77 | 
78 | 
79 |     
80 |     
81 | train_dict, test_dict = {}, {}
82 | train_dict[DATASET_NAME] = X
83 | file_name = "{}.mat".format(DATASET_NAME) 
84 | scipy.io.savemat(os.path.join(NEW_DATA_PATH, file_name), train_dict)
85 | test_dict["{}_gt".format(DATASET_NAME)] = gt
86 | file_name = "{}_gt.mat".format(DATASET_NAME)
87 | scipy.io.savemat(os.path.join(NEW_DATA_PATH, file_name), test_dict)
88 | print("Save target data success ---------------------------------\n")


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/2-training/source/custom_datasets.py:
--------------------------------------------------------------------------------
 1 | from utils import open_file
 2 | import numpy as np
 3 | 
 4 | CUSTOM_DATASETS_CONFIG = {
 5 |     "leaf": {
 6 |         "img": "2018_IEEE_GRSS_DFC_HSI_TR.HDR",
 7 |         "gt": "2018_IEEE_GRSS_DFC_GT_TR.tif",
 8 |         "download": False,
 9 |         "loader": lambda folder: leaf_loader(folder),
10 |     }
11 | }
12 | 
13 | 
14 | def leaf_loader(folder):
15 |     img = open_file(folder + "leaf.mat")
16 |     img = img["leaf"]
17 | 
18 |     rgb_bands = (43, 21, 11)  # AVIRIS sensor
19 | 
20 |     gt = open_file(folder + "leaf_gt.mat")["leaf_gt"]
21 |     label_values = [
22 |             "Undefined",
23 |             "lowest",
24 |             "lower",
25 |             "middle",
26 |             "high",
27 |         ]
28 | 
29 |     ignored_labels = [0]
30 | #     ignored_labels = []
31 |     palette = None
32 |     return img, gt, rgb_bands, ignored_labels, label_values, palette
33 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/2-training/source/inference.py:
--------------------------------------------------------------------------------
  1 | # Python 2/3 compatiblity
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | import joblib
  5 | import os
  6 | from utils import convert_to_color_, convert_from_color_, get_device
  7 | from datasets import open_file
  8 | from models import get_model, test
  9 | import numpy as np
 10 | import seaborn as sns
 11 | from skimage import io
 12 | import argparse
 13 | import torch
 14 | 
 15 | # Test options
 16 | parser = argparse.ArgumentParser(
 17 |     description="Run deep learning experiments on" " various hyperspectral datasets"
 18 | )
 19 | parser.add_argument(
 20 |     "--model",
 21 |     type=str,
 22 |     default=None,
 23 |     help="Model to train. Available:\n"
 24 |     "SVM (linear), "
 25 |     "SVM_grid (grid search on linear, poly and RBF kernels), "
 26 |     "baseline (fully connected NN), "
 27 |     "hu (1D CNN), "
 28 |     "hamida (3D CNN + 1D classifier), "
 29 |     "lee (3D FCN), "
 30 |     "chen (3D CNN), "
 31 |     "li (3D CNN), "
 32 |     "he (3D CNN), "
 33 |     "luo (3D CNN), "
 34 |     "sharma (2D CNN), "
 35 |     "boulch (1D semi-supervised CNN), "
 36 |     "liu (3D semi-supervised CNN), "
 37 |     "mou (1D RNN)",
 38 | )
 39 | parser.add_argument(
 40 |     "--cuda",
 41 |     type=int,
 42 |     default=-1,
 43 |     help="Specify CUDA device (defaults to -1, which learns on CPU)",
 44 | )
 45 | parser.add_argument(
 46 |     "--checkpoint",
 47 |     type=str,
 48 |     default=None,
 49 |     help="Weights to use for initialization, e.g. a checkpoint",
 50 | )
 51 | 
 52 | group_test = parser.add_argument_group("Test")
 53 | group_test.add_argument(
 54 |     "--test_stride",
 55 |     type=int,
 56 |     default=1,
 57 |     help="Sliding window step stride during inference (default = 1)",
 58 | )
 59 | group_test.add_argument(
 60 |     "--image",
 61 |     type=str,
 62 |     default=None,
 63 |     nargs="?",
 64 |     help="Path to an image on which to run inference.",
 65 | )
 66 | group_test.add_argument(
 67 |     "--only_test",
 68 |     type=str,
 69 |     default=None,
 70 |     nargs="?",
 71 |     help="Choose the data on which to test the trained algorithm ",
 72 | )
 73 | group_test.add_argument(
 74 |     "--mat",
 75 |     type=str,
 76 |     default=None,
 77 |     nargs="?",
 78 |     help="In case of a .mat file, define the variable to call inside the file",
 79 | )
 80 | group_test.add_argument(
 81 |     "--n_classes",
 82 |     type=int,
 83 |     default=None,
 84 |     nargs="?",
 85 |     help="When using a trained algorithm, specified  the number of classes of this algorithm",
 86 | )
 87 | # Training options
 88 | group_train = parser.add_argument_group("Model")
 89 | group_train.add_argument(
 90 |     "--patch_size",
 91 |     type=int,
 92 |     help="Size of the spatial neighbourhood (optional, if "
 93 |     "absent will be set by the model)",
 94 | )
 95 | group_train.add_argument(
 96 |     "--batch_size",
 97 |     type=int,
 98 |     help="Batch size (optional, if absent will be set by the model",
 99 | )
100 | 
101 | args = parser.parse_args()
102 | CUDA_DEVICE = get_device(args.cuda)
103 | MODEL = args.model
104 | # Testing file
105 | MAT = args.mat
106 | N_CLASSES = args.n_classes
107 | INFERENCE = args.image
108 | TEST_STRIDE = args.test_stride
109 | CHECKPOINT = args.checkpoint
110 | 
111 | img_filename = os.path.basename(INFERENCE)
112 | basename = MODEL + img_filename
113 | dirname = os.path.dirname(INFERENCE)
114 | 
115 | img = open_file(INFERENCE)
116 | if MAT is not None:
117 |     img = img[MAT]
118 | # Normalization
119 | img = np.asarray(img, dtype="float32")
120 | img = (img - np.min(img)) / (np.max(img) - np.min(img))
121 | N_BANDS = img.shape[-1]
122 | hyperparams = vars(args)
123 | hyperparams.update(
124 |     {
125 |         "n_classes": N_CLASSES,
126 |         "n_bands": N_BANDS,
127 |         "device": CUDA_DEVICE,
128 |         "ignored_labels": [0],
129 |     }
130 | )
131 | hyperparams = dict((k, v) for k, v in hyperparams.items() if v is not None)
132 | 
133 | palette = {0: (0, 0, 0)}
134 | for k, color in enumerate(sns.color_palette("hls", N_CLASSES)):
135 |     palette[k + 1] = tuple(np.asarray(255 * np.array(color), dtype="uint8"))
136 | invert_palette = {v: k for k, v in palette.items()}
137 | 
138 | 
139 | def convert_to_color(x):
140 |     return convert_to_color_(x, palette=palette)
141 | 
142 | 
143 | def convert_from_color(x):
144 |     return convert_from_color_(x, palette=invert_palette)
145 | 
146 | 
147 | if MODEL in ["SVM", "SVM_grid", "SGD", "nearest"]:
148 |     model = joblib.load(CHECKPOINT)
149 |     w, h = img.shape[:2]
150 |     X = img.reshape((w * h, N_BANDS))
151 |     prediction = model.predict(X)
152 |     prediction = prediction.reshape(img.shape[:2])
153 | else:
154 |     model, _, _, hyperparams = get_model(MODEL, **hyperparams)
155 |     
156 |     if CUDA_DEVICE == -1:
157 |         model.load_state_dict(torch.load(CHECKPOINT))
158 |     else:
159 |         model.load_state_dict(torch.load(CHECKPOINT,map_location='cpu'))
160 |         
161 |     
162 |     
163 |     probabilities = test(model, img, hyperparams)
164 |     prediction = np.argmax(probabilities, axis=-1)
165 | 
166 | filename = dirname + "/" + basename + ".tif"
167 | io.imsave(filename, prediction)
168 | basename = "color_" + basename
169 | filename = dirname + "/" + basename + ".tif"
170 | io.imsave(filename, convert_to_color(prediction))
171 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/2-training/source/requirements2.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.10.0
 2 | spectral==0.19
 3 | scipy>=0.19.0
 4 | tqdm>=4.15.0
 5 | visdom>=0.1.5
 6 | seaborn>=0.8
 7 | scikit-learn>=0.19.0
 8 | scikit-image>=0.13.1
 9 | torch>=0.4.0
10 | matplotlib>=2.0.2
11 | torchsummary>=1.5
12 | joblib==0.14.1
13 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/2-training/training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "fb63bd02",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# DeepHyperX on SageMaker--训练"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "45ac89f3",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 1 说明\n",
 17 |     "本章内容为用SageMaker进行训练，数据来自S3。"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "id": "79225937",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## 2 运行环境\n",
 26 |     "Kernel 选择pytorch_latest_p36。  \n",
 27 |     "本文在boto3 1.17.84和sagemaker 2.43.0下测试通过。"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "f580feb0",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import boto3,sagemaker\n",
 38 |     "print(boto3.__version__)\n",
 39 |     "print(sagemaker.__version__)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "a797946f",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## 3 在SageMaker上训练"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "7cb5b9a4",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# 设置数据存放S3 bucket\n",
 58 |     "bucket = 'junzhong'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "8645c318",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "input_path='s3://{}/data/deephyper/'.format(bucket)\n",
 69 |     "output_path='s3://{}/result/deephyper/'.format(bucket)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "a806601d",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import boto3\n",
 80 |     "iam = boto3.client('iam')\n",
 81 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
 82 |     "role=\"\"\n",
 83 |     "for current_role in roles[\"Roles\"]:\n",
 84 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
 85 |     "        role=current_role[\"Arn\"]\n",
 86 |     "        break\n",
 87 |     "#如果role为空表示有问题，需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n",
 88 |     "print(role)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "bb2fb92a",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "from sagemaker.pytorch import PyTorch\n",
 99 |     "\n",
100 |     "#根据需要修改训练实例，和是否使用Spot实例\n",
101 |     "instance_type=\"ml.p3.2xlarge\"\n",
102 |     "use_spot_instances=False\n",
103 |     "\n",
104 |     "estimator = PyTorch(entry_point=\"main.py\",\n",
105 |     "                     source_dir=\"./source\",\n",
106 |     "                     role=role,\n",
107 |     "                     output_path=output_path,\n",
108 |     "                     framework_version='1.6.0',\n",
109 |     "                     hyperparameters={\"folder\":\"/opt/ml/input/data/training/\",\n",
110 |     "                                      \"model\":\"he\",\n",
111 |     "                                      \"dataset\":\"leaf\",\n",
112 |     "                                      \"cuda\":\"0\",\n",
113 |     "                                      \"training_sample\":0.7,\n",
114 |     "                                      \"patch_size\":17,\n",
115 |     "                                      \"epoch\":20,\n",
116 |     "                                      \"batch_size\":32}, \n",
117 |     "                     py_version=\"py3\",\n",
118 |     "                     instance_count=1,\n",
119 |     "                     instance_type=instance_type,\n",
120 |     "                     use_spot_instances=use_spot_instances,\n",
121 |     "                     max_wait=432000 if use_spot_instances else None,\n",
122 |     "                    )"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "id": "b8b2da8b",
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "estimator.fit(input_path)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "9bdc5e9e",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "import os\n",
143 |     "os.makedirs(\"result\", exist_ok=True)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "c951ca17",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "!aws s3 cp $estimator.model_data ./result"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "00cf390e",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "%%sh\n",
164 |     "cd result\n",
165 |     "tar zxvf model.tar.gz"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "760a50c0",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": []
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "conda_pytorch_latest_p36",
180 |    "language": "python",
181 |    "name": "conda_pytorch_latest_p36"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.6.13"
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 5
198 | }
199 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/3-inference/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c8c0ca15",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 推理"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "d2c11e6c",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "/home/ec2-user/SageMaker/github/sagemaker-workshop/hyperspectral/DeepHyperX/3-inference\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "!pwd"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "f8c5cb10",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### 进入工作目录"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "826aebf6",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "`cd sagemaker-workshop/hyperspectral/DeepHyperX/2-training`"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "a007e347",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### 创建环境"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "1e81f2b7",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "```\n",
 59 |     "conda create -n  hyper python=3.6 pip scipy numpy \n",
 60 |     "source activate hyper\n",
 61 |     "pip install -r requirements2.txt -i https://mirrors.163.com/pypi/simple/\n",
 62 |     "    \n",
 63 |     "```"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "id": "71155594",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### 推理"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "f23e4b6a",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "`python3 inference.py --model he --checkpoint '../result/checkpoint/he_et_al/leaf/2021_06_06_08_18_30_epoch11_1.00.pth' --image 'Datasets/leaf/leaf.mat'  --mat leaf --n_classes=5 --patch_size=17 `"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "3618c1c3",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": []
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "conda_pytorch_latest_p36",
 94 |    "language": "python",
 95 |    "name": "conda_pytorch_latest_p36"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.6.13"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 5
112 | }
113 | 


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/LICENSE:
--------------------------------------------------------------------------------
 1 | # License information
 2 | 
 3 | Code for the DeepHyperX toolbox is dual licensed depending on applications, research or commercial.
 4 | 
 5 | ---
 6 | 
 7 | ## COMMERCIAL PURPOSES
 8 | 
 9 | Please contact the ONERA [www.onera.fr/en/contact-us](www.onera.fr/en/contact-us) for additional information or directly the authors Nicolas Audebert or Bertrand Le Saux.
10 | 
11 | ---
12 | 
13 | ## RESEARCH AND NON COMMERCIAL PURPOSES
14 | 
15 | #### Code license
16 | 
17 | For research and non commercial purposes, all the code and documentation is released under the GPLv3 license:
18 | 
19 | Copyright (c) 2018 ONERA and IRISA, Nicolas Audebert, Bertrand Le Saux, Sébastien Lefèvre.
20 | 
21 | This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 | 
23 | PLEASE ACKNOWLEDGE THE ORIGINAL AUTHORS AND PUBLICATION ACCORDING TO THE REPOSITORY github.com/nshaud/DeepHyperx OR IF NOT AVAILABLE:
24 | Nicolas Audebert, Bertrand Le Saux and Sébastien Lefèvre
25 | "Deep Learning for Classification of Hyperspectral Data: A comparative review",
26 | IEEE Geosciences and Remote Sensing Magazine, 2019.


--------------------------------------------------------------------------------
/hyperspectral/DeepHyperX/README.md:
--------------------------------------------------------------------------------
1 | # DeepHyperX on SageMaker
2 | 本workshop演示使用DeepHyperX在SageMaker上如何进行训练。  
3 | 原地址：https://github.com/nshaud/DeepHyperX  
4 | DeepHyperX在各种高光谱数据集上执行深度学习实验的 Python 工具。
5 | ## 数据准备
6 | [1-preparation](1-preparation)演示把原始数据格式转化为mat格式，如果已有mat格式的数据，可跳过数据准备，把数据按要求放入到S3即可。
7 | ## 训练
8 | [2-training](2-training)演示在SageMaker上进行训练。


--------------------------------------------------------------------------------
/image-classification/README.md:
--------------------------------------------------------------------------------
 1 | # 利用Amazon SageMaker内置算法进行图片分类
 2 | Image-classification-lst-format.ipynb演示了利用Amazon SageMaker内置算法进行图片分类模型的训练和部署。
 3 | 
 4 | ## 启动Amazon SageMaker笔记本实例
 5 | 通过以下步骤启动Amazon SageMaker的笔记本实例
 6 | * 访问SageMaker主页，点击左边栏目笔记本实例链接
 7 | * 创建笔记本实例
 8 | * 当笔记本实例处于InService状态时，可以通过点击JupyterLab链接进入到实例中
 9 | 
10 | ## 上传源文件到笔记本实例
11 | 点击左上角上传按钮，将Image-classification-lst-format.ipynb文件上传到笔记本实例中。
12 | 
13 | ## 升级相应Kernel中sagemaker版本
14 | ```
15 | source  activate mxnet_p36 
16 | pip install sagemaker --upgrade
17 | ```
18 | 执行完以上命令重启kenrnel
19 | 
20 | 
21 | ## 运行笔记本实例中的每个Cell
22 | 阅读每个Cell运行相关程序进行模型训练和推理


--------------------------------------------------------------------------------
/images/sagemaker_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/images/sagemaker_notebook.png


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04
 2 | ARG BASE_IMG=${BASE_IMG}
 3 | FROM ${BASE_IMG} 
 4 | 
 5 | ENV PATH="/opt/code:${PATH}"
 6 | 
 7 | COPY sources.list /etc/apt/
 8 | 
 9 | #RUN wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub | apt-key add -
10 | 
11 | RUN apt-get update \
12 |  && apt-get install -y --no-install-recommends --allow-unauthenticated \
13 |     jq
14 | 
15 | ## fix /usr/local/cuda-10.0/compat/libcuda.so
16 | ## RUN bash -c 'echo "/usr/local/cuda-10.0/compat" > /etc/ld.so.conf.d/cuda.conf'
17 | RUN ldconfig -v
18 | 
19 | 
20 | WORKDIR /opt/code
21 | COPY dockersource ./
22 | RUN pip install -r /opt/code/requirements.txt -i https://opentuna.cn/pypi/web/simple/
23 | ## https://github.com/aws/sagemaker-pytorch-training-toolkit/issues/143#issuecomment-566776288
24 | ## https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/upgrade-training-toolkit/docker/build_artifacts/start_with_right_hostname.sh
25 | ## https://github.com/aws/deep-learning-containers/blob/v2.0-pt-1.5.1-py36/pytorch/training/docker/1.5.1/py3/Dockerfile.gpu#L181
26 | COPY changehostname.c /opt/code
27 | COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
28 | COPY train /opt/code


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/changehostname.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | 
 4 | /*
 5 |  * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
 6 |  *
 7 |  * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
 8 |  * not realizing that it needs to use NET/Socket.
 9 |  *
10 |  * When docker container starts we read 'current_host' value  from /opt/ml/input/config/resourceconfig.json
11 |  * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
12 |  */
13 | int gethostname(char *name, size_t len)
14 | {
15 |   const char *val = PLACEHOLDER_HOSTNAME;
16 |   strncpy(name, val, len);
17 |   return 0;
18 | }
19 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/config/hyperparameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "data": "/opt/ml/input/data/training/cfg/data.yaml",
 3 |    "cfg": "/opt/ml/input/data/training/cfg/yolov5s.yaml",
 4 |    "hyp": "/opt/ml/input/data/training/cfg/hyp.yaml",    
 5 |    "weights": "/opt/ml/input/data/training/weights/yolov5s.pt",
 6 |    "img": "640",
 7 |    "epochs": "2",
 8 |    "batch": "16"
 9 | }
10 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/config/resourceconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |     "current_host": "algo-1",
3 |     "hosts": ["algo-1","algo-2","algo-3"],
4 |     "network_interface_name":"eth1"
5 | }


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/hyp.yaml:
--------------------------------------------------------------------------------
 1 | lr0: 0.01
 2 | lrf: 0.2
 3 | momentum: 0.937
 4 | weight_decay: 0.0005
 5 | warmup_epochs: 3.0
 6 | warmup_momentum: 0.8
 7 | warmup_bias_lr: 0.1
 8 | box: 0.05
 9 | cls: 0.5
10 | cls_pw: 1.0
11 | obj: 1.0
12 | obj_pw: 1.0
13 | iou_t: 0.2
14 | anchor_t: 4.0
15 | fl_gamma: 0.0
16 | hsv_h: 0.015
17 | hsv_s: 0.7
18 | hsv_v: 0.4
19 | degrees: 0.0
20 | translate: 0.1
21 | scale: 0.5
22 | shear: 0.0
23 | perspective: 0.0
24 | flipud: 0.0
25 | fliplr: 0.5
26 | mosaic: 1.0
27 | mixup: 0.0
28 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/yolov5s.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 0.33  # model depth multiple
 4 | width_multiple: 0.50  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/weights/yolov5s.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/weights/yolov5s.pt


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/sources.list:
--------------------------------------------------------------------------------
 1 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial main restricted universe multiverse
 2 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial main restricted universe multiverse
 3 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-updates main restricted universe multiverse
 4 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-updates main restricted universe multiverse
 5 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-backports main restricted universe multiverse
 6 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-backports main restricted universe multiverse
 7 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-security main restricted universe multiverse
 8 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-security main restricted universe multiverse
 9 | 
10 | # 预发布软件源，不建议启用
11 | # deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-proposed main restricted universe multiverse
12 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-proposed main restricted universe multiverse


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/start_with_right_hostname.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ "$1" = "train" ]]; then
 4 |      CURRENT_HOST=$(jq .current_host  /opt/ml/input/config/resourceconfig.json)
 5 |      sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
 6 |      gcc -o changehostname.o -c -fPIC -Wall changehostname.c
 7 |      gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
 8 |      LD_PRELOAD=/opt/code/libchangehostname.so train
 9 | else
10 |      eval "$@"
11 | fi
12 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/container/train:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | hpfile=/opt/ml/input/config/hyperparameters.json
 3 | echo "========hyperparameters======="
 4 | cat $hpfile
 5 | hp=$(cat $hpfile |jq -r -c 'to_entries | .[] |"--"+ .key + " " + .value ' | tr '\n' ' ')
 6 | echo "============="
 7 | echo python /opt/code/train.py $hp
 8 | echo "============="
 9 | python /opt/code/train.py $hp
10 | cp -r /opt/code/runs /opt/ml/model/
11 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/training-build.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# YOLOv5 on SageMaker--Build 训练镜像"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本章内容为build训练镜像，推送到AWS ECR，用户可直接使用build完毕的image，不用自己build。"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 2 运行环境\n",
 23 |     "Kernel 选择pytorch_latest_p36。  \n",
 24 |     "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import boto3,sagemaker\n",
 34 |     "print(boto3.__version__)\n",
 35 |     "print(sagemaker.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## 3 Amazon 深度学习容器\n",
 43 |     "\n",
 44 |     "* [容器镜像清单](https://github.com/aws/deep-learning-containers/blob/master/available_images.md)\n",
 45 |     "* 本文基于pytorch training: `727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04`"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 4 下载YOLOv5"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "!git clone https://github.com/ultralytics/yolov5 container/dockersource"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## 5 设置相关名称"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "ecr_repository = 'yolov5-training'\n",
 78 |     "tag = 'latest'"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## 6 Build image"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "#国内pytorch training基础镜像地址，不要修改\n",
 95 |     "base_img='727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04'\n",
 96 |     "#登录基础镜像ECR，不要修改\n",
 97 |     "!aws ecr get-login-password --region cn-northwest-1 | docker login --username AWS --password-stdin 727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "%%time\n",
107 |     "%cd container\n",
108 |     "!docker build -t $ecr_repository -f Dockerfile --build-arg BASE_IMG=$base_img .\n",
109 |     "%cd ../"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "## 7 在本地使用容器进行训练(可选)\n",
117 |     "本地机器如果带GPU，使用`nvidia-docker run`；如果不带GPU，使用`docker run`，建议使用2xlarge以上机型，否则可能不足以分配内存。  \n",
118 |     "训练模型结果存放在`container/local_test/model/runs/train/exp/weights`"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "!nvidia-docker run -v $(pwd)/container/local_test/:/opt/ml/ --shm-size=8g --rm $ecr_repository train\n",
128 |     "# !docker run -v $(pwd)/container/local_test/:/opt/ml/ --shm-size=8g --rm $ecr_repository train"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "## 8 推送到ECR"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "!aws ecr create-repository --repository-name $ecr_repository"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "import boto3\n",
154 |     "region = boto3.session.Session().region_name\n",
155 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
156 |     "image_uri = '{}.dkr.ecr.{}.amazonaws.com.cn/{}'.format(account_id, region, ecr_repository + \":\" + tag)\n",
157 |     "!docker tag $ecr_repository:$tag $image_uri\n",
158 |     "!$(aws ecr get-login --no-include-email)\n",
159 |     "!docker push $image_uri"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": []
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Environment (conda_pytorch_latest_p37)",
173 |    "language": "python",
174 |    "name": "conda_pytorch_latest_p37"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.7.10"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 4
191 | }
192 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/1-training/training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# YOLOv5 on SageMaker--训练"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本章内容为用SageMaker进行训练，数据来自S3。"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 2 运行环境\n",
 23 |     "Kernel 选择pytorch_latest_p36。  \n",
 24 |     "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import boto3,sagemaker\n",
 34 |     "print(boto3.__version__)\n",
 35 |     "print(sagemaker.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## 3 获取image"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "本项目已build完毕image，存放到ECR中，可直接部署到SageMaker。请选择选择合适版本。"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "tag = \"v3.1\""
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import boto3\n",
 68 |     "region = boto3.session.Session().region_name\n",
 69 |     "image_uri = '048912060910.dkr.ecr.{}.amazonaws.com.cn/nwcd/yolov5-training:{}'.format(region,tag)\n",
 70 |     "image_uri"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## 4 在SageMaker上训练"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# 设置数据存放S3 bucket和前缀\n",
 87 |     "bucket = 'junzhong'\n",
 88 |     "pre_key = 'yolov5'"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "training_uri='s3://{}/{}/training/'.format(bucket, pre_key)\n",
 98 |     "outpath='s3://{}/{}/results/'.format(bucket, pre_key)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "import sagemaker,boto3\n",
108 |     "\n",
109 |     "iam = boto3.client('iam')\n",
110 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
111 |     "role=\"\"\n",
112 |     "for current_role in roles[\"Roles\"]:\n",
113 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
114 |     "        role=current_role[\"Arn\"]\n",
115 |     "        break\n",
116 |     "#如果role为空表示有问题\n",
117 |     "print(role)\n",
118 |     "sm = boto3.client('sagemaker')"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "#设置是否使用spot实例进行训练\n",
128 |     "use_spot = True"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "YOLOv5相关参数在`contariner/local_test/input/data/training/cfg/`目录下的`hyp.yaml`中，如需修改，请先修改。每次修改完后需要再同步。"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "!aws s3 sync container/local_test/input/data/training/ s3://{bucket}/{pre_key}/training/"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "from datetime import datetime\n",
154 |     "now = datetime.now()\n",
155 |     "job_name = 'yolov5-' + now.strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
156 |     "job_name"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "response = sm.create_training_job(\n",
166 |     "      TrainingJobName=job_name,\n",
167 |     "      HyperParameters={\n",
168 |     "          'img':\"640\",\n",
169 |     "          'batch':\"16\",\n",
170 |     "          'epochs':\"15\",\n",
171 |     "          'hyp':\"/opt/ml/input/data/training/cfg/hyp.yaml\",\n",
172 |     "          'data':\"/opt/ml/input/data/training/cfg/data.yaml\",\n",
173 |     "          'cfg':\"/opt/ml/input/data/training/cfg/yolov5s.yaml\",\n",
174 |     "          'weights':\"/opt/ml/input/data/training/weights/yolov5s.pt\"\n",
175 |     "      },\n",
176 |     "      AlgorithmSpecification={\n",
177 |     "          'TrainingImage': image_uri,\n",
178 |     "          'TrainingInputMode': 'File',\n",
179 |     "      },\n",
180 |     "      RoleArn=role,\n",
181 |     "      InputDataConfig=[\n",
182 |     "          {\n",
183 |     "              'ChannelName': 'training',\n",
184 |     "              'DataSource': {\n",
185 |     "                  'S3DataSource': {\n",
186 |     "                      'S3DataType': 'S3Prefix',\n",
187 |     "                      'S3Uri': training_uri,\n",
188 |     "                      'S3DataDistributionType': 'FullyReplicated',\n",
189 |     "                  },\n",
190 |     "              },\n",
191 |     "              'InputMode': 'File'\n",
192 |     "          }\n",
193 |     "      ],\n",
194 |     "      OutputDataConfig={\n",
195 |     "          'S3OutputPath': outpath\n",
196 |     "      },\n",
197 |     "      ResourceConfig={\n",
198 |     "          'InstanceType': 'ml.p3.2xlarge',\n",
199 |     "          'InstanceCount': 1,\n",
200 |     "          'VolumeSizeInGB': 100,\n",
201 |     "      },\n",
202 |     "      EnableManagedSpotTraining=use_spot,\n",
203 |     "      StoppingCondition={\"MaxWaitTimeInSeconds\": 3600,\"MaxRuntimeInSeconds\": 3600} if use_spot else {\"MaxRuntimeInSeconds\": 3600}\n",
204 |     "  )\n",
205 |     "response"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "查看状态，也可到SageMaker控制台查看。使用本Workshop提供的数据，大概需要15分钟。  \n",
213 |     "每120秒获取一次状态，因此最多可能有2分钟的延迟。"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "status = sm.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
223 |     "print('Training job current status: {}'.format(status))\n",
224 |     "\n",
225 |     "try:\n",
226 |     "    sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=job_name)\n",
227 |     "    training_info = sm.describe_training_job(TrainingJobName=job_name)\n",
228 |     "    status = training_info['TrainingJobStatus']\n",
229 |     "    print(\"Training job ended with status: \" + status)\n",
230 |     "except:\n",
231 |     "    print('Training failed to start')\n",
232 |     "    message = sm.describe_training_job(TrainingJobName=job_name)['FailureReason']\n",
233 |     "    print('Training failed with the following error: {}'.format(message))"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "如果看到,\n",
241 |     "\n",
242 |     "> `Training job ended with status: Completed`\n",
243 |     "\n",
244 |     "这意味着训练成功完成。"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "## 5 下载训练结果"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "复制下面代码输出的`model_data`，在推理中要使用"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "respone = sm.describe_training_job(TrainingJobName=job_name)\n",
268 |     "model_data = respone['ModelArtifacts']['S3ModelArtifacts']\n",
269 |     "!echo -n $model_data > model_data.txt"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "!aws s3 cp {model_data} model.tar.gz"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "!tar -xvf model.tar.gz"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": []
296 |   }
297 |  ],
298 |  "metadata": {
299 |   "kernelspec": {
300 |    "display_name": "Environment (conda_pytorch_latest_p37)",
301 |    "language": "python",
302 |    "name": "conda_pytorch_latest_p37"
303 |   },
304 |   "language_info": {
305 |    "codemirror_mode": {
306 |     "name": "ipython",
307 |     "version": 3
308 |    },
309 |    "file_extension": ".py",
310 |    "mimetype": "text/x-python",
311 |    "name": "python",
312 |    "nbconvert_exporter": "python",
313 |    "pygments_lexer": "ipython3",
314 |    "version": "3.7.10"
315 |   }
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 4
319 | }
320 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/Dockerfile:
--------------------------------------------------------------------------------
 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04
 2 | ARG BASE_IMG=${BASE_IMG}
 3 | FROM ${BASE_IMG} 
 4 | 
 5 | RUN apt-get update
 6 | RUN apt-get install -y --no-install-recommends nginx net-tools\
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN pip install flask gevent gunicorn boto3 -i https://opentuna.cn/pypi/web/simple/ && \
10 |         rm -rf /root/.cache
11 | 
12 | COPY aws /root/.aws
13 | # RUN mkdir /opt/ml/code
14 | WORKDIR /opt/ml/code
15 | COPY source ./
16 | 
17 | RUN pip install -r requirements.txt -i https://opentuna.cn/pypi/web/simple/
18 |         
19 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard
20 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE
21 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update
22 | # PATH so that the train and serve programs are found when the container is invoked.
23 | 
24 | ENV PYTHONUNBUFFERED=TRUE
25 | ENV PYTHONDONTWRITEBYTECODE=TRUE
26 | ENV PATH="/opt/ml/code/:${PATH}"
27 | 
28 | ENTRYPOINT ["python3"]


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/aws/config:
--------------------------------------------------------------------------------
1 | [default]
2 | region = cn-northwest-1
3 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/inference-build.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# YOLOv5 on SageMaker--Build 推理镜像"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本章内容为build推理镜像，推送到AWS ECR，用户可直接使用build完毕的镜像，不用自己build。"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## 2 运行环境\n",
 23 |     "Kernel 选择pytorch_latest_p36。  \n",
 24 |     "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import boto3,sagemaker\n",
 34 |     "print(boto3.__version__)\n",
 35 |     "print(sagemaker.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## 3 本地推理(可选)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 34,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "mkdir: cannot create directory ‘/opt/ml’: File exists\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "!sudo mkdir /opt/ml\n",
 60 |     "!sudo chmod 777 /opt/ml"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import os\n",
 70 |     "if not os.path.exists(\"/opt/ml/model\"):\n",
 71 |     "    os.mkdir(\"/opt/ml/model\")"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "!cp -r ../1-training/runs/ /opt/ml/model/"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "新启动一个shell窗口，运行`conda activate pytorch_latest_p36`，然后必须cd到`2-inference/source`目录，再运行`python predictor.py`，正常启动会输出以下内容：\n",
 88 |     "```\n",
 89 |     "-------------init_output_dir  /opt/ml/output_dir\n",
 90 |     " * Serving Flask app \"predictor\" (lazy loading)\n",
 91 |     " * Environment: production\n",
 92 |     "   WARNING: This is a development server. Do not use it in a production deployment.\n",
 93 |     "   Use a production WSGI server instead.\n",
 94 |     " * Debug mode: off\n",
 95 |     " * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\n",
 96 |     "```"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "#修改请求图片\n",
106 |     "!curl -H \"Content-Type: application/json\" -X POST --data '{\"bucket\":\"junzhong\",\"image_uri\":\"yolov5/training/images/val/000729.jpeg\"}' http://127.0.0.1:5000/invocations"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "#删除model文件，实际运行时，通过S3动态传入model\n",
116 |     "import os\n",
117 |     "model_file = \"source/yolov5s.pt\"\n",
118 |     "if os.path.isfile(model_file):\n",
119 |     "    os.remove(model_file)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "## 4 Amazon 深度学习容器\n",
127 |     "\n",
128 |     "* [容器镜像清单](https://github.com/aws/deep-learning-containers/blob/master/available_images.md)\n",
129 |     "* 本文基于pytorch inference: `727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04`"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## 5 设置相关名称"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "ecr_repository = 'yolov5-inference'\n",
146 |     "tag = 'latest'"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## 6 Build image"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "#国内pytorch inference基础镜像地址，不要修改\n",
163 |     "base_img='727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04'\n",
164 |     "#登录基础镜像ECR，不要修改\n",
165 |     "!aws ecr get-login-password --region cn-northwest-1 | docker login --username AWS --password-stdin 727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "!docker build -t $ecr_repository:$tag -f Dockerfile --build-arg BASE_IMG=$base_img ."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "## 7 在本地使用容器进行推理(可选)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "import os\n",
191 |     "if not os.path.exists(\"model\"):\n",
192 |     "    os.mkdir(\"model\")"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "!cp -r ../1-training/runs/ model/"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "本地机器如果带GPU，使用`nvidia-docker run`；如果不带GPU，使用`docker run`。"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "!docker run -v $(pwd)/model/:/opt/ml/model/ -p 8080:8080 -d --rm $ecr_repository:$tag serve"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "#修改请求图片\n",
227 |     "!curl -H \"Content-Type: application/json\" -X POST --data '{\"bucket\":\"junzhong\",\"image_uri\":\"yolov5/training/images/val/000729.jpeg\"}' http://127.0.0.1:8080/invocations"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## 8  推送到ECR"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "!aws ecr create-repository --repository-name $ecr_repository"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "import boto3\n",
253 |     "region = boto3.session.Session().region_name\n",
254 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
255 |     "image_uri = '{}.dkr.ecr.{}.amazonaws.com.cn/{}'.format(account_id, region, ecr_repository + \":\" + tag)\n",
256 |     "!docker tag $ecr_repository:$tag $image_uri\n",
257 |     "!$(aws ecr get-login --no-include-email)\n",
258 |     "!docker push $image_uri"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Environment (conda_pytorch_latest_p37)",
272 |    "language": "python",
273 |    "name": "conda_pytorch_latest_p37"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.7.10"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 4
290 | }
291 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/detect.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import platform
 4 | import shutil
 5 | import time
 6 | from pathlib import Path
 7 | 
 8 | import cv2
 9 | import torch
10 | import torch.backends.cudnn as cudnn
11 | from numpy import random
12 | 
13 | from models.experimental import attempt_load
14 | from utils.datasets import LoadStreams, LoadImages
15 | from utils.general import (
16 |     check_img_size, non_max_suppression, apply_classifier, scale_coords,
17 |     xyxy2xywh, plot_one_box, strip_optimizer, set_logging)
18 | from utils.torch_utils import select_device, load_classifier, time_synchronized
19 | 
20 | 
21 | def detect(source, img_size):
22 |     webcam = source.isnumeric() or source.startswith(('rtsp://', 'rtmp://', 'http://')) or source.endswith('.txt')
23 | 
24 |     weights = "yolov5s.pt"
25 |     
26 |     # Initialize
27 |     set_logging()
28 |     device = select_device("")
29 |     half = device.type != 'cpu'  # half precision only supported on CUDA
30 | 
31 |     # Load model
32 |     model = attempt_load(weights, map_location=device)  # load FP32 model
33 |     imgsz = check_img_size(img_size, s=model.stride.max())  # check img_size
34 |     if half:
35 |         model.half()  # to FP16
36 | 
37 |     # Set Dataloader
38 |     vid_path, vid_writer = None, None
39 |     if webcam:
40 |         view_img = True
41 |         cudnn.benchmark = True  # set True to speed up constant image size inference
42 |         dataset = LoadStreams(source, img_size=imgsz)
43 |     else:
44 |         dataset = LoadImages(source, img_size=imgsz)
45 | 
46 |     # Get names and colors
47 |     names = model.module.names if hasattr(model, 'module') else model.names
48 |     colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
49 | 
50 |     # Run inference
51 |     t0 = time.time()
52 |     img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
53 |     _ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once
54 |     
55 |     result=[]
56 |     
57 |     for path, img, im0s, vid_cap in dataset:
58 |         img = torch.from_numpy(img).to(device)
59 |         img = img.half() if half else img.float()  # uint8 to fp16/32
60 |         img /= 255.0  # 0 - 255 to 0.0 - 1.0
61 |         if img.ndimension() == 3:
62 |             img = img.unsqueeze(0)
63 | 
64 |         # Inference
65 |         t1 = time_synchronized()
66 |         pred = model(img, augment=False)[0]
67 | 
68 |         # Apply NMS
69 |         pred = non_max_suppression(pred, 0.4, 0.5)
70 |         t2 = time_synchronized()
71 | 
72 |         # Process detections
73 |         for i, det in enumerate(pred):  # detections per image
74 |             if webcam:  # batch_size >= 1
75 |                 p, s, im0 = path[i], '%g: ' % i, im0s[i].copy()
76 |             else:
77 |                 p, s, im0 = path, '', im0s
78 | 
79 |             gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
80 |             if det is not None and len(det):
81 |                 # Rescale boxes from img_size to im0 size
82 |                 det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
83 | 
84 |                 # Write results
85 |                 for *xyxy, conf, cls in reversed(det):
86 |                     xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
87 |                     item = {}
88 |                     item["class_name"] = names[int(cls)]
89 |                     item["class"] = int(cls)
90 |                     item["confidence"] = float(conf)
91 |                     item["xywh"] = xywh
92 |                     result.append(item)
93 |     return result
94 | 
95 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/2-inference/source/models/__init__.py


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/common.py:
--------------------------------------------------------------------------------
  1 | # This file contains modules common to various models
  2 | import math
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from utils.general import non_max_suppression
  7 | 
  8 | 
  9 | def autopad(k, p=None):  # kernel, padding
 10 |     # Pad to 'same'
 11 |     if p is None:
 12 |         p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
 13 |     return p
 14 | 
 15 | 
 16 | def DWConv(c1, c2, k=1, s=1, act=True):
 17 |     # Depthwise convolution
 18 |     return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
 19 | 
 20 | 
 21 | class Conv(nn.Module):
 22 |     # Standard convolution
 23 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
 24 |         super(Conv, self).__init__()
 25 |         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
 26 |         self.bn = nn.BatchNorm2d(c2)
 27 |         self.act = nn.Hardswish() if act else nn.Identity()
 28 | 
 29 |     def forward(self, x):
 30 |         return self.act(self.bn(self.conv(x)))
 31 | 
 32 |     def fuseforward(self, x):
 33 |         return self.act(self.conv(x))
 34 | 
 35 | 
 36 | class Bottleneck(nn.Module):
 37 |     # Standard bottleneck
 38 |     def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
 39 |         super(Bottleneck, self).__init__()
 40 |         c_ = int(c2 * e)  # hidden channels
 41 |         self.cv1 = Conv(c1, c_, 1, 1)
 42 |         self.cv2 = Conv(c_, c2, 3, 1, g=g)
 43 |         self.add = shortcut and c1 == c2
 44 | 
 45 |     def forward(self, x):
 46 |         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 47 | 
 48 | 
 49 | class BottleneckCSP(nn.Module):
 50 |     # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
 51 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
 52 |         super(BottleneckCSP, self).__init__()
 53 |         c_ = int(c2 * e)  # hidden channels
 54 |         self.cv1 = Conv(c1, c_, 1, 1)
 55 |         self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
 56 |         self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
 57 |         self.cv4 = Conv(2 * c_, c2, 1, 1)
 58 |         self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
 59 |         self.act = nn.LeakyReLU(0.1, inplace=True)
 60 |         self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
 61 | 
 62 |     def forward(self, x):
 63 |         y1 = self.cv3(self.m(self.cv1(x)))
 64 |         y2 = self.cv2(x)
 65 |         return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
 66 | 
 67 | 
 68 | class SPP(nn.Module):
 69 |     # Spatial pyramid pooling layer used in YOLOv3-SPP
 70 |     def __init__(self, c1, c2, k=(5, 9, 13)):
 71 |         super(SPP, self).__init__()
 72 |         c_ = c1 // 2  # hidden channels
 73 |         self.cv1 = Conv(c1, c_, 1, 1)
 74 |         self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
 75 |         self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
 76 | 
 77 |     def forward(self, x):
 78 |         x = self.cv1(x)
 79 |         return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
 80 | 
 81 | 
 82 | class Focus(nn.Module):
 83 |     # Focus wh information into c-space
 84 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
 85 |         super(Focus, self).__init__()
 86 |         self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
 87 | 
 88 |     def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
 89 |         return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
 90 | 
 91 | 
 92 | class Concat(nn.Module):
 93 |     # Concatenate a list of tensors along dimension
 94 |     def __init__(self, dimension=1):
 95 |         super(Concat, self).__init__()
 96 |         self.d = dimension
 97 | 
 98 |     def forward(self, x):
 99 |         return torch.cat(x, self.d)
100 | 
101 | 
102 | class NMS(nn.Module):
103 |     # Non-Maximum Suppression (NMS) module
104 |     conf = 0.3  # confidence threshold
105 |     iou = 0.6  # IoU threshold
106 |     classes = None  # (optional list) filter by class
107 | 
108 |     def __init__(self, dimension=1):
109 |         super(NMS, self).__init__()
110 | 
111 |     def forward(self, x):
112 |         return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)
113 | 
114 | 
115 | class Flatten(nn.Module):
116 |     # Use after nn.AdaptiveAvgPool2d(1) to remove last 2 dimensions
117 |     @staticmethod
118 |     def forward(x):
119 |         return x.view(x.size(0), -1)
120 | 
121 | 
122 | class Classify(nn.Module):
123 |     # Classification head, i.e. x(b,c1,20,20) to x(b,c2)
124 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
125 |         super(Classify, self).__init__()
126 |         self.aap = nn.AdaptiveAvgPool2d(1)  # to x(b,c1,1,1)
127 |         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)  # to x(b,c2,1,1)
128 |         self.flat = Flatten()
129 | 
130 |     def forward(self, x):
131 |         z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1)  # cat if list
132 |         return self.flat(self.conv(z))  # flatten to x(b,c2)
133 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/experimental.py:
--------------------------------------------------------------------------------
  1 | # This file contains experimental modules
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from models.common import Conv, DWConv
  8 | from utils.google_utils import attempt_download
  9 | 
 10 | 
 11 | class CrossConv(nn.Module):
 12 |     # Cross Convolution Downsample
 13 |     def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):
 14 |         # ch_in, ch_out, kernel, stride, groups, expansion, shortcut
 15 |         super(CrossConv, self).__init__()
 16 |         c_ = int(c2 * e)  # hidden channels
 17 |         self.cv1 = Conv(c1, c_, (1, k), (1, s))
 18 |         self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g)
 19 |         self.add = shortcut and c1 == c2
 20 | 
 21 |     def forward(self, x):
 22 |         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 23 | 
 24 | 
 25 | class C3(nn.Module):
 26 |     # Cross Convolution CSP
 27 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
 28 |         super(C3, self).__init__()
 29 |         c_ = int(c2 * e)  # hidden channels
 30 |         self.cv1 = Conv(c1, c_, 1, 1)
 31 |         self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
 32 |         self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
 33 |         self.cv4 = Conv(2 * c_, c2, 1, 1)
 34 |         self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
 35 |         self.act = nn.LeakyReLU(0.1, inplace=True)
 36 |         self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
 37 | 
 38 |     def forward(self, x):
 39 |         y1 = self.cv3(self.m(self.cv1(x)))
 40 |         y2 = self.cv2(x)
 41 |         return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
 42 | 
 43 | 
 44 | class Sum(nn.Module):
 45 |     # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
 46 |     def __init__(self, n, weight=False):  # n: number of inputs
 47 |         super(Sum, self).__init__()
 48 |         self.weight = weight  # apply weights boolean
 49 |         self.iter = range(n - 1)  # iter object
 50 |         if weight:
 51 |             self.w = nn.Parameter(-torch.arange(1., n) / 2, requires_grad=True)  # layer weights
 52 | 
 53 |     def forward(self, x):
 54 |         y = x[0]  # no weight
 55 |         if self.weight:
 56 |             w = torch.sigmoid(self.w) * 2
 57 |             for i in self.iter:
 58 |                 y = y + x[i + 1] * w[i]
 59 |         else:
 60 |             for i in self.iter:
 61 |                 y = y + x[i + 1]
 62 |         return y
 63 | 
 64 | 
 65 | class GhostConv(nn.Module):
 66 |     # Ghost Convolution https://github.com/huawei-noah/ghostnet
 67 |     def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
 68 |         super(GhostConv, self).__init__()
 69 |         c_ = c2 // 2  # hidden channels
 70 |         self.cv1 = Conv(c1, c_, k, s, g, act)
 71 |         self.cv2 = Conv(c_, c_, 5, 1, c_, act)
 72 | 
 73 |     def forward(self, x):
 74 |         y = self.cv1(x)
 75 |         return torch.cat([y, self.cv2(y)], 1)
 76 | 
 77 | 
 78 | class GhostBottleneck(nn.Module):
 79 |     # Ghost Bottleneck https://github.com/huawei-noah/ghostnet
 80 |     def __init__(self, c1, c2, k, s):
 81 |         super(GhostBottleneck, self).__init__()
 82 |         c_ = c2 // 2
 83 |         self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1),  # pw
 84 |                                   DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
 85 |                                   GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
 86 |         self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False),
 87 |                                       Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
 88 | 
 89 |     def forward(self, x):
 90 |         return self.conv(x) + self.shortcut(x)
 91 | 
 92 | 
 93 | class MixConv2d(nn.Module):
 94 |     # Mixed Depthwise Conv https://arxiv.org/abs/1907.09595
 95 |     def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
 96 |         super(MixConv2d, self).__init__()
 97 |         groups = len(k)
 98 |         if equal_ch:  # equal c_ per group
 99 |             i = torch.linspace(0, groups - 1E-6, c2).floor()  # c2 indices
100 |             c_ = [(i == g).sum() for g in range(groups)]  # intermediate channels
101 |         else:  # equal weight.numel() per group
102 |             b = [c2] + [0] * groups
103 |             a = np.eye(groups + 1, groups, k=-1)
104 |             a -= np.roll(a, 1, axis=1)
105 |             a *= np.array(k) ** 2
106 |             a[0] = 1
107 |             c_ = np.linalg.lstsq(a, b, rcond=None)[0].round()  # solve for equal weight indices, ax = b
108 | 
109 |         self.m = nn.ModuleList([nn.Conv2d(c1, int(c_[g]), k[g], s, k[g] // 2, bias=False) for g in range(groups)])
110 |         self.bn = nn.BatchNorm2d(c2)
111 |         self.act = nn.LeakyReLU(0.1, inplace=True)
112 | 
113 |     def forward(self, x):
114 |         return x + self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
115 | 
116 | 
117 | class Ensemble(nn.ModuleList):
118 |     # Ensemble of models
119 |     def __init__(self):
120 |         super(Ensemble, self).__init__()
121 | 
122 |     def forward(self, x, augment=False):
123 |         y = []
124 |         for module in self:
125 |             y.append(module(x, augment)[0])
126 |         # y = torch.stack(y).max(0)[0]  # max ensemble
127 |         # y = torch.cat(y, 1)  # nms ensemble
128 |         y = torch.stack(y).mean(0)  # mean ensemble
129 |         return y, None  # inference, train output
130 | 
131 | 
132 | def attempt_load(weights, map_location=None):
133 |     # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
134 |     model = Ensemble()
135 |     for w in weights if isinstance(weights, list) else [weights]:
136 |         attempt_download(w)
137 |         model.append(torch.load(w, map_location=map_location)['model'].float().fuse().eval())  # load FP32 model
138 | 
139 |     if len(model) == 1:
140 |         return model[-1]  # return model
141 |     else:
142 |         print('Ensemble created with %s\n' % weights)
143 |         for k in ['names', 'stride']:
144 |             setattr(model, k, getattr(model[-1], k))
145 |         return model  # return ensemble
146 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/export.py:
--------------------------------------------------------------------------------
 1 | """Exports a YOLOv5 *.pt model to ONNX and TorchScript formats
 2 | 
 3 | Usage:
 4 |     $ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov5s.pt --img 640 --batch 1
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | import models
13 | from models.experimental import attempt_load
14 | from utils.activations import Hardswish
15 | from utils.general import set_logging
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--weights', type=str, default='./yolov5s.pt', help='weights path')  # from yolov5/models/
20 |     parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')  # height, width
21 |     parser.add_argument('--batch-size', type=int, default=1, help='batch size')
22 |     opt = parser.parse_args()
23 |     opt.img_size *= 2 if len(opt.img_size) == 1 else 1  # expand
24 |     print(opt)
25 |     set_logging()
26 | 
27 |     # Input
28 |     img = torch.zeros((opt.batch_size, 3, *opt.img_size))  # image size(1,3,320,192) iDetection
29 | 
30 |     # Load PyTorch model
31 |     model = attempt_load(opt.weights, map_location=torch.device('cpu'))  # load FP32 model
32 | 
33 |     # Update model
34 |     for k, m in model.named_modules():
35 |         m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatability
36 |         if isinstance(m, models.common.Conv) and isinstance(m.act, nn.Hardswish):
37 |             m.act = Hardswish()  # assign activation
38 |         # if isinstance(m, models.yolo.Detect):
39 |         #     m.forward = m.forward_export  # assign forward (optional)
40 |     model.model[-1].export = True  # set Detect() layer export=True
41 |     y = model(img)  # dry run
42 | 
43 |     # TorchScript export
44 |     try:
45 |         print('\nStarting TorchScript export with torch %s...' % torch.__version__)
46 |         f = opt.weights.replace('.pt', '.torchscript.pt')  # filename
47 |         ts = torch.jit.trace(model, img)
48 |         ts.save(f)
49 |         print('TorchScript export success, saved as %s' % f)
50 |     except Exception as e:
51 |         print('TorchScript export failure: %s' % e)
52 | 
53 |     # ONNX export
54 |     try:
55 |         import onnx
56 | 
57 |         print('\nStarting ONNX export with onnx %s...' % onnx.__version__)
58 |         f = opt.weights.replace('.pt', '.onnx')  # filename
59 |         torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'],
60 |                           output_names=['classes', 'boxes'] if y is None else ['output'])
61 | 
62 |         # Checks
63 |         onnx_model = onnx.load(f)  # load onnx model
64 |         onnx.checker.check_model(onnx_model)  # check onnx model
65 |         # print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable model
66 |         print('ONNX export success, saved as %s' % f)
67 |     except Exception as e:
68 |         print('ONNX export failure: %s' % e)
69 | 
70 |     # CoreML export
71 |     try:
72 |         import coremltools as ct
73 | 
74 |         print('\nStarting CoreML export with coremltools %s...' % ct.__version__)
75 |         # convert model from torchscript and apply pixel scaling as per detect.py
76 |         model = ct.convert(ts, inputs=[ct.ImageType(name='images', shape=img.shape, scale=1 / 255.0, bias=[0, 0, 0])])
77 |         f = opt.weights.replace('.pt', '.mlmodel')  # filename
78 |         model.save(f)
79 |         print('CoreML export success, saved as %s' % f)
80 |     except Exception as e:
81 |         print('CoreML export failure: %s' % e)
82 | 
83 |     # Finish
84 |     print('\nExport complete. Visualize with https://github.com/lutzroeder/netron.')
85 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov3-spp.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.0  # model depth multiple
 4 | width_multiple: 1.0  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # darknet53 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Conv, [32, 3, 1]],  # 0
16 |    [-1, 1, Conv, [64, 3, 2]],  # 1-P1/2
17 |    [-1, 1, Bottleneck, [64]],
18 |    [-1, 1, Conv, [128, 3, 2]],  # 3-P2/4
19 |    [-1, 2, Bottleneck, [128]],
20 |    [-1, 1, Conv, [256, 3, 2]],  # 5-P3/8
21 |    [-1, 8, Bottleneck, [256]],
22 |    [-1, 1, Conv, [512, 3, 2]],  # 7-P4/16
23 |    [-1, 8, Bottleneck, [512]],
24 |    [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32
25 |    [-1, 4, Bottleneck, [1024]],  # 10
26 |   ]
27 | 
28 | # YOLOv3-SPP head
29 | head:
30 |   [[-1, 1, Bottleneck, [1024, False]],
31 |    [-1, 1, SPP, [512, [5, 9, 13]]],
32 |    [-1, 1, Conv, [1024, 3, 1]],
33 |    [-1, 1, Conv, [512, 1, 1]],
34 |    [-1, 1, Conv, [1024, 3, 1]],  # 15 (P5/32-large)
35 | 
36 |    [-2, 1, Conv, [256, 1, 1]],
37 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
38 |    [[-1, 8], 1, Concat, [1]],  # cat backbone P4
39 |    [-1, 1, Bottleneck, [512, False]],
40 |    [-1, 1, Bottleneck, [512, False]],
41 |    [-1, 1, Conv, [256, 1, 1]],
42 |    [-1, 1, Conv, [512, 3, 1]],  # 22 (P4/16-medium)
43 | 
44 |    [-2, 1, Conv, [128, 1, 1]],
45 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
46 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P3
47 |    [-1, 1, Bottleneck, [256, False]],
48 |    [-1, 2, Bottleneck, [256, False]],  # 27 (P3/8-small)
49 | 
50 |    [[27, 22, 15], 1, Detect, [nc, anchors]],   # Detect(P3, P4, P5)
51 |   ]
52 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov5-fpn.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.0  # model depth multiple
 4 | width_multiple: 1.0  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, Bottleneck, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 6, BottleneckCSP, [1024]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 FPN head
28 | head:
29 |   [[-1, 3, BottleneckCSP, [1024, False]],  # 10 (P5/32-large)
30 | 
31 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
32 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
33 |    [-1, 1, Conv, [512, 1, 1]],
34 |    [-1, 3, BottleneckCSP, [512, False]],  # 14 (P4/16-medium)
35 | 
36 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
37 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
38 |    [-1, 1, Conv, [256, 1, 1]],
39 |    [-1, 3, BottleneckCSP, [256, False]],  # 18 (P3/8-small)
40 | 
41 |    [[18, 14, 10], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
42 |   ]
43 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov5-panet.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.0  # model depth multiple
 4 | width_multiple: 1.0  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [116,90, 156,198, 373,326]  # P5/32
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [10,13, 16,30, 33,23]  # P3/8
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 PANet head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P5, P4, P3)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5l.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.0  # model depth multiple
 4 | width_multiple: 1.0  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5m.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 0.67  # model depth multiple
 4 | width_multiple: 0.75  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5s.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 0.33  # model depth multiple
 4 | width_multiple: 0.50  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5x.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 80  # number of classes
 3 | depth_multiple: 1.33  # model depth multiple
 4 | width_multiple: 1.25  # layer channel multiple
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # [from, number, module, args]
15 |   [[-1, 1, Focus, [64, 3]],  # 0-P1/2
16 |    [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
17 |    [-1, 3, BottleneckCSP, [128]],
18 |    [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
19 |    [-1, 9, BottleneckCSP, [256]],
20 |    [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
21 |    [-1, 9, BottleneckCSP, [512]],
22 |    [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
23 |    [-1, 1, SPP, [1024, [5, 9, 13]]],
24 |    [-1, 3, BottleneckCSP, [1024, False]],  # 9
25 |   ]
26 | 
27 | # YOLOv5 head
28 | head:
29 |   [[-1, 1, Conv, [512, 1, 1]],
30 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
31 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P4
32 |    [-1, 3, BottleneckCSP, [512, False]],  # 13
33 | 
34 |    [-1, 1, Conv, [256, 1, 1]],
35 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
36 |    [[-1, 4], 1, Concat, [1]],  # cat backbone P3
37 |    [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
38 | 
39 |    [-1, 1, Conv, [256, 3, 2]],
40 |    [[-1, 14], 1, Concat, [1]],  # cat head P4
41 |    [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
42 | 
43 |    [-1, 1, Conv, [512, 3, 2]],
44 |    [[-1, 10], 1, Concat, [1]],  # cat head P5
45 |    [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
46 | 
47 |    [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
48 |   ]
49 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 |     proxy_read_timeout 1200s;
27 | 
28 |     location ~ ^/(ping|invocations) {
29 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
30 |       proxy_set_header Host $http_host;
31 |       proxy_redirect off;
32 |       proxy_pass http://gunicorn;
33 |     }
34 | 
35 |     location / {
36 |       return 404 "{}";
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import boto3
  4 | import flask
  5 | import json
  6 | import shutil
  7 | import time
  8 | import random
  9 | from detect import detect
 10 | 
 11 | DEBUG = False
 12 | 
 13 | app = flask.Flask(__name__)
 14 | 
 15 | import logging
 16 | logger = logging.getLogger(__name__)
 17 | logger.setLevel(logging.DEBUG)
 18 | logger.addHandler(logging.StreamHandler(sys.stdout))
 19 | 
 20 | @app.route('/ping', methods=['GET'])
 21 | def ping():
 22 |     """Determine if the container is working and healthy. In this sample container, we declare
 23 |     it healthy if we can load the model successfully."""
 24 |     #health = boto3.client('s3') is not None  # You can insert a health check here
 25 | 
 26 |     #status = 200 if health else 404
 27 |     status = 200
 28 |     return flask.Response(response='\n', status=status, mimetype='application/json')
 29 | 
 30 | 
 31 | @app.route('/')
 32 | def hello_world():
 33 |     return 'YOLOv5 endpoint'
 34 | 
 35 | 
 36 | @app.route('/invocations', methods=['POST'])
 37 | def invocations():
 38 |     data = None
 39 |     #解析json，
 40 |     if flask.request.content_type == 'application/json':
 41 |         data = flask.request.data.decode('utf-8')
 42 |         data = json.loads(data)
 43 |         logger.info("invocations params [{}]".format(data))
 44 |         bucket = data['bucket']
 45 |         image_uri = data['image_uri']
 46 |     else:
 47 |         return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain')    
 48 |     
 49 |     tt = time.strftime("%Y%m%d%H%M%S", time.localtime())
 50 |     for i in range(0,5):
 51 |         current_output_dir = os.path.join(init_output_dir,tt+str(random.randint(1000,9999)))
 52 |         if not os.path.exists(current_output_dir):
 53 |             try:
 54 |                 os.mkdir(current_output_dir)
 55 |                 break
 56 |             except FileExistsError:
 57 |                 logger.info("Dir Exist."+current_output_dir)
 58 |     else:
 59 |         return flask.Response(response='Make dir error', status=500, mimetype='text/plain')
 60 | 
 61 |     download_file_name = image_uri.split('/')[-1]
 62 |     download_file_name = os.path.join(current_output_dir, download_file_name)
 63 |     s3_client.download_file(bucket, image_uri, download_file_name)
 64 |     
 65 |     img_size = 640
 66 |     if "img_size" in data:
 67 |         img_size = data["img_size"]
 68 |     inference_result = detect(download_file_name, img_size)
 69 | 
 70 |     
 71 |     _payload = json.dumps({'status': 500, 'message': 'YOLOv5 failed!'})
 72 |     if inference_result:
 73 |          _payload = json.dumps(inference_result)
 74 |     
 75 |     
 76 |     shutil.rmtree(current_output_dir)
 77 |     
 78 |     return flask.Response(response=_payload, status=200, mimetype='application/json')
 79 | 
 80 | 
 81 | #---------------------------------------
 82 | init_output_dir = '/opt/ml/output_dir'
 83 | if not os.path.exists(init_output_dir):
 84 |     try:
 85 |         os.mkdir(init_output_dir)
 86 |     except FileExistsError:
 87 |         logger.info("Dir Exist.")
 88 | 
 89 | #load model
 90 | source_file = '/opt/ml/model/runs/train/exp/weights/best.pt'
 91 | destination_file = "yolov5s.pt"
 92 | if os.path.isfile(source_file) and not os.path.isfile(destination_file):
 93 |     shutil.copy(source_file,destination_file)
 94 |     logger.info("Model file copied.")
 95 | else:
 96 |     logger.info("Model file not copy.")
 97 | 
 98 | s3_client = boto3.client('s3')
 99 | 
100 | if __name__ == '__main__':
101 |     app.run()


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/requirements.txt:
--------------------------------------------------------------------------------
 1 | # pip install -r requirements.txt
 2 | 
 3 | # base ----------------------------------------
 4 | Cython
 5 | matplotlib>=3.2.2
 6 | numpy>=1.18.5
 7 | opencv-python>=4.1.2
 8 | Pillow
 9 | PyYAML>=5.3.1
10 | scipy>=1.4.1
11 | tensorboard>=2.2
12 | torch>=1.7.0
13 | torchvision>=0.8.1
14 | tqdm>=4.41.0
15 | 
16 | # logging -------------------------------------
17 | # wandb
18 | 
19 | # plotting ------------------------------------
20 | seaborn>=0.11.0
21 | pandas
22 | 
23 | # export --------------------------------------
24 | # coremltools>=4.1
25 | # onnx>=1.8.1
26 | # scikit-learn==0.19.2  # for coreml quantization
27 | 
28 | # extras --------------------------------------
29 | thop  # FLOPS computation
30 | pycocotools>=2.0  # COCO mAP
31 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | from __future__ import print_function
17 | import multiprocessing
18 | import os
19 | import signal
20 | import subprocess
21 | import sys
22 | 
23 | 
24 | cpu_count = multiprocessing.cpu_count()
25 | 
26 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60)
27 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count))
28 | # # shishuai comment, this is for g4dn.12xlarge
29 | #model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', 4)) 
30 | 
31 | def sigterm_handler(nginx_pid, gunicorn_pid):
32 |     try:
33 |         os.kill(nginx_pid, signal.SIGQUIT)
34 |     except OSError:
35 |         pass
36 |     try:
37 |         os.kill(gunicorn_pid, signal.SIGTERM)
38 |     except OSError:
39 |         pass
40 | 
41 |     sys.exit(0)
42 | 
43 | def start_server():
44 |     print('Starting the inference server with {} workers.'.format(model_server_workers))
45 | 
46 | 
47 |     # link the log streams to stdout/err so they will be logged to the container logs
48 |     subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
49 |     subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
50 | 
51 |     nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf'])
52 |     gunicorn = subprocess.Popen(['gunicorn',
53 |                                  '--timeout', str(model_server_timeout),
54 |                                  '-k', 'gevent',
55 |                                  '-b', 'unix:/tmp/gunicorn.sock',
56 |                                  '-w', str(model_server_workers),
57 |                                  'wsgi:app'])
58 | 
59 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
60 | 
61 |     # If either subprocess exits, so do we.
62 |     pids = set([nginx.pid, gunicorn.pid])
63 |     while True:
64 |         pid, _ = os.wait()
65 |         if pid in pids:
66 |             break
67 | 
68 |     sigterm_handler(nginx.pid, gunicorn.pid)
69 |     print('Inference server exiting')
70 | 
71 | # The main routine just invokes the start function.
72 | 
73 | if __name__ == '__main__':
74 |     start_server()
75 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/2-inference/source/utils/__init__.py


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/activations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | # Swish https://arxiv.org/pdf/1905.02244.pdf ---------------------------------------------------------------------------
 7 | class Swish(nn.Module):  #
 8 |     @staticmethod
 9 |     def forward(x):
10 |         return x * torch.sigmoid(x)
11 | 
12 | 
13 | class Hardswish(nn.Module):  # export-friendly version of nn.Hardswish()
14 |     @staticmethod
15 |     def forward(x):
16 |         # return x * F.hardsigmoid(x)  # for torchscript and CoreML
17 |         return x * F.hardtanh(x + 3, 0., 6.) / 6.  # for torchscript, CoreML and ONNX
18 | 
19 | 
20 | class MemoryEfficientSwish(nn.Module):
21 |     class F(torch.autograd.Function):
22 |         @staticmethod
23 |         def forward(ctx, x):
24 |             ctx.save_for_backward(x)
25 |             return x * torch.sigmoid(x)
26 | 
27 |         @staticmethod
28 |         def backward(ctx, grad_output):
29 |             x = ctx.saved_tensors[0]
30 |             sx = torch.sigmoid(x)
31 |             return grad_output * (sx * (1 + x * (1 - sx)))
32 | 
33 |     def forward(self, x):
34 |         return self.F.apply(x)
35 | 
36 | 
37 | # Mish https://github.com/digantamisra98/Mish --------------------------------------------------------------------------
38 | class Mish(nn.Module):
39 |     @staticmethod
40 |     def forward(x):
41 |         return x * F.softplus(x).tanh()
42 | 
43 | 
44 | class MemoryEfficientMish(nn.Module):
45 |     class F(torch.autograd.Function):
46 |         @staticmethod
47 |         def forward(ctx, x):
48 |             ctx.save_for_backward(x)
49 |             return x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
50 | 
51 |         @staticmethod
52 |         def backward(ctx, grad_output):
53 |             x = ctx.saved_tensors[0]
54 |             sx = torch.sigmoid(x)
55 |             fx = F.softplus(x).tanh()
56 |             return grad_output * (fx + x * sx * (1 - fx * fx))
57 | 
58 |     def forward(self, x):
59 |         return self.F.apply(x)
60 | 
61 | 
62 | # FReLU https://arxiv.org/abs/2007.11824 -------------------------------------------------------------------------------
63 | class FReLU(nn.Module):
64 |     def __init__(self, c1, k=3):  # ch_in, kernel
65 |         super().__init__()
66 |         self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1)
67 |         self.bn = nn.BatchNorm2d(c1)
68 | 
69 |     def forward(self, x):
70 |         return torch.max(x, self.bn(self.conv(x)))
71 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/evolve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Hyperparameter evolution commands (avoids CUDA memory leakage issues)
 3 | # Replaces train.py python generations 'for' loop with a bash 'for' loop
 4 | 
 5 | # Start on 4-GPU machine
 6 | #for i in 0 1 2 3; do
 7 | #  t=ultralytics/yolov5:evolve && sudo docker pull $t && sudo docker run -d --ipc=host --gpus all -v "$(pwd)"/VOC:/usr/src/VOC $t bash utils/evolve.sh $i
 8 | #  sleep 60 # avoid simultaneous evolve.txt read/write
 9 | #done
10 | 
11 | # Hyperparameter evolution commands
12 | while true; do
13 |   # python train.py --batch 64 --weights yolov5m.pt --data voc.yaml --img 512 --epochs 50 --evolve --bucket ult/evolve/voc --device $1
14 |   python train.py --batch 40 --weights yolov5m.pt --data coco.yaml --img 640 --epochs 30 --evolve --bucket ult/evolve/coco --device $1
15 | done
16 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/google-appengine/python
 2 | 
 3 | # Create a virtualenv for dependencies. This isolates these packages from
 4 | # system-level packages.
 5 | # Use -p python3 or -p python3.7 to select python version. Default is version 2.
 6 | RUN virtualenv /env -p python3
 7 | 
 8 | # Setting these environment variables are the same as running
 9 | # source /env/bin/activate.
10 | ENV VIRTUAL_ENV /env
11 | ENV PATH /env/bin:$PATH
12 | 
13 | RUN apt-get update && apt-get install -y python-opencv
14 | 
15 | # Copy the application's requirements.txt and run pip to install all
16 | # dependencies into the virtualenv.
17 | ADD requirements.txt /app/requirements.txt
18 | RUN pip install -r /app/requirements.txt
19 | 
20 | # Add the application source code.
21 | ADD . /app
22 | 
23 | # Run a WSGI server to serve the application. gunicorn must be declared as
24 | # a dependency in requirements.txt.
25 | CMD gunicorn -b :$PORT main:app
26 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/additional_requirements.txt:
--------------------------------------------------------------------------------
1 | # add these requirements in your app on top of the existing ones
2 | pip==18.1
3 | Flask==1.0.2
4 | gunicorn==19.9.0
5 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/app.yaml:
--------------------------------------------------------------------------------
 1 | runtime: custom
 2 | env: flex
 3 | 
 4 | service: yolov5app
 5 | 
 6 | liveness_check:
 7 |   initial_delay_sec: 600
 8 | 
 9 | manual_scaling:
10 |   instances: 1
11 | resources:
12 |   cpu: 1
13 |   memory_gb: 4
14 |   disk_size_gb: 20


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_utils.py:
--------------------------------------------------------------------------------
  1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries
  2 | # pip install --upgrade google-cloud-storage
  3 | # from google.cloud import storage
  4 | 
  5 | import os
  6 | import platform
  7 | import subprocess
  8 | import time
  9 | from pathlib import Path
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | def gsutil_getsize(url=''):
 15 |     # gs://bucket/file size https://cloud.google.com/storage/docs/gsutil/commands/du
 16 |     s = subprocess.check_output('gsutil du %s' % url, shell=True).decode('utf-8')
 17 |     return eval(s.split(' ')[0]) if len(s) else 0  # bytes
 18 | 
 19 | 
 20 | def attempt_download(weights):
 21 |     # Attempt to download pretrained weights if not found locally
 22 |     weights = weights.strip().replace("'", '')
 23 |     file = Path(weights).name
 24 | 
 25 |     msg = weights + ' missing, try downloading from https://github.com/ultralytics/yolov5/releases/'
 26 |     models = ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']  # available models
 27 | 
 28 |     if file in models and not os.path.isfile(weights):
 29 |         # Google Drive
 30 |         # d = {'yolov5s.pt': '1R5T6rIyy3lLwgFXNms8whc-387H0tMQO',
 31 |         #      'yolov5m.pt': '1vobuEExpWQVpXExsJ2w-Mbf3HJjWkQJr',
 32 |         #      'yolov5l.pt': '1hrlqD1Wdei7UT4OgT785BEk1JwnSvNEV',
 33 |         #      'yolov5x.pt': '1mM8aZJlWTxOg7BZJvNUMrTnA2AbeCVzS'}
 34 |         # r = gdrive_download(id=d[file], name=weights) if file in d else 1
 35 |         # if r == 0 and os.path.exists(weights) and os.path.getsize(weights) > 1E6:  # check
 36 |         #    return
 37 | 
 38 |         try:  # GitHub
 39 |             url = 'https://github.com/ultralytics/yolov5/releases/download/v3.0/' + file
 40 |             print('Downloading %s to %s...' % (url, weights))
 41 |             torch.hub.download_url_to_file(url, weights)
 42 |             assert os.path.exists(weights) and os.path.getsize(weights) > 1E6  # check
 43 |         except Exception as e:  # GCP
 44 |             print('Download error: %s' % e)
 45 |             url = 'https://storage.googleapis.com/ultralytics/yolov5/ckpt/' + file
 46 |             print('Downloading %s to %s...' % (url, weights))
 47 |             r = os.system('curl -L %s -o %s' % (url, weights))  # torch.hub.download_url_to_file(url, weights)
 48 |         finally:
 49 |             if not (os.path.exists(weights) and os.path.getsize(weights) > 1E6):  # check
 50 |                 os.remove(weights) if os.path.exists(weights) else None  # remove partial downloads
 51 |                 print('ERROR: Download failure: %s' % msg)
 52 |             print('')
 53 |             return
 54 | 
 55 | 
 56 | def gdrive_download(id='1n_oKgR81BJtqk75b00eAjdv03qVCQn2f', name='coco128.zip'):
 57 |     # Downloads a file from Google Drive. from utils.google_utils import *; gdrive_download()
 58 |     t = time.time()
 59 | 
 60 |     print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='')
 61 |     os.remove(name) if os.path.exists(name) else None  # remove existing
 62 |     os.remove('cookie') if os.path.exists('cookie') else None
 63 | 
 64 |     # Attempt file download
 65 |     out = "NUL" if platform.system() == "Windows" else "/dev/null"
 66 |     os.system('curl -c ./cookie -s -L "drive.google.com/uc?export=download&id=%s" > %s ' % (id, out))
 67 |     if os.path.exists('cookie'):  # large file
 68 |         s = 'curl -Lb ./cookie "drive.google.com/uc?export=download&confirm=%s&id=%s" -o %s' % (get_token(), id, name)
 69 |     else:  # small file
 70 |         s = 'curl -s -L -o %s "drive.google.com/uc?export=download&id=%s"' % (name, id)
 71 |     r = os.system(s)  # execute, capture return
 72 |     os.remove('cookie') if os.path.exists('cookie') else None
 73 | 
 74 |     # Error check
 75 |     if r != 0:
 76 |         os.remove(name) if os.path.exists(name) else None  # remove partial
 77 |         print('Download error ')  # raise Exception('Download error')
 78 |         return r
 79 | 
 80 |     # Unzip if archive
 81 |     if name.endswith('.zip'):
 82 |         print('unzipping... ', end='')
 83 |         os.system('unzip -q %s' % name)  # unzip
 84 |         os.remove(name)  # remove zip to free space
 85 | 
 86 |     print('Done (%.1fs)' % (time.time() - t))
 87 |     return r
 88 | 
 89 | 
 90 | def get_token(cookie="./cookie"):
 91 |     with open(cookie) as f:
 92 |         for line in f:
 93 |             if "download" in line:
 94 |                 return line.split()[-1]
 95 |     return ""
 96 | 
 97 | # def upload_blob(bucket_name, source_file_name, destination_blob_name):
 98 | #     # Uploads a file to a bucket
 99 | #     # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
100 | #
101 | #     storage_client = storage.Client()
102 | #     bucket = storage_client.get_bucket(bucket_name)
103 | #     blob = bucket.blob(destination_blob_name)
104 | #
105 | #     blob.upload_from_filename(source_file_name)
106 | #
107 | #     print('File {} uploaded to {}.'.format(
108 | #         source_file_name,
109 | #         destination_blob_name))
110 | #
111 | #
112 | # def download_blob(bucket_name, source_blob_name, destination_file_name):
113 | #     # Uploads a blob from a bucket
114 | #     storage_client = storage.Client()
115 | #     bucket = storage_client.get_bucket(bucket_name)
116 | #     blob = bucket.blob(source_blob_name)
117 | #
118 | #     blob.download_to_filename(destination_file_name)
119 | #
120 | #     print('Blob {} downloaded to {}.'.format(
121 | #         source_blob_name,
122 | #         destination_file_name))
123 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/2-inference/source/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app
8 | 


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/README.md:
--------------------------------------------------------------------------------
 1 | # YOLOv5 on SageMaker
 2 | 本workshop演示使用YOLOv5在SageMaker上如何进行训练和推理。  
 3 | YOLOv5官方地址：https://github.com/ultralytics/yolov5  
 4 | YOLOv5是一个对象检测算法，可识别图片、视频里的对象。
 5 | ![检测例子](images/detection_example.jpg)
 6 | ## 数据准备
 7 | [0-preparation](0-preparation)演示把labelme数据格式转化为YOLOv5格式，如果已有YOLOv5格式的数据，可跳过数据准备，把数据按要求放入到S3即可。
 8 | ## 训练
 9 | [1-training](1-training)演示在SageMaker上进行训练。
10 | ## 推理
11 | [2-inference](2-inference)演示在SageMaker上部署Endpoint，以及调用Endpoint进行推理。


--------------------------------------------------------------------------------
/object-detection/yolov5-on-sagemaker/images/detection_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/images/detection_example.jpg


--------------------------------------------------------------------------------
/runtime/Java/Inference.java:
--------------------------------------------------------------------------------
 1 | import java.nio.ByteBuffer;
 2 | 
 3 | import com.amazonaws.services.sagemakerruntime.AmazonSageMakerRuntime;
 4 | import com.amazonaws.services.sagemakerruntime.AmazonSageMakerRuntimeClientBuilder;
 5 | import com.amazonaws.services.sagemakerruntime.model.InvokeEndpointRequest;
 6 | import com.amazonaws.services.sagemakerruntime.model.InvokeEndpointResult;
 7 | 
 8 | public class Inference {
 9 | 	public static void main(String[] args) {
10 | 		String request = "{\"bucket\":\"nowfox\",\"image_uri\":\"data/zidane.jpg\",\"img_size\":416}";
11 | 		InvokeEndpointRequest invokeEndpointRequest = new InvokeEndpointRequest();
12 | 		invokeEndpointRequest.setContentType("application/json");
13 | 		ByteBuffer buf = ByteBuffer.wrap(request.getBytes());
14 | 
15 | 		invokeEndpointRequest.setBody(buf);
16 | 		invokeEndpointRequest.setEndpointName("yolov5");
17 | 		invokeEndpointRequest.setAccept("application/json");
18 | 
19 | 		AmazonSageMakerRuntime amazonSageMaker = AmazonSageMakerRuntimeClientBuilder.defaultClient();
20 | 		InvokeEndpointResult invokeEndpointResult = amazonSageMaker.invokeEndpoint(invokeEndpointRequest);
21 | 		byte[] response = invokeEndpointResult.getBody().array();
22 | 		String result = new String(response);
23 | 		System.out.print(result);
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/runtime/Java/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>cn.nwcdcloud.samples</groupId>
 6 |   <artifactId>sagemaker</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>sagemaker</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 | 		<dependency>
19 | 		    <groupId>com.amazonaws</groupId>
20 | 		    <artifactId>aws-java-sdk-sagemakerruntime</artifactId>
21 | 		    <version>1.11.879</version>
22 | 		</dependency>
23 |   </dependencies>
24 | </project>
25 | 


--------------------------------------------------------------------------------
/runtime/Java2/Inference.java:
--------------------------------------------------------------------------------
 1 | import software.amazon.awssdk.core.SdkBytes;
 2 | import software.amazon.awssdk.services.sagemakerruntime.SageMakerRuntimeClient;
 3 | import software.amazon.awssdk.services.sagemakerruntime.model.InvokeEndpointRequest;
 4 | import software.amazon.awssdk.services.sagemakerruntime.model.InvokeEndpointResponse;
 5 | 
 6 | public class Inference {
 7 | 	public static void main(String[] args) {
 8 | 		String requestBody = "{\"bucket\":\"nowfox\",\"image_uri\":\"data/zidane.jpg\",\"img_size\":416}";
 9 | 		SdkBytes body = SdkBytes.fromUtf8String(requestBody);
10 | 		InvokeEndpointRequest request = InvokeEndpointRequest.builder().endpointName("yolov5")
11 | 				.contentType("application/json").body(body).build();
12 | 		SageMakerRuntimeClient client = SageMakerRuntimeClient.create();
13 | 		InvokeEndpointResponse response = client.invokeEndpoint(request);
14 | 		System.out.print(response.body().asUtf8String());
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/runtime/Java2/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>cn.nwcdcloud.samples</groupId>
 6 |   <artifactId>sagemakerruntime</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>sagemakerruntime</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 | 	<dependency>
19 | 	    <groupId>software.amazon.awssdk</groupId>
20 | 	    <artifactId>sagemakerruntime</artifactId>
21 | 	    <version>2.15.7</version>
22 | 	</dependency>
23 |   </dependencies>
24 | </project>
25 | 


--------------------------------------------------------------------------------
/training-data-input/EFS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 使用EFS作为SageMaker的训练数据输入"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本文为使用EFS作为SageMaker的训练数据输入。  \n",
 16 |     "注意：该功能暂不能在中国区使用。"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## 2 运行环境\n",
 24 |     "Kernel 选择tensorflow2_p36或pytorch_p36均可。  \n",
 25 |     "本文在boto3 1.17.99和sagemaker 2.45.0下测试通过。  "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import boto3,sagemaker\n",
 35 |     "print(boto3.__version__)\n",
 36 |     "print(sagemaker.__version__)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## 3 配置EFS\n",
 44 |     "参考 https://docs.aws.amazon.com/zh_cn/efs/latest/ug/gs-step-two-create-efs-resources.html 配置EFS，注意配置合适的安全组。"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 4 获取/设置相关参数"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "import boto3\n",
 61 |     "import sagemaker\n",
 62 |     "from sagemaker.image_uris import retrieve\n",
 63 |     "\n",
 64 |     "sagemaker_session = sagemaker.Session()\n",
 65 |     "iam = boto3.client('iam')\n",
 66 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
 67 |     "role=\"\"\n",
 68 |     "for current_role in roles[\"Roles\"]:\n",
 69 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
 70 |     "        role=current_role[\"Arn\"]\n",
 71 |     "        break\n",
 72 |     "print(role)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "注意事项：\n",
 80 |     "- 1.SageMaker必须要有使用EFS的权限\n",
 81 |     "- 2.确认EFS的安全组，允许SageMaker访问"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "subnets = [\"subnet-0eecdb20\"]  # Should be same as Subnet used for FSx. Example: subnet-0f9XXXX\n",
 91 |     "security_group_ids = [\"sg-6478f13a\"]  # Should be same as Security group used for FSx. sg-03ZZZZZZ\n",
 92 |     "file_system_id = \"fs-8eafd93a\"  # FSx file system ID with your training dataset. Example: 'fs-0bYYYYYY'\n",
 93 |     "efs_dir=\"/test\" #EFS目录"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "from sagemaker.inputs import FileSystemInput\n",
103 |     "file_system_directory_path = efs_dir\n",
104 |     "file_system_access_mode = \"ro\"#read only\n",
105 |     "file_system_type = \"EFS\"\n",
106 |     "train_fs = FileSystemInput(\n",
107 |     "    file_system_id=file_system_id,\n",
108 |     "    file_system_type=file_system_type,\n",
109 |     "    directory_path=file_system_directory_path,\n",
110 |     "    file_system_access_mode=file_system_access_mode,\n",
111 |     ")"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "## 5 训练"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "本文仅仅是列出了训练目录下的前100个文件，并没有实际训练，主要为演示获取EFS数据。"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### 5.1 TensorFlow"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "from sagemaker.tensorflow import TensorFlow\n",
142 |     "\n",
143 |     "estimator = TensorFlow(\n",
144 |     "    base_job_name=\"tensorflow2-fsx-big\",\n",
145 |     "    entry_point=\"ListFile.py\",\n",
146 |     "    role=role,\n",
147 |     "    py_version=\"py37\",\n",
148 |     "    framework_version=\"2.4.1\",\n",
149 |     "    instance_count=1,\n",
150 |     "    instance_type=\"ml.m5.large\",\n",
151 |     "    sagemaker_session=sagemaker_session,\n",
152 |     "    hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n",
153 |     "    subnets=subnets,\n",
154 |     "    security_group_ids=security_group_ids,\n",
155 |     ")\n",
156 |     "estimator.fit(train_fs)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "### 5.2 PyTorch"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "from sagemaker.pytorch import PyTorch\n",
173 |     "\n",
174 |     "estimator = PyTorch(\n",
175 |     "    base_job_name=\"big-data-input\",\n",
176 |     "    entry_point=\"ListFile.py\",\n",
177 |     "    role=role,\n",
178 |     "    py_version=\"py36\",\n",
179 |     "    framework_version=\"1.6.0\",\n",
180 |     "    instance_count=1,\n",
181 |     "    instance_type=\"ml.m5.large\",\n",
182 |     "    sagemaker_session=sagemaker_session,\n",
183 |     "    hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n",
184 |     "    subnets=subnets,\n",
185 |     "    security_group_ids=security_group_ids,\n",
186 |     ")\n",
187 |     "estimator.fit(train_fs)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Environment (conda_pytorch_p36)",
201 |    "language": "python",
202 |    "name": "conda_pytorch_p36"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.6.13"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 4
219 | }
220 | 


--------------------------------------------------------------------------------
/training-data-input/FSx.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 使用FSx for Lustre作为SageMaker的训练数据输入"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 1 说明\n",
 15 |     "本文为通过FSx for Lustre把S3数据作为SageMaker的训练数据输入，以解决直接从S3上下载训练数据耗时过长问题。  \n",
 16 |     "注意：该功能暂不能在中国区使用。"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## 2 运行环境\n",
 24 |     "Kernel 选择tensorflow2_p36或pytorch_p36均可。  \n",
 25 |     "本文在boto3 1.17.99和sagemaker 2.45.0下测试通过。  "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import boto3,sagemaker\n",
 35 |     "print(boto3.__version__)\n",
 36 |     "print(sagemaker.__version__)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## 3 配置FSx\n",
 44 |     "参考 https://docs.aws.amazon.com/zh_cn/fsx/latest/LustreGuide/create-fs-linked-data-repo.html 进行配置，将您的文件系统链接到S3存储桶。  \n",
 45 |     "配置导入S3数据时，不要输入prefix。"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 4 在VPC中创建S3终端节点\n",
 53 |     "打开VPC web控制台，在左边导航栏点击`终端节点`，再点击`创建终端节点`，在服务名称搜索框中输入`S3`，搜索结果选择类型为`Gateway`的记录，配置路由表中，勾选上主路由表的记录，再点击`创建终端节点`。  \n",
 54 |     "不配置这步会报 Failed. Reason: InternalServerError: We encountered an internal error. Please try again."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## 5 获取/设置相关参数"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "import boto3\n",
 71 |     "import sagemaker\n",
 72 |     "from sagemaker.image_uris import retrieve\n",
 73 |     "\n",
 74 |     "sagemaker_session = sagemaker.Session()\n",
 75 |     "iam = boto3.client('iam')\n",
 76 |     "roles = iam.list_roles(PathPrefix='/service-role')\n",
 77 |     "role=\"\"\n",
 78 |     "for current_role in roles[\"Roles\"]:\n",
 79 |     "    if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n",
 80 |     "        role=current_role[\"Arn\"]\n",
 81 |     "        break\n",
 82 |     "print(role)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "注意事项：\n",
 90 |     "- 1.SageMaker Role必须要有使用FSx的权限\n",
 91 |     "- 2.确认FSx的安全组，允许SageMaker访问"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "subnets = [\"subnet-0eecdb20\"]  # Should be same as Subnet used for FSx. Example: subnet-0f9XXXX\n",
101 |     "security_group_ids = [\"sg-6478f13a\"]  # Should be same as Security group used for FSx. sg-03ZZZZZZ\n",
102 |     "file_system_id = \"fs-011671baa391568ab\"  # FSx file system ID with your training dataset. Example: 'fs-0bYYYYYY'\n",
103 |     "mount_name=\"cm26jbmv\" #FSx控制台页面上的挂载名称，mount name\n",
104 |     "s3_prefix=\"test\" #S3前缀/目录"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from sagemaker.inputs import FileSystemInput\n",
114 |     "file_system_directory_path = \"/{}/{}\".format(mount_name,s3_prefix)\n",
115 |     "file_system_access_mode = \"ro\"#read only\n",
116 |     "file_system_type = \"FSxLustre\"\n",
117 |     "train_fs = FileSystemInput(\n",
118 |     "    file_system_id=file_system_id,\n",
119 |     "    file_system_type=file_system_type,\n",
120 |     "    directory_path=file_system_directory_path,\n",
121 |     "    file_system_access_mode=file_system_access_mode,\n",
122 |     ")"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## 6 训练\n",
130 |     "本文仅仅是列出了训练目录下的前100个文件，并没有实际训练，主要为演示通过FSx获取S3数据。"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "### 6.1 TensorFlow"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "from sagemaker.tensorflow import TensorFlow\n",
147 |     "\n",
148 |     "estimator = TensorFlow(\n",
149 |     "    base_job_name=\"big-data-input\",\n",
150 |     "    entry_point=\"ListFile.py\",\n",
151 |     "    role=role,\n",
152 |     "    py_version=\"py37\",\n",
153 |     "    framework_version=\"2.4.1\",\n",
154 |     "    instance_count=1,\n",
155 |     "    instance_type=\"ml.m5.large\",\n",
156 |     "    sagemaker_session=sagemaker_session,\n",
157 |     "    hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n",
158 |     "    subnets=subnets,\n",
159 |     "    security_group_ids=security_group_ids,\n",
160 |     ")\n",
161 |     "estimator.fit(train_fs)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "### 6.2 PyTorch"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "from sagemaker.pytorch import PyTorch\n",
178 |     "\n",
179 |     "estimator = PyTorch(\n",
180 |     "    base_job_name=\"big-data-input\",\n",
181 |     "    entry_point=\"ListFile.py\",\n",
182 |     "    role=role,\n",
183 |     "    py_version=\"py36\",\n",
184 |     "    framework_version=\"1.6.0\",\n",
185 |     "    instance_count=1,\n",
186 |     "    instance_type=\"ml.m5.large\",\n",
187 |     "    sagemaker_session=sagemaker_session,\n",
188 |     "    hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n",
189 |     "    subnets=subnets,\n",
190 |     "    security_group_ids=security_group_ids,\n",
191 |     ")\n",
192 |     "estimator.fit(train_fs)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": []
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "Environment (conda_pytorch_p36)",
206 |    "language": "python",
207 |    "name": "conda_pytorch_p36"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.6.13"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 4
224 | }
225 | 


--------------------------------------------------------------------------------
/training-data-input/ListFile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument("--path", type=str, default="./")
 7 |     #不能使用 args = parser.parse_args()
 8 |     args, _ = parser.parse_known_args()
 9 |     files = os.listdir(args.path)
10 |     print("=============list file begin")
11 |     count = 0
12 |     for file in files:
13 |         print(file)
14 |         count = count + 1
15 |         if count == 100:
16 |             break
17 |     print("=============list file end")
18 | 


--------------------------------------------------------------------------------
/training-data-input/README.md:
--------------------------------------------------------------------------------
1 | # 训练数据输入
2 | 在有大量数据需要进行训练时，如果SageMaker直接从S3下载数据，耗时较长。可通过FSx for Lustre把S3数据作为SageMaker的训练数据输入，以解决直接从S3上下载训练数据耗时过长问题。也支持从EFS载入数据。  
3 | [FSx](FSx.ipynb)  
4 | [EFS](EFS.ipynb)


--------------------------------------------------------------------------------