├── .gitignore ├── LICENSE ├── README.md ├── anomaly-detection └── PaDiM │ ├── 0-preparation │ └── preparation.ipynb │ ├── 1-training │ ├── source │ │ ├── mvtec.py │ │ ├── requirements2.txt │ │ └── training.py │ └── training.ipynb │ ├── 2-inference │ ├── Dockerfile │ ├── aws │ │ └── config │ ├── inference.ipynb │ └── source │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── mvtec.py │ │ ├── mytime.py │ │ ├── nginx.conf │ │ ├── predictor.py │ │ ├── requirements.txt │ │ ├── serve │ │ ├── test.py │ │ └── wsgi.py │ ├── LICENSE │ ├── README.md │ └── images │ └── detection_example.png ├── distributed-training ├── PyTorch │ ├── README.md │ ├── code │ │ ├── mnist.py │ │ └── requirements.txt │ └── pytorch_mnist.ipynb └── TensorFlow │ └── data-parallel │ ├── README.md │ ├── code │ ├── requirements.txt │ └── train_tensorflow_smdataparallel_mnist.py │ └── tensorflow2_smdataparallel_mnist_demo.ipynb ├── encapsulation ├── Dockerfile ├── LICENSE ├── README.md ├── hyperparameter-tuning.ipynb ├── inference-custom-image.ipynb ├── inference-default-image.ipynb ├── nginx.conf ├── source │ ├── export_model.py │ ├── processing.py │ ├── requirements.txt │ └── train.py ├── test │ ├── cat.681.jpg │ └── dog.592.jpg └── train.ipynb ├── hyperspectral └── DeepHyperX │ ├── 1-preparation │ ├── explore_data.ipynb │ ├── preparation.ipynb │ └── preprocess.py │ ├── 2-training │ ├── source │ │ ├── custom_datasets.py │ │ ├── datasets.py │ │ ├── inference.py │ │ ├── main.py │ │ ├── models.py │ │ ├── requirements2.txt │ │ └── utils.py │ └── training.ipynb │ ├── 3-inference │ └── inference.ipynb │ ├── LICENSE │ └── README.md ├── image-classification ├── Image-classification-lst-format.ipynb ├── LICENSE ├── README.md └── im2rec.py ├── images └── sagemaker_notebook.png ├── object-detection └── yolov5-on-sagemaker │ ├── 0-preparation │ └── preparation.ipynb │ ├── 1-training │ ├── container │ │ ├── Dockerfile │ │ ├── changehostname.c │ │ ├── local_test │ │ │ └── input │ │ │ │ ├── config │ │ │ │ ├── hyperparameters.json │ │ │ │ └── resourceconfig.json │ │ │ │ └── data │ │ │ │ └── training │ │ │ │ ├── cfg │ │ │ │ ├── hyp.yaml │ │ │ │ └── yolov5s.yaml │ │ │ │ └── weights │ │ │ │ └── yolov5s.pt │ │ ├── sources.list │ │ ├── start_with_right_hostname.sh │ │ └── train │ ├── training-build.ipynb │ └── training.ipynb │ ├── 2-inference │ ├── Dockerfile │ ├── aws │ │ └── config │ ├── inference-build.ipynb │ ├── inference.ipynb │ └── source │ │ ├── detect.py │ │ ├── models │ │ ├── __init__.py │ │ ├── common.py │ │ ├── experimental.py │ │ ├── export.py │ │ ├── hub │ │ │ ├── yolov3-spp.yaml │ │ │ ├── yolov5-fpn.yaml │ │ │ └── yolov5-panet.yaml │ │ ├── yolo.py │ │ ├── yolov5l.yaml │ │ ├── yolov5m.yaml │ │ ├── yolov5s.yaml │ │ └── yolov5x.yaml │ │ ├── nginx.conf │ │ ├── predictor.py │ │ ├── requirements.txt │ │ ├── serve │ │ ├── utils │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── datasets.py │ │ ├── evolve.sh │ │ ├── general.py │ │ ├── google_app_engine │ │ │ ├── Dockerfile │ │ │ ├── additional_requirements.txt │ │ │ └── app.yaml │ │ ├── google_utils.py │ │ └── torch_utils.py │ │ └── wsgi.py │ ├── LICENSE │ ├── README.md │ └── images │ └── detection_example.jpg ├── runtime ├── Java │ ├── Inference.java │ └── pom.xml ├── Java2 │ ├── Inference.java │ └── pom.xml └── LICENSE ├── training-data-input ├── EFS.ipynb ├── FSx.ipynb ├── ListFile.py └── README.md └── update-endpoint └── UpdateEndpoint.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | #this project specific 2 | anomaly-detection/PaDiM/0-preparation/bottle 3 | anomaly-detection/PaDiM/1-training/result 4 | anomaly-detection/PaDiM/2-inference/result 5 | anomaly-detection/PaDiM/2-inference/source/train.pkl 6 | image-classification/image-* 7 | object-detection/yolov5-on-sagemaker/0-preparation/biaozhu/* 8 | object-detection/yolov5-on-sagemaker/1-training/container/dockersource/ 9 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/model/ 10 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/data.yaml 11 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/images/* 12 | object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/labels/* 13 | object-detection/yolov5-on-sagemaker/1-training/runs/* 14 | object-detection/yolov5-on-sagemaker/1-training/model_data.txt 15 | object-detection/yolov5-on-sagemaker/2-inference/source/yolov5s.pt 16 | object-detection/yolov5-on-sagemaker/2-inference/result/* 17 | object-detection/yolov5-on-sagemaker/2-inference/model/ 18 | hyperspectral/DeepHyperX/1-preparation/dataset/ 19 | hyperspectral/DeepHyperX/1-preparation/Datasets/ 20 | hyperspectral/DeepHyperX/2-training/result/ 21 | distributed-training/PyTorch/data/ 22 | update-endpoint/data/ 23 | 24 | 25 | 26 | venv/* 27 | */cdk.out/* 28 | 29 | 30 | # Compiled class file 31 | *.class 32 | 33 | # Log file 34 | *.log 35 | 36 | # BlueJ files 37 | *.ctxt 38 | 39 | # Mobile Tools for Java (J2ME) 40 | .mtj.tmp/ 41 | 42 | # Package Files # 43 | #*.jar 44 | *.war 45 | *.nar 46 | *.ear 47 | *.zip 48 | *.tar.gz 49 | *.rar 50 | *.pth 51 | 52 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 53 | hs_err_pid* 54 | 55 | 56 | __pycache__/ 57 | */__pycache__ 58 | /sample_data/* 59 | */dataset/* 60 | /models/ 61 | */models 62 | /.idea/* 63 | /logs/* 64 | /classes/* 65 | !.mvn/wrapper/maven-wrapper.jar 66 | .DS_Store 67 | .idea/ 68 | 69 | 70 | 71 | *Dataset/ 72 | *Datasets/ 73 | *checkpoints/ 74 | *checkpoint/ 75 | 76 | .DS_Store 77 | ### STS ### 78 | .apt_generated 79 | .classpath 80 | .factorypath 81 | .project 82 | .settings 83 | .springBeans 84 | .sts4-cache 85 | 86 | .ipynb_checkpoints/ 87 | .ipynb_checkpoints/* 88 | .ipynb_checkpoints 89 | 90 | ### IntelliJ IDEA ### 91 | .idea 92 | *.iws 93 | *.iml 94 | *.ipr 95 | */target/* 96 | */target 97 | /target/* 98 | /target/ 99 | */dataset 100 | /sample_data/* 101 | 102 | /raw-data/* 103 | */cdk.context.json 104 | cdk-infra/cdk.context.json 105 | /output/* 106 | cdk.out* 107 | *cdk.out* 108 | build.sh 109 | 110 | ### NetBeans ### 111 | /nbproject/private/ 112 | /build/ 113 | /nbbuild/ 114 | /dist/ 115 | /nbdist/ 116 | /.nb-gradle/ 117 | .idea/ 118 | .idea/* 119 | 120 | */.env/ 121 | .env/* 122 | */.env/* 123 | 124 | /cdk.out/ 125 | */cdk.out/ 126 | 127 | */temp/* 128 | */temp 129 | /temp/* 130 | /temp/* 131 | /temp/ 132 | ./temp/ 133 | cdk.context.json 134 | 135 | *.zip 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 本repository为多个软件(算法或项目)的目录,不是一个具体软件。各个软件(算法或项目)在具体文件夹中,比如[anomaly-detection/PaDiM](anomaly-detection/PaDiM),由于上游软件(算法或项目)的LICENSE不同,各个软件(算法或项目)的LICENSE在具体软件(算法或项目)中提供。 2 | 本目录收集的各个软件(算法或项目)互相独立,无依赖。 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon SageMaker Workshop 2 | 利用Amazon SageMaker进行机器学习和深度学习开发。 3 | ## 版权说明 4 | 本repository为多个软件(算法或项目)的目录,不是一个具体软件。各个软件(算法或项目)在具体文件夹中,比如[anomaly-detection/PaDiM](anomaly-detection/PaDiM),由于上游软件(算法或项目)的LICENSE不同,各个软件(算法或项目)的LICENSE在具体软件(算法或项目)中提供。 5 | 本目录收集的各个软件(算法或项目)互相独立,无依赖。 6 | ## 免责声明 7 | 建议测试过程中使用此方案,生产环境使用请自行考虑评估。 8 | 9 | 当您对方案需要进一步的沟通和反馈后,可以联系 nwcd_labs@nwcdcloud.cn 获得更进一步的支持。 10 | 11 | 欢迎联系参与方案共建和提交方案需求, 也欢迎在 github 项目 issue 中留言反馈 bugs。 12 | 13 | ## 内容简介 14 | 以深度学习中的常用场景,介绍如何使用Amazon SageMaker进行模型训练和推理部署。 15 | 16 | 本目录有以下内容: 17 | - [异常检测anomaly-detection](anomaly-detection/PaDiM/README.md),使用PaDiM演示异常检测 18 | - [图片分类image-classification](image-classification/README.md),使用Amazon SageMaker内置的图片分类算法进行模型训练和部署 19 | - [对象检测object-detection](object-detection/yolov5-on-sagemaker/README.md),使用YOLOv5算法演示对象检测 20 | - [高光谱hyperspectral](hyperspectral/DeepHyperX/README.md),使用DeepHyperX算法对高光谱进行处理 21 | - [封装自定义算法encapsulation](encapsulation/README.md),使用自定义算法,通过Amazon SageMaker进行封装在AWS平台上进行模型训练和部署 22 | - [分布式训练distributed-training](distributed-training),多机多卡分布式训练,[PyTorch](distributed-training/PyTorch/README.md)、[TensorFlow](distributed-training/TensorFlow/data-parallel/README.md) 23 | - [训练数据输入](training-data-input/README.md),解决直接从S3上下载训练数据耗时过长问题;也支持从EFS获取训练数据 24 | - [在线更新模型](update-endpoint/UpdateEndpoint.ipynb),在不停止endpoint服务情况下,更新模型 25 | - [运行时客户端调用runtime](runtime),[Java SDK2调用推理示例](runtime/Java2)(推荐)、[Java SDK1调用推理示例](runtime/Java) 26 | 27 | ## 准备工作 28 | 为了使用Amazon SageMaker您只需要拥有一个AWS的账号,我们就可以实践起来。 29 | 30 | ## 常见问题 31 | ### 1.升级相应Kernel中sagemaker版本 32 | 以升级tensorflow_p36 kernal中sagemaker为例,可先使用`conda env list`查看当前所有虚拟环境 33 | ``` 34 | conda env list 35 | source activate tensorflow_p36 36 | pip install sagemaker --upgrade 37 | ``` 38 | 执行完以上命令重启kernel 39 | ### 2.提示`ResourceLimitExceeded` 40 | 如果训练时,提示类似以下内容: 41 | ``` 42 | ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.p3.2xlarge for spot training job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please contact AWS support to request an increase for this limit. 43 | ``` 44 | 为避免误操作造成浪费,默认未开通ml机型大型实例,需要在支持控制面板创建案例,选择提高服务限制。 45 | 限制类型选择`SageMaker`,根据需要选择对应区域,资源类型选择`SageMaker培训`,限制选择期望的机型。 46 | 如果要使用Spot实例进行训练,在描述中说明,参考:`希望提升宁夏区域的 Sagemaker Managed Spot Training ml.p3.2xlarge 限额为1。` 47 | 如果要对推理的机型进行提高服务限制,资源类型选择`SageMaker托管`。 -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/0-preparation/preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "palestinian-oriental", 6 | "metadata": {}, 7 | "source": [ 8 | "# PaDiM on SageMaker--数据准备" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "still-tonight", 14 | "metadata": {}, 15 | "source": [ 16 | "## 说明\n", 17 | "本章内容为准备需要所需数据\n", 18 | "## 运行环境\n", 19 | "Kernel 选择pytorch_latest_p37。 \n", 20 | "### S3目录存放格式\n", 21 | "```\n", 22 | "training\n", 23 | "├── ground_truth\n", 24 | "│ ├── broken_1\n", 25 | "│ │ ├── image001.jpg\n", 26 | "│ │ ├── image002.jpg\n", 27 | "│ │ └── ...\n", 28 | "│ └── broken_2\n", 29 | "│ ├── image101.jpg\n", 30 | "│ ├── image102.jpg\n", 31 | "│ └── ...\n", 32 | "├── test\n", 33 | "│ ├── broken_1\n", 34 | "│ │ ├── image001.jpg\n", 35 | "│ │ ├── image002.jpg\n", 36 | "│ │ └── ...\n", 37 | "│ ├── broken_2\n", 38 | "│ │ ├── image101.jpg\n", 39 | "│ │ ├── image102.jpg\n", 40 | "│ │ └── ...\n", 41 | "│ └── good\n", 42 | "│ ├── image201.jpg\n", 43 | "│ ├── image202.jpg\n", 44 | "│ └── ...\n", 45 | "└── train\n", 46 | " └── good\n", 47 | " ├── image301.txt\n", 48 | " ├── image302.txt\n", 49 | " └── ...\n", 50 | "```\n", 51 | "### SageMaker输入数据根目录\n", 52 | "运行SageMaker时,SageMaker会从S3拷贝数据放到到运行容器的`/opt/ml/input/data/training/`下。即`ground_truth/broken_1/image001.jpg`对应全路径为`/opt/ml/input/data/training/ground_truth/broken_1/image001.jpg`\n", 53 | "### 文件说明\n", 54 | "- train目录下只能有一个good目录\n", 55 | "- test目录下除了有一个good目录,还需要至少1个非good目录 \n", 56 | "- ground_truth目录下只有非good目录,且和test目录、文件名一致" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "similar-projection", 62 | "metadata": {}, 63 | "source": [ 64 | "## 下载示例数据\n", 65 | "访问https://www.mvtec.com/company/research/datasets/mvtec-ad/ 下载数据,本文有所修改,请下载单独类别。\n", 66 | "本文以bottle.tar.xz为例进行介绍。" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "capable-barrel", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!xz -d bottle.tar.xz" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "possible-prediction", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "!tar -xf bottle.tar" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "supposed-backing", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "#增加写权限\n", 97 | "!chmod -R u+w bottle" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "bored-google", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "#去掉ground_truth文件名称中的_mask\n", 108 | "import os\n", 109 | "gt_dir=os.path.join(\"bottle\",'ground_truth')\n", 110 | "img_types = sorted(os.listdir(gt_dir))\n", 111 | "for img_type in img_types:\n", 112 | " img_type_dir = os.path.join(gt_dir, img_type)\n", 113 | " for f in sorted(os.listdir(img_type_dir)):\n", 114 | " if(f.find(\"_mask\")!=-1):\n", 115 | " os.rename(os.path.join(img_type_dir,f),os.path.join(img_type_dir,f.replace(\"_mask\",\"\")))" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "fantastic-richmond", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#修改input_data,训练章节中会继续用到该地址\n", 126 | "input_data = 's3://junzhong/data/mvtec/bottle/'\n", 127 | "!aws s3 sync --quiet bottle $input_data" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "falling-drama", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Environment (conda_pytorch_latest_p37)", 142 | "language": "python", 143 | "name": "conda_pytorch_latest_p37" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.7.10" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 5 160 | } 161 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/1-training/source/mvtec.py: -------------------------------------------------------------------------------- 1 | import os 2 | # import tarfile 3 | from PIL import Image 4 | from tqdm import tqdm 5 | # import urllib.request 6 | 7 | import torch 8 | from torch.utils.data import Dataset 9 | from torchvision import transforms as T 10 | 11 | 12 | # URL = 'ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz' 13 | 14 | 15 | class MVTecDataset(Dataset): 16 | def __init__(self, dataset_path, is_train=True, 17 | resize=256, cropsize=224): 18 | #assert class_name in CLASS_NAMES, 'class_name: {}, should be in {}'.format(class_name, CLASS_NAMES) 19 | self.dataset_path = dataset_path 20 | #self.class_name = class_name 21 | self.is_train = is_train 22 | self.resize = resize 23 | self.cropsize = cropsize 24 | # self.mvtec_folder_path = os.path.join(root_path, 'mvtec_anomaly_detection') 25 | 26 | # download dataset if not exist 27 | # self.download() 28 | 29 | # load dataset 30 | self.x, self.y, self.mask = self.load_dataset_folder() 31 | 32 | # set transforms 33 | self.transform_x = T.Compose([T.Resize(resize, Image.ANTIALIAS), 34 | T.CenterCrop(cropsize), 35 | T.ToTensor(), 36 | T.Normalize(mean=[0.485, 0.456, 0.406], 37 | std=[0.229, 0.224, 0.225])]) 38 | self.transform_mask = T.Compose([T.Resize(resize, Image.NEAREST), 39 | T.CenterCrop(cropsize), 40 | T.ToTensor()]) 41 | 42 | def __getitem__(self, idx): 43 | x, y, mask = self.x[idx], self.y[idx], self.mask[idx] 44 | 45 | x = Image.open(x).convert('RGB') 46 | x = self.transform_x(x) 47 | 48 | if y == 0 or not self.hasGt:#good or not has gt 49 | mask = torch.zeros([1, self.cropsize, self.cropsize]) 50 | else: 51 | mask = Image.open(mask) 52 | mask = self.transform_mask(mask) 53 | 54 | return x, y, mask 55 | 56 | def __len__(self): 57 | return len(self.x) 58 | 59 | def load_dataset_folder(self): 60 | phase = 'train' if self.is_train else 'test' 61 | x, y, mask = [], [], [] 62 | 63 | #img_dir = os.path.join(self.dataset_path, self.class_name, phase) 64 | #gt_dir = os.path.join(self.dataset_path, self.class_name, 'ground_truth') 65 | img_dir = os.path.join(self.dataset_path, phase) 66 | gt_dir = os.path.join(self.dataset_path, 'ground_truth') 67 | hasGt = True if os.path.exists(gt_dir) else False 68 | self.hasGt = hasGt 69 | 70 | img_types = sorted(os.listdir(img_dir)) 71 | for img_type in img_types: 72 | 73 | # load images 74 | img_type_dir = os.path.join(img_dir, img_type) 75 | if not os.path.isdir(img_type_dir): 76 | continue 77 | img_fpath_list = sorted([os.path.join(img_type_dir, f) 78 | for f in os.listdir(img_type_dir) 79 | if f.endswith('.jpg') or f.endswith('.png')]) 80 | x.extend(img_fpath_list) 81 | 82 | # load gt labels 83 | if img_type == 'good': 84 | y.extend([0] * len(img_fpath_list)) 85 | mask.extend([None] * len(img_fpath_list)) 86 | else: 87 | y.extend([1] * len(img_fpath_list)) 88 | if hasGt: 89 | gt_type_dir = os.path.join(gt_dir, img_type) 90 | #img_fname_list = [os.path.splitext(os.path.basename(f))[0] for f in img_fpath_list] 91 | img_fname_list = [os.path.basename(f) for f in img_fpath_list] 92 | gt_fpath_list = [os.path.join(gt_type_dir, img_fname) 93 | for img_fname in img_fname_list] 94 | mask.extend(gt_fpath_list) 95 | else: 96 | mask.extend([None] * len(img_fpath_list)) 97 | 98 | 99 | assert len(x) == len(y), 'number of x and y should be same' 100 | 101 | return list(x), list(y), list(mask) 102 | 103 | # def download(self): 104 | # """Download dataset if not exist""" 105 | 106 | # if not os.path.exists(self.mvtec_folder_path): 107 | # tar_file_path = self.mvtec_folder_path + '.tar.xz' 108 | # if not os.path.exists(tar_file_path): 109 | # download_url(URL, tar_file_path) 110 | # print('unzip downloaded dataset: %s' % tar_file_path) 111 | # tar = tarfile.open(tar_file_path, 'r:xz') 112 | # tar.extractall(self.mvtec_folder_path) 113 | # tar.close() 114 | 115 | # return 116 | 117 | 118 | # class DownloadProgressBar(tqdm): 119 | # def update_to(self, b=1, bsize=1, tsize=None): 120 | # if tsize is not None: 121 | # self.total = tsize 122 | # self.update(b * bsize - self.n) 123 | 124 | 125 | # def download_url(url, output_path): 126 | # with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t: 127 | # urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to) 128 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/1-training/source/requirements2.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | sklearn 3 | matplotlib 4 | scikit-image -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/1-training/training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "applicable-zimbabwe", 6 | "metadata": {}, 7 | "source": [ 8 | "# PaDiM on SageMaker--训练" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "ethical-commitment", 14 | "metadata": {}, 15 | "source": [ 16 | "## 说明\n", 17 | "本章内容为调用SageMaker进行训练,数据来自S3,训练后的模型放到S3。" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "southeast-pasta", 23 | "metadata": {}, 24 | "source": [ 25 | "## 运行环境\n", 26 | "Kernel 选择pytorch_latest_p37。 \n", 27 | "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "alert-trial", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import boto3,sagemaker\n", 38 | "print(boto3.__version__)\n", 39 | "print(sagemaker.__version__)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "acute-instrumentation", 45 | "metadata": {}, 46 | "source": [ 47 | "## 训练" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "german-protein", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "#修改为自己的路径\n", 58 | "input_data = 's3://junzhong/data/mvtec/bottle/'\n", 59 | "output_data = 's3://junzhong/result/mvtec/'" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "independent-migration", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import boto3\n", 70 | "iam = boto3.client('iam')\n", 71 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 72 | "role=\"\"\n", 73 | "for current_role in roles[\"Roles\"]:\n", 74 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 75 | " role=current_role[\"Arn\"]\n", 76 | " break\n", 77 | "#如果role为空表示有问题,需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n", 78 | "print(role)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "sapphire-ordinary", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from sagemaker.pytorch import PyTorch\n", 89 | "\n", 90 | "instance_type='ml.m5.2xlarge'\n", 91 | "\n", 92 | "estimator = PyTorch(entry_point='training.py',\n", 93 | " source_dir='./source',\n", 94 | " role=role,\n", 95 | " output_path=output_data,\n", 96 | " framework_version='1.6.0',\n", 97 | " hyperparameters={'data_path':\"/opt/ml/input/data/training/\", 'save_path':'/opt/ml/model'}, \n", 98 | " py_version='py3',\n", 99 | " instance_count=1,\n", 100 | " instance_type=instance_type,\n", 101 | " use_spot_instances=True,\n", 102 | " max_wait=432000,\n", 103 | " )" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "solved-network", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "result = estimator.fit(input_data)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "academic-parts", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import os\n", 124 | "os.makedirs(\"result\", exist_ok=True)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "spatial-republic", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "!aws s3 cp $estimator.model_data ./result" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "confirmed-replacement", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%%sh\n", 145 | "cd result\n", 146 | "tar zxvf model.tar.gz" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "illegal-billion", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "!pwd" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "mysterious-publicity", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "!mv result/temp_wide_resnet50_2/train.pkl ../2-inference/source/" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "primary-fifty", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Environment (conda_pytorch_latest_p37)", 181 | "language": "python", 182 | "name": "conda_pytorch_latest_p37" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.7.10" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 5 199 | } 200 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/Dockerfile: -------------------------------------------------------------------------------- 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04 2 | ARG BASE_IMG=${BASE_IMG} 3 | FROM ${BASE_IMG} 4 | 5 | RUN apt-get update 6 | RUN apt-get install -y --no-install-recommends nginx net-tools\ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN pip install flask gevent gunicorn boto3 -i https://opentuna.cn/pypi/web/simple/ && \ 10 | rm -rf /root/.cache 11 | 12 | COPY aws /root/.aws 13 | # RUN mkdir /opt/ml/code 14 | WORKDIR /opt/ml/code 15 | COPY source ./ 16 | 17 | RUN pip install -r requirements.txt -i https://opentuna.cn/pypi/web/simple/ 18 | 19 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard 20 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE 21 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update 22 | # PATH so that the train and serve programs are found when the container is invoked. 23 | 24 | ENV PYTHONUNBUFFERED=TRUE 25 | ENV PYTHONDONTWRITEBYTECODE=TRUE 26 | ENV PATH="/opt/ml/code/:${PATH}" 27 | 28 | ENTRYPOINT ["python3"] -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/aws/config: -------------------------------------------------------------------------------- 1 | [default] 2 | region = cn-northwest-1 3 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/anomaly-detection/PaDiM/2-inference/source/__init__.py -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/inference.py: -------------------------------------------------------------------------------- 1 | import random 2 | from random import sample 3 | import argparse 4 | import numpy as np 5 | import os 6 | import pickle 7 | from tqdm import tqdm 8 | from collections import OrderedDict 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.metrics import roc_curve 11 | from sklearn.metrics import precision_recall_curve 12 | from sklearn.covariance import LedoitWolf 13 | from scipy.spatial.distance import mahalanobis 14 | from scipy.ndimage import gaussian_filter 15 | from skimage import morphology 16 | from skimage.segmentation import mark_boundaries 17 | import matplotlib.pyplot as plt 18 | import matplotlib 19 | 20 | import torch 21 | import torch.nn.functional as F 22 | from torch.utils.data import DataLoader 23 | from torchvision.models import wide_resnet50_2, resnet18 24 | import mvtec as mvtec 25 | from mytime import get_current_time 26 | import boto3 27 | import shutil 28 | 29 | 30 | class Config(object): 31 | def __init__(self): 32 | self.arch = "wide_resnet50_2" 33 | self.data_path = "../dataset/mvtec_anomaly_detection/bottle" 34 | 35 | class DetectionSystem(object): 36 | def __init__(self): 37 | self.s3_client = boto3.client("s3") 38 | # extract train set features 39 | train_feature_filepath = 'train.pkl' 40 | print('load train set feature from: %s' % train_feature_filepath) 41 | with open(train_feature_filepath, 'rb') as f: 42 | self.train_outputs = pickle.load(f) 43 | 44 | def predict(self,data_path,upload_bucket,upload_path,threshold=0.673): 45 | args = Config() 46 | save_path = data_path+"save" 47 | 48 | use_cuda = torch.cuda.is_available() 49 | device = torch.device('cuda' if use_cuda else 'cpu') 50 | # load model 51 | if args.arch == 'resnet18': 52 | model = resnet18(pretrained=True, progress=True) 53 | t_d = 448 54 | d = 100 55 | elif args.arch == 'wide_resnet50_2': 56 | model = wide_resnet50_2(pretrained=True, progress=True) 57 | t_d = 1792 58 | d = 550 59 | model.to(device) 60 | model.eval() 61 | random.seed(1024) 62 | torch.manual_seed(1024) 63 | if use_cuda: 64 | torch.cuda.manual_seed_all(1024) 65 | 66 | idx = torch.tensor(sample(range(0, t_d), d)) 67 | 68 | # set model's intermediate outputs 69 | outputs = [] 70 | 71 | def hook(module, input, output): 72 | outputs.append(output) 73 | 74 | model.layer1[-1].register_forward_hook(hook) 75 | model.layer2[-1].register_forward_hook(hook) 76 | model.layer3[-1].register_forward_hook(hook) 77 | 78 | test_outputs = OrderedDict([('layer1', []), ('layer2', []), ('layer3', [])]) 79 | 80 | test_dataset = mvtec.MVTecDataset(data_path, is_train=False) 81 | test_dataloader = DataLoader(test_dataset, batch_size=32, pin_memory=True) 82 | test_imgs = [] 83 | 84 | # extract test set features 85 | for (x, y, mask) in test_dataloader: 86 | test_imgs.extend(x.cpu().detach().numpy()) 87 | # model prediction 88 | with torch.no_grad(): 89 | _ = model(x.to(device)) 90 | # get intermediate layer outputs 91 | for k, v in zip(test_outputs.keys(), outputs): 92 | test_outputs[k].append(v.cpu().detach()) 93 | # initialize hook outputs 94 | outputs = [] 95 | for k, v in test_outputs.items(): 96 | test_outputs[k] = torch.cat(v, 0) 97 | 98 | # Embedding concat 99 | embedding_vectors = test_outputs['layer1'] 100 | for layer_name in ['layer2', 'layer3']: 101 | embedding_vectors = self.embedding_concat(embedding_vectors, test_outputs[layer_name]) 102 | 103 | # randomly select d dimension 104 | embedding_vectors = torch.index_select(embedding_vectors, 1, idx) 105 | 106 | # calculate distance matrix 107 | B, C, H, W = embedding_vectors.size() 108 | embedding_vectors = embedding_vectors.view(B, C, H * W).numpy() 109 | dist_list = [] 110 | for i in range(H * W): 111 | mean = self.train_outputs[0][:, i] 112 | conv_inv = np.linalg.inv(self.train_outputs[1][:, :, i]) 113 | dist = [mahalanobis(sample[:, i], mean, conv_inv) for sample in embedding_vectors] 114 | dist_list.append(dist) 115 | 116 | dist_list = np.array(dist_list).transpose(1, 0).reshape(B, H, W) 117 | 118 | # upsample 119 | dist_list = torch.tensor(dist_list) 120 | score_map = F.interpolate(dist_list.unsqueeze(1), size=x.size(2), mode='bilinear', 121 | align_corners=False).squeeze().numpy() 122 | 123 | # apply gaussian smoothing on the score map 124 | for i in range(score_map.shape[0]): 125 | score_map[i] = gaussian_filter(score_map[i], sigma=4) 126 | 127 | # Normalization 128 | max_score = score_map.max() 129 | min_score = score_map.min() 130 | scores = (score_map - min_score) / (max_score - min_score) 131 | 132 | os.makedirs(save_path, exist_ok=True) 133 | self.plot_fig(test_imgs, scores, threshold, save_path) 134 | 135 | self.upload(save_path,upload_bucket,upload_path) 136 | shutil.rmtree(data_path) 137 | shutil.rmtree(save_path) 138 | print(data_path+"推理完毕") 139 | 140 | return "{'result':'OK'}" 141 | 142 | def upload(self,save_path,upload_bucket,upload_path): 143 | if not upload_path.endswith("/"): 144 | upload_path = upload_path + "/" 145 | for f in os.listdir(save_path): 146 | file_name = os.path.join(save_path,f) 147 | self.s3_client.upload_file(file_name,upload_bucket,upload_path+f) 148 | 149 | def plot_fig(self,test_img, scores, threshold, save_dir): 150 | num = len(scores) 151 | vmax = scores.max() * 255. 152 | vmin = scores.min() * 255. 153 | for i in range(num): 154 | img = test_img[i] 155 | img = self.denormalization(img) 156 | heat_map = scores[i] * 255 157 | mask = scores[i] 158 | mask[mask > threshold] = 1 159 | mask[mask <= threshold] = 0 160 | kernel = morphology.disk(4) 161 | mask = morphology.opening(mask, kernel) 162 | mask *= 255 163 | vis_img = mark_boundaries(img, mask, color=(1, 0, 0), mode='thick') 164 | fig_img, ax_img = plt.subplots(1, 4, figsize=(10, 3)) 165 | fig_img.subplots_adjust(right=0.9) 166 | norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) 167 | for ax_i in ax_img: 168 | ax_i.axes.xaxis.set_visible(False) 169 | ax_i.axes.yaxis.set_visible(False) 170 | ax_img[0].imshow(img) 171 | ax_img[0].title.set_text('Image') 172 | ax = ax_img[1].imshow(heat_map, cmap='jet', norm=norm) 173 | ax_img[1].imshow(img, cmap='gray', interpolation='none') 174 | ax_img[1].imshow(heat_map, cmap='jet', alpha=0.5, interpolation='none') 175 | ax_img[1].title.set_text('Predicted heat map') 176 | ax_img[2].imshow(mask, cmap='gray') 177 | ax_img[2].title.set_text('Predicted mask') 178 | ax_img[3].imshow(vis_img) 179 | ax_img[3].title.set_text('Segmentation result') 180 | left = 0.92 181 | bottom = 0.15 182 | width = 0.015 183 | height = 1 - 2 * bottom 184 | rect = [left, bottom, width, height] 185 | cbar_ax = fig_img.add_axes(rect) 186 | cb = plt.colorbar(ax, shrink=0.6, cax=cbar_ax, fraction=0.046) 187 | cb.ax.tick_params(labelsize=8) 188 | font = { 189 | 'family': 'serif', 190 | 'color': 'black', 191 | 'weight': 'normal', 192 | 'size': 8, 193 | } 194 | cb.set_label('Anomaly Score', fontdict=font) 195 | 196 | fig_img.savefig(os.path.join(save_dir, '{}'.format(i)), dpi=100) 197 | plt.close() 198 | 199 | 200 | def denormalization(self,x): 201 | mean = np.array([0.485, 0.456, 0.406]) 202 | std = np.array([0.229, 0.224, 0.225]) 203 | x = (((x.transpose(1, 2, 0) * std) + mean) * 255.).astype(np.uint8) 204 | 205 | return x 206 | 207 | 208 | def embedding_concat(self,x, y): 209 | B, C1, H1, W1 = x.size() 210 | _, C2, H2, W2 = y.size() 211 | s = int(H1 / H2) 212 | x = F.unfold(x, kernel_size=s, dilation=1, stride=s) 213 | x = x.view(B, C1, -1, H2, W2) 214 | z = torch.zeros(B, C1 + C2, x.size(2), H2, W2) 215 | for i in range(x.size(2)): 216 | z[:, :, i, :, :] = torch.cat((x[:, :, i, :, :], y), 1) 217 | z = z.view(B, -1, H2 * W2) 218 | z = F.fold(z, kernel_size=s, output_size=(H1, W1), stride=s) 219 | 220 | return z 221 | 222 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/mvtec.py: -------------------------------------------------------------------------------- 1 | import os 2 | # import tarfile 3 | from PIL import Image 4 | from tqdm import tqdm 5 | # import urllib.request 6 | 7 | import torch 8 | from torch.utils.data import Dataset 9 | from torchvision import transforms as T 10 | 11 | 12 | # URL = 'ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz' 13 | 14 | 15 | class MVTecDataset(Dataset): 16 | def __init__(self, dataset_path, is_train=True, 17 | resize=256, cropsize=224): 18 | #assert class_name in CLASS_NAMES, 'class_name: {}, should be in {}'.format(class_name, CLASS_NAMES) 19 | self.dataset_path = dataset_path 20 | #self.class_name = class_name 21 | self.is_train = is_train 22 | self.resize = resize 23 | self.cropsize = cropsize 24 | # self.mvtec_folder_path = os.path.join(root_path, 'mvtec_anomaly_detection') 25 | 26 | # download dataset if not exist 27 | # self.download() 28 | 29 | # load dataset 30 | self.x, self.y, self.mask = self.load_dataset_folder() 31 | 32 | # set transforms 33 | self.transform_x = T.Compose([T.Resize(resize, Image.ANTIALIAS), 34 | T.CenterCrop(cropsize), 35 | T.ToTensor(), 36 | T.Normalize(mean=[0.485, 0.456, 0.406], 37 | std=[0.229, 0.224, 0.225])]) 38 | self.transform_mask = T.Compose([T.Resize(resize, Image.NEAREST), 39 | T.CenterCrop(cropsize), 40 | T.ToTensor()]) 41 | 42 | def __getitem__(self, idx): 43 | x, y, mask = self.x[idx], self.y[idx], self.mask[idx] 44 | 45 | x = Image.open(x).convert('RGB') 46 | x = self.transform_x(x) 47 | 48 | mask = torch.zeros([1, self.cropsize, self.cropsize]) 49 | 50 | return x, y, mask 51 | 52 | def __len__(self): 53 | return len(self.x) 54 | 55 | def load_dataset_folder(self): 56 | x, y, mask = [], [], [] 57 | 58 | img_fpath_list = sorted([os.path.join(self.dataset_path, f) 59 | for f in os.listdir(self.dataset_path) 60 | if f.endswith('.jpg') or f.endswith('.png')]) 61 | x.extend(img_fpath_list) 62 | y.extend([1] * len(img_fpath_list)) 63 | mask.extend([None] * len(img_fpath_list)) 64 | 65 | 66 | assert len(x) == len(y), 'number of x and y should be same' 67 | 68 | return list(x), list(y), list(mask) 69 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/mytime.py: -------------------------------------------------------------------------------- 1 | import time 2 | def get_current_time(): 3 | ct = time.time() 4 | local_time = time.localtime(ct) 5 | data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time) 6 | data_secs = (ct - int(ct)) * 1000 7 | time_stamp = "%s.%03d" % (data_head, data_secs) 8 | return time_stamp -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | proxy_read_timeout 1200s; 27 | 28 | location ~ ^/(ping|invocations) { 29 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 30 | proxy_set_header Host $http_host; 31 | proxy_redirect off; 32 | proxy_pass http://gunicorn; 33 | } 34 | 35 | location / { 36 | return 404 "{}"; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import boto3 4 | import flask 5 | import json 6 | import shutil 7 | import time,datetime 8 | import random 9 | from inference import DetectionSystem 10 | import _thread 11 | 12 | DEBUG = False 13 | 14 | # The flask app for serving predictions 15 | app = flask.Flask(__name__) 16 | 17 | import logging 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(logging.DEBUG) 20 | logger.addHandler(logging.StreamHandler(sys.stdout)) 21 | 22 | @app.route('/ping', methods=['GET']) 23 | def ping(): 24 | """Determine if the container is working and healthy. In this sample container, we declare 25 | it healthy if we can load the model successfully.""" 26 | #health = boto3.client('s3') is not None # You can insert a health check here 27 | 28 | #status = 200 if health else 404 29 | status = 200 30 | return flask.Response(response='\n', status=status, mimetype='application/json') 31 | 32 | 33 | @app.route('/') 34 | def hello_world(): 35 | return 'PaDiM endpoint' 36 | 37 | 38 | @app.route('/invocations', methods=['POST']) 39 | def invocations(): 40 | content_type = flask.request.content_type 41 | if content_type != 'application/json' : 42 | return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain') 43 | 44 | tt = time.strftime("%Y%m%d%H%M%S", time.localtime()) 45 | for i in range(0,5): 46 | randomstr = str(random.randint(1000,9999)) 47 | current_data_dir = os.path.join(init_data_dir,tt+randomstr) 48 | if not os.path.exists(current_data_dir): 49 | try: 50 | os.mkdir(current_data_dir) 51 | break 52 | except FileExistsError: 53 | logger.info("Dir Exist."+current_data_dir) 54 | else: 55 | return flask.Response(response='Make dir error', status=500, mimetype='text/plain') 56 | 57 | data = flask.request.data.decode('utf-8') 58 | logger.info("invocations params [{}]".format(data)) 59 | try: 60 | data = json.loads(data) 61 | except: 62 | return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain') 63 | 64 | bucket = data['bucket'] 65 | for image_uri in data['image_uri']: 66 | download_file_name = image_uri.split('/')[-1] 67 | download_file_name = os.path.join(current_data_dir, download_file_name) 68 | s3_client.download_file(bucket, image_uri, download_file_name) 69 | upload_bucket = data['upload_bucket'] 70 | upload_path = data['upload_path'] 71 | 72 | #inference_result = detection.predict(current_data_dir) 73 | #shutil.rmtree(current_data_dir) 74 | _thread.start_new_thread( asyncPredict, (current_data_dir,upload_bucket,upload_path) ) 75 | 76 | _payload = json.dumps({'code': 1, 'msg': 'async Predict'}) 77 | return flask.Response(response=_payload, status=200, mimetype='application/json') 78 | 79 | def asyncPredict(current_data_dir,bucket,path): 80 | inference_result = detection.predict(current_data_dir,bucket,path) 81 | 82 | #--------------------------------------- 83 | init_data_dir = '/opt/ml/data_dir' 84 | 85 | if not os.path.exists(init_data_dir): 86 | try: 87 | os.mkdir(init_data_dir) 88 | except FileExistsError: 89 | logger.info("Dir Exist.") 90 | 91 | s3_client = boto3.client("s3") 92 | detection = DetectionSystem() 93 | #--------------------------------------- 94 | 95 | 96 | if __name__ == '__main__': 97 | app.run() -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | sklearn 3 | matplotlib 4 | scikit-image -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | from __future__ import print_function 17 | import multiprocessing 18 | import os 19 | import signal 20 | import subprocess 21 | import sys 22 | 23 | 24 | cpu_count = multiprocessing.cpu_count() 25 | 26 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 120) 27 | #model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count)) 28 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', 1)) 29 | 30 | def sigterm_handler(nginx_pid, gunicorn_pid): 31 | try: 32 | os.kill(nginx_pid, signal.SIGQUIT) 33 | except OSError: 34 | pass 35 | try: 36 | os.kill(gunicorn_pid, signal.SIGTERM) 37 | except OSError: 38 | pass 39 | 40 | sys.exit(0) 41 | 42 | def start_server(): 43 | print('Starting the inference server with {} workers.'.format(model_server_workers)) 44 | 45 | 46 | # link the log streams to stdout/err so they will be logged to the container logs 47 | subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log']) 48 | subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log']) 49 | 50 | nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf']) 51 | gunicorn = subprocess.Popen(['gunicorn', 52 | '--timeout', str(model_server_timeout), 53 | '-k', 'gevent', 54 | '-b', 'unix:/tmp/gunicorn.sock', 55 | '-w', str(model_server_workers), 56 | 'wsgi:app']) 57 | 58 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 59 | 60 | # If either subprocess exits, so do we. 61 | pids = set([nginx.pid, gunicorn.pid]) 62 | while True: 63 | pid, _ = os.wait() 64 | if pid in pids: 65 | break 66 | 67 | sigterm_handler(nginx.pid, gunicorn.pid) 68 | print('Inference server exiting') 69 | 70 | # The main routine just invokes the start function. 71 | 72 | if __name__ == '__main__': 73 | start_server() 74 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/test.py: -------------------------------------------------------------------------------- 1 | import _thread 2 | from inference import DetectionSystem 3 | 4 | detection = DetectionSystem() 5 | def asyncPredict(current_data_dir,bucket,path): 6 | inference_result = detection.predict(current_data_dir,bucket,path) 7 | 8 | current_data_dir="/opt/ml/data_dir/202103180920572600" 9 | _thread.start_new_thread( asyncPredict, (current_data_dir,"junzhong","result/ad") ) 10 | while 1: 11 | pass -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/2-inference/source/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app 8 | -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/README.md: -------------------------------------------------------------------------------- 1 | # PaDiM on SageMaker 2 | 本workshop演示使用PaDiM在SageMaker上如何进行训练和推理。 3 | PaDiM可进行异常检测 4 | ![检测例子](images/detection_example.png) 5 | ## 数据准备 6 | [0-preparation](0-preparation)说明按格式准备好数据,并放入到S3。 7 | ## 训练 8 | [1-training](1-training)演示在SageMaker上进行训练。 9 | ## 推理 10 | [2-inference](2-inference)演示在SageMaker上部署Endpoint,以及调用Endpoint进行推理。 -------------------------------------------------------------------------------- /anomaly-detection/PaDiM/images/detection_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/anomaly-detection/PaDiM/images/detection_example.png -------------------------------------------------------------------------------- /distributed-training/PyTorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch distributed training on SageMaker 2 | 本workshop演示PyTorch在SageMaker上如何进行分布式训练。 3 | 原地址:https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/pytorch_mnist 4 | 本Workshop使用的是PyTorch自带的DistributedDataParallel,未使用SageMaker的smdistributed。 -------------------------------------------------------------------------------- /distributed-training/PyTorch/code/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/distributed-training/PyTorch/code/requirements.txt -------------------------------------------------------------------------------- /distributed-training/TensorFlow/data-parallel/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow distributed training on SageMaker 2 | 本workshop演示TensorFlow在SageMaker上使用smdistributed进行分布式训练。 -------------------------------------------------------------------------------- /distributed-training/TensorFlow/data-parallel/code/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/distributed-training/TensorFlow/data-parallel/code/requirements.txt -------------------------------------------------------------------------------- /distributed-training/TensorFlow/data-parallel/code/train_tensorflow_smdataparallel_mnist.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os 19 | 20 | # Import SMDataParallel TensorFlow2 Modules 21 | import smdistributed.dataparallel.tensorflow as dist 22 | import tensorflow as tf 23 | 24 | import argparse 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--batch_size', type=int, default=128) 28 | 29 | #不能使用 args = parser.parse_args() 30 | args, _ = parser.parse_known_args() 31 | print("args.batch_size="+str(args.batch_size)) 32 | 33 | tf.random.set_seed(42) 34 | 35 | # SMDataParallel: Initialize 36 | dist.init() 37 | 38 | print("dist.size()="+str(dist.size())) 39 | print("dist.rank()="+str(dist.rank())) 40 | 41 | gpus = tf.config.experimental.list_physical_devices("GPU") 42 | for gpu in gpus: 43 | tf.config.experimental.set_memory_growth(gpu, True) 44 | if gpus: 45 | # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API] 46 | tf.config.experimental.set_visible_devices(gpus[dist.local_rank()], "GPU") 47 | 48 | #(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="mnist-%d.npz" % dist.rank()) 49 | (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(path="/opt/ml/input/data/training/mnist.npz") 50 | 51 | dataset = tf.data.Dataset.from_tensor_slices( 52 | (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) 53 | ) 54 | dataset = dataset.repeat().shuffle(10000).batch(args.batch_size) 55 | 56 | mnist_model = tf.keras.Sequential( 57 | [ 58 | tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), 59 | tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), 60 | tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), 61 | tf.keras.layers.Dropout(0.25), 62 | tf.keras.layers.Flatten(), 63 | tf.keras.layers.Dense(128, activation="relu"), 64 | tf.keras.layers.Dropout(0.5), 65 | tf.keras.layers.Dense(10, activation="softmax"), 66 | ] 67 | ) 68 | loss = tf.losses.SparseCategoricalCrossentropy() 69 | 70 | # SMDataParallel: dist.size() 71 | # LR for 8 node run : 0.000125 72 | # LR for single node run : 0.001 73 | opt = tf.optimizers.Adam(0.000125 * dist.size()) 74 | 75 | checkpoint_dir = os.environ["SM_MODEL_DIR"] 76 | 77 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) 78 | 79 | 80 | @tf.function 81 | def training_step(images, labels, first_batch): 82 | with tf.GradientTape() as tape: 83 | probs = mnist_model(images, training=True) 84 | loss_value = loss(labels, probs) 85 | 86 | # SMDataParallel: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape 87 | tape = dist.DistributedGradientTape(tape) 88 | 89 | grads = tape.gradient(loss_value, mnist_model.trainable_variables) 90 | opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) 91 | 92 | if first_batch: 93 | # SMDataParallel: Broadcast model and optimizer variables 94 | dist.broadcast_variables(mnist_model.variables, root_rank=0) 95 | dist.broadcast_variables(opt.variables(), root_rank=0) 96 | 97 | # SMDataParallel: all_reduce call 98 | loss_value = dist.oob_allreduce(loss_value) # Average the loss across workers 99 | return loss_value 100 | 101 | 102 | for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())): 103 | loss_value = training_step(images, labels, batch == 0) 104 | 105 | if batch % 50 == 0 and dist.rank() == 0: 106 | print("Step #%d\tLoss: %.6f" % (batch, loss_value)) 107 | 108 | # SMDataParallel: Save checkpoints only from master node. 109 | if dist.rank() == 0: 110 | mnist_model.save(os.path.join(checkpoint_dir, "1")) 111 | -------------------------------------------------------------------------------- /distributed-training/TensorFlow/data-parallel/tensorflow2_smdataparallel_mnist_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# TensorFlow distributed training on SageMaker" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本章内容为演示TensorFlow在SageMaker上使用smdistributed进行分布式训练。 " 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## 2 运行环境\n", 23 | "Kernel 选择tensorflow2_p36。 \n", 24 | "本文在boto3 1.17.109和sagemaker 2.48.1下测试通过。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import boto3,sagemaker\n", 34 | "print(boto3.__version__)\n", 35 | "print(sagemaker.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "如果版本较低,请执行以下命令,重启kernal后再检查版本" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!pip install -U boto3 -i https://opentuna.cn/pypi/web/simple/" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!pip install -U sagemaker -i https://opentuna.cn/pypi/web/simple/" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## 3 设置/获取相关参数" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import boto3\n", 77 | "import sagemaker\n", 78 | "from sagemaker.image_uris import retrieve\n", 79 | "\n", 80 | "sagemaker_session = sagemaker.Session()\n", 81 | "iam = boto3.client('iam')\n", 82 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 83 | "role=\"\"\n", 84 | "for current_role in roles[\"Roles\"]:\n", 85 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 86 | " role=current_role[\"Arn\"]\n", 87 | " break\n", 88 | "#如果role为空表示有问题,需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n", 89 | "print(role)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "data_input=\"s3://junzhong/data/mnist.npz\"" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "use_spot = True" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## 4 训练" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from sagemaker.tensorflow import TensorFlow\n", 124 | "\n", 125 | "estimator = TensorFlow(\n", 126 | " base_job_name=\"tensorflow2-smdataparallel-mnist\",\n", 127 | " source_dir=\"code\",\n", 128 | " entry_point=\"train_tensorflow_smdataparallel_mnist.py\",\n", 129 | " role=role,\n", 130 | " py_version=\"py37\",\n", 131 | " framework_version=\"2.3.1\",\n", 132 | " instance_count=2,\n", 133 | " instance_type=\"ml.p3.16xlarge\",\n", 134 | " sagemaker_session=sagemaker_session,\n", 135 | " hyperparameters={'batch_size':128},\n", 136 | " use_spot_instances=use_spot,\n", 137 | " max_wait=7200 if use_spot else None,\n", 138 | " max_run=7200,\n", 139 | " # Training using SMDataParallel Distributed Training Framework\n", 140 | " distribution={\"smdistributed\": {\"dataparallel\": {\"enabled\": True}}},\n", 141 | ")\n", 142 | "#日志都会输出到第1个node上\n", 143 | "estimator.fit(data_input)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "model_data = estimator.model_data\n", 153 | "model_data" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "## 5 部署" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "from sagemaker.tensorflow.model import TensorFlowModel\n", 170 | "model = TensorFlowModel(\n", 171 | " model_data=model_data, \n", 172 | " role=role,\n", 173 | " framework_version='2.3.1')\n", 174 | "predictor = model.deploy(initial_instance_count=1, instance_type=\"ml.m5.large\",endpoint_name=\"tensorflowmnist\")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## 6 推理" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "import tensorflow as tf\n", 191 | "(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "import numpy as np\n", 201 | "import random\n", 202 | "image_size = 3\n", 203 | "mask1 = random.sample(range(len(mnist_images)), image_size)\n", 204 | "mask2 = np.array(mask1, dtype=np.int)\n", 205 | "data = mnist_images[mask2]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "from matplotlib import pyplot as plt\n", 215 | "plt.figure(figsize=(2,2))\n", 216 | "for index, mask in enumerate(mask1):\n", 217 | " plt.subplot(1,image_size,index+1)\n", 218 | " plt.imshow(mnist_images[mask])\n", 219 | " plt.axis('off')\n", 220 | "plt.show()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "from sagemaker.tensorflow.model import TensorFlowPredictor\n", 230 | "endpoint_name = \"tensorflowmnist\"\n", 231 | "predictor = TensorFlowPredictor(\n", 232 | " endpoint_name=endpoint_name,\n", 233 | " sagemaker_session=sagemaker.Session())" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "predict输入和输出数据的格式直接对应Predict于TensorFlow Serving REST API 中方法的请求和响应格式 \n", 241 | "除此外,还支持简化的 json 格式、行分隔的 json 对象(“jsons”或“jsonlines”)和 CSV 数据" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "response = predictor.predict(np.expand_dims(data, axis=3))\n", 251 | "for i in range(0,image_size):\n", 252 | " print(\"Most likely answer: {}\".format(np.argmax(response[\"predictions\"][i])))" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## 7 清理" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "predictor.delete_endpoint()" 269 | ] 270 | } 271 | ], 272 | "metadata": { 273 | "kernelspec": { 274 | "display_name": "Environment (conda_tensorflow2_p36)", 275 | "language": "python", 276 | "name": "conda_tensorflow2_p36" 277 | }, 278 | "language_info": { 279 | "codemirror_mode": { 280 | "name": "ipython", 281 | "version": 3 282 | }, 283 | "file_extension": ".py", 284 | "mimetype": "text/x-python", 285 | "name": "python", 286 | "nbconvert_exporter": "python", 287 | "pygments_lexer": "ipython3", 288 | "version": "3.6.13" 289 | } 290 | }, 291 | "nbformat": 4, 292 | "nbformat_minor": 4 293 | } 294 | -------------------------------------------------------------------------------- /encapsulation/Dockerfile: -------------------------------------------------------------------------------- 1 | # Using the official tensorflow serving image from docker hub as base image 2 | FROM tensorflow/serving 3 | 4 | # Installing NGINX, used to rever proxy the predictions from SageMaker to TF Serving 5 | RUN apt-get update && apt-get install -y --no-install-recommends nginx git 6 | 7 | # Copy our model folder to the container 8 | COPY ./output/tf_server /model 9 | 10 | # Copy NGINX configuration to the container 11 | COPY nginx.conf /etc/nginx/nginx.conf 12 | 13 | # starts NGINX and TF serving pointing to our model 14 | ENTRYPOINT service nginx start | tensorflow_model_server --rest_api_port=8501 \ 15 | --model_name=sagemaker-demo \ 16 | --model_base_path=/model -------------------------------------------------------------------------------- /encapsulation/README.md: -------------------------------------------------------------------------------- 1 | # 说明 2 | 3 | 演示使用Sagemaker 封装图片分类算法, 使用TensorFlow-server 部署模型,并在客户端进行调用。 4 | 5 | 6 | ## 使用sagemaker 训练 7 | 8 | 打开 [train.ipynb](train.ipynb) 进行训练,然后打开[inference-custom-image.ipynb](inference-custom-image.ipynb)或[inference-default-image.ipynb](inference-default-image.ipynb)进行部署和使用 9 | 10 | ## 超级参数优化 11 | 打开 [hyperparameter-tuning.ipynb](hyperparameter-tuning.ipynb) 进行参数优化 -------------------------------------------------------------------------------- /encapsulation/inference-default-image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 使用TensorFlow默认Image进行推理" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "需要用到Tensorflow 和Keras , kernel 选择conda_tensorflow_p36\n", 15 | "\n", 16 | "## 把训练好的模型存放S3上\n", 17 | "model.tar.gz内目录结构如下\n", 18 | "```\n", 19 | "model.tar.gz\n", 20 | "└── tf_server\n", 21 | " └── 1\n", 22 | " ├── saved_model.pb\n", 23 | " └── variables\n", 24 | " ├── variables.data-00000-of-00001\n", 25 | " └── variables.index\n", 26 | "```" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## 部署模型到SageMaker" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from sagemaker.tensorflow.model import TensorFlowModel\n", 43 | "from sagemaker import get_execution_role\n", 44 | "\n", 45 | "#role = get_execution_role()\n", 46 | "role=\"arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235\" \n", 47 | "\n", 48 | "model_uri = \"s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-09-16-06-27-15-538/output/model.tar.gz\"\n", 49 | "endpoint_name = \"sagemaker-cat-vs-dog-2\"\n", 50 | "my_model = TensorFlowModel(\n", 51 | " model_data=model_uri, \n", 52 | " role=role,\n", 53 | " framework_version='1.15.2')\n", 54 | "\n", 55 | "#该步骤大概需要10分钟\n", 56 | "my_model.deploy(initial_instance_count=1,\n", 57 | " endpoint_name=endpoint_name,\n", 58 | " instance_type='ml.t2.medium'\n", 59 | " )" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## 推理\n", 67 | "### 读取数据" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from keras.preprocessing import image\n", 77 | "import json\n", 78 | "import numpy as np\n", 79 | "\n", 80 | "\n", 81 | "IMAGE_WIDTH = 150\n", 82 | "IMAGE_HEIGHT = 150\n", 83 | "# 修改测试图片地址\n", 84 | "image_paths = 'test/cat.681.jpg'\n", 85 | "#image_paths = 'test/dog.592.jpg'\n", 86 | "images = image.load_img(image_paths, target_size=(IMAGE_WIDTH, IMAGE_HEIGHT))\n", 87 | "input_image = image.img_to_array(images)\n", 88 | "input_image = np.expand_dims(input_image, axis=0)\n", 89 | "input_image /= 255.\n", 90 | "\n", 91 | "input_images = input_image.tolist()\n", 92 | "\n", 93 | "data = {\"name\": 'tensorflow/serving/predict',\"signature_name\":'predict',\"inputs\":input_images}" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import sagemaker\n", 103 | "from sagemaker.tensorflow.model import TensorFlowPredictor\n", 104 | "\n", 105 | "endpoint_name = \"sagemaker-cat-vs-dog-2\"\n", 106 | "predictor = TensorFlowPredictor(\n", 107 | " endpoint_name=endpoint_name,\n", 108 | " sagemaker_session=sagemaker.Session())" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "result = predictor.predict(data)\n", 118 | "print(result)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## 删除Endpoint" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import boto3\n", 135 | "sage = boto3.Session().client(service_name='sagemaker') \n", 136 | "sage.delete_endpoint(EndpointName=endpoint_name)" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Environment (conda_tensorflow_p37)", 143 | "language": "python", 144 | "name": "conda_tensorflow_p37" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.7.10" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } 162 | -------------------------------------------------------------------------------- /encapsulation/nginx.conf: -------------------------------------------------------------------------------- 1 | events { 2 | # determines how many requests can simultaneously be served 3 | # https://www.digitalocean.com/community/tutorials/how-to-optimize-nginx-configuration 4 | # for more information 5 | worker_connections 2048; 6 | } 7 | 8 | http { 9 | client_max_body_size 100m; 10 | client_body_buffer_size 128k; 11 | server { 12 | # configures the server to listen to the port 8080 13 | # Amazon SageMaker sends inference requests to port 8080. 14 | # For more information: https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response 15 | listen 8080 deferred; 16 | 17 | # redirects requests from SageMaker to TF Serving 18 | location /invocations { 19 | proxy_pass http://localhost:8501/v1/models/sagemaker-demo:predict; 20 | } 21 | 22 | # Used by SageMaker to confirm if server is alive. 23 | # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests 24 | location /ping { 25 | return 200 "OK"; 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /encapsulation/source/export_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import tensorflow.keras.backend as K 4 | from tensorflow.keras.losses import categorical_crossentropy 5 | from tensorflow.keras.optimizers import Adadelta 6 | 7 | 8 | def export_model(model, 9 | export_model_dir, 10 | model_version 11 | ): 12 | """ 13 | :param export_model_dir: type string, save dir for exported model url 14 | :param model_version: type int best 15 | :return:no return 16 | """ 17 | with tf.get_default_graph().as_default(): 18 | # prediction_signature 19 | tensor_info_input = tf.saved_model.utils.build_tensor_info(model.input) 20 | tensor_info_output = tf.saved_model.utils.build_tensor_info(model.output) 21 | print(model.output.shape, '**', tensor_info_output) 22 | prediction_signature = ( 23 | tf.saved_model.signature_def_utils.build_signature_def( 24 | inputs={'images': tensor_info_input}, # Tensorflow.TensorInfo 25 | outputs={'result': tensor_info_output}, 26 | #method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) 27 | method_name= "tensorflow/serving/predict") 28 | 29 | ) 30 | print('step1 => prediction_signature created successfully') 31 | # set-up a builder 32 | 33 | export_path_base = export_model_dir 34 | export_path = os.path.join( 35 | tf.compat.as_bytes(export_path_base), 36 | tf.compat.as_bytes(str(model_version))) 37 | builder = tf.saved_model.builder.SavedModelBuilder(export_path) 38 | builder.add_meta_graph_and_variables( 39 | # tags:SERVING,TRAINING,EVAL,GPU,TPU 40 | sess=K.get_session(), 41 | tags=[tf.saved_model.tag_constants.SERVING], 42 | signature_def_map={ 43 | 'predict': 44 | prediction_signature, 45 | tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 46 | prediction_signature, 47 | 48 | }, 49 | ) 50 | print('step2 => Export path(%s) ready to export trained model' % export_path, '\n starting to export model...') 51 | #builder.save(as_text=True) 52 | builder.save() 53 | print('Done exporting!') 54 | -------------------------------------------------------------------------------- /encapsulation/source/processing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import numpy as np 4 | import glob 5 | import random 6 | 7 | 8 | def processing_data(training_dir, validation_dir, testing_dir, validation_rate=0.1, testing_rate=0.1): 9 | class_equal,class_list = check_class(training_dir, validation_dir) 10 | if not class_equal: 11 | files = os.listdir(training_dir) 12 | for file in files : 13 | dir_path = os.path.join(os.path.join(training_dir, file)) 14 | if os.path.isdir(dir_path): 15 | print('处理 :', dir_path) 16 | get_filelist(dir_path, file, validation_dir, validation_rate, testing_dir,testing_rate) 17 | 18 | 19 | training_count = get_all_count(training_dir) 20 | validation_count = get_all_count(validation_dir) 21 | testing_count = get_all_count(testing_dir) 22 | 23 | print("==================================================") 24 | print('training count : ', training_count) 25 | print('validation count : ', validation_count) 26 | print('testing count : ', testing_count) 27 | print("class: ", class_list) 28 | print("==================================================") 29 | return class_list,training_count 30 | 31 | 32 | def check_class(training_dir, validation_dir): 33 | class_list = [] 34 | for file in os.listdir(training_dir): 35 | if os.path.isdir(os.path.join(training_dir , file)): 36 | class_list.append(file) 37 | class_list.sort() 38 | 39 | if not os.path.exists(validation_dir): 40 | return False,class_list 41 | 42 | class_list_validation = [] 43 | for file in os.listdir(validation_dir): 44 | if os.path.isdir(os.path.join(validation_dir , file)): 45 | class_list_validation.append(file) 46 | class_list_validation.sort() 47 | 48 | class_equal = (class_list == class_list_validation) 49 | return class_equal,class_list 50 | 51 | 52 | def get_all_count(dir_path): 53 | files = os.listdir(dir_path) 54 | count = 0 55 | for file in files : 56 | if os.path.isdir(dir_path): 57 | label_dir = os.path.join(dir_path, file) 58 | images = os.listdir(label_dir) 59 | tmp_count = len(images) 60 | #print('{} {}'.format(label_dir, tmp_count)) 61 | count += tmp_count 62 | return count 63 | 64 | 65 | 66 | def move_file(file_path, target_path, class_name, item ): 67 | target_dir = os.path.join(target_path, class_name) 68 | if not os.path.exists(target_dir): 69 | os.makedirs(target_dir) 70 | shutil.move(file_path, os.path.join(target_dir, item)) 71 | 72 | def get_filelist(dir_path, class_name, validation_dir, validation_rate, testing_dir,testing_rate): 73 | files = os.listdir(dir_path) 74 | random.shuffle(files) 75 | count = len(files) 76 | 77 | validation_count = int(count * validation_rate) 78 | testing_count = int(count * testing_rate) 79 | 80 | validation_list = files[0:validation_count] 81 | testing_list = files[validation_count: validation_count + testing_count] 82 | training_list = files[validation_count + testing_count: ] 83 | 84 | for item in validation_list: 85 | move_file(os.path.join(os.path.join(dir_path, item)) , validation_dir, class_name, item ) 86 | for item in testing_list: 87 | move_file(os.path.join(os.path.join(dir_path, item)) , testing_dir,class_name, item) 88 | 89 | 90 | 91 | 92 | if __name__ == '__main__': 93 | processing_data("/opt/ml/input/data/training","/opt/ml/input/data/validation","/opt/ml/input/data/testing") -------------------------------------------------------------------------------- /encapsulation/source/requirements.txt: -------------------------------------------------------------------------------- 1 | Keras==2.3.1 2 | Keras-Applications==1.0.8 3 | Keras-Preprocessing==1.1.2 4 | tensorboard==1.15.0 5 | tensorflow==1.15.2 6 | tensorflow-estimator==1.15.1 7 | tensorflow-gpu==1.15.2 8 | 9 | -------------------------------------------------------------------------------- /encapsulation/source/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import argparse 4 | import keras 5 | from keras import models 6 | from keras import layers 7 | from keras import optimizers 8 | from keras.applications import VGG16 9 | import tensorflow as tf 10 | from processing import processing_data 11 | from export_model import export_model 12 | from keras.preprocessing.image import ImageDataGenerator 13 | from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping 14 | from keras.preprocessing.image import ImageDataGenerator 15 | import sys 16 | import logging 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | logger.addHandler(logging.StreamHandler(sys.stdout)) 20 | 21 | 22 | IMAGE_WIDTH = 150 23 | IMAGE_HEIGHT = 150 24 | 25 | 26 | logger.info('tensorflow version:{}'.format(tf.__version__)) 27 | logger.info('keras version:{}'.format(keras.__version__)) 28 | logger.info("gpu_device_name:{}".format(tf.test.gpu_device_name())) 29 | logger.info("tf.test.is_gpu_available():{}".format(str(tf.test.is_gpu_available()))) 30 | 31 | 32 | 33 | def train(train_dir, validation_dir, test_dir, log_dir, model_dir, tf_server_dir, checkpoint_dir, class_list ,training_count, args): 34 | class_count = len(class_list) 35 | print("class_count:"+str(class_count)) 36 | 37 | # 创建模型 38 | conv_base = VGG16(weights='imagenet', 39 | include_top=False, 40 | input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, 3)) 41 | model = models.Sequential() 42 | model.add(conv_base) 43 | model.add(layers.Flatten()) 44 | model.add(layers.Dense(256, activation='relu')) 45 | model.add(layers.Dense(class_count, activation='softmax')) 46 | 47 | # 查看模型结构 48 | logger.info('This is the number of trainable weights ' 49 | 'before freezing the conv base:'+ str(len(model.trainable_weights))) 50 | conv_base.trainable = False 51 | logger.info('This is the number of trainable weights ' 52 | 'after freezing the conv base:'+ str(len(model.trainable_weights))) 53 | #conv_base.summary() 54 | 55 | # 准备训练参数 56 | RUN = RUN + 1 if 'RUN' in locals() else 1 57 | EPOCHS = args.epoch_count 58 | batch_size = args.batch_size 59 | lr = args.lr 60 | steps_per_epoch = training_count // batch_size 61 | logger.info("RUN:"+str(RUN)) 62 | logger.info("steps_per_epoch:"+str(steps_per_epoch)) 63 | logger.info("learning rate:"+str(lr)) 64 | 65 | # 载入图片数据 66 | train_datagen = ImageDataGenerator( 67 | rescale=1./255, 68 | rotation_range=20, 69 | width_shift_range=0.30, 70 | height_shift_range=0.30, 71 | shear_range=0.20, 72 | zoom_range=0.40, 73 | horizontal_flip=True, 74 | fill_mode='nearest') 75 | 76 | # Note that the validation data should not be augmented! 77 | test_datagen = ImageDataGenerator(rescale=1./255) 78 | 79 | train_generator = train_datagen.flow_from_directory( 80 | # This is the target directory 81 | train_dir, 82 | classes=class_list, 83 | # All images will be resized to 150x150 84 | target_size=(IMAGE_WIDTH, IMAGE_HEIGHT), 85 | batch_size=batch_size, 86 | # Since we use binary_crossentropy loss, we need binary labels 87 | class_mode='categorical') 88 | 89 | validation_generator = test_datagen.flow_from_directory( 90 | validation_dir, 91 | classes=class_list, 92 | target_size=(IMAGE_WIDTH, IMAGE_HEIGHT), 93 | batch_size=batch_size, 94 | class_mode='categorical') 95 | 96 | # 第一次训练 97 | LOG_DIR_1 = os.path.join(log_dir, 'run{}-1'.format(RUN)) 98 | LOG_FILE_PATH_1 = os.path.join(checkpoint_dir, 'checkpoint-1-{epoch:02d}-{val_acc:.4f}.hdf5') 99 | 100 | model.compile(loss='categorical_crossentropy', 101 | optimizer=optimizers.Adam(lr=2e-5) , 102 | metrics=['acc']) 103 | tensorboard = TensorBoard(log_dir=LOG_DIR_1, write_images=True) 104 | checkpoint = ModelCheckpoint(filepath=LOG_FILE_PATH_1, monitor='val_acc', verbose=1, save_best_only=True) 105 | early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=1) 106 | 107 | history = model.fit_generator( 108 | train_generator, 109 | steps_per_epoch=steps_per_epoch, 110 | epochs=EPOCHS, 111 | validation_data=validation_generator, 112 | validation_steps=50, 113 | verbose=args.verbose, 114 | callbacks=[tensorboard, checkpoint, early_stopping]) 115 | 116 | # 微调模型 117 | conv_base.trainable = True 118 | 119 | set_trainable = False 120 | for layer in conv_base.layers: 121 | if layer.name == 'block5_conv1': 122 | set_trainable = True 123 | layer.trainable = set_trainable 124 | #conv_base.summary() 125 | 126 | # 第二次训练 127 | LOG_DIR_2 = os.path.join(log_dir, 'run{}-2'.format(RUN)) 128 | LOG_FILE_PATH_2 = os.path.join(checkpoint_dir, 'checkpoint-2-{epoch:02d}-{val_acc:.4f}.hdf5') 129 | model.compile(loss='categorical_crossentropy', 130 | optimizer=optimizers.RMSprop(lr=lr) , 131 | metrics=['acc']) 132 | tensorboard = TensorBoard(log_dir=LOG_DIR_2, write_images=True) 133 | checkpoint = ModelCheckpoint(filepath=LOG_FILE_PATH_2, monitor='val_acc', verbose=1, save_best_only=True) 134 | early_stopping = EarlyStopping(monitor='val_acc', patience=5, verbose=1) 135 | 136 | history = model.fit_generator( 137 | train_generator, 138 | steps_per_epoch=steps_per_epoch, 139 | epochs=EPOCHS, 140 | validation_data=validation_generator, 141 | validation_steps=50, 142 | verbose=args.verbose, 143 | callbacks=[tensorboard, checkpoint, early_stopping]) 144 | 145 | # 测试 146 | test_generator = test_datagen.flow_from_directory( 147 | test_dir, 148 | target_size=(IMAGE_WIDTH, IMAGE_HEIGHT), 149 | batch_size=32, 150 | class_mode='categorical') 151 | 152 | test_loss, test_acc = model.evaluate_generator(test_generator, steps=50) 153 | logger.info('test acc:'+str(test_acc)) 154 | 155 | 156 | 157 | logger.info("保存模型") 158 | model.save(os.path.join(model_dir, 'model.h5')) 159 | 160 | #model = keras.models.load_model(os.path.join(model_dir, 'model.h5')) 161 | export_model( 162 | model, 163 | tf_server_dir, 164 | 1 165 | ) 166 | 167 | 168 | 169 | 170 | def main(args): 171 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_list 172 | 173 | input_dir = args.input_dir 174 | training_dir = os.path.join(input_dir, "data","training") 175 | validation_dir = os.path.join(input_dir, "data","validation") 176 | testing_dir = os.path.join(input_dir, "data","testing") 177 | log_dir = os.path.join(args.output_dir, "log") 178 | model_dir = args.model_dir 179 | tf_server_dir = os.path.join(model_dir, "tf_server") 180 | checkpoint_dir = os.path.join(args.output_dir, "log/checkpoint") 181 | 182 | if not os.path.exists(validation_dir): 183 | os.makedirs(validation_dir) 184 | if not os.path.exists(testing_dir): 185 | os.makedirs(testing_dir) 186 | if not os.path.exists(log_dir): 187 | os.makedirs(log_dir) 188 | if not os.path.exists(model_dir): 189 | os.makedirs(model_dir) 190 | if not os.path.exists(tf_server_dir): 191 | os.makedirs(tf_server_dir) 192 | if not os.path.exists(checkpoint_dir): 193 | os.makedirs(checkpoint_dir) 194 | 195 | 196 | logger.info("epoch_count:"+str(args.epoch_count)) 197 | logger.info("batch_size:"+str(args.batch_size)) 198 | #logger.info("--------------processing data ----------------- ") 199 | class_list,training_count = processing_data(training_dir, validation_dir, testing_dir) 200 | #logger.info("--------------start train --------------------- ") 201 | train(training_dir, validation_dir, testing_dir, log_dir, model_dir, tf_server_dir, checkpoint_dir, class_list ,training_count, args) 202 | logger.info(" 训练完成 ") 203 | 204 | 205 | 206 | 207 | if __name__ == '__main__': 208 | parser = argparse.ArgumentParser(description='Train model.') 209 | 210 | parser.add_argument( 211 | "-e", 212 | "--epoch_count", 213 | type=int, 214 | nargs="?", 215 | help="Epoch count", 216 | default=30, 217 | ) 218 | parser.add_argument( 219 | "-r", 220 | "--lr", 221 | type=float, 222 | nargs="?", 223 | help="learning rate (default: 1e-5)", 224 | default=1e-5, 225 | ) 226 | parser.add_argument( 227 | "-b", 228 | "--batch_size", 229 | type=int, 230 | nargs="?", 231 | help="Batch size (default: 32)", 232 | default=32, 233 | ) 234 | parser.add_argument( 235 | "-m", 236 | "--model_dir", 237 | type=str, 238 | help="Model保存路径.", 239 | default="/opt/ml/model/", 240 | ) 241 | parser.add_argument( 242 | "-i", 243 | "--input_dir", 244 | type=str, 245 | help="input dir", 246 | default="/opt/ml/input/", 247 | ) 248 | parser.add_argument( 249 | "-o", 250 | "--output_dir", 251 | type=str, 252 | help="outpudif", 253 | default="/opt/ml/output/", 254 | ) 255 | parser.add_argument( 256 | "-g", 257 | "--gpu_list", 258 | type=str, 259 | help="gpu list", 260 | default="0", 261 | ) 262 | parser.add_argument( 263 | "-v", 264 | "--verbose", 265 | type=int, 266 | help="log level", 267 | default=2, 268 | ) 269 | args = parser.parse_args() 270 | main(args) 271 | 272 | -------------------------------------------------------------------------------- /encapsulation/test/cat.681.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/encapsulation/test/cat.681.jpg -------------------------------------------------------------------------------- /encapsulation/test/dog.592.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/encapsulation/test/dog.592.jpg -------------------------------------------------------------------------------- /encapsulation/train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import boto3\n", 10 | "import sagemaker\n", 11 | "import os\n", 12 | "from sagemaker import get_execution_role\n", 13 | "\n", 14 | "region = boto3.session.Session().region_name\n", 15 | "\n", 16 | "#如果使用SageMaker的笔记本实例使用下一行\n", 17 | "role = get_execution_role()\n", 18 | "#如果使用自建的笔记本实例请自行获取Role,可从IAM控制台获取到\n", 19 | "#role = \"arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235\"" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "#确保sagemaker版本为2.4.0及以上\n", 29 | "print(sagemaker.__version__)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "#修改bucket\n", 39 | "input_data = 's3://<>/data/cat-vs-dog-1000/'\n", 40 | "output_data = 's3://<>/data/cat-vs-dog-output/'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## 准备图片\n", 48 | "\n", 49 | "原始数据按不同分类上传到input_data目录\n", 50 | "```\n", 51 | "input_data\n", 52 | "├── class1\n", 53 | "│ ├── image001.jpg\n", 54 | "│ ├── image002.jpg\n", 55 | "│ └── ...\n", 56 | "├── class2\n", 57 | "│ ├── image001.jpg\n", 58 | "│ ├── image002.jpg\n", 59 | "│ └── ...\n", 60 | "└── classn\n", 61 | " ├── image001.jpg\n", 62 | " ├── image002.jpg\n", 63 | " └── ...\n", 64 | "```\n", 65 | "可从Kaggle获取[猫狗图片](https://www.kaggle.com/c/dogs-vs-cats/data),然后按目录存放图片。" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "from sagemaker.tensorflow import TensorFlow\n", 75 | "\n", 76 | "# 建议使用gpu类型的实例\n", 77 | "instance_type='ml.p3.2xlarge'\n", 78 | "#instance_type='local'\n", 79 | "model_dir = '/opt/ml/model'\n", 80 | "\n", 81 | "# 可以修改epoch_count,batch_size\n", 82 | "estimator = TensorFlow(entry_point='train.py',\n", 83 | " source_dir='./source',\n", 84 | " role=role,\n", 85 | " output_path=output_data,\n", 86 | " model_dir=model_dir,\n", 87 | " framework_version='1.15.2',\n", 88 | " hyperparameters={'epoch_count':30, 'batch_size':32}, \n", 89 | " py_version='py3',\n", 90 | " instance_count=1,\n", 91 | " instance_type=instance_type,\n", 92 | "# train_volume_size=50,\n", 93 | "# train_max_run=432000,\n", 94 | " use_spot_instances=True,\n", 95 | " max_wait=432000,\n", 96 | "# metric_definitions=[{'Name': 'loss', 'Regex': 'loss = (.*?),'},\n", 97 | "# {'Name':'epoch','Regex': 'Step_Train = (.*?),'}]\n", 98 | " )\n" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "result = estimator.fit(input_data)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### 打印 model_data 路径, 下载并且解压" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "print(estimator.model_data)\n", 124 | "os.environ['S3_URL']=str(estimator.model_data) #environ的键值必须是字符串" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "%%sh\n", 134 | "echo ${S3_URL}" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "%%sh\n", 144 | "\n", 145 | "if [ ! -d \"output\" ];then\n", 146 | "mkdir output\n", 147 | "fi\n", 148 | "\n", 149 | "cd output\n", 150 | "aws s3 cp ${S3_URL} ./model.tar.gz\n", 151 | "\n", 152 | "# aws s3 cp {sli_estimator.model_data} ./model.tar.gz\n", 153 | "\n", 154 | "tar -xvzf ./model.tar.gz " 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "确保模型文件保存到以下目录\n", 162 | "\n", 163 | "`inference.ipynb` 里面会用到\n", 164 | "\n", 165 | "```\n", 166 | "output\n", 167 | "└── tf_server\n", 168 | " └── 1\n", 169 | " ├── saved_model.pb\n", 170 | " └── variables\n", 171 | " ├── variables.data-00000-of-00001\n", 172 | " └── variables.index\n", 173 | "```\n", 174 | "\n", 175 | "```" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "Environment (conda_tensorflow_p37)", 189 | "language": "python", 190 | "name": "conda_tensorflow_p37" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.7.10" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 4 207 | } 208 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/1-preparation/explore_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import scipy.io as sio\n", 10 | "import os\n", 11 | "import scipy.io\n", 12 | "import scipy.ndimage\n", 13 | "import spectral\n", 14 | "import spectral.io.envi as envi\n", 15 | "import pandas as pd\n", 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import matplotlib.colors as colors\n", 19 | "import matplotlib.cm as cmx\n", 20 | "from random import shuffle\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## 分类\n", 28 | "0. 其他\n", 29 | "1. 较低油分\n", 30 | "2. 低油分\n", 31 | "3. 中油分\n", 32 | "4. 高油分\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "def loadData(flieName, dataIndex, temp_split=4):\n", 42 | " \n", 43 | " print(\"------------ loadData \", dataIndex)\n", 44 | " # 原始数据路径\n", 45 | " DATA_PATH = os.path.join(os.getcwd(), flieName)\n", 46 | "\n", 47 | " index = str(dataIndex)\n", 48 | " data = envi.open( os.path.join(DATA_PATH, \"{}.hdr\".format(index)) ,os.path.join(DATA_PATH, \"{}.dat\".format(index)))\n", 49 | " mask_data = envi.open( os.path.join(DATA_PATH, \"mask_{}.hdr\".format(index)) ,os.path.join(DATA_PATH, \"mask_{}.tiff\".format(index)))\n", 50 | "\n", 51 | " HEIGHT = data.shape[0] //temp_split\n", 52 | " WIDTH = data.shape[1] //temp_split\n", 53 | " BAND = data.shape[2]\n", 54 | "# BAND = BAND_SIZE\n", 55 | " new_shape=(BAND,HEIGHT,WIDTH)\n", 56 | " new_data = np.zeros(new_shape, dtype = float)\n", 57 | " label = np.zeros((HEIGHT, WIDTH), dtype = int)\n", 58 | " \n", 59 | "\n", 60 | " sample_count = 0\n", 61 | " for h in range(HEIGHT): \n", 62 | " for w in range(WIDTH):\n", 63 | " x = h*temp_split\n", 64 | " y = w*temp_split\n", 65 | " for b in range(BAND):\n", 66 | " new_data[b][h][w] = data[x,y][b]\n", 67 | "\n", 68 | " if(sum(mask_data[x, y]) > 0.01 ):\n", 69 | " label[h][w] = dataIndex \n", 70 | " sample_count += 1\n", 71 | " else:\n", 72 | " label[h][w] = 0\n", 73 | " \n", 74 | " \n", 75 | " new_data = np.transpose(new_data, (1, 2, 0)) # 将通道数提前,便于数组处理操作\n", 76 | " print(\"sample_count = {} \".format(sample_count))\n", 77 | " print(\"data shape : \", new_data.shape)\n", 78 | " print(\"label shape : \", label.shape)\n", 79 | " return new_data, label" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "PATCH 样本数量 * 通道 * 高 * 宽" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "def create_sample_data(index):\n", 96 | " \n", 97 | " data, label = loadData(\"dataset\", index)\n", 98 | "\n", 99 | " height = data.shape[0]\n", 100 | " width = data.shape[1]\n", 101 | " band = data.shape[2]\n", 102 | " \n", 103 | " \n", 104 | " print(\"band : \", band)\n", 105 | " print(\"height : \", height)\n", 106 | " print(\"width : \", width)\n", 107 | " sample_count = 0\n", 108 | " for h in range(height):\n", 109 | " for w in range(width):\n", 110 | " if label[h][w] == index:\n", 111 | " sample_count += 1\n", 112 | "\n", 113 | " print(\"count : \", sample_count)\n", 114 | " new_shape= (sample_count, band)\n", 115 | " temp_data = np.zeros(new_shape, dtype = float) \n", 116 | " \n", 117 | " count = 0 \n", 118 | " for h in range(height):\n", 119 | " for w in range(width):\n", 120 | " if label[h][w] == index:\n", 121 | " for b in range(band):\n", 122 | " temp_data[count][b] = data[h][w][b]\n", 123 | " count += 1\n", 124 | " \n", 125 | " return temp_data\n", 126 | " \n", 127 | " " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "%%time\n", 137 | "\n", 138 | "\n", 139 | "new_data1 = create_sample_data(1)\n", 140 | "new_data2 = create_sample_data(2)\n", 141 | "new_data3 = create_sample_data(3)\n", 142 | "new_data4 = create_sample_data(4)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "def drawLine(new_data):\n", 152 | "\n", 153 | " plt.figure(figsize=(22, 3))\n", 154 | " new_data1.shape\n", 155 | " size = new_data1.shape[0]\n", 156 | " split = size // 4\n", 157 | " \n", 158 | "\n", 159 | " #第一行第一列图形\n", 160 | " ax1 = plt.subplot(1,4,1)\n", 161 | " ax2 = plt.subplot(1,4,2)\n", 162 | " ax3 = plt.subplot(1,4,3)\n", 163 | " ax4 = plt.subplot(1,4,4)\n", 164 | " x= np.linspace(0, 100 ,new_data.shape[1])\n", 165 | "\n", 166 | " \n", 167 | " plt.sca(ax1)\n", 168 | " plt.plot(x,new_data[split * 0])\n", 169 | " \n", 170 | " \n", 171 | " plt.sca(ax2)\n", 172 | " plt.plot(x,new_data[split * 1])\n", 173 | " \n", 174 | " \n", 175 | " plt.sca(ax3)\n", 176 | " plt.plot(x,new_data[split * 2])\n", 177 | " \n", 178 | " \n", 179 | " plt.sca(ax4)\n", 180 | " plt.plot(x,new_data[split * 3])\n", 181 | "\n", 182 | " plt.show()\n", 183 | " \n" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "drawLine(new_data1)\n", 193 | "drawLine(new_data2)\n", 194 | "drawLine(new_data3)\n", 195 | "drawLine(new_data4)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "def drawLine(d1, d2):\n", 205 | "\n", 206 | " plt.figure(figsize=(22, 5))\n", 207 | " size_1 = d1.shape[0]\n", 208 | " split_1 = size_1 // 4\n", 209 | " \n", 210 | " size_2 = d2.shape[0]\n", 211 | " split_2 = size_2 // 4\n", 212 | " \n", 213 | "\n", 214 | " #第一行第一列图形\n", 215 | " ax1 = plt.subplot(1,4,1)\n", 216 | " ax2 = plt.subplot(1,4,2)\n", 217 | " ax3 = plt.subplot(1,4,3)\n", 218 | " ax4 = plt.subplot(1,4,4)\n", 219 | " x= np.linspace(0, d1.shape[1] ,d1.shape[1])\n", 220 | "\n", 221 | " \n", 222 | " plt.sca(ax1)\n", 223 | " plt.plot(x,d1[split_1 * 0])\n", 224 | " plt.plot(x,d2[split_2 * 0], color='red')\n", 225 | " \n", 226 | " \n", 227 | " plt.sca(ax2)\n", 228 | " plt.plot(x,d1[split_1 * 1])\n", 229 | " plt.plot(x,d2[split_2 * 1], color='red')\n", 230 | " \n", 231 | " \n", 232 | " plt.sca(ax3)\n", 233 | " plt.plot(x,d1[split_1 * 2])\n", 234 | " plt.plot(x,d2[split_2 * 2], color='red')\n", 235 | " \n", 236 | " \n", 237 | " plt.sca(ax4)\n", 238 | " plt.plot(x,d1[split_1 * 3])\n", 239 | " plt.plot(x,d2[split_2 * 3], color='red')\n", 240 | " \n", 241 | " plt.show()\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### 中油分和高油分" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "drawLine(new_data3, new_data4)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "### 低油分和较低油分" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "drawLine(new_data1, new_data2)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Environment (conda_pytorch_latest_p37)", 287 | "language": "python", 288 | "name": "conda_pytorch_latest_p37" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.7.10" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 4 305 | } 306 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/1-preparation/preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "exposed-selection", 6 | "metadata": {}, 7 | "source": [ 8 | "# DeepHyperX on SageMaker--数据准备" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "interstate-routine", 14 | "metadata": {}, 15 | "source": [ 16 | "## 1 说明\n", 17 | "本章内容主要是把原始数据格式转化为mat格式。\n", 18 | "## 2 运行环境\n", 19 | "Kernel 选择pytorch_latest_p36。 \n", 20 | "## 3 已有mat格式数据\n", 21 | "如果已有YOLOv5格式的数据,可跳过数据准备,把数据放入S3即可。 \n", 22 | "### 3.1 S3目录存放格式\n", 23 | "```\n", 24 | "deephyper\n", 25 | "├── class1\n", 26 | "│ ├── class1_gt.mat\n", 27 | "│ └── class1.mat\n", 28 | "├── class2\n", 29 | "│ ├── class2_gt.mat\n", 30 | "│ └── class2.mat\n", 31 | "...\n", 32 | "└── classn\n", 33 | " ├── classn_gt.mat\n", 34 | " └── classn.mat\n", 35 | "```\n", 36 | "### 3.2 SageMaker输入数据根目录\n", 37 | "运行SageMaker时,SageMaker会从S3拷贝数据放到到运行容器的`/opt/ml/input/data/training/`下。即`deephyper/class1/class1.mat`对应全路径为`/opt/ml/input/data/training/class1/class1.mat`" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "polyphonic-estimate", 43 | "metadata": {}, 44 | "source": [ 45 | "## 4 没有mat格式数据" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "multiple-archives", 51 | "metadata": {}, 52 | "source": [ 53 | "### 4.1 拷贝数据到本地" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "opponent-first", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import os\n", 64 | "if not os.path.exists(\"dataset\"):\n", 65 | " os.mkdir(\"dataset\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "strange-medication", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "!aws s3 sync s3://junzhong/data/hyper_leaf/ ./dataset" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "looking-triumph", 81 | "metadata": {}, 82 | "source": [ 83 | "### 4.2 转化为mat格式" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "theoretical-sheriff", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "!pip install spectral -i https://opentuna.cn/pypi/web/simple/" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "rising-lunch", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "!python preprocess.py" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "facial-stable", 109 | "metadata": {}, 110 | "source": [ 111 | "### 4.3 拷贝到S3" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "considered-museum", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# 设置数据存放S3 bucket\n", 122 | "bucket = 'junzhong'" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "fitting-opinion", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "!aws s3 sync Datasets/ s3://{bucket}/data/deephyper/" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "pressed-composite", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Environment (conda_pytorch_latest_p37)", 147 | "language": "python", 148 | "name": "conda_pytorch_latest_p37" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.7.10" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 5 165 | } 166 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/1-preparation/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import scipy.io 4 | import spectral.io.envi as envi 5 | 6 | 7 | DATASET_NAME = 'leaf' 8 | NEW_DATA_PATH = os.path.join(os.getcwd(), "Datasets/"+DATASET_NAME) # 存放数据路径 patch是文件夹名称 9 | 10 | 11 | """ 12 | temp_split: 对数据进行拆分 13 | """ 14 | def loadData(flieName, dataIndex, temp_split=4): 15 | 16 | print("------------ loadData ", dataIndex) 17 | # 原始数据路径 18 | DATA_PATH = os.path.join(os.getcwd(), flieName) 19 | 20 | index = str(dataIndex) 21 | data = envi.open( os.path.join(DATA_PATH, "{}.hdr".format(index)) ,os.path.join(DATA_PATH, "{}.dat".format(index))) 22 | mask_data = envi.open( os.path.join(DATA_PATH, "mask_{}.hdr".format(index)) ,os.path.join(DATA_PATH, "mask_{}.tiff".format(index))) 23 | 24 | HEIGHT = data.shape[0] //temp_split 25 | WIDTH = data.shape[1] //temp_split 26 | BAND = data.shape[2] 27 | # BAND = BAND_SIZE 28 | new_shape=(BAND,HEIGHT,WIDTH) 29 | new_data = np.zeros(new_shape, dtype = float) 30 | label = np.zeros((HEIGHT, WIDTH), dtype = int) 31 | 32 | 33 | sample_count = 0 34 | for h in range(HEIGHT): 35 | for w in range(WIDTH): 36 | x = h*temp_split 37 | y = w*temp_split 38 | for b in range(BAND): 39 | new_data[b][h][w] = data[x,y][b] 40 | 41 | if(sum(mask_data[x, y]) > 0.01 ): 42 | label[h][w] = dataIndex 43 | sample_count += 1 44 | else: 45 | label[h][w] = 0 46 | 47 | 48 | new_data = np.transpose(new_data, (1, 2, 0)) # 将通道数提前,便于数组处理操作 49 | print("sample_count = {} ".format(sample_count)) 50 | print("data shape : ", new_data.shape) 51 | print("label shape : ", label.shape) 52 | return new_data, label 53 | 54 | if not os.path.exists(NEW_DATA_PATH): 55 | print(" ", NEW_DATA_PATH) 56 | os.makedirs(NEW_DATA_PATH) 57 | print("create dataset dir success.") 58 | 59 | data1, label1 = loadData("dataset", 1) 60 | data2, label2 = loadData("dataset", 2) 61 | data3, label3 = loadData("dataset", 3) 62 | data4, label4 = loadData("dataset", 4) 63 | 64 | 65 | 66 | X1 = np.hstack((data1, data2)) 67 | X2 = np.hstack((data3, data4)) 68 | 69 | gt1 = np.hstack((label1, label2)) 70 | gt2 = np.hstack((label3, label4)) 71 | 72 | X = np.vstack((X1, X2)) 73 | 74 | gt = np.vstack((gt1, gt2)) 75 | 76 | 77 | 78 | 79 | 80 | 81 | train_dict, test_dict = {}, {} 82 | train_dict[DATASET_NAME] = X 83 | file_name = "{}.mat".format(DATASET_NAME) 84 | scipy.io.savemat(os.path.join(NEW_DATA_PATH, file_name), train_dict) 85 | test_dict["{}_gt".format(DATASET_NAME)] = gt 86 | file_name = "{}_gt.mat".format(DATASET_NAME) 87 | scipy.io.savemat(os.path.join(NEW_DATA_PATH, file_name), test_dict) 88 | print("Save target data success ---------------------------------\n") -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/2-training/source/custom_datasets.py: -------------------------------------------------------------------------------- 1 | from utils import open_file 2 | import numpy as np 3 | 4 | CUSTOM_DATASETS_CONFIG = { 5 | "leaf": { 6 | "img": "2018_IEEE_GRSS_DFC_HSI_TR.HDR", 7 | "gt": "2018_IEEE_GRSS_DFC_GT_TR.tif", 8 | "download": False, 9 | "loader": lambda folder: leaf_loader(folder), 10 | } 11 | } 12 | 13 | 14 | def leaf_loader(folder): 15 | img = open_file(folder + "leaf.mat") 16 | img = img["leaf"] 17 | 18 | rgb_bands = (43, 21, 11) # AVIRIS sensor 19 | 20 | gt = open_file(folder + "leaf_gt.mat")["leaf_gt"] 21 | label_values = [ 22 | "Undefined", 23 | "lowest", 24 | "lower", 25 | "middle", 26 | "high", 27 | ] 28 | 29 | ignored_labels = [0] 30 | # ignored_labels = [] 31 | palette = None 32 | return img, gt, rgb_bands, ignored_labels, label_values, palette 33 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/2-training/source/inference.py: -------------------------------------------------------------------------------- 1 | # Python 2/3 compatiblity 2 | from __future__ import print_function 3 | from __future__ import division 4 | import joblib 5 | import os 6 | from utils import convert_to_color_, convert_from_color_, get_device 7 | from datasets import open_file 8 | from models import get_model, test 9 | import numpy as np 10 | import seaborn as sns 11 | from skimage import io 12 | import argparse 13 | import torch 14 | 15 | # Test options 16 | parser = argparse.ArgumentParser( 17 | description="Run deep learning experiments on" " various hyperspectral datasets" 18 | ) 19 | parser.add_argument( 20 | "--model", 21 | type=str, 22 | default=None, 23 | help="Model to train. Available:\n" 24 | "SVM (linear), " 25 | "SVM_grid (grid search on linear, poly and RBF kernels), " 26 | "baseline (fully connected NN), " 27 | "hu (1D CNN), " 28 | "hamida (3D CNN + 1D classifier), " 29 | "lee (3D FCN), " 30 | "chen (3D CNN), " 31 | "li (3D CNN), " 32 | "he (3D CNN), " 33 | "luo (3D CNN), " 34 | "sharma (2D CNN), " 35 | "boulch (1D semi-supervised CNN), " 36 | "liu (3D semi-supervised CNN), " 37 | "mou (1D RNN)", 38 | ) 39 | parser.add_argument( 40 | "--cuda", 41 | type=int, 42 | default=-1, 43 | help="Specify CUDA device (defaults to -1, which learns on CPU)", 44 | ) 45 | parser.add_argument( 46 | "--checkpoint", 47 | type=str, 48 | default=None, 49 | help="Weights to use for initialization, e.g. a checkpoint", 50 | ) 51 | 52 | group_test = parser.add_argument_group("Test") 53 | group_test.add_argument( 54 | "--test_stride", 55 | type=int, 56 | default=1, 57 | help="Sliding window step stride during inference (default = 1)", 58 | ) 59 | group_test.add_argument( 60 | "--image", 61 | type=str, 62 | default=None, 63 | nargs="?", 64 | help="Path to an image on which to run inference.", 65 | ) 66 | group_test.add_argument( 67 | "--only_test", 68 | type=str, 69 | default=None, 70 | nargs="?", 71 | help="Choose the data on which to test the trained algorithm ", 72 | ) 73 | group_test.add_argument( 74 | "--mat", 75 | type=str, 76 | default=None, 77 | nargs="?", 78 | help="In case of a .mat file, define the variable to call inside the file", 79 | ) 80 | group_test.add_argument( 81 | "--n_classes", 82 | type=int, 83 | default=None, 84 | nargs="?", 85 | help="When using a trained algorithm, specified the number of classes of this algorithm", 86 | ) 87 | # Training options 88 | group_train = parser.add_argument_group("Model") 89 | group_train.add_argument( 90 | "--patch_size", 91 | type=int, 92 | help="Size of the spatial neighbourhood (optional, if " 93 | "absent will be set by the model)", 94 | ) 95 | group_train.add_argument( 96 | "--batch_size", 97 | type=int, 98 | help="Batch size (optional, if absent will be set by the model", 99 | ) 100 | 101 | args = parser.parse_args() 102 | CUDA_DEVICE = get_device(args.cuda) 103 | MODEL = args.model 104 | # Testing file 105 | MAT = args.mat 106 | N_CLASSES = args.n_classes 107 | INFERENCE = args.image 108 | TEST_STRIDE = args.test_stride 109 | CHECKPOINT = args.checkpoint 110 | 111 | img_filename = os.path.basename(INFERENCE) 112 | basename = MODEL + img_filename 113 | dirname = os.path.dirname(INFERENCE) 114 | 115 | img = open_file(INFERENCE) 116 | if MAT is not None: 117 | img = img[MAT] 118 | # Normalization 119 | img = np.asarray(img, dtype="float32") 120 | img = (img - np.min(img)) / (np.max(img) - np.min(img)) 121 | N_BANDS = img.shape[-1] 122 | hyperparams = vars(args) 123 | hyperparams.update( 124 | { 125 | "n_classes": N_CLASSES, 126 | "n_bands": N_BANDS, 127 | "device": CUDA_DEVICE, 128 | "ignored_labels": [0], 129 | } 130 | ) 131 | hyperparams = dict((k, v) for k, v in hyperparams.items() if v is not None) 132 | 133 | palette = {0: (0, 0, 0)} 134 | for k, color in enumerate(sns.color_palette("hls", N_CLASSES)): 135 | palette[k + 1] = tuple(np.asarray(255 * np.array(color), dtype="uint8")) 136 | invert_palette = {v: k for k, v in palette.items()} 137 | 138 | 139 | def convert_to_color(x): 140 | return convert_to_color_(x, palette=palette) 141 | 142 | 143 | def convert_from_color(x): 144 | return convert_from_color_(x, palette=invert_palette) 145 | 146 | 147 | if MODEL in ["SVM", "SVM_grid", "SGD", "nearest"]: 148 | model = joblib.load(CHECKPOINT) 149 | w, h = img.shape[:2] 150 | X = img.reshape((w * h, N_BANDS)) 151 | prediction = model.predict(X) 152 | prediction = prediction.reshape(img.shape[:2]) 153 | else: 154 | model, _, _, hyperparams = get_model(MODEL, **hyperparams) 155 | 156 | if CUDA_DEVICE == -1: 157 | model.load_state_dict(torch.load(CHECKPOINT)) 158 | else: 159 | model.load_state_dict(torch.load(CHECKPOINT,map_location='cpu')) 160 | 161 | 162 | 163 | probabilities = test(model, img, hyperparams) 164 | prediction = np.argmax(probabilities, axis=-1) 165 | 166 | filename = dirname + "/" + basename + ".tif" 167 | io.imsave(filename, prediction) 168 | basename = "color_" + basename 169 | filename = dirname + "/" + basename + ".tif" 170 | io.imsave(filename, convert_to_color(prediction)) 171 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/2-training/source/requirements2.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.10.0 2 | spectral==0.19 3 | scipy>=0.19.0 4 | tqdm>=4.15.0 5 | visdom>=0.1.5 6 | seaborn>=0.8 7 | scikit-learn>=0.19.0 8 | scikit-image>=0.13.1 9 | torch>=0.4.0 10 | matplotlib>=2.0.2 11 | torchsummary>=1.5 12 | joblib==0.14.1 13 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/2-training/training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fb63bd02", 6 | "metadata": {}, 7 | "source": [ 8 | "# DeepHyperX on SageMaker--训练" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "45ac89f3", 14 | "metadata": {}, 15 | "source": [ 16 | "## 1 说明\n", 17 | "本章内容为用SageMaker进行训练,数据来自S3。" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "79225937", 23 | "metadata": {}, 24 | "source": [ 25 | "## 2 运行环境\n", 26 | "Kernel 选择pytorch_latest_p36。 \n", 27 | "本文在boto3 1.17.84和sagemaker 2.43.0下测试通过。" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "f580feb0", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import boto3,sagemaker\n", 38 | "print(boto3.__version__)\n", 39 | "print(sagemaker.__version__)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "a797946f", 45 | "metadata": {}, 46 | "source": [ 47 | "## 3 在SageMaker上训练" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "7cb5b9a4", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# 设置数据存放S3 bucket\n", 58 | "bucket = 'junzhong'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "8645c318", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "input_path='s3://{}/data/deephyper/'.format(bucket)\n", 69 | "output_path='s3://{}/result/deephyper/'.format(bucket)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "a806601d", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import boto3\n", 80 | "iam = boto3.client('iam')\n", 81 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 82 | "role=\"\"\n", 83 | "for current_role in roles[\"Roles\"]:\n", 84 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 85 | " role=current_role[\"Arn\"]\n", 86 | " break\n", 87 | "#如果role为空表示有问题,需要先打开https://cn-northwest-1.console.amazonaws.cn/sagemaker/home?region=cn-northwest-1#/notebook-instances/create以创建IAM Role\n", 88 | "print(role)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "bb2fb92a", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "from sagemaker.pytorch import PyTorch\n", 99 | "\n", 100 | "#根据需要修改训练实例,和是否使用Spot实例\n", 101 | "instance_type=\"ml.p3.2xlarge\"\n", 102 | "use_spot_instances=False\n", 103 | "\n", 104 | "estimator = PyTorch(entry_point=\"main.py\",\n", 105 | " source_dir=\"./source\",\n", 106 | " role=role,\n", 107 | " output_path=output_path,\n", 108 | " framework_version='1.6.0',\n", 109 | " hyperparameters={\"folder\":\"/opt/ml/input/data/training/\",\n", 110 | " \"model\":\"he\",\n", 111 | " \"dataset\":\"leaf\",\n", 112 | " \"cuda\":\"0\",\n", 113 | " \"training_sample\":0.7,\n", 114 | " \"patch_size\":17,\n", 115 | " \"epoch\":20,\n", 116 | " \"batch_size\":32}, \n", 117 | " py_version=\"py3\",\n", 118 | " instance_count=1,\n", 119 | " instance_type=instance_type,\n", 120 | " use_spot_instances=use_spot_instances,\n", 121 | " max_wait=432000 if use_spot_instances else None,\n", 122 | " )" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "b8b2da8b", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "estimator.fit(input_path)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "9bdc5e9e", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "import os\n", 143 | "os.makedirs(\"result\", exist_ok=True)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "c951ca17", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "!aws s3 cp $estimator.model_data ./result" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "00cf390e", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "%%sh\n", 164 | "cd result\n", 165 | "tar zxvf model.tar.gz" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "760a50c0", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "conda_pytorch_latest_p36", 180 | "language": "python", 181 | "name": "conda_pytorch_latest_p36" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.6.13" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 5 198 | } 199 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/3-inference/inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c8c0ca15", 6 | "metadata": {}, 7 | "source": [ 8 | "# 推理" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "d2c11e6c", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "/home/ec2-user/SageMaker/github/sagemaker-workshop/hyperspectral/DeepHyperX/3-inference\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "!pwd" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "f8c5cb10", 32 | "metadata": {}, 33 | "source": [ 34 | "### 进入工作目录" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "826aebf6", 40 | "metadata": {}, 41 | "source": [ 42 | "`cd sagemaker-workshop/hyperspectral/DeepHyperX/2-training`" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "a007e347", 48 | "metadata": {}, 49 | "source": [ 50 | "### 创建环境" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "1e81f2b7", 56 | "metadata": {}, 57 | "source": [ 58 | "```\n", 59 | "conda create -n hyper python=3.6 pip scipy numpy \n", 60 | "source activate hyper\n", 61 | "pip install -r requirements2.txt -i https://mirrors.163.com/pypi/simple/\n", 62 | " \n", 63 | "```" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "71155594", 69 | "metadata": {}, 70 | "source": [ 71 | "### 推理" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "f23e4b6a", 77 | "metadata": {}, 78 | "source": [ 79 | "`python3 inference.py --model he --checkpoint '../result/checkpoint/he_et_al/leaf/2021_06_06_08_18_30_epoch11_1.00.pth' --image 'Datasets/leaf/leaf.mat' --mat leaf --n_classes=5 --patch_size=17 `" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "3618c1c3", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "conda_pytorch_latest_p36", 94 | "language": "python", 95 | "name": "conda_pytorch_latest_p36" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.6.13" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 5 112 | } 113 | -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/LICENSE: -------------------------------------------------------------------------------- 1 | # License information 2 | 3 | Code for the DeepHyperX toolbox is dual licensed depending on applications, research or commercial. 4 | 5 | --- 6 | 7 | ## COMMERCIAL PURPOSES 8 | 9 | Please contact the ONERA [www.onera.fr/en/contact-us](www.onera.fr/en/contact-us) for additional information or directly the authors Nicolas Audebert or Bertrand Le Saux. 10 | 11 | --- 12 | 13 | ## RESEARCH AND NON COMMERCIAL PURPOSES 14 | 15 | #### Code license 16 | 17 | For research and non commercial purposes, all the code and documentation is released under the GPLv3 license: 18 | 19 | Copyright (c) 2018 ONERA and IRISA, Nicolas Audebert, Bertrand Le Saux, Sébastien Lefèvre. 20 | 21 | This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 22 | 23 | PLEASE ACKNOWLEDGE THE ORIGINAL AUTHORS AND PUBLICATION ACCORDING TO THE REPOSITORY github.com/nshaud/DeepHyperx OR IF NOT AVAILABLE: 24 | Nicolas Audebert, Bertrand Le Saux and Sébastien Lefèvre 25 | "Deep Learning for Classification of Hyperspectral Data: A comparative review", 26 | IEEE Geosciences and Remote Sensing Magazine, 2019. -------------------------------------------------------------------------------- /hyperspectral/DeepHyperX/README.md: -------------------------------------------------------------------------------- 1 | # DeepHyperX on SageMaker 2 | 本workshop演示使用DeepHyperX在SageMaker上如何进行训练。 3 | 原地址:https://github.com/nshaud/DeepHyperX 4 | DeepHyperX在各种高光谱数据集上执行深度学习实验的 Python 工具。 5 | ## 数据准备 6 | [1-preparation](1-preparation)演示把原始数据格式转化为mat格式,如果已有mat格式的数据,可跳过数据准备,把数据按要求放入到S3即可。 7 | ## 训练 8 | [2-training](2-training)演示在SageMaker上进行训练。 -------------------------------------------------------------------------------- /image-classification/README.md: -------------------------------------------------------------------------------- 1 | # 利用Amazon SageMaker内置算法进行图片分类 2 | Image-classification-lst-format.ipynb演示了利用Amazon SageMaker内置算法进行图片分类模型的训练和部署。 3 | 4 | ## 启动Amazon SageMaker笔记本实例 5 | 通过以下步骤启动Amazon SageMaker的笔记本实例 6 | * 访问SageMaker主页,点击左边栏目笔记本实例链接 7 | * 创建笔记本实例 8 | * 当笔记本实例处于InService状态时,可以通过点击JupyterLab链接进入到实例中 9 | 10 | ## 上传源文件到笔记本实例 11 | 点击左上角上传按钮,将Image-classification-lst-format.ipynb文件上传到笔记本实例中。 12 | 13 | ## 升级相应Kernel中sagemaker版本 14 | ``` 15 | source activate mxnet_p36 16 | pip install sagemaker --upgrade 17 | ``` 18 | 执行完以上命令重启kenrnel 19 | 20 | 21 | ## 运行笔记本实例中的每个Cell 22 | 阅读每个Cell运行相关程序进行模型训练和推理 -------------------------------------------------------------------------------- /images/sagemaker_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/images/sagemaker_notebook.png -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/Dockerfile: -------------------------------------------------------------------------------- 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04 2 | ARG BASE_IMG=${BASE_IMG} 3 | FROM ${BASE_IMG} 4 | 5 | ENV PATH="/opt/code:${PATH}" 6 | 7 | COPY sources.list /etc/apt/ 8 | 9 | #RUN wget -qO - https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub | apt-key add - 10 | 11 | RUN apt-get update \ 12 | && apt-get install -y --no-install-recommends --allow-unauthenticated \ 13 | jq 14 | 15 | ## fix /usr/local/cuda-10.0/compat/libcuda.so 16 | ## RUN bash -c 'echo "/usr/local/cuda-10.0/compat" > /etc/ld.so.conf.d/cuda.conf' 17 | RUN ldconfig -v 18 | 19 | 20 | WORKDIR /opt/code 21 | COPY dockersource ./ 22 | RUN pip install -r /opt/code/requirements.txt -i https://opentuna.cn/pypi/web/simple/ 23 | ## https://github.com/aws/sagemaker-pytorch-training-toolkit/issues/143#issuecomment-566776288 24 | ## https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/upgrade-training-toolkit/docker/build_artifacts/start_with_right_hostname.sh 25 | ## https://github.com/aws/deep-learning-containers/blob/v2.0-pt-1.5.1-py36/pytorch/training/docker/1.5.1/py3/Dockerfile.gpu#L181 26 | COPY changehostname.c /opt/code 27 | COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh 28 | COPY train /opt/code -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/changehostname.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker. 6 | * 7 | * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host, 8 | * not realizing that it needs to use NET/Socket. 9 | * 10 | * When docker container starts we read 'current_host' value from /opt/ml/input/config/resourceconfig.json 11 | * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library. 12 | */ 13 | int gethostname(char *name, size_t len) 14 | { 15 | const char *val = PLACEHOLDER_HOSTNAME; 16 | strncpy(name, val, len); 17 | return 0; 18 | } 19 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/config/hyperparameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "/opt/ml/input/data/training/cfg/data.yaml", 3 | "cfg": "/opt/ml/input/data/training/cfg/yolov5s.yaml", 4 | "hyp": "/opt/ml/input/data/training/cfg/hyp.yaml", 5 | "weights": "/opt/ml/input/data/training/weights/yolov5s.pt", 6 | "img": "640", 7 | "epochs": "2", 8 | "batch": "16" 9 | } 10 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/config/resourceconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "current_host": "algo-1", 3 | "hosts": ["algo-1","algo-2","algo-3"], 4 | "network_interface_name":"eth1" 5 | } -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/hyp.yaml: -------------------------------------------------------------------------------- 1 | lr0: 0.01 2 | lrf: 0.2 3 | momentum: 0.937 4 | weight_decay: 0.0005 5 | warmup_epochs: 3.0 6 | warmup_momentum: 0.8 7 | warmup_bias_lr: 0.1 8 | box: 0.05 9 | cls: 0.5 10 | cls_pw: 1.0 11 | obj: 1.0 12 | obj_pw: 1.0 13 | iou_t: 0.2 14 | anchor_t: 4.0 15 | fl_gamma: 0.0 16 | hsv_h: 0.015 17 | hsv_s: 0.7 18 | hsv_v: 0.4 19 | degrees: 0.0 20 | translate: 0.1 21 | scale: 0.5 22 | shear: 0.0 23 | perspective: 0.0 24 | flipud: 0.0 25 | fliplr: 0.5 26 | mosaic: 1.0 27 | mixup: 0.0 28 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/cfg/yolov5s.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 0.33 # model depth multiple 4 | width_multiple: 0.50 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/weights/yolov5s.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/1-training/container/local_test/input/data/training/weights/yolov5s.pt -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/sources.list: -------------------------------------------------------------------------------- 1 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial main restricted universe multiverse 2 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial main restricted universe multiverse 3 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-updates main restricted universe multiverse 4 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-updates main restricted universe multiverse 5 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-backports main restricted universe multiverse 6 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-backports main restricted universe multiverse 7 | deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-security main restricted universe multiverse 8 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-security main restricted universe multiverse 9 | 10 | # 预发布软件源,不建议启用 11 | # deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-proposed main restricted universe multiverse 12 | # deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ xenial-proposed main restricted universe multiverse -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/start_with_right_hostname.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [[ "$1" = "train" ]]; then 4 | CURRENT_HOST=$(jq .current_host /opt/ml/input/config/resourceconfig.json) 5 | sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c 6 | gcc -o changehostname.o -c -fPIC -Wall changehostname.c 7 | gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl 8 | LD_PRELOAD=/opt/code/libchangehostname.so train 9 | else 10 | eval "$@" 11 | fi 12 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/container/train: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | hpfile=/opt/ml/input/config/hyperparameters.json 3 | echo "========hyperparameters=======" 4 | cat $hpfile 5 | hp=$(cat $hpfile |jq -r -c 'to_entries | .[] |"--"+ .key + " " + .value ' | tr '\n' ' ') 6 | echo "=============" 7 | echo python /opt/code/train.py $hp 8 | echo "=============" 9 | python /opt/code/train.py $hp 10 | cp -r /opt/code/runs /opt/ml/model/ 11 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/training-build.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# YOLOv5 on SageMaker--Build 训练镜像" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本章内容为build训练镜像,推送到AWS ECR,用户可直接使用build完毕的image,不用自己build。" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## 2 运行环境\n", 23 | "Kernel 选择pytorch_latest_p36。 \n", 24 | "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import boto3,sagemaker\n", 34 | "print(boto3.__version__)\n", 35 | "print(sagemaker.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 3 Amazon 深度学习容器\n", 43 | "\n", 44 | "* [容器镜像清单](https://github.com/aws/deep-learning-containers/blob/master/available_images.md)\n", 45 | "* 本文基于pytorch training: `727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04`" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 4 下载YOLOv5" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "!git clone https://github.com/ultralytics/yolov5 container/dockersource" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## 5 设置相关名称" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "ecr_repository = 'yolov5-training'\n", 78 | "tag = 'latest'" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## 6 Build image" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "#国内pytorch training基础镜像地址,不要修改\n", 95 | "base_img='727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04'\n", 96 | "#登录基础镜像ECR,不要修改\n", 97 | "!aws ecr get-login-password --region cn-northwest-1 | docker login --username AWS --password-stdin 727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "%%time\n", 107 | "%cd container\n", 108 | "!docker build -t $ecr_repository -f Dockerfile --build-arg BASE_IMG=$base_img .\n", 109 | "%cd ../" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## 7 在本地使用容器进行训练(可选)\n", 117 | "本地机器如果带GPU,使用`nvidia-docker run`;如果不带GPU,使用`docker run`,建议使用2xlarge以上机型,否则可能不足以分配内存。 \n", 118 | "训练模型结果存放在`container/local_test/model/runs/train/exp/weights`" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "!nvidia-docker run -v $(pwd)/container/local_test/:/opt/ml/ --shm-size=8g --rm $ecr_repository train\n", 128 | "# !docker run -v $(pwd)/container/local_test/:/opt/ml/ --shm-size=8g --rm $ecr_repository train" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## 8 推送到ECR" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "!aws ecr create-repository --repository-name $ecr_repository" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import boto3\n", 154 | "region = boto3.session.Session().region_name\n", 155 | "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", 156 | "image_uri = '{}.dkr.ecr.{}.amazonaws.com.cn/{}'.format(account_id, region, ecr_repository + \":\" + tag)\n", 157 | "!docker tag $ecr_repository:$tag $image_uri\n", 158 | "!$(aws ecr get-login --no-include-email)\n", 159 | "!docker push $image_uri" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Environment (conda_pytorch_latest_p37)", 173 | "language": "python", 174 | "name": "conda_pytorch_latest_p37" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.7.10" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 4 191 | } 192 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/1-training/training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# YOLOv5 on SageMaker--训练" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本章内容为用SageMaker进行训练,数据来自S3。" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## 2 运行环境\n", 23 | "Kernel 选择pytorch_latest_p36。 \n", 24 | "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import boto3,sagemaker\n", 34 | "print(boto3.__version__)\n", 35 | "print(sagemaker.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 3 获取image" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "本项目已build完毕image,存放到ECR中,可直接部署到SageMaker。请选择选择合适版本。" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "tag = \"v3.1\"" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import boto3\n", 68 | "region = boto3.session.Session().region_name\n", 69 | "image_uri = '048912060910.dkr.ecr.{}.amazonaws.com.cn/nwcd/yolov5-training:{}'.format(region,tag)\n", 70 | "image_uri" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## 4 在SageMaker上训练" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# 设置数据存放S3 bucket和前缀\n", 87 | "bucket = 'junzhong'\n", 88 | "pre_key = 'yolov5'" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "training_uri='s3://{}/{}/training/'.format(bucket, pre_key)\n", 98 | "outpath='s3://{}/{}/results/'.format(bucket, pre_key)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "import sagemaker,boto3\n", 108 | "\n", 109 | "iam = boto3.client('iam')\n", 110 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 111 | "role=\"\"\n", 112 | "for current_role in roles[\"Roles\"]:\n", 113 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 114 | " role=current_role[\"Arn\"]\n", 115 | " break\n", 116 | "#如果role为空表示有问题\n", 117 | "print(role)\n", 118 | "sm = boto3.client('sagemaker')" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "#设置是否使用spot实例进行训练\n", 128 | "use_spot = True" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "YOLOv5相关参数在`contariner/local_test/input/data/training/cfg/`目录下的`hyp.yaml`中,如需修改,请先修改。每次修改完后需要再同步。" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "!aws s3 sync container/local_test/input/data/training/ s3://{bucket}/{pre_key}/training/" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "from datetime import datetime\n", 154 | "now = datetime.now()\n", 155 | "job_name = 'yolov5-' + now.strftime(\"%Y-%m-%d-%H-%M-%S\")\n", 156 | "job_name" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "response = sm.create_training_job(\n", 166 | " TrainingJobName=job_name,\n", 167 | " HyperParameters={\n", 168 | " 'img':\"640\",\n", 169 | " 'batch':\"16\",\n", 170 | " 'epochs':\"15\",\n", 171 | " 'hyp':\"/opt/ml/input/data/training/cfg/hyp.yaml\",\n", 172 | " 'data':\"/opt/ml/input/data/training/cfg/data.yaml\",\n", 173 | " 'cfg':\"/opt/ml/input/data/training/cfg/yolov5s.yaml\",\n", 174 | " 'weights':\"/opt/ml/input/data/training/weights/yolov5s.pt\"\n", 175 | " },\n", 176 | " AlgorithmSpecification={\n", 177 | " 'TrainingImage': image_uri,\n", 178 | " 'TrainingInputMode': 'File',\n", 179 | " },\n", 180 | " RoleArn=role,\n", 181 | " InputDataConfig=[\n", 182 | " {\n", 183 | " 'ChannelName': 'training',\n", 184 | " 'DataSource': {\n", 185 | " 'S3DataSource': {\n", 186 | " 'S3DataType': 'S3Prefix',\n", 187 | " 'S3Uri': training_uri,\n", 188 | " 'S3DataDistributionType': 'FullyReplicated',\n", 189 | " },\n", 190 | " },\n", 191 | " 'InputMode': 'File'\n", 192 | " }\n", 193 | " ],\n", 194 | " OutputDataConfig={\n", 195 | " 'S3OutputPath': outpath\n", 196 | " },\n", 197 | " ResourceConfig={\n", 198 | " 'InstanceType': 'ml.p3.2xlarge',\n", 199 | " 'InstanceCount': 1,\n", 200 | " 'VolumeSizeInGB': 100,\n", 201 | " },\n", 202 | " EnableManagedSpotTraining=use_spot,\n", 203 | " StoppingCondition={\"MaxWaitTimeInSeconds\": 3600,\"MaxRuntimeInSeconds\": 3600} if use_spot else {\"MaxRuntimeInSeconds\": 3600}\n", 204 | " )\n", 205 | "response" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "查看状态,也可到SageMaker控制台查看。使用本Workshop提供的数据,大概需要15分钟。 \n", 213 | "每120秒获取一次状态,因此最多可能有2分钟的延迟。" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "status = sm.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n", 223 | "print('Training job current status: {}'.format(status))\n", 224 | "\n", 225 | "try:\n", 226 | " sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=job_name)\n", 227 | " training_info = sm.describe_training_job(TrainingJobName=job_name)\n", 228 | " status = training_info['TrainingJobStatus']\n", 229 | " print(\"Training job ended with status: \" + status)\n", 230 | "except:\n", 231 | " print('Training failed to start')\n", 232 | " message = sm.describe_training_job(TrainingJobName=job_name)['FailureReason']\n", 233 | " print('Training failed with the following error: {}'.format(message))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "如果看到,\n", 241 | "\n", 242 | "> `Training job ended with status: Completed`\n", 243 | "\n", 244 | "这意味着训练成功完成。" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## 5 下载训练结果" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "复制下面代码输出的`model_data`,在推理中要使用" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "respone = sm.describe_training_job(TrainingJobName=job_name)\n", 268 | "model_data = respone['ModelArtifacts']['S3ModelArtifacts']\n", 269 | "!echo -n $model_data > model_data.txt" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "!aws s3 cp {model_data} model.tar.gz" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "!tar -xvf model.tar.gz" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [] 296 | } 297 | ], 298 | "metadata": { 299 | "kernelspec": { 300 | "display_name": "Environment (conda_pytorch_latest_p37)", 301 | "language": "python", 302 | "name": "conda_pytorch_latest_p37" 303 | }, 304 | "language_info": { 305 | "codemirror_mode": { 306 | "name": "ipython", 307 | "version": 3 308 | }, 309 | "file_extension": ".py", 310 | "mimetype": "text/x-python", 311 | "name": "python", 312 | "nbconvert_exporter": "python", 313 | "pygments_lexer": "ipython3", 314 | "version": "3.7.10" 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 4 319 | } 320 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/Dockerfile: -------------------------------------------------------------------------------- 1 | #ARG BASE_IMG=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04 2 | ARG BASE_IMG=${BASE_IMG} 3 | FROM ${BASE_IMG} 4 | 5 | RUN apt-get update 6 | RUN apt-get install -y --no-install-recommends nginx net-tools\ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN pip install flask gevent gunicorn boto3 -i https://opentuna.cn/pypi/web/simple/ && \ 10 | rm -rf /root/.cache 11 | 12 | COPY aws /root/.aws 13 | # RUN mkdir /opt/ml/code 14 | WORKDIR /opt/ml/code 15 | COPY source ./ 16 | 17 | RUN pip install -r requirements.txt -i https://opentuna.cn/pypi/web/simple/ 18 | 19 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard 20 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE 21 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update 22 | # PATH so that the train and serve programs are found when the container is invoked. 23 | 24 | ENV PYTHONUNBUFFERED=TRUE 25 | ENV PYTHONDONTWRITEBYTECODE=TRUE 26 | ENV PATH="/opt/ml/code/:${PATH}" 27 | 28 | ENTRYPOINT ["python3"] -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/aws/config: -------------------------------------------------------------------------------- 1 | [default] 2 | region = cn-northwest-1 3 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/inference-build.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# YOLOv5 on SageMaker--Build 推理镜像" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本章内容为build推理镜像,推送到AWS ECR,用户可直接使用build完毕的镜像,不用自己build。" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## 2 运行环境\n", 23 | "Kernel 选择pytorch_latest_p36。 \n", 24 | "本文在boto3 1.17.12和sagemaker 2.26.0下测试通过。" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import boto3,sagemaker\n", 34 | "print(boto3.__version__)\n", 35 | "print(sagemaker.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## 3 本地推理(可选)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 34, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "mkdir: cannot create directory ‘/opt/ml’: File exists\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "!sudo mkdir /opt/ml\n", 60 | "!sudo chmod 777 /opt/ml" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import os\n", 70 | "if not os.path.exists(\"/opt/ml/model\"):\n", 71 | " os.mkdir(\"/opt/ml/model\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!cp -r ../1-training/runs/ /opt/ml/model/" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "新启动一个shell窗口,运行`conda activate pytorch_latest_p36`,然后必须cd到`2-inference/source`目录,再运行`python predictor.py`,正常启动会输出以下内容:\n", 88 | "```\n", 89 | "-------------init_output_dir /opt/ml/output_dir\n", 90 | " * Serving Flask app \"predictor\" (lazy loading)\n", 91 | " * Environment: production\n", 92 | " WARNING: This is a development server. Do not use it in a production deployment.\n", 93 | " Use a production WSGI server instead.\n", 94 | " * Debug mode: off\n", 95 | " * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\n", 96 | "```" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "#修改请求图片\n", 106 | "!curl -H \"Content-Type: application/json\" -X POST --data '{\"bucket\":\"junzhong\",\"image_uri\":\"yolov5/training/images/val/000729.jpeg\"}' http://127.0.0.1:5000/invocations" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "#删除model文件,实际运行时,通过S3动态传入model\n", 116 | "import os\n", 117 | "model_file = \"source/yolov5s.pt\"\n", 118 | "if os.path.isfile(model_file):\n", 119 | " os.remove(model_file)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "## 4 Amazon 深度学习容器\n", 127 | "\n", 128 | "* [容器镜像清单](https://github.com/aws/deep-learning-containers/blob/master/available_images.md)\n", 129 | "* 本文基于pytorch inference: `727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04`" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## 5 设置相关名称" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "ecr_repository = 'yolov5-inference'\n", 146 | "tag = 'latest'" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## 6 Build image" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "#国内pytorch inference基础镜像地址,不要修改\n", 163 | "base_img='727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.6.0-gpu-py36-cu101-ubuntu16.04'\n", 164 | "#登录基础镜像ECR,不要修改\n", 165 | "!aws ecr get-login-password --region cn-northwest-1 | docker login --username AWS --password-stdin 727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "!docker build -t $ecr_repository:$tag -f Dockerfile --build-arg BASE_IMG=$base_img ." 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## 7 在本地使用容器进行推理(可选)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "import os\n", 191 | "if not os.path.exists(\"model\"):\n", 192 | " os.mkdir(\"model\")" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "!cp -r ../1-training/runs/ model/" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "本地机器如果带GPU,使用`nvidia-docker run`;如果不带GPU,使用`docker run`。" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "!docker run -v $(pwd)/model/:/opt/ml/model/ -p 8080:8080 -d --rm $ecr_repository:$tag serve" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "#修改请求图片\n", 227 | "!curl -H \"Content-Type: application/json\" -X POST --data '{\"bucket\":\"junzhong\",\"image_uri\":\"yolov5/training/images/val/000729.jpeg\"}' http://127.0.0.1:8080/invocations" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## 8 推送到ECR" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "!aws ecr create-repository --repository-name $ecr_repository" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "import boto3\n", 253 | "region = boto3.session.Session().region_name\n", 254 | "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", 255 | "image_uri = '{}.dkr.ecr.{}.amazonaws.com.cn/{}'.format(account_id, region, ecr_repository + \":\" + tag)\n", 256 | "!docker tag $ecr_repository:$tag $image_uri\n", 257 | "!$(aws ecr get-login --no-include-email)\n", 258 | "!docker push $image_uri" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Environment (conda_pytorch_latest_p37)", 272 | "language": "python", 273 | "name": "conda_pytorch_latest_p37" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.7.10" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 4 290 | } 291 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import platform 4 | import shutil 5 | import time 6 | from pathlib import Path 7 | 8 | import cv2 9 | import torch 10 | import torch.backends.cudnn as cudnn 11 | from numpy import random 12 | 13 | from models.experimental import attempt_load 14 | from utils.datasets import LoadStreams, LoadImages 15 | from utils.general import ( 16 | check_img_size, non_max_suppression, apply_classifier, scale_coords, 17 | xyxy2xywh, plot_one_box, strip_optimizer, set_logging) 18 | from utils.torch_utils import select_device, load_classifier, time_synchronized 19 | 20 | 21 | def detect(source, img_size): 22 | webcam = source.isnumeric() or source.startswith(('rtsp://', 'rtmp://', 'http://')) or source.endswith('.txt') 23 | 24 | weights = "yolov5s.pt" 25 | 26 | # Initialize 27 | set_logging() 28 | device = select_device("") 29 | half = device.type != 'cpu' # half precision only supported on CUDA 30 | 31 | # Load model 32 | model = attempt_load(weights, map_location=device) # load FP32 model 33 | imgsz = check_img_size(img_size, s=model.stride.max()) # check img_size 34 | if half: 35 | model.half() # to FP16 36 | 37 | # Set Dataloader 38 | vid_path, vid_writer = None, None 39 | if webcam: 40 | view_img = True 41 | cudnn.benchmark = True # set True to speed up constant image size inference 42 | dataset = LoadStreams(source, img_size=imgsz) 43 | else: 44 | dataset = LoadImages(source, img_size=imgsz) 45 | 46 | # Get names and colors 47 | names = model.module.names if hasattr(model, 'module') else model.names 48 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] 49 | 50 | # Run inference 51 | t0 = time.time() 52 | img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img 53 | _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once 54 | 55 | result=[] 56 | 57 | for path, img, im0s, vid_cap in dataset: 58 | img = torch.from_numpy(img).to(device) 59 | img = img.half() if half else img.float() # uint8 to fp16/32 60 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 61 | if img.ndimension() == 3: 62 | img = img.unsqueeze(0) 63 | 64 | # Inference 65 | t1 = time_synchronized() 66 | pred = model(img, augment=False)[0] 67 | 68 | # Apply NMS 69 | pred = non_max_suppression(pred, 0.4, 0.5) 70 | t2 = time_synchronized() 71 | 72 | # Process detections 73 | for i, det in enumerate(pred): # detections per image 74 | if webcam: # batch_size >= 1 75 | p, s, im0 = path[i], '%g: ' % i, im0s[i].copy() 76 | else: 77 | p, s, im0 = path, '', im0s 78 | 79 | gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh 80 | if det is not None and len(det): 81 | # Rescale boxes from img_size to im0 size 82 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 83 | 84 | # Write results 85 | for *xyxy, conf, cls in reversed(det): 86 | xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh 87 | item = {} 88 | item["class_name"] = names[int(cls)] 89 | item["class"] = int(cls) 90 | item["confidence"] = float(conf) 91 | item["xywh"] = xywh 92 | result.append(item) 93 | return result 94 | 95 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/2-inference/source/models/__init__.py -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/common.py: -------------------------------------------------------------------------------- 1 | # This file contains modules common to various models 2 | import math 3 | 4 | import torch 5 | import torch.nn as nn 6 | from utils.general import non_max_suppression 7 | 8 | 9 | def autopad(k, p=None): # kernel, padding 10 | # Pad to 'same' 11 | if p is None: 12 | p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad 13 | return p 14 | 15 | 16 | def DWConv(c1, c2, k=1, s=1, act=True): 17 | # Depthwise convolution 18 | return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act) 19 | 20 | 21 | class Conv(nn.Module): 22 | # Standard convolution 23 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups 24 | super(Conv, self).__init__() 25 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) 26 | self.bn = nn.BatchNorm2d(c2) 27 | self.act = nn.Hardswish() if act else nn.Identity() 28 | 29 | def forward(self, x): 30 | return self.act(self.bn(self.conv(x))) 31 | 32 | def fuseforward(self, x): 33 | return self.act(self.conv(x)) 34 | 35 | 36 | class Bottleneck(nn.Module): 37 | # Standard bottleneck 38 | def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion 39 | super(Bottleneck, self).__init__() 40 | c_ = int(c2 * e) # hidden channels 41 | self.cv1 = Conv(c1, c_, 1, 1) 42 | self.cv2 = Conv(c_, c2, 3, 1, g=g) 43 | self.add = shortcut and c1 == c2 44 | 45 | def forward(self, x): 46 | return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) 47 | 48 | 49 | class BottleneckCSP(nn.Module): 50 | # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks 51 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 52 | super(BottleneckCSP, self).__init__() 53 | c_ = int(c2 * e) # hidden channels 54 | self.cv1 = Conv(c1, c_, 1, 1) 55 | self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) 56 | self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) 57 | self.cv4 = Conv(2 * c_, c2, 1, 1) 58 | self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) 59 | self.act = nn.LeakyReLU(0.1, inplace=True) 60 | self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) 61 | 62 | def forward(self, x): 63 | y1 = self.cv3(self.m(self.cv1(x))) 64 | y2 = self.cv2(x) 65 | return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) 66 | 67 | 68 | class SPP(nn.Module): 69 | # Spatial pyramid pooling layer used in YOLOv3-SPP 70 | def __init__(self, c1, c2, k=(5, 9, 13)): 71 | super(SPP, self).__init__() 72 | c_ = c1 // 2 # hidden channels 73 | self.cv1 = Conv(c1, c_, 1, 1) 74 | self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) 75 | self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) 76 | 77 | def forward(self, x): 78 | x = self.cv1(x) 79 | return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) 80 | 81 | 82 | class Focus(nn.Module): 83 | # Focus wh information into c-space 84 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups 85 | super(Focus, self).__init__() 86 | self.conv = Conv(c1 * 4, c2, k, s, p, g, act) 87 | 88 | def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) 89 | return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)) 90 | 91 | 92 | class Concat(nn.Module): 93 | # Concatenate a list of tensors along dimension 94 | def __init__(self, dimension=1): 95 | super(Concat, self).__init__() 96 | self.d = dimension 97 | 98 | def forward(self, x): 99 | return torch.cat(x, self.d) 100 | 101 | 102 | class NMS(nn.Module): 103 | # Non-Maximum Suppression (NMS) module 104 | conf = 0.3 # confidence threshold 105 | iou = 0.6 # IoU threshold 106 | classes = None # (optional list) filter by class 107 | 108 | def __init__(self, dimension=1): 109 | super(NMS, self).__init__() 110 | 111 | def forward(self, x): 112 | return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) 113 | 114 | 115 | class Flatten(nn.Module): 116 | # Use after nn.AdaptiveAvgPool2d(1) to remove last 2 dimensions 117 | @staticmethod 118 | def forward(x): 119 | return x.view(x.size(0), -1) 120 | 121 | 122 | class Classify(nn.Module): 123 | # Classification head, i.e. x(b,c1,20,20) to x(b,c2) 124 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups 125 | super(Classify, self).__init__() 126 | self.aap = nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1) 127 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) # to x(b,c2,1,1) 128 | self.flat = Flatten() 129 | 130 | def forward(self, x): 131 | z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list 132 | return self.flat(self.conv(z)) # flatten to x(b,c2) 133 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/experimental.py: -------------------------------------------------------------------------------- 1 | # This file contains experimental modules 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | from models.common import Conv, DWConv 8 | from utils.google_utils import attempt_download 9 | 10 | 11 | class CrossConv(nn.Module): 12 | # Cross Convolution Downsample 13 | def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False): 14 | # ch_in, ch_out, kernel, stride, groups, expansion, shortcut 15 | super(CrossConv, self).__init__() 16 | c_ = int(c2 * e) # hidden channels 17 | self.cv1 = Conv(c1, c_, (1, k), (1, s)) 18 | self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g) 19 | self.add = shortcut and c1 == c2 20 | 21 | def forward(self, x): 22 | return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) 23 | 24 | 25 | class C3(nn.Module): 26 | # Cross Convolution CSP 27 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 28 | super(C3, self).__init__() 29 | c_ = int(c2 * e) # hidden channels 30 | self.cv1 = Conv(c1, c_, 1, 1) 31 | self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) 32 | self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) 33 | self.cv4 = Conv(2 * c_, c2, 1, 1) 34 | self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) 35 | self.act = nn.LeakyReLU(0.1, inplace=True) 36 | self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)]) 37 | 38 | def forward(self, x): 39 | y1 = self.cv3(self.m(self.cv1(x))) 40 | y2 = self.cv2(x) 41 | return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) 42 | 43 | 44 | class Sum(nn.Module): 45 | # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070 46 | def __init__(self, n, weight=False): # n: number of inputs 47 | super(Sum, self).__init__() 48 | self.weight = weight # apply weights boolean 49 | self.iter = range(n - 1) # iter object 50 | if weight: 51 | self.w = nn.Parameter(-torch.arange(1., n) / 2, requires_grad=True) # layer weights 52 | 53 | def forward(self, x): 54 | y = x[0] # no weight 55 | if self.weight: 56 | w = torch.sigmoid(self.w) * 2 57 | for i in self.iter: 58 | y = y + x[i + 1] * w[i] 59 | else: 60 | for i in self.iter: 61 | y = y + x[i + 1] 62 | return y 63 | 64 | 65 | class GhostConv(nn.Module): 66 | # Ghost Convolution https://github.com/huawei-noah/ghostnet 67 | def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups 68 | super(GhostConv, self).__init__() 69 | c_ = c2 // 2 # hidden channels 70 | self.cv1 = Conv(c1, c_, k, s, g, act) 71 | self.cv2 = Conv(c_, c_, 5, 1, c_, act) 72 | 73 | def forward(self, x): 74 | y = self.cv1(x) 75 | return torch.cat([y, self.cv2(y)], 1) 76 | 77 | 78 | class GhostBottleneck(nn.Module): 79 | # Ghost Bottleneck https://github.com/huawei-noah/ghostnet 80 | def __init__(self, c1, c2, k, s): 81 | super(GhostBottleneck, self).__init__() 82 | c_ = c2 // 2 83 | self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1), # pw 84 | DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw 85 | GhostConv(c_, c2, 1, 1, act=False)) # pw-linear 86 | self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), 87 | Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity() 88 | 89 | def forward(self, x): 90 | return self.conv(x) + self.shortcut(x) 91 | 92 | 93 | class MixConv2d(nn.Module): 94 | # Mixed Depthwise Conv https://arxiv.org/abs/1907.09595 95 | def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True): 96 | super(MixConv2d, self).__init__() 97 | groups = len(k) 98 | if equal_ch: # equal c_ per group 99 | i = torch.linspace(0, groups - 1E-6, c2).floor() # c2 indices 100 | c_ = [(i == g).sum() for g in range(groups)] # intermediate channels 101 | else: # equal weight.numel() per group 102 | b = [c2] + [0] * groups 103 | a = np.eye(groups + 1, groups, k=-1) 104 | a -= np.roll(a, 1, axis=1) 105 | a *= np.array(k) ** 2 106 | a[0] = 1 107 | c_ = np.linalg.lstsq(a, b, rcond=None)[0].round() # solve for equal weight indices, ax = b 108 | 109 | self.m = nn.ModuleList([nn.Conv2d(c1, int(c_[g]), k[g], s, k[g] // 2, bias=False) for g in range(groups)]) 110 | self.bn = nn.BatchNorm2d(c2) 111 | self.act = nn.LeakyReLU(0.1, inplace=True) 112 | 113 | def forward(self, x): 114 | return x + self.act(self.bn(torch.cat([m(x) for m in self.m], 1))) 115 | 116 | 117 | class Ensemble(nn.ModuleList): 118 | # Ensemble of models 119 | def __init__(self): 120 | super(Ensemble, self).__init__() 121 | 122 | def forward(self, x, augment=False): 123 | y = [] 124 | for module in self: 125 | y.append(module(x, augment)[0]) 126 | # y = torch.stack(y).max(0)[0] # max ensemble 127 | # y = torch.cat(y, 1) # nms ensemble 128 | y = torch.stack(y).mean(0) # mean ensemble 129 | return y, None # inference, train output 130 | 131 | 132 | def attempt_load(weights, map_location=None): 133 | # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a 134 | model = Ensemble() 135 | for w in weights if isinstance(weights, list) else [weights]: 136 | attempt_download(w) 137 | model.append(torch.load(w, map_location=map_location)['model'].float().fuse().eval()) # load FP32 model 138 | 139 | if len(model) == 1: 140 | return model[-1] # return model 141 | else: 142 | print('Ensemble created with %s\n' % weights) 143 | for k in ['names', 'stride']: 144 | setattr(model, k, getattr(model[-1], k)) 145 | return model # return ensemble 146 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/export.py: -------------------------------------------------------------------------------- 1 | """Exports a YOLOv5 *.pt model to ONNX and TorchScript formats 2 | 3 | Usage: 4 | $ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov5s.pt --img 640 --batch 1 5 | """ 6 | 7 | import argparse 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | import models 13 | from models.experimental import attempt_load 14 | from utils.activations import Hardswish 15 | from utils.general import set_logging 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--weights', type=str, default='./yolov5s.pt', help='weights path') # from yolov5/models/ 20 | parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size') # height, width 21 | parser.add_argument('--batch-size', type=int, default=1, help='batch size') 22 | opt = parser.parse_args() 23 | opt.img_size *= 2 if len(opt.img_size) == 1 else 1 # expand 24 | print(opt) 25 | set_logging() 26 | 27 | # Input 28 | img = torch.zeros((opt.batch_size, 3, *opt.img_size)) # image size(1,3,320,192) iDetection 29 | 30 | # Load PyTorch model 31 | model = attempt_load(opt.weights, map_location=torch.device('cpu')) # load FP32 model 32 | 33 | # Update model 34 | for k, m in model.named_modules(): 35 | m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatability 36 | if isinstance(m, models.common.Conv) and isinstance(m.act, nn.Hardswish): 37 | m.act = Hardswish() # assign activation 38 | # if isinstance(m, models.yolo.Detect): 39 | # m.forward = m.forward_export # assign forward (optional) 40 | model.model[-1].export = True # set Detect() layer export=True 41 | y = model(img) # dry run 42 | 43 | # TorchScript export 44 | try: 45 | print('\nStarting TorchScript export with torch %s...' % torch.__version__) 46 | f = opt.weights.replace('.pt', '.torchscript.pt') # filename 47 | ts = torch.jit.trace(model, img) 48 | ts.save(f) 49 | print('TorchScript export success, saved as %s' % f) 50 | except Exception as e: 51 | print('TorchScript export failure: %s' % e) 52 | 53 | # ONNX export 54 | try: 55 | import onnx 56 | 57 | print('\nStarting ONNX export with onnx %s...' % onnx.__version__) 58 | f = opt.weights.replace('.pt', '.onnx') # filename 59 | torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'], 60 | output_names=['classes', 'boxes'] if y is None else ['output']) 61 | 62 | # Checks 63 | onnx_model = onnx.load(f) # load onnx model 64 | onnx.checker.check_model(onnx_model) # check onnx model 65 | # print(onnx.helper.printable_graph(onnx_model.graph)) # print a human readable model 66 | print('ONNX export success, saved as %s' % f) 67 | except Exception as e: 68 | print('ONNX export failure: %s' % e) 69 | 70 | # CoreML export 71 | try: 72 | import coremltools as ct 73 | 74 | print('\nStarting CoreML export with coremltools %s...' % ct.__version__) 75 | # convert model from torchscript and apply pixel scaling as per detect.py 76 | model = ct.convert(ts, inputs=[ct.ImageType(name='images', shape=img.shape, scale=1 / 255.0, bias=[0, 0, 0])]) 77 | f = opt.weights.replace('.pt', '.mlmodel') # filename 78 | model.save(f) 79 | print('CoreML export success, saved as %s' % f) 80 | except Exception as e: 81 | print('CoreML export failure: %s' % e) 82 | 83 | # Finish 84 | print('\nExport complete. Visualize with https://github.com/lutzroeder/netron.') 85 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov3-spp.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 1.0 # model depth multiple 4 | width_multiple: 1.0 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # darknet53 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Conv, [32, 3, 1]], # 0 16 | [-1, 1, Conv, [64, 3, 2]], # 1-P1/2 17 | [-1, 1, Bottleneck, [64]], 18 | [-1, 1, Conv, [128, 3, 2]], # 3-P2/4 19 | [-1, 2, Bottleneck, [128]], 20 | [-1, 1, Conv, [256, 3, 2]], # 5-P3/8 21 | [-1, 8, Bottleneck, [256]], 22 | [-1, 1, Conv, [512, 3, 2]], # 7-P4/16 23 | [-1, 8, Bottleneck, [512]], 24 | [-1, 1, Conv, [1024, 3, 2]], # 9-P5/32 25 | [-1, 4, Bottleneck, [1024]], # 10 26 | ] 27 | 28 | # YOLOv3-SPP head 29 | head: 30 | [[-1, 1, Bottleneck, [1024, False]], 31 | [-1, 1, SPP, [512, [5, 9, 13]]], 32 | [-1, 1, Conv, [1024, 3, 1]], 33 | [-1, 1, Conv, [512, 1, 1]], 34 | [-1, 1, Conv, [1024, 3, 1]], # 15 (P5/32-large) 35 | 36 | [-2, 1, Conv, [256, 1, 1]], 37 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 38 | [[-1, 8], 1, Concat, [1]], # cat backbone P4 39 | [-1, 1, Bottleneck, [512, False]], 40 | [-1, 1, Bottleneck, [512, False]], 41 | [-1, 1, Conv, [256, 1, 1]], 42 | [-1, 1, Conv, [512, 3, 1]], # 22 (P4/16-medium) 43 | 44 | [-2, 1, Conv, [128, 1, 1]], 45 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 46 | [[-1, 6], 1, Concat, [1]], # cat backbone P3 47 | [-1, 1, Bottleneck, [256, False]], 48 | [-1, 2, Bottleneck, [256, False]], # 27 (P3/8-small) 49 | 50 | [[27, 22, 15], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 51 | ] 52 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov5-fpn.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 1.0 # model depth multiple 4 | width_multiple: 1.0 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, Bottleneck, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 6, BottleneckCSP, [1024]], # 9 25 | ] 26 | 27 | # YOLOv5 FPN head 28 | head: 29 | [[-1, 3, BottleneckCSP, [1024, False]], # 10 (P5/32-large) 30 | 31 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 32 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 33 | [-1, 1, Conv, [512, 1, 1]], 34 | [-1, 3, BottleneckCSP, [512, False]], # 14 (P4/16-medium) 35 | 36 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 37 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 38 | [-1, 1, Conv, [256, 1, 1]], 39 | [-1, 3, BottleneckCSP, [256, False]], # 18 (P3/8-small) 40 | 41 | [[18, 14, 10], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 42 | ] 43 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/hub/yolov5-panet.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 1.0 # model depth multiple 4 | width_multiple: 1.0 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [116,90, 156,198, 373,326] # P5/32 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [10,13, 16,30, 33,23] # P3/8 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 PANet head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P5, P4, P3) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5l.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 1.0 # model depth multiple 4 | width_multiple: 1.0 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5m.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 0.67 # model depth multiple 4 | width_multiple: 0.75 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5s.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 0.33 # model depth multiple 4 | width_multiple: 0.50 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/models/yolov5x.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 80 # number of classes 3 | depth_multiple: 1.33 # model depth multiple 4 | width_multiple: 1.25 # layer channel multiple 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # [from, number, module, args] 15 | [[-1, 1, Focus, [64, 3]], # 0-P1/2 16 | [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 17 | [-1, 3, BottleneckCSP, [128]], 18 | [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 19 | [-1, 9, BottleneckCSP, [256]], 20 | [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 21 | [-1, 9, BottleneckCSP, [512]], 22 | [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 23 | [-1, 1, SPP, [1024, [5, 9, 13]]], 24 | [-1, 3, BottleneckCSP, [1024, False]], # 9 25 | ] 26 | 27 | # YOLOv5 head 28 | head: 29 | [[-1, 1, Conv, [512, 1, 1]], 30 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 31 | [[-1, 6], 1, Concat, [1]], # cat backbone P4 32 | [-1, 3, BottleneckCSP, [512, False]], # 13 33 | 34 | [-1, 1, Conv, [256, 1, 1]], 35 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 36 | [[-1, 4], 1, Concat, [1]], # cat backbone P3 37 | [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) 38 | 39 | [-1, 1, Conv, [256, 3, 2]], 40 | [[-1, 14], 1, Concat, [1]], # cat head P4 41 | [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) 42 | 43 | [-1, 1, Conv, [512, 3, 2]], 44 | [[-1, 10], 1, Concat, [1]], # cat head P5 45 | [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) 46 | 47 | [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 48 | ] 49 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | proxy_read_timeout 1200s; 27 | 28 | location ~ ^/(ping|invocations) { 29 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 30 | proxy_set_header Host $http_host; 31 | proxy_redirect off; 32 | proxy_pass http://gunicorn; 33 | } 34 | 35 | location / { 36 | return 404 "{}"; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import boto3 4 | import flask 5 | import json 6 | import shutil 7 | import time 8 | import random 9 | from detect import detect 10 | 11 | DEBUG = False 12 | 13 | app = flask.Flask(__name__) 14 | 15 | import logging 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.DEBUG) 18 | logger.addHandler(logging.StreamHandler(sys.stdout)) 19 | 20 | @app.route('/ping', methods=['GET']) 21 | def ping(): 22 | """Determine if the container is working and healthy. In this sample container, we declare 23 | it healthy if we can load the model successfully.""" 24 | #health = boto3.client('s3') is not None # You can insert a health check here 25 | 26 | #status = 200 if health else 404 27 | status = 200 28 | return flask.Response(response='\n', status=status, mimetype='application/json') 29 | 30 | 31 | @app.route('/') 32 | def hello_world(): 33 | return 'YOLOv5 endpoint' 34 | 35 | 36 | @app.route('/invocations', methods=['POST']) 37 | def invocations(): 38 | data = None 39 | #解析json, 40 | if flask.request.content_type == 'application/json': 41 | data = flask.request.data.decode('utf-8') 42 | data = json.loads(data) 43 | logger.info("invocations params [{}]".format(data)) 44 | bucket = data['bucket'] 45 | image_uri = data['image_uri'] 46 | else: 47 | return flask.Response(response='This predictor only supports JSON data', status=415, mimetype='text/plain') 48 | 49 | tt = time.strftime("%Y%m%d%H%M%S", time.localtime()) 50 | for i in range(0,5): 51 | current_output_dir = os.path.join(init_output_dir,tt+str(random.randint(1000,9999))) 52 | if not os.path.exists(current_output_dir): 53 | try: 54 | os.mkdir(current_output_dir) 55 | break 56 | except FileExistsError: 57 | logger.info("Dir Exist."+current_output_dir) 58 | else: 59 | return flask.Response(response='Make dir error', status=500, mimetype='text/plain') 60 | 61 | download_file_name = image_uri.split('/')[-1] 62 | download_file_name = os.path.join(current_output_dir, download_file_name) 63 | s3_client.download_file(bucket, image_uri, download_file_name) 64 | 65 | img_size = 640 66 | if "img_size" in data: 67 | img_size = data["img_size"] 68 | inference_result = detect(download_file_name, img_size) 69 | 70 | 71 | _payload = json.dumps({'status': 500, 'message': 'YOLOv5 failed!'}) 72 | if inference_result: 73 | _payload = json.dumps(inference_result) 74 | 75 | 76 | shutil.rmtree(current_output_dir) 77 | 78 | return flask.Response(response=_payload, status=200, mimetype='application/json') 79 | 80 | 81 | #--------------------------------------- 82 | init_output_dir = '/opt/ml/output_dir' 83 | if not os.path.exists(init_output_dir): 84 | try: 85 | os.mkdir(init_output_dir) 86 | except FileExistsError: 87 | logger.info("Dir Exist.") 88 | 89 | #load model 90 | source_file = '/opt/ml/model/runs/train/exp/weights/best.pt' 91 | destination_file = "yolov5s.pt" 92 | if os.path.isfile(source_file) and not os.path.isfile(destination_file): 93 | shutil.copy(source_file,destination_file) 94 | logger.info("Model file copied.") 95 | else: 96 | logger.info("Model file not copy.") 97 | 98 | s3_client = boto3.client('s3') 99 | 100 | if __name__ == '__main__': 101 | app.run() -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/requirements.txt: -------------------------------------------------------------------------------- 1 | # pip install -r requirements.txt 2 | 3 | # base ---------------------------------------- 4 | Cython 5 | matplotlib>=3.2.2 6 | numpy>=1.18.5 7 | opencv-python>=4.1.2 8 | Pillow 9 | PyYAML>=5.3.1 10 | scipy>=1.4.1 11 | tensorboard>=2.2 12 | torch>=1.7.0 13 | torchvision>=0.8.1 14 | tqdm>=4.41.0 15 | 16 | # logging ------------------------------------- 17 | # wandb 18 | 19 | # plotting ------------------------------------ 20 | seaborn>=0.11.0 21 | pandas 22 | 23 | # export -------------------------------------- 24 | # coremltools>=4.1 25 | # onnx>=1.8.1 26 | # scikit-learn==0.19.2 # for coreml quantization 27 | 28 | # extras -------------------------------------- 29 | thop # FLOPS computation 30 | pycocotools>=2.0 # COCO mAP 31 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | from __future__ import print_function 17 | import multiprocessing 18 | import os 19 | import signal 20 | import subprocess 21 | import sys 22 | 23 | 24 | cpu_count = multiprocessing.cpu_count() 25 | 26 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60) 27 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count)) 28 | # # shishuai comment, this is for g4dn.12xlarge 29 | #model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', 4)) 30 | 31 | def sigterm_handler(nginx_pid, gunicorn_pid): 32 | try: 33 | os.kill(nginx_pid, signal.SIGQUIT) 34 | except OSError: 35 | pass 36 | try: 37 | os.kill(gunicorn_pid, signal.SIGTERM) 38 | except OSError: 39 | pass 40 | 41 | sys.exit(0) 42 | 43 | def start_server(): 44 | print('Starting the inference server with {} workers.'.format(model_server_workers)) 45 | 46 | 47 | # link the log streams to stdout/err so they will be logged to the container logs 48 | subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log']) 49 | subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log']) 50 | 51 | nginx = subprocess.Popen(['nginx', '-c', '/opt/ml/code/nginx.conf']) 52 | gunicorn = subprocess.Popen(['gunicorn', 53 | '--timeout', str(model_server_timeout), 54 | '-k', 'gevent', 55 | '-b', 'unix:/tmp/gunicorn.sock', 56 | '-w', str(model_server_workers), 57 | 'wsgi:app']) 58 | 59 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 60 | 61 | # If either subprocess exits, so do we. 62 | pids = set([nginx.pid, gunicorn.pid]) 63 | while True: 64 | pid, _ = os.wait() 65 | if pid in pids: 66 | break 67 | 68 | sigterm_handler(nginx.pid, gunicorn.pid) 69 | print('Inference server exiting') 70 | 71 | # The main routine just invokes the start function. 72 | 73 | if __name__ == '__main__': 74 | start_server() 75 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/2-inference/source/utils/__init__.py -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/activations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | # Swish https://arxiv.org/pdf/1905.02244.pdf --------------------------------------------------------------------------- 7 | class Swish(nn.Module): # 8 | @staticmethod 9 | def forward(x): 10 | return x * torch.sigmoid(x) 11 | 12 | 13 | class Hardswish(nn.Module): # export-friendly version of nn.Hardswish() 14 | @staticmethod 15 | def forward(x): 16 | # return x * F.hardsigmoid(x) # for torchscript and CoreML 17 | return x * F.hardtanh(x + 3, 0., 6.) / 6. # for torchscript, CoreML and ONNX 18 | 19 | 20 | class MemoryEfficientSwish(nn.Module): 21 | class F(torch.autograd.Function): 22 | @staticmethod 23 | def forward(ctx, x): 24 | ctx.save_for_backward(x) 25 | return x * torch.sigmoid(x) 26 | 27 | @staticmethod 28 | def backward(ctx, grad_output): 29 | x = ctx.saved_tensors[0] 30 | sx = torch.sigmoid(x) 31 | return grad_output * (sx * (1 + x * (1 - sx))) 32 | 33 | def forward(self, x): 34 | return self.F.apply(x) 35 | 36 | 37 | # Mish https://github.com/digantamisra98/Mish -------------------------------------------------------------------------- 38 | class Mish(nn.Module): 39 | @staticmethod 40 | def forward(x): 41 | return x * F.softplus(x).tanh() 42 | 43 | 44 | class MemoryEfficientMish(nn.Module): 45 | class F(torch.autograd.Function): 46 | @staticmethod 47 | def forward(ctx, x): 48 | ctx.save_for_backward(x) 49 | return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x))) 50 | 51 | @staticmethod 52 | def backward(ctx, grad_output): 53 | x = ctx.saved_tensors[0] 54 | sx = torch.sigmoid(x) 55 | fx = F.softplus(x).tanh() 56 | return grad_output * (fx + x * sx * (1 - fx * fx)) 57 | 58 | def forward(self, x): 59 | return self.F.apply(x) 60 | 61 | 62 | # FReLU https://arxiv.org/abs/2007.11824 ------------------------------------------------------------------------------- 63 | class FReLU(nn.Module): 64 | def __init__(self, c1, k=3): # ch_in, kernel 65 | super().__init__() 66 | self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1) 67 | self.bn = nn.BatchNorm2d(c1) 68 | 69 | def forward(self, x): 70 | return torch.max(x, self.bn(self.conv(x))) 71 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/evolve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Hyperparameter evolution commands (avoids CUDA memory leakage issues) 3 | # Replaces train.py python generations 'for' loop with a bash 'for' loop 4 | 5 | # Start on 4-GPU machine 6 | #for i in 0 1 2 3; do 7 | # t=ultralytics/yolov5:evolve && sudo docker pull $t && sudo docker run -d --ipc=host --gpus all -v "$(pwd)"/VOC:/usr/src/VOC $t bash utils/evolve.sh $i 8 | # sleep 60 # avoid simultaneous evolve.txt read/write 9 | #done 10 | 11 | # Hyperparameter evolution commands 12 | while true; do 13 | # python train.py --batch 64 --weights yolov5m.pt --data voc.yaml --img 512 --epochs 50 --evolve --bucket ult/evolve/voc --device $1 14 | python train.py --batch 40 --weights yolov5m.pt --data coco.yaml --img 640 --epochs 30 --evolve --bucket ult/evolve/coco --device $1 15 | done 16 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/google-appengine/python 2 | 3 | # Create a virtualenv for dependencies. This isolates these packages from 4 | # system-level packages. 5 | # Use -p python3 or -p python3.7 to select python version. Default is version 2. 6 | RUN virtualenv /env -p python3 7 | 8 | # Setting these environment variables are the same as running 9 | # source /env/bin/activate. 10 | ENV VIRTUAL_ENV /env 11 | ENV PATH /env/bin:$PATH 12 | 13 | RUN apt-get update && apt-get install -y python-opencv 14 | 15 | # Copy the application's requirements.txt and run pip to install all 16 | # dependencies into the virtualenv. 17 | ADD requirements.txt /app/requirements.txt 18 | RUN pip install -r /app/requirements.txt 19 | 20 | # Add the application source code. 21 | ADD . /app 22 | 23 | # Run a WSGI server to serve the application. gunicorn must be declared as 24 | # a dependency in requirements.txt. 25 | CMD gunicorn -b :$PORT main:app 26 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/additional_requirements.txt: -------------------------------------------------------------------------------- 1 | # add these requirements in your app on top of the existing ones 2 | pip==18.1 3 | Flask==1.0.2 4 | gunicorn==19.9.0 5 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_app_engine/app.yaml: -------------------------------------------------------------------------------- 1 | runtime: custom 2 | env: flex 3 | 4 | service: yolov5app 5 | 6 | liveness_check: 7 | initial_delay_sec: 600 8 | 9 | manual_scaling: 10 | instances: 1 11 | resources: 12 | cpu: 1 13 | memory_gb: 4 14 | disk_size_gb: 20 -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/utils/google_utils.py: -------------------------------------------------------------------------------- 1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries 2 | # pip install --upgrade google-cloud-storage 3 | # from google.cloud import storage 4 | 5 | import os 6 | import platform 7 | import subprocess 8 | import time 9 | from pathlib import Path 10 | 11 | import torch 12 | 13 | 14 | def gsutil_getsize(url=''): 15 | # gs://bucket/file size https://cloud.google.com/storage/docs/gsutil/commands/du 16 | s = subprocess.check_output('gsutil du %s' % url, shell=True).decode('utf-8') 17 | return eval(s.split(' ')[0]) if len(s) else 0 # bytes 18 | 19 | 20 | def attempt_download(weights): 21 | # Attempt to download pretrained weights if not found locally 22 | weights = weights.strip().replace("'", '') 23 | file = Path(weights).name 24 | 25 | msg = weights + ' missing, try downloading from https://github.com/ultralytics/yolov5/releases/' 26 | models = ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt'] # available models 27 | 28 | if file in models and not os.path.isfile(weights): 29 | # Google Drive 30 | # d = {'yolov5s.pt': '1R5T6rIyy3lLwgFXNms8whc-387H0tMQO', 31 | # 'yolov5m.pt': '1vobuEExpWQVpXExsJ2w-Mbf3HJjWkQJr', 32 | # 'yolov5l.pt': '1hrlqD1Wdei7UT4OgT785BEk1JwnSvNEV', 33 | # 'yolov5x.pt': '1mM8aZJlWTxOg7BZJvNUMrTnA2AbeCVzS'} 34 | # r = gdrive_download(id=d[file], name=weights) if file in d else 1 35 | # if r == 0 and os.path.exists(weights) and os.path.getsize(weights) > 1E6: # check 36 | # return 37 | 38 | try: # GitHub 39 | url = 'https://github.com/ultralytics/yolov5/releases/download/v3.0/' + file 40 | print('Downloading %s to %s...' % (url, weights)) 41 | torch.hub.download_url_to_file(url, weights) 42 | assert os.path.exists(weights) and os.path.getsize(weights) > 1E6 # check 43 | except Exception as e: # GCP 44 | print('Download error: %s' % e) 45 | url = 'https://storage.googleapis.com/ultralytics/yolov5/ckpt/' + file 46 | print('Downloading %s to %s...' % (url, weights)) 47 | r = os.system('curl -L %s -o %s' % (url, weights)) # torch.hub.download_url_to_file(url, weights) 48 | finally: 49 | if not (os.path.exists(weights) and os.path.getsize(weights) > 1E6): # check 50 | os.remove(weights) if os.path.exists(weights) else None # remove partial downloads 51 | print('ERROR: Download failure: %s' % msg) 52 | print('') 53 | return 54 | 55 | 56 | def gdrive_download(id='1n_oKgR81BJtqk75b00eAjdv03qVCQn2f', name='coco128.zip'): 57 | # Downloads a file from Google Drive. from utils.google_utils import *; gdrive_download() 58 | t = time.time() 59 | 60 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='') 61 | os.remove(name) if os.path.exists(name) else None # remove existing 62 | os.remove('cookie') if os.path.exists('cookie') else None 63 | 64 | # Attempt file download 65 | out = "NUL" if platform.system() == "Windows" else "/dev/null" 66 | os.system('curl -c ./cookie -s -L "drive.google.com/uc?export=download&id=%s" > %s ' % (id, out)) 67 | if os.path.exists('cookie'): # large file 68 | s = 'curl -Lb ./cookie "drive.google.com/uc?export=download&confirm=%s&id=%s" -o %s' % (get_token(), id, name) 69 | else: # small file 70 | s = 'curl -s -L -o %s "drive.google.com/uc?export=download&id=%s"' % (name, id) 71 | r = os.system(s) # execute, capture return 72 | os.remove('cookie') if os.path.exists('cookie') else None 73 | 74 | # Error check 75 | if r != 0: 76 | os.remove(name) if os.path.exists(name) else None # remove partial 77 | print('Download error ') # raise Exception('Download error') 78 | return r 79 | 80 | # Unzip if archive 81 | if name.endswith('.zip'): 82 | print('unzipping... ', end='') 83 | os.system('unzip -q %s' % name) # unzip 84 | os.remove(name) # remove zip to free space 85 | 86 | print('Done (%.1fs)' % (time.time() - t)) 87 | return r 88 | 89 | 90 | def get_token(cookie="./cookie"): 91 | with open(cookie) as f: 92 | for line in f: 93 | if "download" in line: 94 | return line.split()[-1] 95 | return "" 96 | 97 | # def upload_blob(bucket_name, source_file_name, destination_blob_name): 98 | # # Uploads a file to a bucket 99 | # # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 100 | # 101 | # storage_client = storage.Client() 102 | # bucket = storage_client.get_bucket(bucket_name) 103 | # blob = bucket.blob(destination_blob_name) 104 | # 105 | # blob.upload_from_filename(source_file_name) 106 | # 107 | # print('File {} uploaded to {}.'.format( 108 | # source_file_name, 109 | # destination_blob_name)) 110 | # 111 | # 112 | # def download_blob(bucket_name, source_blob_name, destination_file_name): 113 | # # Uploads a blob from a bucket 114 | # storage_client = storage.Client() 115 | # bucket = storage_client.get_bucket(bucket_name) 116 | # blob = bucket.blob(source_blob_name) 117 | # 118 | # blob.download_to_filename(destination_file_name) 119 | # 120 | # print('Blob {} downloaded to {}.'.format( 121 | # source_blob_name, 122 | # destination_file_name)) 123 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/2-inference/source/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app 8 | -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/README.md: -------------------------------------------------------------------------------- 1 | # YOLOv5 on SageMaker 2 | 本workshop演示使用YOLOv5在SageMaker上如何进行训练和推理。 3 | YOLOv5官方地址:https://github.com/ultralytics/yolov5 4 | YOLOv5是一个对象检测算法,可识别图片、视频里的对象。 5 | ![检测例子](images/detection_example.jpg) 6 | ## 数据准备 7 | [0-preparation](0-preparation)演示把labelme数据格式转化为YOLOv5格式,如果已有YOLOv5格式的数据,可跳过数据准备,把数据按要求放入到S3即可。 8 | ## 训练 9 | [1-training](1-training)演示在SageMaker上进行训练。 10 | ## 推理 11 | [2-inference](2-inference)演示在SageMaker上部署Endpoint,以及调用Endpoint进行推理。 -------------------------------------------------------------------------------- /object-detection/yolov5-on-sagemaker/images/detection_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nwcdheap/sagemaker-workshop/9bcbc4c787da346fcf10aa6fd97a8a100328719a/object-detection/yolov5-on-sagemaker/images/detection_example.jpg -------------------------------------------------------------------------------- /runtime/Java/Inference.java: -------------------------------------------------------------------------------- 1 | import java.nio.ByteBuffer; 2 | 3 | import com.amazonaws.services.sagemakerruntime.AmazonSageMakerRuntime; 4 | import com.amazonaws.services.sagemakerruntime.AmazonSageMakerRuntimeClientBuilder; 5 | import com.amazonaws.services.sagemakerruntime.model.InvokeEndpointRequest; 6 | import com.amazonaws.services.sagemakerruntime.model.InvokeEndpointResult; 7 | 8 | public class Inference { 9 | public static void main(String[] args) { 10 | String request = "{\"bucket\":\"nowfox\",\"image_uri\":\"data/zidane.jpg\",\"img_size\":416}"; 11 | InvokeEndpointRequest invokeEndpointRequest = new InvokeEndpointRequest(); 12 | invokeEndpointRequest.setContentType("application/json"); 13 | ByteBuffer buf = ByteBuffer.wrap(request.getBytes()); 14 | 15 | invokeEndpointRequest.setBody(buf); 16 | invokeEndpointRequest.setEndpointName("yolov5"); 17 | invokeEndpointRequest.setAccept("application/json"); 18 | 19 | AmazonSageMakerRuntime amazonSageMaker = AmazonSageMakerRuntimeClientBuilder.defaultClient(); 20 | InvokeEndpointResult invokeEndpointResult = amazonSageMaker.invokeEndpoint(invokeEndpointRequest); 21 | byte[] response = invokeEndpointResult.getBody().array(); 22 | String result = new String(response); 23 | System.out.print(result); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /runtime/Java/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | cn.nwcdcloud.samples 6 | sagemaker 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | sagemaker 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.amazonaws 20 | aws-java-sdk-sagemakerruntime 21 | 1.11.879 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /runtime/Java2/Inference.java: -------------------------------------------------------------------------------- 1 | import software.amazon.awssdk.core.SdkBytes; 2 | import software.amazon.awssdk.services.sagemakerruntime.SageMakerRuntimeClient; 3 | import software.amazon.awssdk.services.sagemakerruntime.model.InvokeEndpointRequest; 4 | import software.amazon.awssdk.services.sagemakerruntime.model.InvokeEndpointResponse; 5 | 6 | public class Inference { 7 | public static void main(String[] args) { 8 | String requestBody = "{\"bucket\":\"nowfox\",\"image_uri\":\"data/zidane.jpg\",\"img_size\":416}"; 9 | SdkBytes body = SdkBytes.fromUtf8String(requestBody); 10 | InvokeEndpointRequest request = InvokeEndpointRequest.builder().endpointName("yolov5") 11 | .contentType("application/json").body(body).build(); 12 | SageMakerRuntimeClient client = SageMakerRuntimeClient.create(); 13 | InvokeEndpointResponse response = client.invokeEndpoint(request); 14 | System.out.print(response.body().asUtf8String()); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /runtime/Java2/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | cn.nwcdcloud.samples 6 | sagemakerruntime 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | sagemakerruntime 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | software.amazon.awssdk 20 | sagemakerruntime 21 | 2.15.7 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /training-data-input/EFS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 使用EFS作为SageMaker的训练数据输入" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本文为使用EFS作为SageMaker的训练数据输入。 \n", 16 | "注意:该功能暂不能在中国区使用。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## 2 运行环境\n", 24 | "Kernel 选择tensorflow2_p36或pytorch_p36均可。 \n", 25 | "本文在boto3 1.17.99和sagemaker 2.45.0下测试通过。 " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import boto3,sagemaker\n", 35 | "print(boto3.__version__)\n", 36 | "print(sagemaker.__version__)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## 3 配置EFS\n", 44 | "参考 https://docs.aws.amazon.com/zh_cn/efs/latest/ug/gs-step-two-create-efs-resources.html 配置EFS,注意配置合适的安全组。" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## 4 获取/设置相关参数" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "import boto3\n", 61 | "import sagemaker\n", 62 | "from sagemaker.image_uris import retrieve\n", 63 | "\n", 64 | "sagemaker_session = sagemaker.Session()\n", 65 | "iam = boto3.client('iam')\n", 66 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 67 | "role=\"\"\n", 68 | "for current_role in roles[\"Roles\"]:\n", 69 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 70 | " role=current_role[\"Arn\"]\n", 71 | " break\n", 72 | "print(role)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "注意事项:\n", 80 | "- 1.SageMaker必须要有使用EFS的权限\n", 81 | "- 2.确认EFS的安全组,允许SageMaker访问" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "subnets = [\"subnet-0eecdb20\"] # Should be same as Subnet used for FSx. Example: subnet-0f9XXXX\n", 91 | "security_group_ids = [\"sg-6478f13a\"] # Should be same as Security group used for FSx. sg-03ZZZZZZ\n", 92 | "file_system_id = \"fs-8eafd93a\" # FSx file system ID with your training dataset. Example: 'fs-0bYYYYYY'\n", 93 | "efs_dir=\"/test\" #EFS目录" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from sagemaker.inputs import FileSystemInput\n", 103 | "file_system_directory_path = efs_dir\n", 104 | "file_system_access_mode = \"ro\"#read only\n", 105 | "file_system_type = \"EFS\"\n", 106 | "train_fs = FileSystemInput(\n", 107 | " file_system_id=file_system_id,\n", 108 | " file_system_type=file_system_type,\n", 109 | " directory_path=file_system_directory_path,\n", 110 | " file_system_access_mode=file_system_access_mode,\n", 111 | ")" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## 5 训练" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "本文仅仅是列出了训练目录下的前100个文件,并没有实际训练,主要为演示获取EFS数据。" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### 5.1 TensorFlow" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from sagemaker.tensorflow import TensorFlow\n", 142 | "\n", 143 | "estimator = TensorFlow(\n", 144 | " base_job_name=\"tensorflow2-fsx-big\",\n", 145 | " entry_point=\"ListFile.py\",\n", 146 | " role=role,\n", 147 | " py_version=\"py37\",\n", 148 | " framework_version=\"2.4.1\",\n", 149 | " instance_count=1,\n", 150 | " instance_type=\"ml.m5.large\",\n", 151 | " sagemaker_session=sagemaker_session,\n", 152 | " hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n", 153 | " subnets=subnets,\n", 154 | " security_group_ids=security_group_ids,\n", 155 | ")\n", 156 | "estimator.fit(train_fs)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "### 5.2 PyTorch" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "from sagemaker.pytorch import PyTorch\n", 173 | "\n", 174 | "estimator = PyTorch(\n", 175 | " base_job_name=\"big-data-input\",\n", 176 | " entry_point=\"ListFile.py\",\n", 177 | " role=role,\n", 178 | " py_version=\"py36\",\n", 179 | " framework_version=\"1.6.0\",\n", 180 | " instance_count=1,\n", 181 | " instance_type=\"ml.m5.large\",\n", 182 | " sagemaker_session=sagemaker_session,\n", 183 | " hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n", 184 | " subnets=subnets,\n", 185 | " security_group_ids=security_group_ids,\n", 186 | ")\n", 187 | "estimator.fit(train_fs)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Environment (conda_pytorch_p36)", 201 | "language": "python", 202 | "name": "conda_pytorch_p36" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.6.13" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 4 219 | } 220 | -------------------------------------------------------------------------------- /training-data-input/FSx.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 使用FSx for Lustre作为SageMaker的训练数据输入" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 1 说明\n", 15 | "本文为通过FSx for Lustre把S3数据作为SageMaker的训练数据输入,以解决直接从S3上下载训练数据耗时过长问题。 \n", 16 | "注意:该功能暂不能在中国区使用。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## 2 运行环境\n", 24 | "Kernel 选择tensorflow2_p36或pytorch_p36均可。 \n", 25 | "本文在boto3 1.17.99和sagemaker 2.45.0下测试通过。 " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import boto3,sagemaker\n", 35 | "print(boto3.__version__)\n", 36 | "print(sagemaker.__version__)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## 3 配置FSx\n", 44 | "参考 https://docs.aws.amazon.com/zh_cn/fsx/latest/LustreGuide/create-fs-linked-data-repo.html 进行配置,将您的文件系统链接到S3存储桶。 \n", 45 | "配置导入S3数据时,不要输入prefix。" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 4 在VPC中创建S3终端节点\n", 53 | "打开VPC web控制台,在左边导航栏点击`终端节点`,再点击`创建终端节点`,在服务名称搜索框中输入`S3`,搜索结果选择类型为`Gateway`的记录,配置路由表中,勾选上主路由表的记录,再点击`创建终端节点`。 \n", 54 | "不配置这步会报 Failed. Reason: InternalServerError: We encountered an internal error. Please try again." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## 5 获取/设置相关参数" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import boto3\n", 71 | "import sagemaker\n", 72 | "from sagemaker.image_uris import retrieve\n", 73 | "\n", 74 | "sagemaker_session = sagemaker.Session()\n", 75 | "iam = boto3.client('iam')\n", 76 | "roles = iam.list_roles(PathPrefix='/service-role')\n", 77 | "role=\"\"\n", 78 | "for current_role in roles[\"Roles\"]:\n", 79 | " if current_role[\"RoleName\"].startswith(\"AmazonSageMaker-ExecutionRole-\"):\n", 80 | " role=current_role[\"Arn\"]\n", 81 | " break\n", 82 | "print(role)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "注意事项:\n", 90 | "- 1.SageMaker Role必须要有使用FSx的权限\n", 91 | "- 2.确认FSx的安全组,允许SageMaker访问" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "subnets = [\"subnet-0eecdb20\"] # Should be same as Subnet used for FSx. Example: subnet-0f9XXXX\n", 101 | "security_group_ids = [\"sg-6478f13a\"] # Should be same as Security group used for FSx. sg-03ZZZZZZ\n", 102 | "file_system_id = \"fs-011671baa391568ab\" # FSx file system ID with your training dataset. Example: 'fs-0bYYYYYY'\n", 103 | "mount_name=\"cm26jbmv\" #FSx控制台页面上的挂载名称,mount name\n", 104 | "s3_prefix=\"test\" #S3前缀/目录" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from sagemaker.inputs import FileSystemInput\n", 114 | "file_system_directory_path = \"/{}/{}\".format(mount_name,s3_prefix)\n", 115 | "file_system_access_mode = \"ro\"#read only\n", 116 | "file_system_type = \"FSxLustre\"\n", 117 | "train_fs = FileSystemInput(\n", 118 | " file_system_id=file_system_id,\n", 119 | " file_system_type=file_system_type,\n", 120 | " directory_path=file_system_directory_path,\n", 121 | " file_system_access_mode=file_system_access_mode,\n", 122 | ")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "## 6 训练\n", 130 | "本文仅仅是列出了训练目录下的前100个文件,并没有实际训练,主要为演示通过FSx获取S3数据。" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### 6.1 TensorFlow" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from sagemaker.tensorflow import TensorFlow\n", 147 | "\n", 148 | "estimator = TensorFlow(\n", 149 | " base_job_name=\"big-data-input\",\n", 150 | " entry_point=\"ListFile.py\",\n", 151 | " role=role,\n", 152 | " py_version=\"py37\",\n", 153 | " framework_version=\"2.4.1\",\n", 154 | " instance_count=1,\n", 155 | " instance_type=\"ml.m5.large\",\n", 156 | " sagemaker_session=sagemaker_session,\n", 157 | " hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n", 158 | " subnets=subnets,\n", 159 | " security_group_ids=security_group_ids,\n", 160 | ")\n", 161 | "estimator.fit(train_fs)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### 6.2 PyTorch" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "from sagemaker.pytorch import PyTorch\n", 178 | "\n", 179 | "estimator = PyTorch(\n", 180 | " base_job_name=\"big-data-input\",\n", 181 | " entry_point=\"ListFile.py\",\n", 182 | " role=role,\n", 183 | " py_version=\"py36\",\n", 184 | " framework_version=\"1.6.0\",\n", 185 | " instance_count=1,\n", 186 | " instance_type=\"ml.m5.large\",\n", 187 | " sagemaker_session=sagemaker_session,\n", 188 | " hyperparameters={\"path\":\"/opt/ml/input/data/training\"},\n", 189 | " subnets=subnets,\n", 190 | " security_group_ids=security_group_ids,\n", 191 | ")\n", 192 | "estimator.fit(train_fs)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Environment (conda_pytorch_p36)", 206 | "language": "python", 207 | "name": "conda_pytorch_p36" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.6.13" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 4 224 | } 225 | -------------------------------------------------------------------------------- /training-data-input/ListFile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--path", type=str, default="./") 7 | #不能使用 args = parser.parse_args() 8 | args, _ = parser.parse_known_args() 9 | files = os.listdir(args.path) 10 | print("=============list file begin") 11 | count = 0 12 | for file in files: 13 | print(file) 14 | count = count + 1 15 | if count == 100: 16 | break 17 | print("=============list file end") 18 | -------------------------------------------------------------------------------- /training-data-input/README.md: -------------------------------------------------------------------------------- 1 | # 训练数据输入 2 | 在有大量数据需要进行训练时,如果SageMaker直接从S3下载数据,耗时较长。可通过FSx for Lustre把S3数据作为SageMaker的训练数据输入,以解决直接从S3上下载训练数据耗时过长问题。也支持从EFS载入数据。 3 | [FSx](FSx.ipynb) 4 | [EFS](EFS.ipynb) --------------------------------------------------------------------------------