├── .eslintrc.yml ├── .gitignore ├── LICENSE ├── README.md ├── config ├── .env.sample ├── cloudformation.template.yml ├── config.example.yml └── kes.js ├── db ├── knexfile.js ├── migrations │ └── 20190122223855_init.js └── setup.sh ├── diagram.png ├── lambda ├── Dockerfile ├── Makefile ├── download_and_predict │ ├── __init__.py │ ├── base.py │ ├── custom_types.py │ ├── handler.py │ └── mercantile.pyi ├── mypy.ini ├── package.zip ├── setup.py └── tests │ ├── __init__.py │ ├── handler.py │ └── test_base.py ├── lambda_examples ├── README.md ├── ml_enabler.py ├── s3_images.py ├── save_image.py ├── sentinel_hub.py └── super_tiles.py ├── package.json ├── scripts ├── csv_to_geojson.py ├── download.js ├── gpu-util.js ├── model.js ├── run-sqs-push.js ├── sqs-push.js ├── tag-cloudwatch-logs.js └── verify.js ├── test └── test_sqs-push.js └── yarn.lock /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | extends: standard 2 | plugins: 3 | - standard 4 | - promise 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | cloudformation.yml 4 | node_modules 5 | .env 6 | __pycache__ 7 | .mypy_cache 8 | .vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Development Seed 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chip 'n Scale: Queue Arranger 2 | 3 | `chip-n-scale-queue-arranger` helps you run machine learning models over satellite imagery at scale. It is a collection of [AWS CloudFormation](https://aws.amazon.com/cloudformation/) templates deployed by [`kes`](http://devseed.com/kes/), lambda functions, and utility scripts for monitoring and managing the project. 4 | 5 | ![AWS components diagram](diagram.png) 6 | 7 | ## Status 8 | 9 | Currently this is only deployed internally to Development Seed and we are [refactoring a bit](https://github.com/developmentseed/chip-n-scale-queue-arranger/pull/5) for easier reuse, modification, and deployment. Please excuse the dust and feel free to open an issue if you have any questions. The current build process for the lambda looks like: 10 | 11 | ```sh 12 | cd lambda 13 | make build 14 | ``` 15 | which produces a package.zip file. This can eventually be built into another script. 16 | 17 | ## Requirements 18 | 19 | - [`python 3.7.x`](https://www.python.org/) 20 | - [`node`](https://nodejs.org/en/) 21 | - [`yarn`](https://yarnpkg.com/en/) (or [`npm`](https://www.npmjs.com/)) 22 | - A [TensorFlow Serving Docker Image](https://www.tensorflow.org/tfx/serving/serving_basic) which accepts base64 encoded images. 23 | - For a walkthrough of this process, check out [this post](https://medium.com/devseed/technical-walkthrough-packaging-ml-models-for-inference-with-tf-serving-2a50f73ce6f8). 24 | - Or if you just have a model, build an image with the included `yarn model` tool 25 | - An [XYZ raster tile endpoint](https://docs.mapbox.com/api/maps/#maps) 26 | - A corresponding list of tiles over the area you'd like to predict on. If you know the extent of your prediction area as [`GeoJSON`](http://geojson.org/), you can use [`geodex`](https://github.com/developmentseed/geodex), [`mercantile`](https://github.com/mapbox/mercantile), or [`tile-cover`](https://github.com/mapbox/tile-cover) 27 | - An [AWS account](https://aws.amazon.com/) with sufficient privileges to deploy `config/cloudformation.template.yml` 28 | 29 | ## Deploying 30 | 31 | To create your own project, first install the `node` dependencies: 32 | 33 | ```sh 34 | yarn install 35 | ``` 36 | 37 | Then add values to `config/.env` and to `config/config.yml` to configure your project. Samples for each are provided and you can find more information on the [`kes` documentation page](http://devseed.com/kes/). 38 | 39 | Once these values are filled in, you can deploy the project to AWS (takes ~10 minutes): 40 | 41 | ```sh 42 | yarn deploy 43 | ... 44 | CF operation is in state of CREATE_COMPLETE 45 | 46 | The stack test-stack is deployed or updated. 47 | - The database is available at: postgres://your-db-string 48 | - The queue is available at https://your-queue-url 49 | 50 | Is this the first time setting up this stack? Run the following command to set up the database: 51 | 52 | $ yarn setup postgres://your-db-string 53 | 54 | ✨ Done in 424.62s. 55 | ``` 56 | 57 | This will return a database string to run a migration: 58 | 59 | ```sh 60 | yarn setup [DB_STRING] 61 | ``` 62 | 63 | If `yarn deploy` fails on the first attempt, you'll need to run `yarn delete` to remove the stack and start again. Otherwise the project will fail on newer updates indicating that it is in the state `ROLLBACK_COMPLETE`. If the first deploy succeeds, you can make future updates by rerunning `yarn deploy`. 64 | 65 | By default, the cloudwatch logs are not tagged for resource tracking. To add `Project` tags to the cloudwatch logs, run the following: 66 | 67 | ```sh 68 | yarn tag-logs 69 | ``` 70 | 71 | If you'd like to confirm the everything is deployed correctly (recommended), run: 72 | 73 | ```sh 74 | yarn verify 75 | ``` 76 | 77 | This will test a few portions of the deployed stack to ensure that it will function correctly. Once you're ready, begin pushing tile messages to the SQS queue. 78 | 79 | ## Running 80 | 81 | Once the stack is deployed, you can kick off the prediction by adding messages to the SQS queue. Each individual message will look like: 82 | 83 | ```json 84 | { "x": 1, "y": 2, "z": 3} 85 | ``` 86 | 87 | where `x`, `y`, and `z` specify [an individual map tile](https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames). Because pushing these messages into the queue quickly is important to running the prediction at scale, we've included a utility script to assist this process: 88 | 89 | ```sh 90 | yarn sqs-push [tiles.txt] [https://your-queue-url] 91 | ``` 92 | 93 | The first argument, `tiles.txt`, is a line-delimited file containing your tile indices in the format `x-y-z` and the second argument is the URL of your SQS Queue. If you have a lot of tiles to push to the queue, it's best to run this script in the background or on a separate computer. The maximum number of simultaneous inflight SQS requests can be set with the `PROMISE_THRESHOLD` environment variable. 94 | 95 | ## Post processing 96 | 97 | Once the processing is complete, you can pull down the stored results as a simple CSV file. 98 | 99 | ```sh 100 | DATABASE_URL='postgres://myusername:mypassword@your-db-string.rds.amazonaws.com:5432/ResultsDB' yarn download my_csv_filename.csv 101 | ``` 102 | 103 | You can then convert that CSV file to a geojson while thresholding on per-class ML confidence. For example, if you have a binary prediction and only want to keep tiles where confidence in class index 1 was 95% or greater, use something like: 104 | 105 | ```sh 106 | yarn convert-geojson my_csv_filename.csv my_thresholded_features.geojson --thresh_ind 1 --thresh 0.95 107 | ``` 108 | 109 | ## Completion 110 | 111 | After the prediction is complete, you should download the data from the AWS RDS database. Then it's okay to delete the stack: 112 | 113 | ```sh 114 | yarn delete 115 | ``` 116 | 117 | ## Speed, Cost, and GPU Utilization 118 | 119 | The primary costs of running this stack come from Lambda Functions and GPU instances. The Lambdas parallelize the image downloading and database writing; The GPU instances provide the prediction capacity. To run the inference optimally, from a speed and cost perspective, these two resources need to be scaled in tandem. Roughly four scenarios can occur: 120 | - **Lambda concurrency is much higher than GPU prediction capacity**. When too many Lambdas call the prediction endpoint at once, many of them will timeout and fail. The GPU instances will be fully utilized (good) but Lambda costs will be very high running longer and for more times than necessary. This will also hit the satellite imagery tile endpoint more times than needed. If this is happening, Lambda errors will be high, Lambda run time will be high, GPU utilization will be high, and SQS messages will show up in the dead letter queue. To fix it, lower the maximum Lambda concurrency or increase GPU capacity. 121 | - **Lambda concurrency is slightly higher than GPU prediction capacity**. Similar to the above case, if the Lambda concurrency is slightly too high compared to GPU prediction throughput, the Lambdas will run for longer than necessary but not timeout. If this is happening, Lambda errors will be low, Lambda run time will be high, and GPU utilization will be high. To fix it, lower the maximum Lambda concurrency or increase GPU capacity. 122 | - **Lambda concurrency is lower than GPU prediction capacity**. In this case, the Lambda monitoring metrics will look normal (low errors and low run time) but the GPU prediction instances have the capacity to predict many more images. To see this, run `yarn gpu-util [ssh key]` which will show the GPU utilization of each instance/GPU in the cluster: 123 | 124 | ```bash 125 | $ yarn gpu-util ~/.ssh/my-key.pem 126 | yarn run v1.3.2 127 | $ node scripts/gpu-util.js ~/.ssh/my-key.pem 128 | ┌────────────────────────┬────────────────────────┬────────────────────────┐ 129 | │ IP Address │ Instance Type │ GPU Utilization │ 130 | ├────────────────────────┼────────────────────────┼────────────────────────┤ 131 | │ 3.89.130.180 │ p3.2xlarge │ 5 % │ 132 | ├────────────────────────┼────────────────────────┼────────────────────────┤ 133 | │ 23.20.130.19 │ p3.2xlarge │ 2 % │ 134 | ├────────────────────────┼────────────────────────┼────────────────────────┤ 135 | │ 54.224.113.60 │ p3.2xlarge │ 3 % │ 136 | ├────────────────────────┼────────────────────────┼────────────────────────┤ 137 | │ 34.204.40.177 │ p3.2xlarge │ 12 % │ 138 | └────────────────────────┴────────────────────────┴────────────────────────┘ 139 | ✨ Done in 3.30s. 140 | ``` 141 | 142 | To fix this, increase the number of concurrent Lambdas or decrease the GPU capacity. (Note that by default, the security group on the instances won't accept SSH connections. To use `gpu-util`, add a new rule to your EC2 security group) 143 | 144 | - **Optimal :tada:** 145 | 146 | High GPU utilization, low Lambda errors, and low Lambda run time. :ship: 147 | 148 | ## Motivation 149 | 150 | Running machine learning inference at scale can be challenging. One bottleneck is that it's often hard to ingest/download images fast enough to keep a GPU fully utilized. This seeks to solve that bottleneck by parallelizing the imagery acquisition on AWS Lambda functions and running that separate from the machine learning predictions. 151 | 152 | ## Acknowledgements 153 | 154 | - [The World Bank](https://www.worldbank.org/), [Nethope](https://nethope.org/), and [UNICEF](https://www.unicef.org/) partnered with us on machine learning projects that provided opportunities to test these capabilities. 155 | - [Digital Globe](https://www.digitalglobe.com/) assisted in using their services to access ultra high-resolution satellite imagery at scale. 156 | - [Azavea's](https://www.azavea.com/) [raster-vision-aws](https://github.com/azavea/raster-vision-aws) repo provides the base AMI for these EC2 instances (`nvidia-docker` + ECS enabled). 157 | -------------------------------------------------------------------------------- /config/.env.sample: -------------------------------------------------------------------------------- 1 | TILE_ACCESS_TOKEN='string' 2 | RDS_USERNAME='string' 3 | RDS_PASSWORD='string' 4 | -------------------------------------------------------------------------------- /config/cloudformation.template.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: 'stack: {{stackName}} | stage: {{stage}} | deployed by Kes' 3 | Resources: 4 | 5 | ################################################# 6 | # Lambda config BEGIN 7 | ################################################# 8 | {{#each lambdas}} 9 | {{name}}LambdaFunction: 10 | Type: AWS::Lambda::Function 11 | Properties: 12 | Code: 13 | S3Bucket: {{bucket}} 14 | S3Key: {{remote}} 15 | FunctionName: {{../stackName}}-{{name}} 16 | Environment: 17 | Variables: 18 | stage: {{../stage}} 19 | stackName: {{../stackName}} 20 | PREDICTION_ENDPOINT: 21 | Fn::Join: 22 | - '' 23 | - - 'http://' 24 | - !GetAtt {{../stackNoDash}}LoadBalancer.DNSName 25 | - {{{../predictionPath}}} 26 | - ':predict' 27 | DATABASE_URL: 28 | Fn::Join: 29 | - '' 30 | - - 'postgres://{{../rds.username}}:{{../rds.password}}@' 31 | - !GetAtt {{../stackNoDash}}ResultsDB.Endpoint.Address 32 | - ':' 33 | - !GetAtt {{../stackNoDash}}ResultsDB.Endpoint.Port 34 | - '/' 35 | - ResultsDB 36 | {{#each envs}} 37 | {{@key}}: {{{this}}} 38 | {{/each}} 39 | Handler: {{handler}} 40 | MemorySize: {{memory}} 41 | Role: !GetAtt LambdaProcessingRole.Arn 42 | Runtime: {{runtime}} 43 | {{# if concurrent}} 44 | ReservedConcurrentExecutions: {{concurrent}} 45 | {{/if}} 46 | Timeout: {{timeout}} 47 | Tags: 48 | - Key: Project 49 | Value: {{../projectTag}} 50 | - Key: Stack 51 | Value: {{../stackName}} 52 | 53 | {{name}}LambdaFunctionLogGroup: 54 | Type: AWS::Logs::LogGroup 55 | Properties: 56 | LogGroupName: 57 | Fn::Join: 58 | - '' 59 | - - '/aws/lambda/' 60 | - {{../stackName}} 61 | - '-' 62 | - {{name}} 63 | 64 | {{#if queueTrigger}} 65 | {{../stackNoDash}}{{name}}LambdaEventSourceMapping: 66 | Type: AWS::Lambda::EventSourceMapping 67 | Properties: 68 | Enabled: True 69 | EventSourceArn: !GetAtt {{../stackNoDash}}TileQueue.Arn 70 | FunctionName: !Ref {{name}}LambdaFunction 71 | {{/if}} 72 | {{/each}} 73 | 74 | ################################################# 75 | # Lambda config END 76 | ################################################# 77 | 78 | ################################################# 79 | # SQS config BEGIN 80 | ################################################# 81 | 82 | {{stackNoDash}}TileQueue: 83 | Type: AWS::SQS::Queue 84 | Properties: 85 | QueueName: {{stackNoDash}}TileQueue 86 | VisibilityTimeout: {{sqs.visibilityTimeout}} 87 | RedrivePolicy: 88 | deadLetterTargetArn: !GetAtt {{stackNoDash}}DeadLetterQueue.Arn 89 | maxReceiveCount: {{sqs.maxReceiveCount}} 90 | Tags: 91 | - Key: Project 92 | Value: {{projectTag}} 93 | - Key: Stack 94 | Value: {{stackName}} 95 | 96 | {{stackNoDash}}DeadLetterQueue: 97 | Type: AWS::SQS::Queue 98 | Properties: 99 | QueueName: {{stackNoDash}}DeadLetterQueue 100 | Tags: 101 | - Key: Project 102 | Value: {{projectTag}} 103 | - Key: Stack 104 | Value: {{stackName}} 105 | 106 | ################################################# 107 | # SQS config END 108 | ################################################# 109 | 110 | ################################################# 111 | # ECS config BEGIN 112 | ################################################# 113 | 114 | {{stackNoDash}}InstanceProfile: 115 | Type: AWS::IAM::InstanceProfile 116 | Properties: 117 | Path: "/" 118 | Roles: 119 | - !Ref ECSRole 120 | 121 | {{stackNoDash}}TaskDefinition: 122 | Type: AWS::ECS::TaskDefinition 123 | Properties: 124 | Family: {{stackName}}-TaskDefinition 125 | ContainerDefinitions: 126 | - Name: {{stackNoDash}}TaskDefinition 127 | Essential: true 128 | Image: {{ecs.image}} 129 | MemoryReservation: {{ecs.memory}} 130 | PortMappings: 131 | - ContainerPort: 8501 132 | LogConfiguration: 133 | LogDriver: awslogs 134 | Options: 135 | awslogs-group: !Ref {{stackNoDash}}DockerLogs 136 | awslogs-region: !Sub ${AWS::Region} 137 | 138 | {{stackNoDash}}DockerLogs: 139 | Type: AWS::Logs::LogGroup 140 | Properties: 141 | LogGroupName: {{stackName}}-ecs-docker 142 | 143 | {{stackNoDash}}ECSService: 144 | Type: AWS::ECS::Service 145 | DependsOn: 146 | - {{stackNoDash}}ECSAutoScalingGroup 147 | Properties: 148 | Cluster: !Ref {{stackNoDash}}ECSCluster 149 | DesiredCount: 1 150 | TaskDefinition: !Ref {{stackNoDash}}TaskDefinition 151 | DeploymentConfiguration: 152 | MaximumPercent: 100 153 | MinimumHealthyPercent: 0 154 | LoadBalancers: 155 | - ContainerName: {{stackNoDash}}TaskDefinition 156 | ContainerPort: 8501 157 | TargetGroupArn: !Ref {{stackNoDash}}TargetGroup 158 | 159 | {{stackNoDash}}ECSCluster: 160 | Type: AWS::ECS::Cluster 161 | 162 | {{stackNoDash}}ContainerInstanceLaunch: 163 | Type: AWS::AutoScaling::LaunchConfiguration 164 | Metadata: 165 | AWS::CloudFormation::Init: 166 | config: 167 | commands: 168 | 01_add_instance_to_cluster: 169 | command: !Sub | 170 | #!/bin/bash 171 | echo ECS_CLUSTER=${ {{stackNoDash}}ECSCluster} >> /etc/ecs/ecs.config 172 | echo ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=1m >> /etc/ecs/ecs.config 173 | files: 174 | "/etc/cfn/cfn-hup.conf": 175 | content: !Sub | 176 | [main] 177 | stack=${AWS::StackId} 178 | region=${AWS::Region} 179 | mode: '000400' 180 | owner: root 181 | group: root 182 | "/etc/cfn/hooks.d/cfn-auto-reloader.conf": 183 | content: !Sub | 184 | [cfn-auto-reloader-hook] 185 | triggers=post.update 186 | path=Resources.{{stackNoDash}}ContainerInstanceLaunch.Metadata.AWS::CloudFormation::Init 187 | action=/opt/aws/bin/cfn-init -v --stack ${AWS::StackName} --resource {{stackNoDash}}ContainerInstanceLaunch --region ${AWS::Region} 188 | runas=root 189 | services: 190 | sysvinit: 191 | cfn-hup: 192 | enabled: 'true' 193 | ensureRunning: 'true' 194 | files: 195 | - "/etc/cfn/cfn-hup.conf" 196 | - "/etc/cfn/hooks.d/cfn-auto-reloader.conf" 197 | Properties: 198 | SecurityGroups: 199 | - !Ref {{stackNoDash}}ECSHostSecurityGroup 200 | ImageId: !FindInMap [AWSRegionToAMI, !Ref "AWS::Region", AMIID] 201 | InstanceType: {{ecs.instanceType}} 202 | IamInstanceProfile: !Ref {{stackNoDash}}InstanceProfile 203 | BlockDeviceMappings: 204 | - DeviceName: "/dev/xvdcz" 205 | Ebs: 206 | DeleteOnTermination: true 207 | VolumeSize: 100 208 | VolumeType: gp2 209 | KeyName: {{ ecs.keyPairName }} 210 | UserData: 211 | "Fn::Base64": !Join 212 | - "" 213 | - - "#cloud-config\n" 214 | - "\nruncmd:\n" 215 | - " - yum install -y aws-cfn-bootstrap\n" 216 | - !Sub " - /opt/aws/bin/cfn-init -v --stack ${AWS::StackName} --resource {{stackNoDash}}ContainerInstanceLaunch --region ${AWS::Region}\n" 217 | - !Sub " - /opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource {{stackNoDash}}ECSAutoScalingGroup --region ${AWS::Region}\n" 218 | DependsOn: 219 | - {{stackNoDash}}ECSHostSecurityGroup 220 | 221 | {{stackNoDash}}ECSAutoScalingGroup: 222 | Type: AWS::AutoScaling::AutoScalingGroup 223 | UpdatePolicy: 224 | AutoScalingRollingUpdate: 225 | MinInstancesInService: 0 226 | Properties: 227 | AvailabilityZones: 228 | - {{ecs.availabilityZone}} 229 | LaunchConfigurationName: !Ref {{stackNoDash}}ContainerInstanceLaunch 230 | MinSize: 1 231 | MaxSize: {{ ecs.maxInstances }} 232 | DesiredCapacity: {{ ecs.desiredInstances }} 233 | Tags: 234 | - Key: Name 235 | Value: "{{stackName}} ECS" 236 | PropagateAtLaunch: true 237 | - Key: Project 238 | Value: {{projectTag}} 239 | PropagateAtLaunch: true 240 | - Key: Stack 241 | Value: {{stackName}} 242 | PropagateAtLaunch: true 243 | 244 | {{stackNoDash}}ECSHostSecurityGroup: 245 | Type: AWS::EC2::SecurityGroup 246 | Properties: 247 | VpcId: {{vpc}} 248 | GroupDescription: Access to the ECS hosts and the tasks/containers that run on them 249 | SecurityGroupIngress: 250 | # Only allow inbound access to ECS from the ELB 251 | - SourceSecurityGroupId: !Ref {{stackNoDash}}LoadBalancerSecurityGroup 252 | IpProtocol: -1 253 | 254 | ################################################# 255 | # ECS config END 256 | ################################################# 257 | 258 | ################################################# 259 | # RDS config BEGIN 260 | ################################################# 261 | 262 | {{stackNoDash}}ResultsDB: 263 | Type: AWS::RDS::DBInstance 264 | Properties: 265 | DBName: ResultsDB 266 | AllocatedStorage: {{rds.storage}} 267 | DBInstanceClass: {{rds.instanceType}} 268 | Engine: postgres 269 | EngineVersion: 9.6.2 270 | MasterUsername: {{rds.username}} 271 | MasterUserPassword: {{rds.password}} 272 | Tags: 273 | - Key: Project 274 | Value: {{projectTag}} 275 | - Key: Stack 276 | Value: {{stackName}} 277 | 278 | ################################################# 279 | # RDS config END 280 | ################################################# 281 | 282 | ################################################# 283 | # Load Balancer BEGIN 284 | ################################################# 285 | 286 | {{stackNoDash}}LoadBalancer: 287 | Type: AWS::ElasticLoadBalancingV2::LoadBalancer 288 | Properties: 289 | SecurityGroups: 290 | - !Ref {{stackNoDash}}LoadBalancerSecurityGroup 291 | Subnets: 292 | {{#each subnets}} 293 | - {{{this}}} 294 | {{/each}} 295 | Tags: 296 | - Key: Project 297 | Value: {{projectTag}} 298 | - Key: Stack 299 | Value: {{stackName}} 300 | 301 | {{stackNoDash}}TargetGroup: 302 | Type: AWS::ElasticLoadBalancingV2::TargetGroup 303 | Properties: 304 | VpcId: {{vpc}} 305 | Port: 80 306 | Protocol: HTTP 307 | Matcher: 308 | HttpCode: 200-299 309 | HealthCheckIntervalSeconds: 30 310 | HealthCheckPath: {{predictionPath}} 311 | HealthCheckProtocol: HTTP 312 | HealthCheckTimeoutSeconds: 5 313 | HealthyThresholdCount: 5 314 | DependsOn: 315 | - {{stackNoDash}}LoadBalancer 316 | 317 | {{stackNoDash}}LoadBalancerListener: 318 | Type: AWS::ElasticLoadBalancingV2::Listener 319 | Properties: 320 | LoadBalancerArn: !Ref {{stackNoDash}}LoadBalancer 321 | Port: 80 322 | Protocol: HTTP 323 | DefaultActions: 324 | - Type: forward 325 | TargetGroupArn: !Ref {{stackNoDash}}TargetGroup 326 | 327 | {{stackNoDash}}ListenerRule: 328 | Type: AWS::ElasticLoadBalancingV2::ListenerRule 329 | Properties: 330 | ListenerArn: !Ref {{stackNoDash}}LoadBalancerListener 331 | Priority: 2 332 | Conditions: 333 | - Field: path-pattern 334 | Values: 335 | - {{{predictionPath}}} 336 | Actions: 337 | - TargetGroupArn: !Ref {{stackNoDash}}TargetGroup 338 | Type: forward 339 | 340 | {{stackNoDash}}LoadBalancerSecurityGroup: 341 | Type: AWS::EC2::SecurityGroup 342 | Properties: 343 | VpcId: {{vpc}} 344 | GroupDescription: Access to the load balancer that sits in front of ECS 345 | SecurityGroupIngress: 346 | # Allow access from anywhere to our ECS services 347 | - CidrIp: 0.0.0.0/0 348 | IpProtocol: -1 349 | 350 | ################################################# 351 | # Load Balancer END 352 | ################################################# 353 | 354 | 355 | ################################################# 356 | # IAM config BEGIN 357 | ################################################# 358 | 359 | LambdaProcessingRole: 360 | Type: AWS::IAM::Role 361 | Properties: 362 | RoleName: "{{stackName}}-lambda-processing" 363 | AssumeRolePolicyDocument: 364 | Version: '2012-10-17' 365 | Statement: 366 | - Effect: Allow 367 | Principal: 368 | Service: lambda.amazonaws.com 369 | Action: sts:AssumeRole 370 | Path: "/" 371 | Policies: 372 | - PolicyName: ProcessingLambda 373 | PolicyDocument: 374 | Version: '2012-10-17' 375 | Statement: 376 | - Effect: Allow 377 | Action: 378 | - lambda:GetFunction 379 | - lambda:invokeFunction 380 | - logs:CreateLogGroup 381 | - logs:CreateLogStream 382 | - logs:DescribeLogStreams 383 | - logs:PutLogEvents 384 | Resource: "*" 385 | # Allow writing to ingest buckets 386 | - Effect: Allow 387 | Action: 388 | - s3:AbortMultipartUpload 389 | - s3:Get* 390 | - s3:Put* 391 | - s3:List* 392 | - s3:DeleteObject 393 | - s3:DeleteObjectVersion 394 | Resource: 395 | - !Sub "arn:aws:s3:::{{buckets.internal}}" 396 | - !Sub "arn:aws:s3:::{{buckets.internal}}/*" 397 | # Allow access to SQS 398 | - Effect: Allow 399 | Action: 400 | - sqs:SendMessage 401 | - sqs:ReceiveMessage 402 | - sqs:ChangeMessageVisibility 403 | - sqs:DeleteMessage 404 | - sqs:GetQueueUrl 405 | - sqs:GetQueueAttributes 406 | Resource: !Sub "arn:aws:sqs:${AWS::Region}:${AWS::AccountId}:{{stackNoDash}}TileQueue" 407 | 408 | ECSRole: 409 | Type: AWS::IAM::Role 410 | Properties: 411 | RoleName: !Sub "{{stackName}}-ecs-role" 412 | AssumeRolePolicyDocument: 413 | Version: '2012-10-17' 414 | Statement: 415 | - Effect: Allow 416 | Principal: 417 | Service: 418 | - ec2.amazonaws.com 419 | - ecs.amazonaws.com 420 | Action: sts:AssumeRole 421 | Path: "/" 422 | Policies: 423 | - PolicyName: ECSRole 424 | PolicyDocument: 425 | Version: '2012-10-17' 426 | Statement: 427 | - Effect: Allow 428 | Action: 429 | - cloudwatch:GetMetricStatistics 430 | - ecr:BatchCheckLayerAvailability 431 | - ecr:BatchGetImage 432 | - ecr:GetAuthorizationToken 433 | - ecr:GetDownloadUrlForLayer 434 | - ec2:AuthorizeSecurityGroupIngress 435 | - ec2:Describe* 436 | - ecs:DeregisterContainerInstance 437 | - ecs:DescribeClusters 438 | - ecs:DescribeContainerInstances 439 | - ecs:DescribeServices 440 | - ecs:DiscoverPollEndpoint 441 | - ecs:ListContainerInstances 442 | - ecs:ListServices 443 | - ecs:ListTaskDefinitions 444 | - ecs:ListTasks 445 | - ecs:Poll 446 | - ecs:RegisterContainerInstance 447 | - ecs:RunTask 448 | - ecs:StartTelemetrySession 449 | - ecs:Submit* 450 | - lambda:GetFunction 451 | - lambda:invokeFunction 452 | - logs:CreateLogGroup 453 | - logs:CreateLogStream 454 | - logs:DescribeLogStreams 455 | - logs:PutLogEvents 456 | Resource: "*" 457 | 458 | # Allow interaction with internal buckets 459 | - Effect: Allow 460 | Action: 461 | - s3:AbortMultipartUpload 462 | - s3:Get* 463 | - s3:Put* 464 | - s3:List* 465 | - s3:DeleteObject 466 | - s3:DeleteObjectVersion 467 | Resource: 468 | - !Sub "arn:aws:s3:::{{buckets.internal}}" 469 | - !Sub "arn:aws:s3:::{{buckets.internal}}/*" 470 | 471 | # Allow interaction with the load balancer 472 | - Effect: Allow 473 | Action: 474 | - elasticloadbalancing:* 475 | Resource: "*" 476 | 477 | 478 | 479 | ################################################# 480 | # IAM config END 481 | ################################################# 482 | 483 | Mappings: 484 | AWSRegionToAMI: 485 | DOCS: 486 | LIST: http://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html 487 | us-east-1: 488 | AMIID: ami-07eb64b216d4d3522 # hardcoded, built via https://github.com/azavea/raster-vision-aws#create-the-custom-ami 489 | 490 | Outputs: 491 | dbConnectionString: 492 | Value: 493 | Fn::Join: 494 | - '' 495 | - - 'postgres://{{rds.username}}:{{rds.password}}@' 496 | - !GetAtt {{stackNoDash}}ResultsDB.Endpoint.Address 497 | - ':' 498 | - !GetAtt {{stackNoDash}}ResultsDB.Endpoint.Port 499 | - '/' 500 | - ResultsDB 501 | queueURL: 502 | Value: !Ref {{stackNoDash}}TileQueue 503 | modelEndpoint: 504 | Value: 505 | Fn::Join: 506 | - '' 507 | - - 'http://' 508 | - !GetAtt {{stackNoDash}}LoadBalancer.DNSName 509 | - {{{predictionPath}}} 510 | -------------------------------------------------------------------------------- /config/config.example.yml: -------------------------------------------------------------------------------- 1 | default: 2 | stage: dev 3 | stackName: your-stack 4 | stackNoDash: YourStack 5 | projectTag: project 6 | capabilities: 7 | - CAPABILITY_NAMED_IAM 8 | buckets: 9 | internal: your-bucket # existing s3 bucket to store deployment artifacts 10 | 11 | lambdas: 12 | DownloadAndPredict: 13 | handler: download_and_predict.handler.handler 14 | timeout: 60 15 | memory: 512 16 | runtime: python3.7 17 | source: lambda/package.zip 18 | queueTrigger: true 19 | concurrent: 5 20 | envs: 21 | TILE_ACCESS_TOKEN: '{{TILE_ACCESS_TOKEN}}' 22 | TILE_ENDPOINT: 'https://example.com/{}/{}/{}.jpg?access_token={}' 23 | 24 | rds: 25 | username: '{{RDS_USERNAME}}' 26 | password: '{{RDS_PASSWORD}}' 27 | storage: 20 28 | instanceType: 'db.t2.medium' 29 | 30 | vpc: your-vpc # existing VPC containing the two subnets below 31 | subnets: 32 | - subnet 1 33 | - subnet 2 34 | 35 | ecs: 36 | availabilityZone: us-east-1a 37 | maxInstances: 1 38 | desiredInstances: 1 39 | keyPairName: your-key-pair 40 | instanceType: t2.nano # replace with a GPU instance for faster predictions (and higher costs) 41 | image: tensorflow/serving:latest # docker image containing your inference model built with TF Serving 42 | memory: 1000 # replace with the memory required by your TF Serving docker image 43 | 44 | sqs: 45 | visibilityTimeout: 60 46 | maxReceiveCount: 5 47 | 48 | predictionPath: '/v1/models/your_model' # path to your model on the TF Serving docker image; don't include :predict 49 | -------------------------------------------------------------------------------- /config/kes.js: -------------------------------------------------------------------------------- 1 | const { Kes } = require('kes') 2 | 3 | // Override the KES class to include useful post-deploy helpers 4 | class UpdatedKes extends Kes { 5 | opsStack () { 6 | return super.opsStack() 7 | .then(() => this.describeCF()) 8 | .then((r) => { 9 | let output = r.Stacks[0].Outputs 10 | let dbConnection = output.find(o => o.OutputKey === 'dbConnectionString')['OutputValue'] 11 | let queueURL = output.find(o => o.OutputKey === 'queueURL')['OutputValue'] 12 | return console.log(` 13 | The stack ${r.Stacks[0].StackName} is deployed or updated. 14 | - The database is available at: ${dbConnection} 15 | - The queue is available at ${queueURL} 16 | 17 | Is this the first time setting up this stack? Run the following command to set up the database: 18 | 19 | $ yarn setup ${dbConnection} 20 | ` 21 | ) 22 | }) 23 | } 24 | } 25 | 26 | module.exports = UpdatedKes 27 | -------------------------------------------------------------------------------- /db/knexfile.js: -------------------------------------------------------------------------------- 1 | var path = require('path') 2 | module.exports = { 3 | remote: { 4 | client: 'pg', 5 | debug: process.env.KNEX_DEBUG || false, 6 | connection: process.env.DATABASE_URL, 7 | migrations: { 8 | directory: path.join(__dirname, 'migrations') 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /db/migrations/20190122223855_init.js: -------------------------------------------------------------------------------- 1 | exports.up = async function (knex) { 2 | try { 3 | return knex.schema.createTable('results', t => { 4 | t.string('tile').primary() 5 | t.jsonb('output') 6 | }) 7 | } catch (e) { 8 | console.error(e) 9 | } 10 | } 11 | 12 | exports.down = async function (knex) { 13 | return knex.schema.dropTable('results') 14 | } 15 | -------------------------------------------------------------------------------- /db/setup.sh: -------------------------------------------------------------------------------- 1 | DATABASE_URL=$1 2 | DATABASE_URL=$DATABASE_URL knex migrate:latest --env remote --knexfile db/knexfile.js 3 | -------------------------------------------------------------------------------- /diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/chip-n-scale-queue-arranger/541bc104ab895e1751d2f38e6c40868c12209360/diagram.png -------------------------------------------------------------------------------- /lambda/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM lambci/lambda:build-python3.7 2 | 3 | WORKDIR /tmp 4 | 5 | ENV PACKAGE_PREFIX /tmp/python 6 | 7 | ################################################################################ 8 | # CREATE PACKAGE # 9 | ################################################################################ 10 | COPY download_and_predict download_and_predict 11 | COPY setup.py setup.py 12 | 13 | ENV \ 14 | LANG=en_US.UTF-8 \ 15 | LC_ALL=en_US.UTF-8 \ 16 | CFLAGS="--std=c99" 17 | 18 | RUN pip3 install . --no-binary numpy -t $PACKAGE_PREFIX -U 19 | 20 | ################################################################################ 21 | # REDUCE PACKAGE SIZE # 22 | ################################################################################ 23 | RUN rm -rdf $PACKAGE_PREFIX/boto3/ \ 24 | && rm -rdf $PACKAGE_PREFIX/botocore/ \ 25 | && rm -rdf $PACKAGE_PREFIX/docutils/ \ 26 | && rm -rdf $PACKAGE_PREFIX/dateutil/ \ 27 | && rm -rdf $PACKAGE_PREFIX/jmespath/ \ 28 | && rm -rdf $PACKAGE_PREFIX/s3transfer/ \ 29 | && rm -rdf $PACKAGE_PREFIX/numpy/doc/ 30 | 31 | # Leave module precompiles for faster Lambda startup 32 | RUN find $PACKAGE_PREFIX -type f -name '*.pyc' | while read f; do n=$(echo $f | sed 's/__pycache__\///' | sed 's/.cpython-[2-3][0-9]//'); cp $f $n; done; 33 | RUN find $PACKAGE_PREFIX -type d -a -name '__pycache__' -print0 | xargs -0 rm -rf 34 | RUN find $PACKAGE_PREFIX -type f -a -name '*.py' -print0 | xargs -0 rm -f 35 | 36 | ################################################################################ 37 | # CREATE ARCHIVE # 38 | ################################################################################ 39 | RUN cd $PACKAGE_PREFIX && zip -r9q /tmp/package.zip * 40 | 41 | # Cleanup 42 | RUN rm -rf $PACKAGE_PREFIX 43 | -------------------------------------------------------------------------------- /lambda/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL = /bin/bash 3 | 4 | build: 5 | docker build --tag lambda:latest . 6 | docker run --name lambda -itd lambda:latest /bin/bash 7 | docker cp lambda:/tmp/package.zip package.zip 8 | docker stop lambda 9 | docker rm lambda -------------------------------------------------------------------------------- /lambda/download_and_predict/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/chip-n-scale-queue-arranger/541bc104ab895e1751d2f38e6c40868c12209360/lambda/download_and_predict/__init__.py -------------------------------------------------------------------------------- /lambda/download_and_predict/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Lambda for downloading images, packaging them for prediction, sending them 3 | to a remote ML serving image, and saving them 4 | @author:Development Seed 5 | """ 6 | 7 | import json 8 | from functools import reduce 9 | from io import BytesIO 10 | from base64 import b64encode 11 | from urllib.parse import urlparse 12 | from typing import Dict, List, NamedTuple, Callable, Optional, Tuple, Any, Iterator 13 | 14 | from mercantile import Tile 15 | import requests 16 | import pg8000 17 | 18 | from download_and_predict.custom_types import SQSEvent 19 | 20 | class DownloadAndPredict(object): 21 | """ 22 | base object DownloadAndPredict implementing all necessary methods to 23 | make machine learning predictions 24 | """ 25 | 26 | def __init__(self, imagery: str, db: str, prediction_endpoint: str): 27 | super(DownloadAndPredict, self).__init__() 28 | self.imagery = imagery 29 | self.db = db 30 | self.prediction_endpoint = prediction_endpoint 31 | 32 | @staticmethod 33 | def get_tiles(event: SQSEvent) -> List[Tile]: 34 | """ 35 | Return the body of our incoming SQS messages as an array of mercantile Tiles 36 | Expects events of the following format: 37 | 38 | { 'Records': [ { "body": '{ "x": 4, "y": 5, "z":3 }' }] } 39 | 40 | """ 41 | return [ 42 | Tile(*json.loads(record['body']).values()) 43 | for record 44 | in event['Records'] 45 | ] 46 | 47 | 48 | @staticmethod 49 | def b64encode_image(image_binary:bytes) -> str: 50 | return b64encode(image_binary).decode('utf-8') 51 | 52 | 53 | def get_images(self, tiles: List[Tile]) -> Iterator[Tuple[Tile, bytes]]: 54 | for tile in tiles: 55 | url = self.imagery.format(x=tile.x, y=tile.y, z=tile.z) 56 | r = requests.get(url) 57 | yield (tile, r.content) 58 | 59 | 60 | def get_prediction_payload(self, tiles:List[Tile]) -> Tuple[List[Tile], str]: 61 | """ 62 | tiles: list mercantile Tiles 63 | imagery: str an imagery API endpoint with three variables {z}/{x}/{y} to replace 64 | 65 | Return: 66 | - an array of b64 encoded images to send to our prediction endpoint 67 | - a corresponding array of tile indices 68 | 69 | These arrays are returned together because they are parallel operations: we 70 | need to match up the tile indicies with their corresponding images 71 | """ 72 | tiles_and_images = self.get_images(tiles) 73 | tile_indices, images = zip(*tiles_and_images) 74 | 75 | instances = [dict(image_bytes=dict(b64=self.b64encode_image(img))) for img in images] 76 | payload = json.dumps(dict(instances=instances)) 77 | 78 | return (list(tile_indices), payload) 79 | 80 | def post_prediction(self, payload:str) -> Dict[str, Any]: 81 | r = requests.post(self.prediction_endpoint, data=payload) 82 | r.raise_for_status() 83 | return r.json() 84 | 85 | def save_to_db(self, tiles:List[Tile], results:List[Any], result_wrapper:Optional[Callable]=None) -> None: 86 | """ 87 | Save our prediction results to the provided database 88 | tiles: list mercantile Tiles 89 | results: list of predictions 90 | db: str database connection string 91 | 92 | """ 93 | db = urlparse(self.db) 94 | 95 | conn = pg8000.connect( 96 | user=db.username, 97 | password=db.password, 98 | host=db.hostname, 99 | database=db.path[1:], 100 | port=db.port 101 | ) 102 | cursor = conn.cursor() 103 | 104 | for i, output in enumerate(results): 105 | result = result_wrapper(output) if result_wrapper else output 106 | cursor.execute("INSERT INTO results VALUES (%s, %s) ON CONFLICT (tile) DO UPDATE SET output = %s", (tiles[i], result, result)) 107 | 108 | conn.commit() 109 | conn.close() 110 | -------------------------------------------------------------------------------- /lambda/download_and_predict/custom_types.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | 3 | SQSEvent = Dict[str, List[Dict[str, Any]]] 4 | -------------------------------------------------------------------------------- /lambda/download_and_predict/handler.py: -------------------------------------------------------------------------------- 1 | """Example AWS Lambda function for chip-n-scale""" 2 | 3 | import os 4 | import pg8000 5 | from typing import Dict, Any 6 | 7 | from download_and_predict.base import DownloadAndPredict 8 | from download_and_predict.custom_types import SQSEvent 9 | 10 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 11 | # read all our environment variables to throw errors early 12 | imagery = os.getenv('TILE_ENDPOINT') 13 | db = os.getenv('DATABASE_URL') 14 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 15 | 16 | assert(imagery) 17 | assert(db) 18 | assert(prediction_endpoint) 19 | 20 | # instantiate our DownloadAndPredict class 21 | dap = DownloadAndPredict( 22 | imagery=imagery, 23 | db=db, 24 | prediction_endpoint=prediction_endpoint 25 | ) 26 | 27 | # get tiles from our SQS event 28 | tiles = dap.get_tiles(event) 29 | 30 | # construct a payload for our prediction endpoint 31 | tile_indices, payload = dap.get_prediction_payload(tiles) 32 | 33 | # send prediction request 34 | content = dap.post_prediction(payload) 35 | 36 | # save prediction request to db 37 | dap.save_to_db( 38 | tile_indices, 39 | content['predictions'], 40 | result_wrapper=lambda x: pg8000.PGJsonb(x) 41 | ) 42 | -------------------------------------------------------------------------------- /lambda/download_and_predict/mercantile.pyi: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Any, Callable 2 | 3 | class Tile(): 4 | @property 5 | def x(self) -> int: ... 6 | @property 7 | def y(self) -> int: ... 8 | @property 9 | def z(self) -> int: ... 10 | 11 | quadkey = Callable[[Any], str] 12 | -------------------------------------------------------------------------------- /lambda/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | 3 | scripts_are_modules = True 4 | show_traceback = True 5 | ignore_missing_imports = True 6 | mypy_path = lambdas/download_and_predict 7 | namespace_packages = True 8 | -------------------------------------------------------------------------------- /lambda/package.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/chip-n-scale-queue-arranger/541bc104ab895e1751d2f38e6c40868c12209360/lambda/package.zip -------------------------------------------------------------------------------- /lambda/setup.py: -------------------------------------------------------------------------------- 1 | """Setup.""" 2 | 3 | from setuptools import setup, find_packages 4 | 5 | inst_reqs = [ 6 | "mercantile", 7 | "pg8000==1.16.4", 8 | "requests", 9 | "pillow", 10 | "numpy" 11 | ] 12 | extra_reqs = {"test": ["pytest", "pytest-cov"]} 13 | 14 | setup( 15 | name="app", 16 | version="0.0.1", 17 | description=u"Lambda Download and Predict", 18 | python_requires=">=3", 19 | keywords="AWS-Lambda Python", 20 | packages=find_packages(exclude=["ez_setup", "examples", "tests"]), 21 | include_package_data=True, 22 | zip_safe=False, 23 | install_requires=inst_reqs, 24 | extras_require=extra_reqs, 25 | ) 26 | -------------------------------------------------------------------------------- /lambda/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/chip-n-scale-queue-arranger/541bc104ab895e1751d2f38e6c40868c12209360/lambda/tests/__init__.py -------------------------------------------------------------------------------- /lambda/tests/handler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/chip-n-scale-queue-arranger/541bc104ab895e1751d2f38e6c40868c12209360/lambda/tests/handler.py -------------------------------------------------------------------------------- /lambda/tests/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from mercantile import Tile 3 | 4 | from download_and_predict.base import DownloadAndPredict 5 | 6 | def test_get_tiles(): 7 | # create a class with fake environment variables 8 | dap = DownloadAndPredict( 9 | imagery='https://example.com/{z}/{x}/{y}.png', 10 | db='postgres://usr:pw@host:port/database', 11 | prediction_endpoint='https://myloadbalancer.com/v1/models/ml:predict' 12 | ) 13 | 14 | # create an example SQS event which invokes a lambda 15 | event = { 'Records': [ { "body": '{ "x": 4, "y": 5, "z":3 }' }] } 16 | 17 | tiles = dap.get_tiles(event) 18 | fixture_tiles = [Tile(x=4, y=5, z=3)] 19 | 20 | assert(tiles == fixture_tiles) 21 | -------------------------------------------------------------------------------- /lambda_examples/README.md: -------------------------------------------------------------------------------- 1 | ## Lambda Examples 2 | *(how to customize this repo for running your ML inference task)* 3 | 4 | The primary functionality of this repository is contained in the lambda function located at `lambda/download_and_predict/handler.py`. It is intentionally very little code to allow for easy configuration: with the imports and assertions removed, there is one class instantiation and four method calls. The class `DownloadAndPredict` provides the base functionality required to run machine learning inference: 5 | - Creates a list of Mercator tiles based on an input SQS event. 6 | - Downloads those tiles from a TMS/XYS tile endpoint and puts them in the proper format for sending them to Tensorflow Serving or an equivalent Docker image. 7 | - Sends the payload to the prediction endpoint. 8 | - Saves the result into a database. 9 | 10 | There are two primary options to customize this workflow: 11 | - Add new code to `handler.py` to manipulate the returned values (`tiles`, `payload`, `content`, etc.) 12 | - Subclass `DownloadAndPredict` to provide alternative methods for the operations listed above. 13 | 14 | Any additional third-party libraries should be added to `lambda/setup.py` for inclusion in the lambda function deployment. 15 | 16 | Examples of customization are listed in this library to show how `chip-n-scale-queue-arranger` can be used with a variety of different tools. 17 | 18 | - [Download imagery from Sentinel Hub](sentinel_hub.py). For more information, check out the [`sentinelhub-py` docs](https://sentinelhub-py.readthedocs.io/en/latest/). 19 | - [Download larger tiles and create smaller tiles for inference](super_tiles.py). This is useful for reducing the load on the imagery/tile endpoint. 20 | - [Save results to `ml-enabler`](ml_enabler.py). For more information, check out the [`ml-enabler` repo](https://github.com/hotosm/ml-enabler). 21 | -------------------------------------------------------------------------------- /lambda_examples/ml_enabler.py: -------------------------------------------------------------------------------- 1 | """Example AWS Lambda function for chip-n-scale with ml_enabler""" 2 | 3 | import os 4 | import datetime 5 | from typing import Dict, Any, List, Optional, Callable 6 | from io import BytesIO 7 | from urllib.parse import urlparse 8 | 9 | from download_and_predict.base import DownloadAndPredict 10 | from download_and_predict.custom_types import SQSEvent 11 | 12 | import pg8000 13 | from mercantile import Tile, quadkey 14 | 15 | class MLEnablerSave(DownloadAndPredict): 16 | def __init__(self, imagery: str, db: str, prediction_endpoint: str, prediction_id: str): 17 | # type annotatation error ignored, re: https://github.com/python/mypy/issues/5887 18 | super(DownloadAndPredict, self).__init__(dict( # type: ignore 19 | imagery=imagery, 20 | db=db, 21 | prediction_endpoint=prediction_endpoint 22 | )) # 23 | self.prediction_id = prediction_id 24 | 25 | def save_to_db(self, tiles:List[Tile], results:List[Any], result_wrapper:Optional[Callable]=None) -> None: 26 | db = urlparse(self.db) 27 | 28 | conn = pg8000.connect( 29 | user=db.username, 30 | password=db.password, 31 | host=db.hostname, 32 | database=db.path[1:], 33 | port=db.port 34 | ) 35 | cursor = conn.cursor() 36 | 37 | for i, output in enumerate(results): 38 | quadkey = quadkey(tiles[i]) 39 | # centroid = db.Column(Geometry('POINT', srid=4326)) 40 | predictions = pg8000.PGJsonb(output) 41 | cursor.execute("INSERT INTO mlenabler VALUES (null, %s, %s, %s) ON CONFLICT (id) DO UPDATE SET output = %s", (self.prediction_id, quadkey, predictions, predictions)) 42 | 43 | conn.commit() 44 | conn.close() 45 | 46 | 47 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 48 | # read all our environment variables to throw errors early 49 | imagery = os.getenv('TILE_ENDPOINT') 50 | db = os.getenv('DATABASE_URL') 51 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 52 | prediction_id = os.getenv('PREDICTION_ID') 53 | 54 | assert(imagery) 55 | assert(db) 56 | assert(prediction_endpoint) 57 | assert(prediction_id) 58 | 59 | # instantiate our custom DownloadAndPredict class 60 | dap = MLEnablerSave( 61 | imagery=imagery, 62 | db=db, 63 | prediction_endpoint=prediction_endpoint, 64 | prediction_id=prediction_id 65 | ) 66 | 67 | # now that we've defined the behavior of our custom class, all the below 68 | # methods are identical to those in the base example (without the db 69 | # results wrapper) 70 | 71 | # get tiles from our SQS event 72 | tiles = dap.get_tiles(event) 73 | 74 | # construct a payload for our prediction endpoint 75 | tile_indices, payload = dap.get_prediction_payload(tiles) 76 | 77 | # send prediction request 78 | content = dap.post_prediction(payload) 79 | 80 | # save prediction request to db 81 | dap.save_to_db( 82 | tile_indices, 83 | content['predictions'] 84 | ) 85 | -------------------------------------------------------------------------------- /lambda_examples/s3_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example AWS Lambda function for chip-n-scale 3 | To read images directly from S3 bucket. 4 | """ 5 | 6 | import os 7 | from os import path as op 8 | import pg8000 9 | from typing import List, Dict, Any 10 | import boto3 11 | import json 12 | 13 | 14 | from download_and_predict.base import DownloadAndPredict 15 | from download_and_predict.custom_types import SQSEvent 16 | 17 | class S3_DownloadAndPredict(DownloadAndPredict): 18 | """ 19 | base object DownloadAndPredict implementing all necessary methods to 20 | make machine learning predictions 21 | """ 22 | 23 | def __init__(self, bucket: str, db: str, prediction_endpoint: str): 24 | super(DownloadAndPredict, self).__init__() 25 | self.bucket = bucket 26 | self.db = db 27 | self.prediction_endpoint = prediction_endpoint 28 | 29 | 30 | def get_images(self, s3_keys: List): 31 | s3_client=boto3.client('s3') 32 | for s3_file in s3_keys: 33 | key = json.loads(s3_file)['image'] 34 | response = s3_client.get_object(Bucket =self.bucket, Key = key) 35 | yield(key, response["Body"].read()) 36 | 37 | 38 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 39 | # read all our environment variables to throw errors early 40 | bucket =os.getenv('BUCKET') 41 | db = os.getenv('DATABASE_URL') 42 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 43 | 44 | assert(bucket) 45 | assert(db) 46 | assert(prediction_endpoint) 47 | 48 | # instantiate our DownloadAndPredict class 49 | dap = S3_DownloadAndPredict( 50 | bucket=bucket, 51 | db=db, 52 | prediction_endpoint=prediction_endpoint 53 | ) 54 | 55 | # construct a payload for our prediction endpoint 56 | s3_keys =[record['body'] for record in event['Records']] 57 | 58 | # sent images fron s3 bucket for inference 59 | tile_indices, payload = dap.get_prediction_payload(s3_keys) 60 | 61 | # send prediction request 62 | content = dap.post_prediction(payload) 63 | 64 | # save prediction request to db 65 | dap.save_to_db( 66 | tile_indices, 67 | content['predictions'], 68 | result_wrapper=lambda x: pg8000.PGJsonb(x) 69 | ) 70 | -------------------------------------------------------------------------------- /lambda_examples/save_image.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example AWS Lambda function for chip-n-scale for saving get_images 3 | 4 | Note that this requires three corresponding changes: 5 | - a prediction docker image which returns image results as a list/array 6 | - an update to the database: column "output" needs type `bytea` 7 | - pillow and numpy need to be added to the lambda requirements 8 | """ 9 | 10 | import os 11 | import pg8000 12 | from typing import Dict, Any, List 13 | from io import BytesIO 14 | 15 | from PIL import Image 16 | import numpy as np 17 | 18 | from download_and_predict.base import DownloadAndPredict 19 | from download_and_predict.custom_types import SQSEvent 20 | 21 | def prediction_to_image(pred: List) -> bytes: 22 | img = Image.fromarray(np.array(pred).astype(np.uint8)) 23 | byts = BytesIO() 24 | img.save(byts, format='png') 25 | return byts.getvalue() 26 | 27 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 28 | # read all our environment variables to throw errors early 29 | imagery = os.getenv('TILE_ENDPOINT') 30 | db = os.getenv('DATABASE_URL') 31 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 32 | 33 | assert(imagery) 34 | assert(db) 35 | assert(prediction_endpoint) 36 | 37 | # instantiate our DownloadAndPredict class 38 | dap = DownloadAndPredict( 39 | imagery=imagery, 40 | db=db, 41 | prediction_endpoint=prediction_endpoint 42 | ) 43 | 44 | # get tiles from our SQS event 45 | tiles = dap.get_tiles(event) 46 | 47 | # construct a payload for our prediction endpoint 48 | tile_indices, payload = dap.get_prediction_payload(tiles) 49 | 50 | # send prediction request 51 | content = dap.post_prediction(payload) 52 | 53 | # save prediction request to db 54 | dap.save_to_db( 55 | tile_indices, 56 | content['predictions'], 57 | result_wrapper=prediction_to_image 58 | ) 59 | -------------------------------------------------------------------------------- /lambda_examples/sentinel_hub.py: -------------------------------------------------------------------------------- 1 | """Example AWS Lambda function for chip-n-scale with Sentinel Hub""" 2 | 3 | import os 4 | import datetime 5 | from typing import Dict, Any, Tuple, List, Iterator 6 | from io import BytesIO 7 | 8 | from download_and_predict.base import DownloadAndPredict 9 | from download_and_predict.custom_types import SQSEvent 10 | 11 | import pg8000 12 | from sentinelhub import BBox, CRS, WmsRequest, MimeType, DataSource 13 | from PIL import Image 14 | from mercantile import bounds, Tile 15 | 16 | class SentinelHubDownloader(DownloadAndPredict): 17 | def __init__(self, imagery: str, db: str, prediction_endpoint: str, sentinel_wms_kwargs: Dict[str, Any]): 18 | # type annotatation error ignored, re: https://github.com/python/mypy/issues/5887 19 | super(DownloadAndPredict, self).__init__(dict( # type: ignore 20 | imagery=imagery, 21 | db=db, 22 | prediction_endpoint=prediction_endpoint 23 | )) 24 | self.sentinel_wms_kwargs = sentinel_wms_kwargs 25 | 26 | def get_images(self, tiles: List[Tile]) -> Iterator[Tuple[Tile, bytes]]: 27 | for tile in tiles: 28 | # convert the tile index to a BBox with a buffer 29 | x, y, z = tile 30 | bbox = BBox(bounds((x, y, z)), crs=CRS.WGS84) 31 | 32 | # request the data from SentinelHub 33 | request = WmsRequest(**dict(bbox=bbox, **self.sentinel_wms_kwargs)) 34 | image_array = request.get_data(data_filter=[0])[0] 35 | img = Image.fromarray(image_array) 36 | img_bytes = BytesIO() 37 | img.save(img_bytes, format='png') 38 | yield (tile, img_bytes.getvalue()) 39 | 40 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 41 | # read all our environment variables to throw errors early 42 | imagery = os.getenv('TILE_ENDPOINT') 43 | db = os.getenv('DATABASE_URL') 44 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 45 | sh_instance_id = os.getenv('SH_INSTANCE_ID') 46 | 47 | assert(imagery) 48 | assert(db) 49 | assert(prediction_endpoint) 50 | assert(sh_instance_id) 51 | 52 | # instantiate our custom DownloadAndPredict class 53 | dap = SentinelHubDownloader( 54 | imagery=imagery, 55 | db=db, 56 | prediction_endpoint=prediction_endpoint, 57 | sentinel_wms_kwargs=dict( 58 | layer='MY-SENTINEL-HUB-LAYER', 59 | width=256, height=256, 60 | maxcc=0.20, 61 | instance_id=sh_instance_id, 62 | time=(f'2019-04-01', f'2019-07-30'), 63 | time_difference=datetime.timedelta(days=21), 64 | ) 65 | ) 66 | 67 | # now that we've defined the behavior of our custom class, all the below 68 | # methods are identical to those in the base example 69 | 70 | # get tiles from our SQS event 71 | tiles = dap.get_tiles(event) 72 | 73 | # construct a payload for our prediction endpoint 74 | tile_indices, payload = dap.get_prediction_payload(tiles) 75 | 76 | # send prediction request 77 | content = dap.post_prediction(payload) 78 | 79 | # save prediction request to db 80 | dap.save_to_db( 81 | tile_indices, 82 | content['predictions'], 83 | result_wrapper=lambda x: pg8000.PGJsonb(x) 84 | ) 85 | -------------------------------------------------------------------------------- /lambda_examples/super_tiles.py: -------------------------------------------------------------------------------- 1 | """Example AWS Lambda function for chip-n-scale with super tiles""" 2 | 3 | import os 4 | import datetime 5 | from typing import Dict, Any, Tuple, List, Iterator 6 | from io import BytesIO 7 | 8 | from download_and_predict.base import DownloadAndPredict 9 | from download_and_predict.custom_types import SQSEvent 10 | 11 | import pg8000 12 | from mercantile import Tile, children 13 | from rasterio.io import MemoryFile 14 | from rasterio.windows import Window 15 | import requests 16 | 17 | class SuperTileDownloader(DownloadAndPredict): 18 | def __init__(self, imagery: str, db: str, prediction_endpoint: str, model_image_size: int): 19 | # type annotatation error ignored, re: https://github.com/python/mypy/issues/5887 20 | super(DownloadAndPredict, self).__init__(dict( # type: ignore 21 | imagery=imagery, 22 | db=db, 23 | prediction_endpoint=prediction_endpoint 24 | )) 25 | self.model_image_size = model_image_size 26 | 27 | def get_images(self, tiles: List[Tile]) -> Iterator[Tuple[Tile, bytes]]: 28 | """return images cropped to a given model_image_size from an imagery endpoint""" 29 | for tile in tiles: 30 | url = self.imagery.format(x=tile.x, y=tile.y, z=tile.z) 31 | r = requests.get(url) 32 | with MemoryFile(BytesIO(r.content)) as memfile: 33 | with memfile.open() as dataset: 34 | # because of the tile indexing, we assume all tiles are square 35 | sz = dataset.width 36 | zoom_offset = sz // self.model_image_size - 1 37 | 38 | tile_indices = children(tile, zoom=zoom_offset + tile.z) 39 | tile_indices.sort() 40 | 41 | for i in range (2 ** zoom_offset): 42 | for j in range(2 ** zoom_offset): 43 | window = Window(i * sz, j * sz, (i + 1) * sz, (j + 1) * sz) 44 | yield ( 45 | tile_indices[i + j], 46 | dataset.read(window=window) 47 | ) 48 | 49 | def handler(event: SQSEvent, context: Dict[str, Any]) -> None: 50 | # read all our environment variables to throw errors early 51 | imagery = os.getenv('TILE_ENDPOINT') 52 | db = os.getenv('DATABASE_URL') 53 | prediction_endpoint=os.getenv('PREDICTION_ENDPOINT') 54 | model_image_size = os.getenv('MODEL_IMAGE_SIZE') 55 | 56 | assert(imagery) 57 | assert(db) 58 | assert(prediction_endpoint) 59 | assert(model_image_size) 60 | 61 | # instantiate our custom DownloadAndPredict class 62 | dap = SuperTileDownloader( 63 | imagery=imagery, 64 | db=db, 65 | prediction_endpoint=prediction_endpoint, 66 | model_image_size=int(model_image_size) 67 | ) 68 | 69 | # now that we've defined the behavior of our custom class, all the below 70 | # methods are identical to those in the base example 71 | 72 | # get tiles from our SQS event 73 | tiles = dap.get_tiles(event) 74 | 75 | # construct a payload for our prediction endpoint 76 | tile_indices, payload = dap.get_prediction_payload(tiles) 77 | 78 | # send prediction request 79 | content = dap.post_prediction(payload) 80 | 81 | # save prediction request to db 82 | dap.save_to_db( 83 | tile_indices, 84 | content['predictions'], 85 | result_wrapper=lambda x: pg8000.PGJsonb(x) 86 | ) 87 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ml-inference-cfn", 3 | "version": "0.1.0", 4 | "description": "Kes Deployment for Machine Learning Inference", 5 | "main": "index.js", 6 | "scripts": { 7 | "model": "node ./scripts/model.js", 8 | "build": "node_modules/.bin/webpack", 9 | "watch": "node_modules/.bin/webpack -w", 10 | "deploy": "AWS_SDK_LOAD_CONFIG=true kes cf deploy --kes-folder config --kes-class config/kes.js", 11 | "setup": "db/setup.sh", 12 | "delete": "AWS_SDK_LOAD_CONFIG=true kes cf delete --kes-folder config --kes-class config/kes.js", 13 | "gpu-util": "AWS_SDK_LOAD_CONFIG=true node scripts/gpu-util.js", 14 | "verify": "AWS_SDK_LOAD_CONFIG=true node scripts/verify.js", 15 | "sqs-push": "AWS_SDK_LOAD_CONFIG=true node scripts/run-sqs-push.js", 16 | "download": "AWS_SDK_LOAD_CONFIG=true node scripts/download.js", 17 | "convert-geojson": "python scripts/csv_to_geojson.py", 18 | "test": "node test/test_sqs-push.js", 19 | "tag-logs": "AWS_SDK_LOAD_CONFIG=true node scripts/tag-cloudwatch-logs.js" 20 | }, 21 | "author": "Development Seed", 22 | "license": "ISC", 23 | "dependencies": { 24 | "@google-cloud/storage": "^4.2.0", 25 | "aws-sdk": "^2.395.0", 26 | "cli-table": "^0.3.1", 27 | "d3-queue": "^3.0.7", 28 | "js-yaml": "^3.12.1", 29 | "knex": "^0.16.3", 30 | "lodash.flatten": "^4.4.0", 31 | "log-update": "^2.3.0", 32 | "minimist": "^1.2.0", 33 | "mkdirp": "^0.5.1", 34 | "node-ssh": "^5.1.2", 35 | "pg": "^7.8.0", 36 | "split": "^1.0.1", 37 | "through2-batch": "^1.1.1", 38 | "uuid": "^3.3.2" 39 | }, 40 | "devDependencies": { 41 | "axios": "^0.18.0", 42 | "babel-core": "^6.26.0", 43 | "babel-loader": "^7.1.2", 44 | "babel-plugin-transform-async-to-generator": "^6.24.1", 45 | "babel-polyfill": "^6.26.0", 46 | "dotenv": "^6.2.0", 47 | "eslint": "^4.19.1", 48 | "eslint-config-standard": "^11.0.0", 49 | "eslint-plugin-import": "^2.10.0", 50 | "eslint-plugin-node": "^6.0.1", 51 | "eslint-plugin-promise": "^3.7.0", 52 | "eslint-plugin-standard": "^3.0.1", 53 | "kes": "^2.2.7", 54 | "memorystream": "^0.3.1", 55 | "prepend-loader": "^0.0.2", 56 | "proxyquire": "^2.1.3", 57 | "sinon": "^7.4.2", 58 | "tape": "^4.11.0", 59 | "webpack": "^3.5.6" 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /scripts/csv_to_geojson.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simply script to convert a csv file of prediction results into a geojson 3 | 4 | Specify the CSV filepath and your confidence threshold. The CSV file should 5 | contain tile indices in TMS coordinates. The confidence threshold is useful if 6 | you ran prediction over a large area -- it will keep the resulting geojson 7 | much smaller on disk. Set the threshold to 0.0 to include all tile predictions 8 | in the geojson output. 9 | 10 | Requires pygeotile, geojson, and mercantile 11 | """ 12 | 13 | import csv 14 | import json 15 | import argparse 16 | import os.path as op 17 | 18 | from mercantile import feature, Tile 19 | from geojson import Feature 20 | from pygeotile.tile import Tile as Pygeo_tile 21 | 22 | 23 | def convert_csv(fname_csv, fname_geojson, tile_format, thresh_ind, thresh): 24 | """Convert tile indices in CSV file to geojson""" 25 | 26 | if not op.exists(fname_csv): 27 | raise ValueError(f'Cannot find file {fname_csv}') 28 | 29 | # Error check tile format 30 | if tile_format == 'tms': 31 | tile_func = Pygeo_tile.from_tms 32 | elif tile_format == 'google': 33 | tile_func = Pygeo_tile.from_google 34 | else: 35 | raise ValueError(f'Tile format not understood. Got: {tile_format}') 36 | 37 | if not 0 <= thresh <= 1.: 38 | raise ValueError(f"'thresh' must be on interval [0, 1]. Got: {thresh}") 39 | 40 | with open(fname_csv, 'r') as csvfile: 41 | with open(fname_geojson, 'w') as results: 42 | reader = csv.reader(csvfile) 43 | first_line = True 44 | 45 | # Create a FeatureCollection 46 | results.write('{"type":"FeatureCollection","features":[') 47 | next(reader) # Skip header 48 | 49 | for row in reader: 50 | 51 | # Load as pygeotile using TMS coords 52 | geot = tile_func(*[int(t) for t in row[0].split('-')]) 53 | 54 | # Create feature with mercantile 55 | feat = feature(Tile(geot.google[0], geot.google[1], geot.zoom)) 56 | 57 | # Get class prediction confidences 58 | pred = json.loads(','.join(row[1:])) 59 | pred_red = list(map(lambda x: round(x, 2), pred)) 60 | if pred_red[thresh_ind] >= thresh: 61 | # Add commas prior to any feature that isn't the first one 62 | if first_line: 63 | first_line = False 64 | else: 65 | results.write(',') 66 | 67 | pred_obj = dict(zip(map(lambda x: 'p%s' % x, 68 | range(len(pred_red))), pred_red)) 69 | 70 | results.write(json.dumps(Feature(geometry=feat['geometry'], 71 | properties=pred_obj))) 72 | 73 | # Finalize the feature FeatureCollection 74 | results.write(']}') 75 | 76 | 77 | if __name__ == "__main__": 78 | 79 | parser = argparse.ArgumentParser(description='Convert CSV of tile predictions to GeoJSON.') 80 | parser.add_argument('fname_csv', type=str, 81 | help='Filepath to CSV file needing conversion.') 82 | parser.add_argument('fname_geojson', type=str, default='results.geojson', 83 | help='Filepath to save geojson file to.') 84 | parser.add_argument('--tile-format', type=str, default='tms', 85 | help='Format of tile indices in CSV file ("tms" or "google").') 86 | parser.add_argument('--thresh-ind', type=int, default=1, 87 | help='Optional threshold class index for including a prediction.') 88 | parser.add_argument('--thresh', type=float, default=0., 89 | help='Optional threshold for including a prediction.') 90 | 91 | args = parser.parse_args() 92 | convert_csv(args.fname_csv, args.fname_geojson, args.tile_format, 93 | args.thresh_ind, args.thresh) 94 | -------------------------------------------------------------------------------- /scripts/download.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const { promisify } = require('util') 3 | 4 | const dbConfig = require('../db/knexfile').remote 5 | const db = require('knex')(dbConfig) 6 | 7 | const writeFile = promisify(fs.writeFile) 8 | const outputFile = process.argv[2] 9 | 10 | db('results').then(results => { 11 | const csv = ['tile,output'].concat(results.map(result => `${result.tile},${JSON.stringify(result.output)}`)).join('\n') 12 | return writeFile(outputFile, csv) 13 | }).then(_ => process.exit(0)) 14 | -------------------------------------------------------------------------------- /scripts/gpu-util.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const yaml = require('js-yaml') 3 | const AWS = require('aws-sdk') 4 | const NodeSSH = require('node-ssh') 5 | const flatten = require('lodash.flatten') 6 | const Table = require('cli-table') 7 | const logUpdate = require('log-update') 8 | 9 | // setup 10 | const ssh = new NodeSSH() 11 | const GPU_UTIL_QUERY = 'nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader' 12 | const getConnectParams = host => { 13 | return { 14 | host, username: 'ec2-user', privateKey: process.argv[2] 15 | } 16 | } 17 | const tableParams = { 18 | head: ['IP Address', 'Instance Type', 'GPU Utilization'], 19 | colWidths: [24, 24, 24] 20 | } 21 | 22 | // get stackName from our config file 23 | const config = yaml.safeLoad(fs.readFileSync('config/config.yml').toString()) 24 | const stackName = config.default.stackName 25 | 26 | // find all our project EC2s and get their IP 27 | const ec2 = new AWS.EC2() 28 | ec2.describeInstances({ Filters: [{ Name: 'tag:Project', Values: [`${stackName}`] }] }) 29 | .promise() 30 | .then(resp => flatten(resp.Reservations.map(r => r.Instances))) 31 | .then(instances => { 32 | setInterval(() => { 33 | // run our promises in serial so we don't mix up our ssh connection 34 | // from: https://decembersoft.com/posts/promises-in-serial-with-array-reduce/ 35 | instances.reduce((promiseChain, instance) => { 36 | return promiseChain.then(chainResults => { 37 | return ssh.connect(getConnectParams(instance.PublicIpAddress)).then(() => { 38 | return ssh.execCommand(GPU_UTIL_QUERY).then(result => { 39 | ssh.dispose() 40 | return [ ...chainResults, [ 41 | instance.PublicIpAddress, 42 | instance.InstanceType, 43 | result.stdout 44 | ] 45 | ] 46 | }) 47 | }) 48 | }) 49 | }, Promise.resolve([])).then(results => { 50 | let table = new Table(tableParams) 51 | results.forEach(r => table.push(r)) 52 | logUpdate(table.toString()) 53 | }) 54 | }, 5000) 55 | }) 56 | -------------------------------------------------------------------------------- /scripts/model.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | const Q = require('d3-queue').queue; 4 | const mkdir = require('mkdirp').sync; 5 | const pipeline = require('stream').pipeline; 6 | const fs = require('fs'); 7 | const os = require('os'); 8 | const CP = require('child_process'); 9 | const tmp = os.tmpdir() + '/' + Math.random().toString(36).substring(2, 15) + Math.random().toString(36).substring(2, 15) 10 | const path = require('path'); 11 | const argv = require('minimist')(process.argv, { 12 | boolean: ['use-gpu', 'help'], 13 | alias: {'use_gpu': 'use-gpu'} 14 | }); 15 | 16 | function help() { 17 | console.error(); 18 | console.error(' Build TFServing docker images for Chip-N-Scale given a GS model location'); 19 | console.error(); 20 | console.error('Usage:'); 21 | console.error(); 22 | console.error(' yarn model [--use_gpu]'); 23 | console.error(); 24 | console.error('Options:'); 25 | console.error(' --use_gpu builds and tags gpu tfserving image, without this flag cpu image will be used'); 26 | console.error(); 27 | } 28 | 29 | let model = argv._[2]; 30 | 31 | if (!model || argv.help) { 32 | return help(); 33 | } 34 | 35 | model = new URL(model); 36 | 37 | if (model.protocol === 's3:') { 38 | console.error('s3: models will be supported in the future'); 39 | process.exit(); 40 | } else if (model.protocol !== 'gs:') { 41 | console.error('Only gs:// protocols are supported'); 42 | process.exit(); 43 | } 44 | 45 | mkdir(tmp + '/001'); 46 | console.error(`ok - tmp dir: ${tmp}`); 47 | 48 | if (model.protocol === 'gs:') { 49 | return gs_get(model, docker); 50 | } 51 | 52 | /** 53 | * Given a Google Storage Folder containing a model, 54 | * fetch and save it to disk 55 | */ 56 | function gs_get(model, cb) { 57 | const gs = new (require('@google-cloud/storage').Storage)(); 58 | const bucket = gs.bucket(model.host); 59 | 60 | if (!process.env.GOOGLE_APPLICATION_CREDENTIALS) { 61 | console.error('GOOGLE_APPLICATION_CREDENTIALS environment var must be set'); 62 | console.error('See: https://cloud.google.com/docs/authentication/getting-started'); 63 | process.exit(); 64 | } 65 | 66 | const model_path = model.pathname.replace(/^\//, ''); 67 | 68 | bucket.getFiles({ 69 | prefix: model_path 70 | }, (err, files) => { 71 | if (err) return cb(err); 72 | 73 | const q = new Q(1); 74 | 75 | for (let file of files) { 76 | if (file.name[file.name.length - 1] === '/') continue; 77 | 78 | const name = path.parse(file.name.replace(model_path, '')); 79 | 80 | if (name.dir) { 81 | mkdir(path.resolve(tmp + '/001', name.dir)); 82 | } 83 | 84 | q.defer((file, name, done) => { 85 | console.error(`ok - fetching ${name.dir + '/' + name.base}`); 86 | pipeline( 87 | file.createReadStream(), 88 | fs.createWriteStream(path.resolve(tmp + '/001', name.dir, name.base)), 89 | done 90 | ); 91 | }, file, name); 92 | } 93 | 94 | q.awaitAll(cb); 95 | }); 96 | } 97 | 98 | function docker(err, res) { 99 | if (err) throw err; 100 | if (argv.use_gpu) { 101 | console.error('ok - pulling tensorflow/serving:latest-gpu docker image'); 102 | CP.execSync(` 103 | docker pull tensorflow/serving:latest-gpu 104 | `); 105 | } else { 106 | console.error('ok - pulling tensorflow/serving docker image'); 107 | CP.execSync(` 108 | docker pull tensorflow/serving 109 | `); 110 | } 111 | 112 | // Ignore errors, these are to ensure the next commands don't err 113 | try { 114 | CP.execSync(` 115 | docker kill serving_base 116 | `); 117 | } catch(err) { 118 | console.error('ok - no old task to stop'); 119 | } 120 | 121 | try { 122 | CP.execSync(` 123 | docker rm serving_base 124 | `); 125 | } catch(err) { 126 | console.error('ok - no old image to remove'); 127 | } 128 | 129 | CP.execSync(` 130 | docker run -d --name serving_base tensorflow/serving${argv.use_gpu ? ':latest-gpu' : ''} 131 | `); 132 | 133 | CP.execSync(` 134 | docker cp ${tmp}/ serving_base:/models/default/ \ 135 | `); 136 | 137 | const tag = `developmentseed/default:${Math.random().toString(36).substring(2, 15)}${argv.use_gpu ? '-gpu' : ''}`; 138 | 139 | CP.execSync(` 140 | docker commit --change "ENV MODEL_NAME default" serving_base ${tag} 141 | `); 142 | 143 | console.error(`ok - docker: ${tag}`); 144 | 145 | console.error(); 146 | console.error(`ok - Run with docker run -p 8501:8501 -t ${tag}`); 147 | console.error(); 148 | 149 | } 150 | -------------------------------------------------------------------------------- /scripts/run-sqs-push.js: -------------------------------------------------------------------------------- 1 | const sqsPush = require('./sqs-push') 2 | sqsPush.run() 3 | -------------------------------------------------------------------------------- /scripts/sqs-push.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const { Transform, Writable } = require('stream') 3 | const split = require('split') 4 | const through2Batch = require('through2-batch') 5 | const logUpdate = require('log-update') 6 | const { SQS } = require('aws-sdk') 7 | const uuidv4 = require('uuid/v4') 8 | 9 | const promiseThreshold = process.env.PROMISE_THRESHOLD || 500 10 | const queue = process.argv[3] 11 | const errors = [] 12 | let count = 0 13 | 14 | const transform = new Transform({ 15 | objectMode: true, 16 | transform: (data, _, done) => { 17 | if (!data.toString()) return done(null, null) // don't write empty lines 18 | const [ x, y, z ] = data.toString().split('-').map(d => Number(d)) 19 | done(null, JSON.stringify({ x, y, z })) 20 | } 21 | }) 22 | 23 | const counter = new Transform({ 24 | objectMode: true, 25 | transform: (data, _, done) => { 26 | logUpdate(`Sending ${++count} messages to queue: ${queue}`) 27 | done(null, data) 28 | } 29 | }) 30 | 31 | // simplified from https://github.com/danielyaa5/sqs-write-stream 32 | class SqsWriteStream extends Writable { 33 | /** 34 | * Must provide a url property 35 | * @param {Object} queue - An object with a url property 36 | */ 37 | constructor (queue, options) { 38 | super({ 39 | objectMode: true 40 | }) 41 | this.queueUrl = queue.url 42 | this.sqs = new SQS() 43 | this.activePromises = new Map() 44 | this.decrementActivePromises = this.decrementActivePromises.bind(this) 45 | this.sendMessages = this.sendMessages.bind(this) 46 | this.paused = false 47 | this.buffer = [] 48 | } 49 | 50 | decrementActivePromises (id) { 51 | this.activePromises.delete(id) 52 | if (this.paused && this.activePromises.size < promiseThreshold / 2) { 53 | this.paused = false 54 | this.cb() 55 | } 56 | } 57 | 58 | sendMessages (Entries) { 59 | const Id = uuidv4() 60 | const promise = this.sqs.sendMessageBatch({ 61 | Entries, 62 | QueueUrl: this.queueUrl 63 | }) 64 | .promise() 65 | .then((data) => { 66 | if (data.Failed && data.Failed.length > 0) { 67 | data.Failed.forEach((error) => { 68 | errors.push(error) 69 | }) 70 | } 71 | this.decrementActivePromises(Id) 72 | }) 73 | .catch((error) => { 74 | errors.push(`Error: ${error}`) 75 | this.decrementActivePromises(Id) 76 | }) 77 | this.activePromises.set(Id, promise) 78 | } 79 | 80 | _write (obj, enc, cb) { 81 | if (this.activePromises.size >= promiseThreshold) { 82 | this.paused = true 83 | this.cb = cb 84 | this.buffer.push(obj) 85 | return false 86 | } else { 87 | try { 88 | if (this.buffer.length > 0) { 89 | this.buffer.forEach((bufferedObject) => { 90 | const Entries = obj.map((object) => ({ 91 | MessageBody: object, 92 | Id: uuidv4() 93 | })) 94 | this.sendMessages(Entries) 95 | }) 96 | this.buffer = [] 97 | } 98 | const Entries = obj.map((object) => ({ 99 | MessageBody: object, 100 | Id: uuidv4() 101 | })) 102 | this.sendMessages(Entries) 103 | return cb() 104 | } catch (err) { 105 | errors.push(`Error: ${err}`) 106 | return cb(err) 107 | } 108 | } 109 | } 110 | } 111 | 112 | function run () { 113 | const sqsStream = new SqsWriteStream({ url: queue }) 114 | fs.createReadStream(process.argv[2]) 115 | .pipe(split()) 116 | .pipe(counter) 117 | .pipe(transform) 118 | .pipe(through2Batch.obj({batchSize: 10})) 119 | .pipe(sqsStream) 120 | sqsStream.on('finish', () => { 121 | if (errors.length > 0) { 122 | logUpdate(errors) 123 | } 124 | }) 125 | } 126 | 127 | module.exports = { 128 | run 129 | } 130 | -------------------------------------------------------------------------------- /scripts/tag-cloudwatch-logs.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const AWS = require('aws-sdk') 3 | const yaml = require('js-yaml') 4 | 5 | // get stackName, projectTag from our config file 6 | const config = yaml.safeLoad(fs.readFileSync('config/config.yml').toString()) 7 | const stackName = config.default.stackName 8 | const projectTag = config.default.projectTag 9 | 10 | const cw = new AWS.CloudWatchLogs() 11 | 12 | // helper 13 | function tagWithProject (logGroup) { 14 | console.log(`tagging ${logGroup.logGroupName} with { Project: ${projectTag} }`) 15 | return cw.tagLogGroup({ 16 | logGroupName: logGroup.logGroupName, 17 | tags: { Project: projectTag } 18 | }).promise() 19 | } 20 | 21 | // tag lambda cloudwatch logs 22 | cw.describeLogGroups({ logGroupNamePrefix: `/aws/lambda/${stackName}` }) 23 | .promise() 24 | .then(resp => { 25 | return Promise.all(resp.logGroups.map(tagWithProject)) 26 | }) 27 | 28 | // tag ECS cloudwatch logs 29 | cw.describeLogGroups({ logGroupNamePrefix: stackName }) 30 | .promise() 31 | .then(resp => { 32 | return Promise.all(resp.logGroups.map(tagWithProject)) 33 | }) 34 | -------------------------------------------------------------------------------- /scripts/verify.js: -------------------------------------------------------------------------------- 1 | const assert = require('assert') 2 | const path = require('path') 3 | const fs = require('fs') 4 | const yaml = require('js-yaml') 5 | const AWS = require('aws-sdk') 6 | const dbConfig = require('../db/knexfile').remote 7 | const axios = require('axios') 8 | require('dotenv').config({ path: path.join(process.env.PWD, 'config', '.env') }) 9 | 10 | // get stackName from our config file 11 | const config = yaml.safeLoad(fs.readFileSync('config/config.yml').toString()) 12 | const stackName = config.default.stackName 13 | 14 | // fixtures 15 | const DB_TYPES = [ 16 | { column_name: 'tile', data_type: 'character varying' }, 17 | { column_name: 'output', data_type: 'jsonb' } 18 | ] 19 | 20 | // get output values from the cloudformation stack 21 | async function getStackOutputs (stackName) { 22 | const cf = new AWS.CloudFormation() 23 | return cf.describeStacks({ StackName: stackName }).promise() 24 | .then(resp => resp.Stacks[0].Outputs) 25 | } 26 | 27 | async function verify () { 28 | console.log(`Verifying stack ${stackName}`) 29 | const outputs = await getStackOutputs(stackName) 30 | dbConfig.connection = outputs.find(o => o.OutputKey === 'dbConnectionString').OutputValue 31 | const db = require('knex')(dbConfig) 32 | 33 | // check that our db has the correct columns 34 | await db.select(['column_name', 'data_type']) 35 | .table('information_schema.columns') 36 | .where({ 'table_name': 'results' }) 37 | .then(rows => assert.deepStrictEqual(rows, DB_TYPES)) 38 | .catch(err => console.error(err)) 39 | .then(_ => console.log('Database has the correct columns')) 40 | 41 | // check that our ALB/GPU endpoint is healthy 42 | const endpoint = outputs.find(o => o.OutputKey === 'modelEndpoint').OutputValue 43 | await axios.get(endpoint) 44 | .then(resp => assert.deepStrictEqual(resp.status, 200)) 45 | .catch(err => console.error(err)) 46 | .then(_ => console.log('TF Serving returns a 200 status from the internal load balancer endpoint')) 47 | 48 | // download a tile 49 | const tile = { x: 184260, y: 107656, z: 18 } 50 | const url = config.default.lambdas.DownloadAndPredict.envs.TILE_ENDPOINT 51 | .replace('{}', tile.z) 52 | .replace('{}', tile.x) 53 | .replace('{}', tile.y) 54 | .replace('{}', process.env.TILE_ACCESS_TOKEN) 55 | 56 | const img = await axios.get(url, { responseType: 'arraybuffer' }) 57 | .then(resp => { 58 | assert.deepStrictEqual(resp.status, 200) 59 | console.log('Tile endpoint returns a 200 status') 60 | return resp.data.toString('base64') 61 | }) 62 | .catch(err => console.error(err)) 63 | 64 | // confirm that we receive a prediction from the endpoint using the tile 65 | const body = { instances: [{ 'image_bytes': { 'b64': img } }] } 66 | await axios.post(`${endpoint}:predict`, body) 67 | .then(resp => resp.data) 68 | .then(data => assert(Array.isArray(data.predictions))) 69 | .catch(err => console.error(err)) 70 | .then(_ => console.log('Prediction endpoint response has key "predictions" and it is an array')) 71 | 72 | return true 73 | } 74 | 75 | verify() 76 | .then(a => console.log('Stack verified')) 77 | .catch(err => console.error(err) && process.exit(1)) 78 | .then(_ => process.exit(0)) 79 | -------------------------------------------------------------------------------- /test/test_sqs-push.js: -------------------------------------------------------------------------------- 1 | const test = require('tape') 2 | const sinon = require('sinon') 3 | const proxyquire = require('proxyquire').noCallThru() 4 | const MemoryStream = require('memorystream') 5 | const fs = require('fs') 6 | 7 | test('sqs-push', (t) => { 8 | const error = 'error' 9 | const sendMessageBatch = sinon.stub() 10 | sendMessageBatch.onFirstCall().returns({ promise: () => (Promise.reject(error)) }) 11 | const SQS = function () { 12 | return { 13 | sendMessageBatch 14 | } 15 | } 16 | const aws = { SQS } 17 | 18 | const memStream = new MemoryStream() 19 | const stubFsCreateReadStream = sinon.stub(fs, 'createReadStream') 20 | stubFsCreateReadStream.returns(memStream) 21 | const logUpdate = sinon.stub() 22 | 23 | process.argv = [ 24 | 'command', 25 | 'empty', 26 | 'file', 27 | 'queueurl' 28 | ] 29 | 30 | const sqsPush = proxyquire( 31 | '../scripts/sqs-push.js', 32 | { 33 | 'aws-sdk': aws, 34 | 'log-update': logUpdate, 35 | 'fs': fs 36 | } 37 | ) 38 | sqsPush.run() 39 | memStream.write('9-162-307\n9-161-307\n9-163-307') 40 | memStream.end('') 41 | setTimeout(() => { 42 | t.equal(logUpdate.getCall(3).args[0][0], 'Error: error', 43 | 'Logs error when sqs message promise rejects') 44 | t.end() 45 | }, 1) 46 | }) 47 | --------------------------------------------------------------------------------