├── scripts
    ├── log-analysis
    │   ├── stop.sh
    │   └── start.sh
    ├── viewer
    │   ├── stop.sh
    │   └── start.sh
    ├── evaluation
    │   ├── stop.sh
    │   ├── start.sh
    │   └── prepare-config.py
    ├── training
    │   ├── stop.sh
    │   ├── increment.sh
    │   ├── start.sh
    │   └── prepare-config.py
    └── upload
    │   ├── upload-car.sh
    │   ├── import-model.py
    │   ├── increment.sh
    │   ├── download-model.sh
    │   ├── prepare-config.py
    │   └── upload-model.sh
├── utils
    ├── Dockerfile.sagemaker-gpu
    ├── cuda-check.sh
    ├── Dockerfile.gpu-detect
    ├── cuda-check-tf.py
    ├── start-xorg.sh
    ├── setup-xorg.sh
    ├── start-local-browser.sh
    ├── sample-createspot.sh
    └── submit-monitor.py
├── docker
    ├── docker-compose-mount.yml
    ├── docker-compose-robomaker-multi.yml
    ├── docker-compose-endpoint.yml
    ├── docker-compose-local-xorg.yml
    ├── docker-compose-webviewer-swarm.yml
    ├── docker-compose-keys.yml
    ├── docker-compose-webviewer.yml
    ├── docker-compose-eval-swarm.yml
    ├── docker-compose-training-swarm.yml
    ├── docker-compose-azure.yml
    ├── docker-compose-cwlog.yml
    ├── docker-compose-local.yml
    ├── docker-compose-eval.yml
    └── docker-compose-training.yml
├── defaults
    ├── dependencies.json
    ├── model_metadata_sac.json
    ├── hyperparameters.json
    ├── template-system.env
    ├── model_metadata.json
    ├── template-worker.env
    ├── reward_function.py
    ├── debug-reward_function.py
    └── template-run.env
├── docs
    ├── _config.yml
    ├── head-to-head.md
    ├── multi_run.md
    ├── video.md
    ├── multi_worker.md
    ├── docker.md
    ├── opengl.md
    ├── index.md
    ├── upload.md
    ├── windows.md
    ├── multi_gpu.md
    ├── installation.md
    └── reference.md
├── .gitignore
├── bin
    ├── detect.sh
    ├── autorun.sh
    ├── runonce.sh
    ├── prepare.sh
    ├── init.sh
    ├── activate.sh
    └── scripts_wrapper.sh
└── README.md


/scripts/log-analysis/stop.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | docker stop loganalysis
4 | 


--------------------------------------------------------------------------------
/utils/Dockerfile.sagemaker-gpu:
--------------------------------------------------------------------------------
1 | FROM awsdeepracercommunity/deepracer-sagemaker:5.0.0-gpu
2 | ENV CUDA_VISIBLE_DEVICES=0
3 | 


--------------------------------------------------------------------------------
/docker/docker-compose-mount.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | 
3 | services:
4 |   robomaker:
5 |     volumes:
6 |       - "${DR_MOUNT_DIR}:/root/.ros/log"
7 | 


--------------------------------------------------------------------------------
/docker/docker-compose-robomaker-multi.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | 
3 | services:
4 |   robomaker:
5 |     volumes:
6 |       - "${DR_DIR}/tmp/comms.${DR_RUN_ID}:/mnt/comms"
7 | 


--------------------------------------------------------------------------------
/defaults/dependencies.json:
--------------------------------------------------------------------------------
1 | {
2 |     "master_version": "5.0",
3 |     "containers": {
4 |         "rl_coach": "5.0.0",
5 |         "robomaker": "5.0.3",
6 |         "sagemaker": "5.0.0"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | ---
2 | theme: jekyll-theme-slate
3 | markdown: GFM
4 | name: Deepracer-for-Cloud
5 | plugins:
6 |   - jekyll-relative-links
7 | relative_links:
8 |   enabled:     true
9 |   collections: false


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | custom_files/
 3 | logs/
 4 | docker/volumes/
 5 | recording/
 6 | recording
 7 | /*.env
 8 | /*.bak
 9 | /*.json
10 | DONE
11 | data/
12 | tmp/
13 | autorun.s3url
14 | nohup.out
15 | start.sh


--------------------------------------------------------------------------------
/docker/docker-compose-endpoint.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   rl_coach:
 5 |     environment:
 6 |       - S3_ENDPOINT_URL=${DR_MINIO_URL}
 7 |   robomaker:
 8 |     environment:
 9 |       - S3_ENDPOINT_URL=${DR_MINIO_URL}
10 | 


--------------------------------------------------------------------------------
/utils/cuda-check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | CONTAINER_ID=$(docker create --rm -ti -e CUDA_VISIBLE_DEVICES --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py")
4 | docker cp $DR_DIR/utils/cuda-check-tf.py $CONTAINER_ID:/opt/install/
5 | docker start -a $CONTAINER_ID 


--------------------------------------------------------------------------------
/utils/Dockerfile.gpu-detect:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:11.4.2-base-ubuntu18.04
2 | RUN apt-get update && apt-get install -y --no-install-recommends wget python3
3 | RUN wget https://gist.githubusercontent.com/f0k/63a664160d016a491b2cbea15913d549/raw/f25b6b38932cfa489150966ee899e5cc899bf4a6/cuda_check.py
4 | CMD ["python3","cuda_check.py"]


--------------------------------------------------------------------------------
/defaults/model_metadata_sac.json:
--------------------------------------------------------------------------------
1 | {
2 |     "action_space": {"speed": {"high": 2, "low": 1}, "steering_angle": {"high": 30, "low": -30}},
3 |     "sensor": ["FRONT_FACING_CAMERA"],
4 |     "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW",
5 |     "training_algorithm": "sac", 
6 |     "action_space_type": "continuous",
7 |     "version": "4"
8 | }
9 | 


--------------------------------------------------------------------------------
/docker/docker-compose-local-xorg.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   robomaker:
 5 |     environment:
 6 |       - DISPLAY
 7 |       - USE_EXTERNAL_X=${DR_HOST_X}
 8 |       - XAUTHORITY=/root/.Xauthority
 9 |       - QT_X11_NO_MITSHM=1
10 |     volumes:
11 |       - '/tmp/.X11-unix/:/tmp/.X11-unix'
12 |       - '${XAUTHORITY}:/root/.Xauthority'


--------------------------------------------------------------------------------
/docker/docker-compose-webviewer-swarm.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | networks:
 4 |   default:
 5 |     external: true
 6 |     name: sagemaker-local
 7 | 
 8 | services:
 9 |   proxy:
10 |     deploy:
11 |       restart_policy:
12 |         condition: none
13 |       replicas: 1
14 |       placement:
15 |         constraints: [node.labels.Sagemaker == true ]
16 | 


--------------------------------------------------------------------------------
/docker/docker-compose-keys.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   rl_coach:
 5 |     environment:
 6 |       - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID}
 7 |       - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}
 8 |   robomaker:
 9 |     environment:
10 |       - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID}
11 |       - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}
12 | 


--------------------------------------------------------------------------------
/docker/docker-compose-webviewer.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | networks:
 4 |   default:
 5 |     external: true
 6 |     name: sagemaker-local
 7 | 
 8 | services:
 9 |   proxy:
10 |     image: nginx
11 |     ports:
12 |       - "${DR_WEBVIEWER_PORT}:80"
13 |     volumes:
14 |       - ${DR_VIEWER_HTML}:/usr/share/nginx/html/index.html
15 |       - ${DR_NGINX_CONF}:/etc/nginx/conf.d/default.conf
16 | 
17 | 


--------------------------------------------------------------------------------
/scripts/viewer/stop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | STACK_NAME="deepracer-$DR_RUN_ID-viewer"
 4 | COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml
 5 | 
 6 | # Check if we will use Docker Swarm or Docker Compose
 7 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 8 | then
 9 |     docker stack rm $STACK_NAME
10 | else
11 |     docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down
12 | fi


--------------------------------------------------------------------------------
/scripts/log-analysis/start.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | docker run --rm -d -p "8888:8888" \
 4 | -v `pwd`/../../data/logs:/workspace/logs \
 5 | -v `pwd`/../../docker/volumes/.aws:/root/.aws \
 6 | -v `pwd`/../../data/analysis:/workspace/analysis \
 7 | -v `pwd`/../../data/minio:/workspace/minio \
 8 | --name loganalysis \
 9 | --network sagemaker-local \
10 |  awsdeepracercommunity/deepracer-analysis:$DR_ANALYSIS_IMAGE
11 | 
12 | docker logs -f loganalysis


--------------------------------------------------------------------------------
/utils/cuda-check-tf.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.client import device_lib
 2 | import tensorflow as tf
 3 | 
 4 | def get_available_gpus():
 5 |     local_device_protos = device_lib.list_local_devices()
 6 |     return [x.name for x in local_device_protos if x.device_type == 'GPU']
 7 | 
 8 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05)
 9 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
10 | print(get_available_gpus())
11 | 


--------------------------------------------------------------------------------
/defaults/hyperparameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "batch_size": 64,
 3 |     "beta_entropy": 0.01,
 4 |     "discount_factor": 0.995,
 5 |     "e_greedy_value": 0.05,
 6 |     "epsilon_steps": 10000,
 7 |     "exploration_type": "categorical",
 8 |     "loss_type": "huber",
 9 |     "lr": 0.0003,
10 |     "num_episodes_between_training": 20,
11 |     "num_epochs": 10,
12 |     "stack_size": 1,
13 |     "term_cond_avg_score": 350.0,
14 |     "term_cond_max_episodes": 1000,
15 |     "sac_alpha": 0.2
16 |   }


--------------------------------------------------------------------------------
/docker/docker-compose-eval-swarm.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   rl_coach:
 5 |     deploy:
 6 |       restart_policy:
 7 |         condition: none
 8 |       placement:
 9 |         constraints: [node.labels.Sagemaker == true ]
10 |   robomaker:
11 |     deploy:
12 |       restart_policy:
13 |         condition: none
14 |       replicas: 1
15 |       placement:
16 |         constraints: [node.labels.Robomaker == true ]
17 |     environment:
18 |         - DOCKER_REPLICA_SLOT={{.Task.Slot}}


--------------------------------------------------------------------------------
/docker/docker-compose-training-swarm.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   rl_coach:
 5 |     deploy:
 6 |       restart_policy:
 7 |         condition: none
 8 |       placement:
 9 |         constraints: [node.labels.Sagemaker == true ]
10 |   robomaker:
11 |     deploy:
12 |       restart_policy:
13 |         condition: none
14 |       replicas: ${DR_WORKERS}
15 |       placement:
16 |         constraints: [node.labels.Robomaker == true ]
17 |     environment:
18 |         - DOCKER_REPLICA_SLOT={{.Task.Slot}}


--------------------------------------------------------------------------------
/docker/docker-compose-azure.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | networks:
 4 |   default:
 5 |     external: true
 6 |     name: sagemaker-local
 7 | 
 8 | services:
 9 |   minio:
10 |     image: minio/minio
11 |     ports:
12 |       - "9000:9000"
13 |     command: gateway azure
14 |     environment:
15 |       - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID}
16 |       - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}
17 |       - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID}
18 |       - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY}
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/evaluation/stop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | STACK_NAME="deepracer-eval-$DR_RUN_ID"
 4 | RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX}
 5 | 
 6 | # Check if we will use Docker Swarm or Docker Compose
 7 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 8 | then
 9 |     docker stack rm $STACK_NAME
10 | else
11 |     COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ )
12 |     export DR_CURRENT_PARAMS_FILE=""
13 |     export ROBOMAKER_COMMAND=""
14 |     docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down
15 | fi


--------------------------------------------------------------------------------
/docker/docker-compose-cwlog.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   rl_coach:
 5 |     logging:
 6 |       driver: awslogs
 7 |       options:
 8 |         awslogs-group: '/deepracer-for-cloud'
 9 |         awslogs-create-group: 'true'
10 |         awslogs-region: ${DR_AWS_APP_REGION}
11 |         tag: "{{.Name}}"
12 |   robomaker:
13 |     logging:
14 |       driver: awslogs
15 |       options:
16 |         awslogs-group: '/deepracer-for-cloud'
17 |         awslogs-create-group: 'true' 
18 |         awslogs-region: ${DR_AWS_APP_REGION}
19 |         tag: "{{.Name}}"


--------------------------------------------------------------------------------
/docker/docker-compose-local.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: '3.7'
 3 | 
 4 | networks:
 5 |   default:
 6 |     external: true
 7 |     name: sagemaker-local
 8 | 
 9 | services:
10 |   minio:
11 |     image: minio/minio
12 |     ports:
13 |       - "9000:9000"
14 |       - "9001:9001"
15 |     command: server /data --console-address ":9001"
16 |     environment:
17 |       - MINIO_ROOT_USER=${DR_LOCAL_ACCESS_KEY_ID}
18 |       - MINIO_ROOT_PASSWORD=${DR_LOCAL_SECRET_ACCESS_KEY}
19 |       - MINIO_UID
20 |       - MINIO_GID
21 |       - MINIO_USERNAME
22 |       - MINIO_GROUPNAME
23 |     volumes:
24 |       - ${DR_DIR}/data/minio:/data
25 | 


--------------------------------------------------------------------------------
/utils/start-xorg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export DISPLAY=$DR_DISPLAY
 3 | 
 4 | nohup sudo xinit /usr/bin/jwm -- /usr/lib/xorg/Xorg $DISPLAY -config $DR_DIR/tmp/xorg.conf > $DR_DIR/tmp/xorg.log 2>&1 &
 5 | sleep 1
 6 | 
 7 | if [[ "${DR_GUI_ENABLE,,}" == "true" ]]; then   
 8 |     xrandr -s 1400x900
 9 |     x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & 
10 |     sleep 1
11 | fi
12 | 
13 | xauth generate $DISPLAY
14 | export XAUTHORITY=~/.Xauthority
15 | 
16 | if timeout 1s xset q &>/dev/null; then 
17 |     echo "X Server started on display $DISPLAY" 
18 | else
19 |     echo "Server failed to start on display $DISPLAY"
20 | fi


--------------------------------------------------------------------------------
/defaults/template-system.env:
--------------------------------------------------------------------------------
 1 | DR_CLOUD=<CLOUD_REPLACE>
 2 | DR_AWS_APP_REGION=<REGION_REPLACE>
 3 | DR_UPLOAD_S3_PROFILE=default
 4 | DR_UPLOAD_S3_BUCKET=<AWS_DR_BUCKET>
 5 | DR_UPLOAD_S3_ROLE=<AWS_DR_BUCKET_ROLE>
 6 | DR_LOCAL_S3_BUCKET=bucket
 7 | DR_LOCAL_S3_PROFILE=<LOCAL_PROFILE>
 8 | DR_GUI_ENABLE=False
 9 | DR_KINESIS_STREAM_NAME=
10 | DR_KINESIS_STREAM_ENABLE=True
11 | DR_SAGEMAKER_IMAGE=<SAGE_TAG>
12 | DR_ROBOMAKER_IMAGE=<ROBO_TAG>
13 | DR_ANALYSIS_IMAGE=cpu
14 | DR_COACH_IMAGE=<COACH_TAG>
15 | DR_WORKERS=1
16 | DR_ROBOMAKER_MOUNT_LOGS=False
17 | DR_CLOUD_WATCH_ENABLE=False
18 | DR_DOCKER_STYLE=swarm
19 | DR_HOST_X=False
20 | DR_WEBVIEWER_PORT=8100
21 | # DR_DISPLAY=:99
22 | # DR_REMOTE_MINIO_URL=http://mynas:9000
23 | # CUDA_VISIBLE_DEVICES=0


--------------------------------------------------------------------------------
/utils/setup-xorg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to install basic X-Windows on a headless instance (e.g. in EC2)
 4 | 
 5 | # Install additional packages
 6 | sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \
 7 |                     menu mesa-utils xterm jwm x11vnc pkg-config -y --no-install-recommends
 8 | 
 9 | # Configure
10 | sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config
11 | BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1)
12 | sudo nvidia-xconfig --busid=$BUS_ID -o $DR_DIR/tmp/xorg.conf
13 | 
14 | touch ~/.Xauthority
15 | 
16 | sudo tee -a $DR_DIR/tmp/xorg.conf << EOF
17 | 
18 | Section "DRI"
19 |         Mode 0666
20 | EndSection
21 | EOF
22 | 


--------------------------------------------------------------------------------
/defaults/model_metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "action_space": [
 3 |         {
 4 |             "steering_angle": -30,
 5 |             "speed": 0.6
 6 |         },
 7 |         {
 8 |             "steering_angle": -15,
 9 |             "speed": 0.6
10 |         },
11 |         {
12 |             "steering_angle": 0,
13 |             "speed": 0.6
14 |         },
15 |         {
16 |             "steering_angle": 15,
17 |             "speed": 0.6
18 |         },
19 |         {
20 |             "steering_angle": 30,
21 |             "speed": 0.6
22 |         }
23 |     ],
24 |     "sensor": ["FRONT_FACING_CAMERA"],
25 |     "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW",
26 |     "training_algorithm": "clipped_ppo", 
27 |     "action_space_type": "discrete",
28 |     "version": "4"
29 | }
30 | 


--------------------------------------------------------------------------------
/bin/detect.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ## What am I?
 4 | if [[ -f /var/run/cloud-init/instance-data.json ]]; 
 5 | then
 6 |     # We have a cloud-init environment (Azure or AWS).
 7 |     CLOUD_NAME=$(jq -r '.v1."cloud-name"' /var/run/cloud-init/instance-data.json)
 8 |     if [[ "${CLOUD_NAME}" == "azure" ]];
 9 |     then
10 |         export CLOUD_NAME
11 |         export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta_data".imds.compute."vmSize"' /var/run/cloud-init/instance-data.json)
12 |     elif [[ "${CLOUD_NAME}" == "aws" ]];
13 |     then
14 |         export CLOUD_NAME
15 |         export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta-data"."instance-type"' /var/run/cloud-init/instance-data.json)
16 |     else
17 |         export CLOUD_NAME=local
18 |     fi
19 | else
20 |     export CLOUD_NAME=local
21 | fi


--------------------------------------------------------------------------------
/defaults/template-worker.env:
--------------------------------------------------------------------------------
 1 | DR_WORLD_NAME=reInvent2019_track
 2 | DR_RACE_TYPE=TIME_TRIAL
 3 | DR_CAR_COLOR=Blue
 4 | DR_ENABLE_DOMAIN_RANDOMIZATION=False
 5 | DR_TRAIN_CHANGE_START_POSITION=True
 6 | DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False
 7 | DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05
 8 | DR_TRAIN_START_POSITION_OFFSET=0.0
 9 | DR_OA_NUMBER_OF_OBSTACLES=6
10 | DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0
11 | DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False
12 | DR_OA_IS_OBSTACLE_BOT_CAR=False
13 | DR_OA_OBJECT_POSITIONS=
14 | DR_H2B_IS_LANE_CHANGE=False
15 | DR_H2B_LOWER_LANE_CHANGE_TIME=3.0
16 | DR_H2B_UPPER_LANE_CHANGE_TIME=5.0
17 | DR_H2B_LANE_CHANGE_DISTANCE=1.0
18 | DR_H2B_NUMBER_OF_BOT_CARS=3
19 | DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0
20 | DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False
21 | DR_H2B_BOT_CAR_SPEED=0.2
22 | 


--------------------------------------------------------------------------------
/docs/head-to-head.md:
--------------------------------------------------------------------------------
 1 | # Head-to-Head Race (Beta)
 2 | 
 3 | It is possible to run a head-to-head race, similar to the races in the brackets 
 4 | run by AWS in the Virtual Circuits to  determine the winner of the head-to-bot races.
 5 | 
 6 | This replaces the "Tournament Mode".
 7 | 
 8 | ## Introduction
 9 | 
10 | The concept is that you have two models racing each other, one Purple and one Orange Car. One car
11 | is powered by our primary configured model, and the second car is powered by the model in `DR_EVAL_OPP_S3_MODEL_PREFIX`
12 | 
13 | ## Configuration
14 | 
15 | ### run.env
16 | 
17 | Configure `run.env` with the following parameters:
18 | * `DR_RACE_TYPE` should be `HEAD_TO_MODEL`.
19 | * `DR_EVAL_OPP_S3_MODEL_PREFIX` will be the S3 prefix for the secondary model.
20 | * `DR_EVAL_OPP_CAR_NAME` is the display name of this model.
21 | 
22 | Metrics, Traces and Videos will be stored in each models' prefix.
23 | 
24 | ## Run
25 | 
26 | Run the race with `dr-start-evaluation`; one race will be run. 


--------------------------------------------------------------------------------
/docs/multi_run.md:
--------------------------------------------------------------------------------
 1 | # Running Multiple Experiments
 2 | 
 3 | It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`.
 4 | 
 5 | The feature works by creating unique prefixes to the container names:
 6 | * In Swarm mode this is done through defining a stack name (default: deepracer-0)
 7 | * In Compose mode this is done through adding a project name.
 8 | 
 9 | ## Suggested way to use the feature
10 | 
11 | By default `run.env` is loaded when DRfC is activated - but it is possible to load a separate configuration through `source bin/activate.sh <filename>`. 
12 | 
13 | The best way to use this feature is to have a bash-shell per experiment, and to load a separate configuration per shell.
14 | 
15 | After activating one can control each experiment independently through using the `dr-*` commands.
16 | 
17 | If using local or Azure the S3 / Minio instance will be shared, and is running only once.


--------------------------------------------------------------------------------
/docker/docker-compose-eval.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | networks:
 4 |   default:
 5 |     external: true
 6 |     name: sagemaker-local
 7 | 
 8 | services:
 9 |   rl_coach:
10 |     image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE}
11 |     command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"]
12 |   robomaker:
13 |     image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE}
14 |     command: ["${ROBOMAKER_COMMAND}"]
15 |     ports:
16 |       - "${DR_ROBOMAKER_EVAL_PORT}:8080"
17 |     environment:
18 |       - CUDA_VISIBLE_DEVICES
19 |       - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD}
20 |       - WORLD_NAME=${DR_WORLD_NAME}
21 |       - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES}
22 |       - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
23 |       - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET}      
24 |       - APP_REGION=${DR_AWS_APP_REGION}
25 |       - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE}
26 |       - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME}
27 |       - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE}
28 |       - ENABLE_GUI=${DR_GUI_ENABLE}
29 |       - ROLLOUT_IDX=0
30 |       - RTF_OVERRIDE=${DR_EVAL_RTF}
31 | 


--------------------------------------------------------------------------------
/defaults/reward_function.py:
--------------------------------------------------------------------------------
 1 | def reward_function(params):
 2 |     '''
 3 |     Example of penalize steering, which helps mitigate zig-zag behaviors
 4 |     '''
 5 |     
 6 |     # Read input parameters
 7 |     distance_from_center = params['distance_from_center']
 8 |     track_width = params['track_width']
 9 |     steering = abs(params['steering_angle']) # Only need the absolute steering angle
10 | 
11 |     # Calculate 3 marks that are farther and father away from the center line
12 |     marker_1 = 0.1 * track_width
13 |     marker_2 = 0.25 * track_width
14 |     marker_3 = 0.5 * track_width
15 | 
16 |     # Give higher reward if the car is closer to center line and vice versa
17 |     if distance_from_center <= marker_1:
18 |         reward = 1
19 |     elif distance_from_center <= marker_2:
20 |         reward = 0.5
21 |     elif distance_from_center <= marker_3:
22 |         reward = 0.1
23 |     else:
24 |         reward = 1e-3  # likely crashed/ close to off track
25 | 
26 |     # Steering penality threshold, change the number based on your action space setting
27 |     ABS_STEERING_THRESHOLD = 15
28 | 
29 |     # Penalize reward if the car is steering too much
30 |     if steering > ABS_STEERING_THRESHOLD:
31 |         reward *= 0.8
32 | 
33 |     return float(reward)
34 | 


--------------------------------------------------------------------------------
/bin/autorun.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ## this is the default autorun script
 4 | ## file should run automatically after init.sh completes.  
 5 | ## this script downloads your configured run.env, system.env and any custom container requests
 6 | 
 7 | INSTALL_DIR_TEMP="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )"
 8 | 
 9 | ## retrieve the s3_location name you sent the instance in user data launch
10 | ## assumed to first line of file
11 | S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/autorun.s3url)
12 | 
13 | source $INSTALL_DIR_TEMP/bin/activate.sh
14 | 
15 | ## get the updatated run.env and system.env files and any others you stashed in s3
16 | aws s3 sync s3://$S3_LOCATION $INSTALL_DIR_TEMP
17 | 
18 | ## get the right docker containers, if needed
19 | SYSENV="$INSTALL_DIR_TEMP/system.env"
20 | SAGEMAKER_IMAGE=$(cat $SYSENV | grep DR_SAGEMAKER_IMAGE | sed 's/.*=//')
21 | ROBOMAKER_IMAGE=$(cat $SYSENV | grep DR_ROBOMAKER_IMAGE | sed 's/.*=//')
22 | 
23 | docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_IMAGE
24 | docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_IMAGE
25 | 
26 | dr-reload
27 | 
28 | date | tee $INSTALL_DIR_TEMP/DONE-AUTORUN
29 | 
30 | ## start training
31 | cd $INSTALL_DIR_TEMP/scripts/training 
32 | ./start.sh
33 | 
34 | 


--------------------------------------------------------------------------------
/scripts/training/stop.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | STACK_NAME="deepracer-$DR_RUN_ID"
 4 | RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX}
 5 | 
 6 | SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs )
 7 | 
 8 | if [[ -n $SAGEMAKER_CONTAINERS ]];
 9 | then
10 |     for CONTAINER in $SAGEMAKER_CONTAINERS; do
11 |         CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
12 |         CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1')
13 |         COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2')
14 |         COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX)
15 |         if [[ -n $COMPOSE_FILE ]]; then
16 |             sudo docker-compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME
17 |             docker container rm $CONTAINER
18 |         fi
19 |     done
20 | fi
21 | 
22 | # Check if we will use Docker Swarm or Docker Compose
23 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
24 | then
25 |     docker stack rm $STACK_NAME
26 | else
27 |     COMPOSE_FILES=$(echo ${DR_TRAIN_COMPOSE_FILE} | cut -f1-2 -d\ )
28 |     export DR_CURRENT_PARAMS_FILE=""
29 |     export ROBOMAKER_COMMAND=""
30 |     docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down
31 | fi


--------------------------------------------------------------------------------
/scripts/upload/upload-car.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script creates the tar.gz file necessary to operate inside a deepracer physical car
 4 | # The file is created directly from within the sagemaker container, using the most recent checkpoint
 5 | 
 6 | # Find name of sagemaker container
 7 | SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs )
 8 | if [[ -n $SAGEMAKER_CONTAINERS ]];
 9 | then
10 |     for CONTAINER in $SAGEMAKER_CONTAINERS; do
11 |         CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
12 |         CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1')
13 |         echo $CONTAINER_NAME
14 |     done
15 | fi
16 | 
17 | #create tmp directory if it doesnt already exit
18 | mkdir -p $DR_DIR/tmp/car_upload
19 | cd $DR_DIR/tmp/car_upload
20 | #ensure directory is empty
21 | rm -r $DR_DIR/tmp/car_upload/*
22 | #The files we want are located inside the sagemaker container at /opt/ml/model.  Copy them to the tmp directory
23 | docker cp $CONTAINER_NAME:/opt/ml/model $DR_DIR/tmp/car_upload
24 | cd $DR_DIR/tmp/car_upload/model
25 | #create a tar.gz file containing all of these files
26 | tar -czvf carfile.tar.gz *
27 | 
28 | #upload to s3
29 | aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz
30 | 
31 | 


--------------------------------------------------------------------------------
/bin/runonce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -eq 0 ]]; then
 4 |     echo "Schedules a command to be run after the next reboot."
 5 |     echo "Usage: $(basename $0) <command>"
 6 |     echo "       $(basename $0) -p <path> <command>"
 7 |     echo "       $(basename $0) -r <command>"
 8 | else
 9 |     REMOVE=0
10 |     COMMAND=${!#}
11 |     SCRIPTPATH=$PATH
12 | 
13 |     while getopts ":r:p:" optionName; do
14 |         case "$optionName" in
15 |             r) REMOVE=1; COMMAND=$OPTARG;;
16 |             p) SCRIPTPATH=$OPTARG;;
17 |         esac
18 |     done
19 | 
20 |     SCRIPT="${HOME}/.$(basename $0)_$(echo $COMMAND | sed 's/[^a-zA-Z0-9_]/_/g')"
21 | 
22 |     if [[ ! -f $SCRIPT ]]; then
23 |         echo "PATH=$SCRIPTPATH" >> $SCRIPT
24 |         echo "cd $(pwd)"        >> $SCRIPT
25 |         echo "logger -t $(basename $0) -p local3.info \"COMMAND=$COMMAND ; USER=\$(whoami) ($(logname)) ; PWD=$(pwd) ; PATH=\$PATH\"" >> $SCRIPT
26 |         echo "$COMMAND | logger -t $(basename $0) -p local3.info" >> $SCRIPT
27 |         echo "$0 -r \"$(echo $COMMAND | sed 's/\"/\\\"/g')\""     >> $SCRIPT
28 |         chmod +x $SCRIPT
29 |     fi
30 | 
31 |     CRONTAB="${HOME}/.$(basename $0)_temp_crontab_$RANDOM"
32 |     ENTRY="@reboot $SCRIPT"
33 | 
34 |     echo "$(crontab -l 2>/dev/null)" | grep -v "$ENTRY" | grep -v "^# DO NOT EDIT THIS FILE - edit the master and reinstall.$" | grep -v "^# ([^ ]* installed on [^)]*)$" | grep -v "^# (Cron version [^$]*\$[^$]*\$)$" > $CRONTAB
35 | 
36 |     if [[ $REMOVE -eq 0 ]]; then
37 |         echo "$ENTRY" >> $CRONTAB
38 |     fi
39 | 
40 |     crontab $CRONTAB
41 |     rm $CRONTAB
42 | 
43 |     if [[ $REMOVE -ne 0 ]]; then
44 |         rm $SCRIPT
45 |     fi
46 | fi
47 | 


--------------------------------------------------------------------------------
/docker/docker-compose-training.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | networks:
 4 |   default:
 5 |     external: true
 6 |     name: sagemaker-local
 7 | 
 8 | services:
 9 |   rl_coach:
10 |     image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE}
11 |     environment:
12 |       - SAGEMAKER_IMAGE=${DR_SAGEMAKER_IMAGE}
13 |       - PRETRAINED=${DR_LOCAL_S3_PRETRAINED}
14 |       - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX}
15 |       - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
16 |       - PRETRAINED_CHECKPOINT=${DR_LOCAL_S3_PRETRAINED_CHECKPOINT}
17 |       - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
18 |       - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
19 |       - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY}
20 |       - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY}
21 |     volumes:
22 |       - "/var/run/docker.sock:/var/run/docker.sock"
23 |       - "/tmp/sagemaker:/tmp/sagemaker"
24 |   robomaker:
25 |     image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE}
26 |     command: ["${ROBOMAKER_COMMAND}"]
27 |     ports:
28 |       - "${DR_ROBOMAKER_TRAIN_PORT}:8080"
29 |       - "${DR_ROBOMAKER_GUI_PORT}:5900"
30 |     environment:
31 |       - WORLD_NAME=${DR_WORLD_NAME}
32 |       - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
33 |       - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
34 |       - APP_REGION=${DR_AWS_APP_REGION}
35 |       - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE}
36 |       - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME}
37 |       - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE}
38 |       - ENABLE_GUI=${DR_GUI_ENABLE}
39 |       - CUDA_VISIBLE_DEVICES
40 |       - MULTI_CONFIG
41 |       - RTF_OVERRIDE=${DR_TRAIN_RTF}      
42 | 


--------------------------------------------------------------------------------
/defaults/debug-reward_function.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy
 3 | import rospy
 4 | import time
 5 | 
 6 | class Reward:
 7 | 
 8 |     '''
 9 |     Debugging reward function to be used to track performance of local training.
10 |     Will print out the Real-Time-Factor (RTF), as well as how many 
11 |     steps-per-second (sim-time) that the system is able to deliver.
12 |     '''
13 | 
14 |     def __init__(self, verbose=False, track_time=False):
15 |         self.verbose = verbose
16 |         self.track_time = track_time
17 | 
18 |         if track_time:
19 |             TIME_WINDOW=10
20 |             self.time = numpy.zeros([TIME_WINDOW, 2])
21 | 
22 |         if verbose:
23 |             print("Initializing Reward Class")
24 | 
25 |     def get_time(self):
26 | 
27 |         wall_time_incr = numpy.max(self.time[:,0]) - numpy.min(self.time[:,0])
28 |         sim_time_incr = numpy.max(self.time[:,1]) - numpy.min(self.time[:,1])
29 |         
30 |         rtf = sim_time_incr / wall_time_incr
31 |         fps = (self.time.shape[0] - 1) / sim_time_incr
32 | 
33 |         return rtf, fps
34 |     
35 |     def record_time(self, steps):
36 | 
37 |         index = int(steps) % self.time.shape[0]
38 |         self.time[index,0] = time.time()
39 |         self.time[index,1] = rospy.get_time()
40 | 
41 |     def reward_function(self, params):
42 | 
43 |         # Read input parameters
44 |         steps = params["steps"]
45 | 
46 |         if self.track_time:
47 |             self.record_time(steps)
48 | 
49 |         if self.track_time:
50 |             if steps >= self.time.shape[0]:
51 |                 rtf, fps = self.get_time()
52 |                 print("TIME: s: {}, rtf: {}, fps:{}".format(int(steps), round(rtf, 2), round(fps, 2) ))
53 | 
54 |         return 1.0
55 | 
56 | 
57 | reward_object = Reward(verbose=False, track_time=True)
58 | 
59 | def reward_function(params):
60 |     return reward_object.reward_function(params)
61 | 


--------------------------------------------------------------------------------
/scripts/upload/import-model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import boto3
 4 | import sys
 5 | import os 
 6 | import time
 7 | import json
 8 | import io
 9 | import yaml
10 | from botocore.loaders import UnknownServiceError
11 | 
12 | try:
13 |     import pandas as pd
14 |     import deepracer
15 | except ImportError:
16 |     print("You need to install pandas and deepracer-utils to use the import function.")
17 |     exit(1)
18 | 
19 | # Read in command 
20 | aws_profile = sys.argv[1]
21 | aws_s3_role = sys.argv[2]
22 | aws_s3_bucket = sys.argv[3]
23 | aws_s3_prefix = sys.argv[4]
24 | dr_model_name = sys.argv[5]
25 | 
26 | if not aws_s3_role:
27 |     print("You must configure an IAM role with access to the S3 bucket in variable DR_UPLOAD_S3_ROLE ")
28 |     exit(1)
29 | 
30 | session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile)
31 | 
32 | try:
33 |     dr = session.client('deepracer')
34 | except UnknownServiceError:
35 |     print ("Boto3 service 'deepracer' is not installed. Cannot import model.")
36 |     print ("Install with 'pip install deepracer-utils' and 'python -m deepracer install-cli --force'")
37 |     exit(1)
38 | 
39 | # Load model to check if it already exists
40 | a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25)
41 | model_dict = a['Models']
42 | while "NextToken" in a:
43 |     a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25, NextToken=a["NextToken"])
44 |     model_dict.extend(a['Models'])
45 | 
46 | models = pd.DataFrame.from_dict(model_dict)
47 | 
48 | if models[models['ModelName']==dr_model_name].size > 0:
49 |     sys.exit('Model {} already exists.'.format(dr_model_name))
50 | 
51 | # Import from S3
52 | print('Importing from s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix))
53 | response = dr.import_model(Name=dr_model_name, ModelArtifactsS3Path='s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix), RoleArn=aws_s3_role, Type='REINFORCEMENT_LEARNING')
54 | 
55 | if response['ResponseMetadata']['HTTPStatusCode'] == 200:
56 |     print('Model importing as {}'.format(response['ModelArn']))
57 | else:
58 |     sys.exit('Error occcured when uploading')


--------------------------------------------------------------------------------
/docs/video.md:
--------------------------------------------------------------------------------
 1 | # Watching the car
 2 | 
 3 | There are multiple ways to watch the car during training and evaluation. The ports and 'features' depend on the docker mode (swarm vs. compose) as well as between training and evaluation.
 4 | 
 5 | ## Training using Viewer
 6 | 
 7 | DRfC has a built in viewer that supports showing the video stream from up to 6 workers on one webpage.
 8 | 
 9 | The view can be started with `dr-start-viewer` and is available on `http://localhost:8100` or `http://127.0.0.1:8100`. The viewer must be updated if training is restarted using `dr-update-viewer`, as it needs to connect to the new containers.
10 | 
11 | It is also possible to automatically start/update the viewer using the `-v` flag to `dr-start-training`.
12 | 
13 | ## ROS Stream Viewer
14 | 
15 | The ROS Stream Viewer is a built in ROS feature that will stream any topic in ROS that publishing ROSImg messages. The viewer starts automatically.
16 | 
17 | ### Ports
18 | 
19 | | Docker Mode  | Training         | Evaluation      | Comment
20 | | -------- | -------- | -------- | -------- | 
21 | | swarm      | 8080 + `DR_RUN_ID` |  8180 + `DR_RUN_ID` | Default 8080/8180. Multiple workers share one port, press F5 to cycle between them.
22 | | compose | 8080-8089 | 8080-8089 | Each worker gets a unique port.
23 | 
24 | ### Topics
25 | 
26 | | Topic  | Description         | 
27 | | -------- | -------- | 
28 | | `/racecar/camera/zed/rgb/image_rect_color`      | In-car video stream. This is used for inference. | 
29 | | `/racecar/main_camera/zed/rgb/image_rect_color`      | Camera following the car. Stream without overlay | 
30 | | `/sub_camera/zed/rgb/image_rect_color`      | Top-view of the track | 
31 | | `/racecar/deepracer/kvs_stream`      | Camera following the car. Stream with overlay. Different overlay in Training and Evaluation | 
32 | | `/racecar/deepracer/main_camera_stream`      | Same as `kvs_stream`, topic used for MP4 production. Only active in Evaluation if `DR_EVAL_SAVE_MP4=True` | 
33 | 
34 | ## Saving Evaluation to File
35 | 
36 | During evaluation (`dr-start-evaluation`), if `DR_EVAL_SAVE_MP4=True` then three MP4 files are created in the S3 bucket's MP4 folder. They contain the in-car camera, top-camera and the camera following the car.


--------------------------------------------------------------------------------
/utils/start-local-browser.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage(){
 4 | 	echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]"
 5 |   echo "       -w        Width of individual stream."
 6 |   echo "       -h        Heigth of individual stream."
 7 |   echo "       -q        Quality of the stream image."
 8 |   echo "       -t        Topic to follow - default /racecar/deepracer/kvs_stream"
 9 |   echo "       -b        Browser command (default: firefox --new-tab)"
10 | 	exit 1
11 | }
12 | 
13 | trap ctrl_c INT
14 | 
15 | function ctrl_c() {
16 |         echo "Requested to stop."
17 |         exit 1
18 | }
19 | 
20 | # Stream definition
21 | TOPIC="/racecar/deepracer/kvs_stream"
22 | WIDTH=480
23 | HEIGHT=360
24 | QUALITY=75
25 | BROWSER="firefox --new-tab"
26 | 
27 | while getopts ":w:h:q:t:b:" opt; do
28 | case $opt in
29 | w) WIDTH="$OPTARG"
30 | ;;
31 | h) HEIGHT="$OPTARG"
32 | ;;
33 | q) QUALITY="$OPTARG"
34 | ;;
35 | t) TOPIC="$OPTARG"
36 | ;;
37 | b) BROWSER="$OPTARG"
38 | ;;
39 | \?) echo "Invalid option -$OPTARG" >&2
40 | usage
41 | ;;
42 | esac
43 | done
44 | 
45 | FILE=$DR_DIR/tmp/streams-$DR_RUN_ID.html
46 | 
47 | # Check if we will use Docker Swarm or Docker Compose
48 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
49 | then
50 |   echo "This script does not support swarm mode. Use `dr-start-viewer`."
51 |   exit
52 | fi
53 | 
54 | echo "<html><head><title>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</title></head><body><h1>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</h1>" > $FILE
55 | 
56 | ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE")
57 | if [ -z "$ROBOMAKER_CONTAINERS" ]; then
58 |     echo "No running robomakers. Exiting."
59 |     exit
60 | fi
61 | 
62 | for c in $ROBOMAKER_CONTAINERS; do
63 |     C_PORT=$(docker inspect $c | jq -r '.[0].NetworkSettings.Ports["8080/tcp"][0].HostPort')
64 |     C_URL="http://localhost:${C_PORT}/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}"
65 |     C_IMG="<img src=\"${C_URL}\"></img>"
66 |     echo $C_IMG >> $FILE
67 | done
68 | 
69 | echo "</body></html>" >> $FILE
70 | echo "Starting browser '$BROWSER'."
71 | $BROWSER `readlink -f $FILE ` &


--------------------------------------------------------------------------------
/docs/multi_worker.md:
--------------------------------------------------------------------------------
 1 | # Using multiple Robomaker workers
 2 | 
 3 | One way to accelerate training is to launch multiple Robomaker workers that feed into one Sagemaker instance.
 4 | 
 5 | The number of workers is configured through setting `system.env` `DR_WORKERS` to the desired number of workers. The result is that the number of episodes (hyperparameter `num_episodes_between_training`) will be divivided over the number of workers. The theoretical maximum number of workers equals `num_episodes_between_training`.
 6 | 
 7 | The training can be started as normal.
 8 | 
 9 | ## How many workers do I need?
10 | 
11 | One Robomaker worker requires 2-4 vCPUs. Tests show that a `c5.4xlarge` instance can run 3 workers and the Sagemaker without a drop in performance. Using OpenGL images reduces the number of vCPUs required per worker.
12 | 
13 | To avoid issues with the position from which evaluations are run ensure that `( num_episodes_between_training / DR_WORKERS) * DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST = 1.0`. 
14 | 
15 | Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.1`.
16 | 
17 | Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration.
18 | 
19 | ## Training with different parameters for each worker
20 | 
21 | It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME).  To enable, set DR_TRAIN_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc.  (So alongside run.env, you should have woker-2.env, worker-3.env, etc.  run.env is still used for worker 1)  Modify the worker env files with your desired changes, which can be more than just the world_name.  These additional worker env files are only used if you are training with multiple workers.
22 | 
23 | ## Watching the streams
24 | 
25 | If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time.
26 | 


--------------------------------------------------------------------------------
/defaults/template-run.env:
--------------------------------------------------------------------------------
 1 | DR_RUN_ID=0
 2 | DR_WORLD_NAME=reInvent2019_track
 3 | DR_RACE_TYPE=TIME_TRIAL
 4 | DR_CAR_NAME=FastCar
 5 | DR_CAR_BODY_SHELL_TYPE=deepracer
 6 | DR_CAR_COLOR=Red
 7 | DR_DISPLAY_NAME=$DR_CAR_NAME
 8 | DR_RACER_NAME=$DR_CAR_NAME
 9 | DR_ENABLE_DOMAIN_RANDOMIZATION=False
10 | DR_EVAL_NUMBER_OF_TRIALS=3
11 | DR_EVAL_IS_CONTINUOUS=True
12 | DR_EVAL_MAX_RESETS=100
13 | DR_EVAL_OFF_TRACK_PENALTY=5.0
14 | DR_EVAL_COLLISION_PENALTY=5.0
15 | DR_EVAL_SAVE_MP4=False
16 | DR_EVAL_CHECKPOINT=last
17 | DR_EVAL_OPP_S3_MODEL_PREFIX=rl-deepracer-sagemaker
18 | DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer
19 | DR_EVAL_OPP_CAR_NAME=FasterCar
20 | DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME
21 | DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME
22 | DR_EVAL_DEBUG_REWARD=False
23 | DR_EVAL_RESET_BEHIND_DIST=1.0
24 | #DR_EVAL_RTF=1.0
25 | DR_TRAIN_CHANGE_START_POSITION=True
26 | DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False
27 | DR_TRAIN_START_POSITION_OFFSET=0.0
28 | DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05
29 | DR_TRAIN_MULTI_CONFIG=False
30 | DR_TRAIN_MIN_EVAL_TRIALS=5
31 | #DR_TRAIN_RTF=1.0
32 | DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker
33 | DR_LOCAL_S3_PRETRAINED=False
34 | DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained
35 | DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last
36 | DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files
37 | DR_LOCAL_S3_TRAINING_PARAMS_FILE=training_params.yaml
38 | DR_LOCAL_S3_EVAL_PARAMS_FILE=evaluation_params.yaml
39 | DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json
40 | DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json
41 | DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py
42 | DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics
43 | DR_UPLOAD_S3_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX
44 | DR_OA_NUMBER_OF_OBSTACLES=6
45 | DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0
46 | DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False
47 | DR_OA_IS_OBSTACLE_BOT_CAR=False
48 | DR_OA_OBJECT_POSITIONS=
49 | DR_H2B_IS_LANE_CHANGE=False
50 | DR_H2B_LOWER_LANE_CHANGE_TIME=3.0
51 | DR_H2B_UPPER_LANE_CHANGE_TIME=5.0
52 | DR_H2B_LANE_CHANGE_DISTANCE=1.0
53 | DR_H2B_NUMBER_OF_BOT_CARS=3
54 | DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0
55 | DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False
56 | DR_H2B_BOT_CAR_SPEED=0.2
57 | DR_H2B_BOT_CAR_PENALTY=5.0


--------------------------------------------------------------------------------
/docs/docker.md:
--------------------------------------------------------------------------------
 1 | # About the Docker setup
 2 | 
 3 | DRfC supports running Docker in to modes `swarm` and `compose` - this behaviour is configured in `system.env` through `DR_DOCKER_STYLE`.
 4 | 
 5 | ## Swarm Mode
 6 | 
 7 | Docker Swarm mode is the default. Docker Swarm makes it possible to connect multiple hosts together to spread the load -- esp. useful if one wants to run multiple Robomaker workers, but can also be useful locally if one has two computers that each are not powerful enough to run DeepRacer.
 8 | 
 9 | In Swarm mode DRfC creates Stacks, using `docker stack`. During operations one can check running stacks through `docker stack ls`, and running services through `docker stack <id> ls`.
10 | 
11 | DRfC is installed only on the manager. (The first installed host.) Swarm workers are 'dumb' and do not need to have DRfC installed.
12 | 
13 | ### Key features
14 | 
15 | * Allows user to connect multiple computers on the same network. (In AWS the instances must be connected on same VPC, and instances must be allowed to communicate.)
16 | * Supports [multiple Robomaker workers](multi_worker.md)
17 | * Supports [running multiple parallel experiments](multi_run.md)
18 | 
19 | ### Limitations
20 | 
21 | * The Sagemaker container can only be run on the manager.
22 | * Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique tag, not in Docker Hub. If you have multiple Docker nodes ensure that they all have the image available.
23 | 
24 | ### Connecting Workers
25 | 
26 | * On the manager run `docker swarm join-token manager`.
27 | * On the worker run the command that was displayed on the manager `docker swarm join --token <token> <ip>:<port>`.
28 | 
29 | ### Ports
30 | 
31 | Docker Swarm will automatically put a load-balancer in front of all replicas in a service. This means that the ROS Web View, which provides a video stream of the DeepRacer during training, will be load balanced - sharing one port (`8080`). If you have multiple workers (even across multiple hosts) then press F5 to cycle through them. 
32 | 
33 | ## Compose Mode
34 | 
35 | In Compose mode DRfC creates Services, using `docker-compose`. During operations one can check running stacks through `docker service ls`, and running services through `docker service ps`.
36 | 
37 | ### Key features
38 | 
39 | * Supports [multiple Robomaker workers](multi_worker.md)
40 | * Supports [running multiple parallel experiments](multi_run.md)
41 | * Supports [GPU Accelerated OpenGL for Robomaker](opengl.md)
42 | 
43 | ### Limitations
44 | 
45 | * Workload cannot be spread across multiple hosts.
46 | 
47 | ### Ports
48 | 
49 | In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports.
50 | 


--------------------------------------------------------------------------------
/docs/opengl.md:
--------------------------------------------------------------------------------
 1 | # GPU Accelerated OpenGL for Robomaker
 2 | 
 3 | One way to improve performance, especially of Robomaker, is to enable GPU-accelerated OpenGL. OpenGL can significantly improve Gazebo performance, even where the GPU does not have enough GPU RAM, or is too old, to support Tensorflow.
 4 | 
 5 | ## Desktop 
 6 | 
 7 | On a Ubuntu desktop running Unity there are hardly any additional steps required.
 8 | 
 9 | * Ensure that a recent Nvidia driver is installed and is running.
10 | * Ensure that nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script.
11 | * Configure DRfC using the following settings in `system.env`:
12 |     * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
13 |     * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU.
14 | 
15 | Before running `dr-start-training` ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined.
16 | 
17 | NOTE: Users have experienced issues to start training process from remote (SSH / RDP), as a local X session may not be running / you are not allowed to connect to it. Workaround is to start an additional X server following the steps for Headless Server.
18 | 
19 | With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU.
20 | 
21 | ## Headless Server
22 | 
23 | Also a headless server with a GPU, e.g. an EC2 instance, or a local computer with a displayless GPU (e.g. Tesla K40, K80, M40).
24 | 
25 | * Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script.
26 | * Setup an X-server on the host. `utils/setup-xorg.sh` is a basic installation script.
27 | * Configure DRfC using the following settings in `system.env`:
28 |     * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container.
29 |     * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU.
30 |     * `DR_DISPLAY`; the X display that the headless X server will start on. (Default is `:99`.)
31 | 
32 | Start up the X server with `utils/start-xorg.sh`. 
33 | 
34 | If `DR_GUI_ENABLE=True` then a VNC server will be started on port 5900 so that you can connect and interact with the Gazebo UI.
35 | 
36 | With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU.
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepRacer-For-Cloud
 2 | Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing).
 3 | 
 4 | ## Introduction
 5 | 
 6 | DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities are in the [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository.
 7 | 
 8 | ## Main Features
 9 | 
10 | DRfC supports a wide set of features to ensure that you can focus on creating the best model:
11 | * User-friendly
12 | 	* Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups.
13 | 	* Wide set of scripts (`dr-*`) enables effortless training.
14 | 	* Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them.
15 | * Modes
16 | 	* Time Trial
17 | 	* Object Avoidance
18 | 	* Head-to-Bot
19 | * Training
20 | 	* Multiple Robomaker instances per Sagemaker (N:1) to improve training progress.
21 | 	* Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel.
22 | 	* Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances.
23 | * Evaluation
24 | 	* Evaluate independently from training.
25 | 	* Save evaluation run to MP4 file in S3.
26 | * Logging
27 | 	* Training metrics and trace files are stored to S3.
28 | 	* Optional integration with AWS CloudWatch.
29 | 	* Optional exposure of Robomaker internal log-files.
30 | * Technology
31 | 	* Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL)
32 | 
33 | ## Documentation
34 | 
35 | Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://aws-deepracer-community.github.io/deepracer-for-cloud).
36 | 
37 | ## Support
38 | 
39 | * For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support.
40 | * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required.
41 | 


--------------------------------------------------------------------------------
/scripts/upload/increment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 | 	echo "Usage: $0 [-f] [-w] [-p <model-prefix>] [-d <delimiter>]"
 5 |     echo ""
 6 |     echo "Command will increment a numerical suffix on the current upload model."
 7 |     echo "-p model  Sets the to-be name to be <model-prefix> rather than auto-incremeneting the previous model."
 8 |     echo "-d delim  Delimiter in model-name (e.g. '-' in 'test-model-1')"
 9 |     echo "-f        Force. Ask for no confirmations."
10 |     echo "-w        Wipe the S3 prefix to ensure that two models are not mixed."
11 | 	exit 1
12 | }
13 | 
14 | trap ctrl_c INT
15 | 
16 | function ctrl_c() {
17 |         echo "Requested to stop."
18 |         exit 1
19 | }
20 | 
21 | OPT_DELIM='-'
22 | 
23 | while getopts ":fwp:d:" opt; do
24 | case $opt in
25 | 
26 | f) OPT_FORCE="True"
27 | ;;
28 | p) OPT_PREFIX="$OPTARG"
29 | ;;
30 | w) OPT_WIPE="--delete"
31 | ;;
32 | d) OPT_DELIM="$OPTARG"
33 | ;;
34 | h) usage
35 | ;;
36 | \?) echo "Invalid option -$OPTARG" >&2
37 | usage
38 | ;;
39 | esac
40 | done
41 | 
42 | CONFIG_FILE=$DR_CONFIG
43 | echo "Configuration file $CONFIG_FILE will be updated."
44 | 
45 | ## Read in data
46 | CURRENT_UPLOAD_MODEL=$(grep -e "^DR_UPLOAD_S3_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }')
47 | CURRENT_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL}" | \
48 |                     awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')
49 | if [[ -z ${CURRENT_UPLOAD_MODEL_NUM} ]];
50 | then
51 |     NEW_UPLOAD_MODEL="${CURRENT_UPLOAD_MODEL}${OPT_DELIM}1"
52 | else
53 |     NEW_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL_NUM} + 1" | bc )
54 |     NEW_UPLOAD_MODEL=$(echo $CURRENT_UPLOAD_MODEL | sed "s/${CURRENT_UPLOAD_MODEL_NUM}\$/${NEW_UPLOAD_MODEL_NUM}/")
55 | fi
56 | 
57 | if [[ -n "${NEW_UPLOAD_MODEL}" ]];
58 | then
59 |     echo "Incrementing model from ${CURRENT_UPLOAD_MODEL} to ${NEW_UPLOAD_MODEL}"
60 |     if [[ -z "${OPT_FORCE}" ]]; 
61 |     then
62 |         read -r -p "Are you sure? [y/N] " response
63 |         if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
64 |         then
65 |             echo "Aborting."
66 |             exit 1
67 |         fi
68 |     fi
69 |     sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$NEW_UPLOAD_MODEL/g" "$CONFIG_FILE" && echo "Done."
70 | else
71 |     echo    "Error in determining new model. Aborting."
72 |     exit 1
73 | fi
74 | 
75 | export DR_UPLOAD_S3_PREFIX=$(eval echo "${NEW_UPLOAD_MODEL}")
76 | 
77 | if [[ -n "${OPT_WIPE}" ]];
78 | then
79 |     MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} )
80 |     if [[ -n "${MODEL_DIR_S3}" ]];
81 |     then
82 |         echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} exists. Will wipe."
83 |     fi
84 |     if [[ -z "${OPT_FORCE}" ]]; 
85 |     then
86 |         read -r -p "Are you sure? [y/N] " response
87 |         if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
88 |         then
89 |             echo "Aborting."
90 |             exit 1
91 |         fi
92 |     fi
93 |     aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} --recursive
94 | fi
95 | 


--------------------------------------------------------------------------------
/scripts/training/increment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage(){
 4 | 	echo "Usage: $0 [-f] [-w] [-p <model-prefix>] [-d <delimiter>]"
 5 |     echo ""
 6 |     echo "Command will set the current model to be the pre-trained model and increment a numerical suffix."
 7 |     echo "-p model  Sets the to-be name to be <model-prefix> rather than auto-incremeneting the previous model."
 8 |     echo "-d delim  Delimiter in model-name (e.g. '-' in 'test-model-1')"
 9 |     echo "-f        Force. Ask for no confirmations."
10 |     echo "-w        Wipe the S3 prefix to ensure that two models are not mixed."
11 | 	exit 1
12 | }
13 | 
14 | trap ctrl_c INT
15 | 
16 | function ctrl_c() {
17 |         echo "Requested to stop."
18 |         exit 1
19 | }
20 | 
21 | OPT_DELIM='-'
22 | 
23 | while getopts ":fwp:d:" opt; do
24 | case $opt in
25 | 
26 | f) OPT_FORCE="True"
27 | ;;
28 | p) OPT_PREFIX="$OPTARG"
29 | ;;
30 | w) OPT_WIPE="--delete"
31 | ;;
32 | d) OPT_DELIM="$OPTARG"
33 | ;;
34 | h) usage
35 | ;;
36 | \?) echo "Invalid option -$OPTARG" >&2
37 | usage
38 | ;;
39 | esac
40 | done
41 | 
42 | CONFIG_FILE=$DR_CONFIG
43 | echo "Configuration file $CONFIG_FILE will be updated."
44 | 
45 | ## Read in data
46 | CURRENT_RUN_MODEL=$(grep -e "^DR_LOCAL_S3_MODEL_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }')
47 | CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \
48 |                     awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')
49 | if [[ -z ${CURRENT_RUN_MODEL_NUM} ]];
50 | then
51 |     NEW_RUN_MODEL="${CURRENT_RUN_MODEL}${OPT_DELIM}1"
52 | else
53 |     NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc )
54 |     NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")
55 | fi
56 | 
57 | if [[ -n "${NEW_RUN_MODEL}" ]];
58 | then
59 |     echo "Incrementing model from ${CURRENT_RUN_MODEL} to ${NEW_RUN_MODEL}"
60 |     if [[ -z "${OPT_FORCE}" ]]; 
61 |     then
62 |         read -r -p "Are you sure? [y/N] " response
63 |         if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
64 |         then
65 |             echo "Aborting."
66 |             exit 1
67 |         fi
68 |     fi
69 |     sed -i.bak -re "s/(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(DR_LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(DR_LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done."
70 | else
71 |     echo    "Error in determining new model. Aborting."
72 |     exit 1
73 | fi
74 | 
75 | if [[ -n "${OPT_WIPE}" ]];
76 | then
77 |     MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} )
78 |     if [[ -n "${MODEL_DIR_S3}" ]];
79 |     then
80 |         echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe."
81 |     fi
82 |     if [[ -z "${OPT_FORCE}" ]]; 
83 |     then
84 |         read -r -p "Are you sure? [y/N] " response
85 |         if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
86 |         then
87 |             echo "Aborting."
88 |             exit 1
89 |         fi
90 |     fi
91 |     aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive
92 | fi
93 | 


--------------------------------------------------------------------------------
/scripts/evaluation/start.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | source $DR_DIR/bin/scripts_wrapper.sh
  4 | 
  5 | usage(){
  6 | 	echo "Usage: $0 [-q] [-c]"
  7 |   echo "       -q        Quiet - does not start log tracing."
  8 |   echo "       -c        Clone - copies model into new prefix before evaluating."
  9 | 	exit 1
 10 | }
 11 | 
 12 | trap ctrl_c INT
 13 | 
 14 | function ctrl_c() {
 15 |         echo "Requested to stop."
 16 |         exit 1
 17 | }
 18 | 
 19 | while getopts ":qc" opt; do
 20 | case $opt in
 21 | q) OPT_QUIET="QUIET"
 22 | ;;
 23 | c) OPT_CLONE="CLONE"
 24 | ;;
 25 | h) usage
 26 | ;;
 27 | \?) echo "Invalid option -$OPTARG" >&2
 28 | usage
 29 | ;;
 30 | esac
 31 | done
 32 | 
 33 | # clone if required
 34 | if [ -n "$OPT_CLONE" ]; then
 35 |   echo "Cloning model into s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E"
 36 |   aws  $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/model s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/model
 37 |   aws  $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/ip s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/ip
 38 |   export DR_LOCAL_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}-E
 39 | fi
 40 | 
 41 | # set evaluation specific environment variables
 42 | S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX"
 43 | STACK_NAME="deepracer-eval-$DR_RUN_ID"
 44 | 
 45 | export ROBOMAKER_COMMAND="./run.sh run evaluation.launch"
 46 | export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE}
 47 | 
 48 | if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ];
 49 | then
 50 |   COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml"
 51 |   export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX"
 52 |   mkdir -p $DR_MOUNT_DIR
 53 | else
 54 |   COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE"
 55 | fi
 56 | 
 57 | echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE"
 58 | python3 $DR_DIR/scripts/evaluation/prepare-config.py
 59 | 
 60 | # Check if we are using Host X -- ensure variables are populated
 61 | if [[ "${DR_HOST_X,,}" == "true" ]];
 62 | then
 63 |   if [[ -n "$DR_DISPLAY" ]]; then
 64 |     ROBO_DISPLAY=$DR_DISPLAY
 65 |   else
 66 |     ROBO_DISPLAY=$DISPLAY
 67 |   fi
 68 | 
 69 |   if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then 
 70 |       echo "No X Server running on display $ROBO_DISPLAY. Exiting"
 71 |       exit 0
 72 |   fi
 73 | 
 74 |   if [[ -z "$XAUTHORITY" ]]; then
 75 |     export XAUTHORITY=~/.Xauthority
 76 |     if [[ ! -f "$XAUTHORITY" ]]; then
 77 |       echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping."
 78 |       exit 0
 79 |     fi
 80 |   fi
 81 | fi
 82 | 
 83 | 
 84 | # Check if we will use Docker Swarm or Docker Compose
 85 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 86 | then
 87 |   DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME
 88 | else
 89 |   DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d
 90 | fi
 91 | 
 92 | # Request to be quiet. Quitting here.
 93 | if [ -n "$OPT_QUIET" ]; then
 94 |   exit 0
 95 | fi
 96 | 
 97 | # Trigger requested log-file
 98 | dr-logs-robomaker -w 15 -e
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server.
 4 | 
 5 | DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository.
 6 | 
 7 | # Main Features
 8 | 
 9 | DRfC supports a wide set of features to ensure that you can focus on creating the best model:
10 | * User-friendly
11 | 	* Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups.
12 | 	* Wide set of scripts (`dr-*`) enables effortless training.
13 | 	* Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them.
14 | * Modes
15 | 	* Time Trial
16 | 	* Object Avoidance
17 | 	* Head-to-Bot
18 | * Training
19 | 	* Multiple Robomaker instances per Sagemaker (N:1) to improve training progress.
20 | 	* Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel.
21 | 	* Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances.
22 | * Evaluation
23 | 	* Evaluate independently from training.
24 | 	* Save evaluation run to MP4 file in S3.
25 | * Logging
26 | 	* Training metrics and trace files are stored to S3.
27 | 	* Optional integration with AWS CloudWatch.
28 | 	* Optional exposure of Robomaker internal log-files.
29 | * Technology
30 | 	* Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL)
31 | 
32 | # Documentation
33 | 
34 | * [Initial Installation](installation.md)
35 | * [Upload Model to Console](upload.md)
36 | * [Reference](reference.md)
37 | * [Using multiple Robomaker workers](multi_worker.md)
38 | * [Running multiple parallel experiments](multi_run.md)
39 | * [GPU Accelerated OpenGL for Robomaker](opengl.md)
40 | * [Having multiple GPUs in one Computer](multi_gpu.md)
41 | * [Installing on Windows](windows.md)
42 | * [Run a Head-to-Head Race](head-to-head.md)
43 | * [Watching the car](video.md)
44 | 
45 | # Support
46 | 
47 | * For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support.
48 | * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required.
49 | 


--------------------------------------------------------------------------------
/docs/upload.md:
--------------------------------------------------------------------------------
 1 | # Upload Model to AWS Console
 2 | 
 3 | Starting end July 2020 the AWS DeepRacer Console was re-designed which is now changing the way
 4 | that models need to be uploaded to enable them to be evaluated or submitted to the AWS hosted Summit or Virtual League events.
 5 | 
 6 | ## Create Upload Bucket
 7 | 
 8 | The recommendation is to create a unique bucket in `us-east-1` which is used as 'transit' between your training bucket, local or in an AWS region close to your EC2 instances.
 9 | 
10 | The bucket needs to be defined so that 'Objects can be public'; AWS will create a specific IAM policy to access the data in your bucket as part of the import.
11 | 
12 | ## Configure Upload Bucket
13 | 
14 | In `system.env` set `DR_UPLOAD_S3_BUCKET` to the name of your created bucket.
15 | 
16 | In `run.env` set the `DR_UPLOAD_S3_PREFIX` to any prefix of your choice.
17 | 
18 | ## Upload Model
19 | 
20 | After configuring the system you can run `dr-upload-model`; it will copy out the required parts of `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX` into `s3://DR_UPLOAD_S3_BUCKET/DR_UPLOAD_S3_PREFIX`.
21 | 
22 | Once uploaded you can use the [Import model](https://console.aws.amazon.com/deepracer/home?region=us-east-1#models/importModel) feature of the AWS DeepRacer console to load the model into the model store.
23 | 
24 | ## Things to know
25 | 
26 | ### Upload switches
27 | There are several useful switches to the upload command:
28 |   * f - this will force upload, no confirmation question if you want to proceed with upload
29 |   * w - wipes the target AWS DeepRacer model structure before upload in the designated bucket/prefix
30 |   * d - dry-Run mode, does not perform any write or delete operatios on target
31 |   * b - uploads best checkpoint instead of default which is last checkpoint
32 |   * p prefix - uploads model into specified S3 prefix
33 |   * i - imports model using the prefix as the model name
34 |   * I name - import model with a specific model name"
35 | 
36 | ### Import
37 | If you want to use the import switches (`-i` or `-I`) there are a few pre-requisites.
38 | 
39 | * Python packages to be installed with `pip install`:
40 |   * pandas
41 |   * deepracer-utils
42 | * Install boto3 service `deepracer` with `python -m deepracer install-cli --force`.
43 | * Create an IAM Role which the Deepracer service can use to access S3. Declare the ARN in `DR_UPLOAD_S3_ROLE` in `system.env`.
44 | 
45 | ### Managing your models
46 | You should decide how you're going to manage your models. Upload to AWS does not preserve all the files created locally so if you delete your local files you will find it hard to go back to a previous model and resume training.
47 | 
48 | ### Create file formatted for physical car, and upload to S3
49 | You can also create the file in the format necessary to run on the physical car directly from DRfC, without going through the AWS console.
50 | This is executed by running 'dr-upload-car-zip';  it will copy files out of the running sagemaker container, format them into the proper .tar.gz file, and upload that file to `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX`.    One of the limitations of this approach is that it only uses the latest checkpoint, and does not have the option to use the "best" checkpoint, or an earlier checkpoint.   Another limitation is that the sagemaker container must be running at the time this command is executed.
51 | 


--------------------------------------------------------------------------------
/scripts/upload/download-model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage(){
  4 | 	echo "Usage: $0 [-f] [-w] [-d] -s <source-prefix> -t <target-prefix"
  5 |   echo "       -f                Force download. No confirmation question."
  6 |   echo "       -w                Wipes the target AWS DeepRacer model structure before upload."
  7 |   echo "       -d                Dry-Run mode. Does not perform any write or delete operatios on target."
  8 |   echo "       -c                Copy config files into custom_files."
  9 |   echo "       -s source-url     Downloads model from specified S3 URL (s3://bucket/prefix)."
 10 |   echo "       -t target-prefix  Downloads model into specified prefix in local storage."
 11 | 	exit 1
 12 | }
 13 | 
 14 | trap ctrl_c INT
 15 | 
 16 | function ctrl_c() {
 17 |         echo "Requested to stop."
 18 |         exit 1
 19 | }
 20 | 
 21 | while getopts "s:t:fwcdh" opt; do
 22 | case $opt in
 23 | f) OPT_FORCE="True"
 24 | ;;
 25 | c) OPT_CONFIG="Config"
 26 | ;;
 27 | d) OPT_DRYRUN="--dryrun"
 28 | ;;
 29 | w) OPT_WIPE="--delete"
 30 | ;;
 31 | t) OPT_TARGET="$OPTARG"
 32 | ;;
 33 | s) OPT_SOURCE="$OPTARG"
 34 | ;;
 35 | h) usage
 36 | ;;
 37 | \?) echo "Invalid option -$OPTARG" >&2
 38 | usage
 39 | ;;
 40 | esac
 41 | done
 42 | 
 43 | if [[ -n "${OPT_DRYRUN}" ]];
 44 | then
 45 |   echo "*** DRYRUN MODE ***"
 46 | fi
 47 | 
 48 | SOURCE_S3_URL="${OPT_SOURCE}"
 49 | 
 50 | if [[ -z "${SOURCE_S3_URL}" ]];
 51 | then
 52 |   echo "No source URL to download model from."
 53 |   exit 1
 54 | fi
 55 | 
 56 | TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
 57 | TARGET_S3_PREFIX=${OPT_TARGET}
 58 | if [[ -z "${TARGET_S3_PREFIX}" ]];
 59 | then
 60 |   echo "No target prefix defined. Exiting."
 61 |   exit 1
 62 | fi
 63 | 
 64 | SOURCE_REWARD_FILE_S3_KEY="${SOURCE_S3_URL}/reward_function.py"
 65 | SOURCE_HYPERPARAM_FILE_S3_KEY="${SOURCE_S3_URL}/ip/hyperparameters.json"
 66 | SOURCE_METADATA_S3_KEY="${SOURCE_S3_URL}/model/model_metadata.json"
 67 | 
 68 | WORK_DIR=${DR_DIR}/tmp/download
 69 | mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}/config ${WORK_DIR}/full
 70 | 
 71 | # Check if metadata-files are available
 72 | REWARD_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_REWARD_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null)
 73 | METADATA_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_METADATA_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null)
 74 | HYPERPARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_HYPERPARAM_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null)
 75 | 
 76 | if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ];
 77 | then
 78 |     echo "All meta-data files found. Source model ${SOURCE_S3_URL} valid."
 79 | else
 80 |     echo "Meta-data files are not found. Source model ${SOURCE_S3_URL} not valid. Exiting."
 81 |     exit 1
 82 | fi
 83 | 
 84 | # Upload files
 85 | if [[ -z "${OPT_FORCE}" ]];
 86 | then
 87 |     echo "Ready to download model ${SOURCE_S3_URL} to local ${TARGET_S3_PREFIX}"
 88 |     read -r -p "Are you sure? [y/N] " response
 89 |     if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
 90 |     then
 91 |         echo "Aborting."
 92 |         exit 1
 93 |     fi
 94 | fi
 95 | 
 96 | cd ${WORK_DIR}
 97 | aws ${DR_UPLOAD_PROFILE} s3 sync "${SOURCE_S3_URL}" ${WORK_DIR}/full/ ${OPT_DRYRUN}
 98 | aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync ${WORK_DIR}/full/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN} ${OPT_WIPE}
 99 | 
100 | if [[ -n "${OPT_CONFIG}" ]];
101 | then
102 |   echo "Copy configuration to custom_files"
103 |   cp ${WORK_DIR}/config/* ${DR_DIR}/custom_files/
104 | fi
105 | 
106 | echo "Done."
107 | 


--------------------------------------------------------------------------------
/docs/windows.md:
--------------------------------------------------------------------------------
 1 | # Installing on Windows
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html).  Ensure your windows has an updated [nvidia cuda enabled driver](https://developer.nvidia.com/cuda/wsl/download) that will work with WSL.
 6 | 
 7 | The further instructions assume that you have a basic working WSL using the default Ubuntu distribution.
 8 | 
 9 | 
10 | ## Additional steps
11 | 
12 | The typical `bin/prepare.sh` script will not work for a Ubuntu WSL installation, hence alternate steps will be required.
13 | 
14 | ### Adding required packages
15 | 
16 | Install additional packages with the following command:
17 | 
18 | ```
19 | sudo apt-get install jq awscli python3-boto3 docker-compose
20 | ```
21 | 
22 | ### Install and configure docker and nvidia-docker
23 | ```
24 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
25 | sudo add-apt-repository    "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
26 | sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io
27 | 
28 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
29 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
30 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
31 | 
32 | cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
33 | sudo usermod -a -G docker $(id -un)
34 | ```
35 | 
36 | 
37 | ### Install DRfC
38 | 
39 | You can now run `bin/init.sh -a gpu -c local` to setup DRfC, and follow the typical DRfC startup instructions
40 | 
41 | ## Known Issues
42 | 
43 | * `init.sh` is not able to detect the GPU given differences in the Nvidia drivers, and the WSL2 Linux Kernel. You need to manually set the GPU image in `system.env`.
44 | * Docker does not start automatically when you launch Ubuntu. Start it manually with `sudo service docker start` 
45 | 
46 |      You can also configure the service to start automatically using the Windows Task Scheduler
47 |      
48 |      *1)* Create a new file at /etc/init-wsl  (sudo vi /etc/init-wsl) with the following contents.
49 |      
50 |      ```
51 |      #!/bin/sh
52 |      service start docker
53 |      ```
54 |  
55 |      *2)* Make the script executable `sudo chmod +x /etc/init-wsl`
56 |        
57 |      *3)* Open Task Scheduler in Windows 10
58 |        
59 |      - On the left, click **Task Scheduler Library** option, and then on the right, click **Create Task**
60 |           
61 |      - In **General** Tab, Enter Name **WSL Startup**, and select **Run whether user is logged on or not** and **Run with highest privileges** options.
62 |          
63 |      - In **Trigger** tab, click New ... > Begin the task: **At startup** > OK
64 |         
65 |      - In **Actions** tab, click New ... > Action: **Start a program**
66 |                             
67 |        program/script:  **wsl**
68 |                    
69 |        add arguments:  **-u root /etc/init-wsl**
70 |                    
71 |      - Click OK to exit
72 |           
73 |      *4)* You can run the task manually to confirm, or after Windows reboot docker should now automatically start.
74 | 
75 | * Video streams may not load using the localhost address.  To access the html video streams from your windows browser, you may need to use the IP address of the WSL VM.  From a WSL terminal, determine your IP address by the command 'ip addr' and look for **eth0** then **inet** (e.g. ip = 172.29.38.21).  Then from your windows browser (edge, chrome, etc) navigate to **ip:8080** (e.g. 172.29.38.21:8080)
76 |      
77 | 


--------------------------------------------------------------------------------
/scripts/upload/prepare-config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import boto3
 4 | import sys
 5 | import os 
 6 | import time
 7 | import json
 8 | import io
 9 | import yaml
10 | 
11 | config = {}
12 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
13 | config['JOB_TYPE'] = 'TRAINING'
14 | config['METRICS_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket')
15 | config['METRICS_S3_OBJECT_KEY'] = "{}/TrainingMetrics.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
16 | config['MODEL_METADATA_FILE_S3_KEY'] = "{}/model/model_metadata.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
17 | config['REWARD_FILE_S3_KEY'] = "{}/reward_function.py".format(os.environ.get('TARGET_S3_PREFIX', 'bucket'))
18 | config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket')
19 | config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('TARGET_S3_PREFIX', 'rl-deepracer-sagemaker')
20 | 
21 | # Car and training 
22 | config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
23 | if config['BODY_SHELL_TYPE'] == 'deepracer':
24 |     config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red')
25 | config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar')
26 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
27 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
28 | config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1')
29 | config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1')
30 | 
31 | config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false'))
32 | config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true'))
33 | config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05')
34 | config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')
35 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
36 | config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5')
37 | 
38 | # Object Avoidance
39 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
40 |     config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
41 |     config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
42 |     config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
43 |     config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')
44 | 
45 |     object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
46 |     if object_position_str != "":
47 |         object_positions = []
48 |         for o in object_position_str.split(";"):
49 |             object_positions.append(o)
50 |         config['OBJECT_POSITIONS'] = object_positions
51 |         config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))
52 | 
53 | # Head to Bot
54 | if config['RACE_TYPE'] == 'HEAD_TO_BOT':
55 |     config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
56 |     config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
57 |     config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
58 |     config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
59 |     config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
60 |     config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
61 |     config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
62 |     config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')
63 | 
64 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('WORK_DIR'),'training_params.yaml'))
65 | print(local_yaml_path)
66 | with open(local_yaml_path, 'w') as yaml_file:
67 |     yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)


--------------------------------------------------------------------------------
/docs/multi_gpu.md:
--------------------------------------------------------------------------------
 1 | # Training on a Computer with more than one GPU
 2 | 
 3 | In some cases you might end up with having a computer with more than one GPU. This may be common on a workstation
 4 | which may have one GPU for general graphics (e.g. GTX 10-series, RTX 20-series), as well as a data center GPU 
 5 | like a Tesla K40, K80 or M40.
 6 | 
 7 | In this setting it can get a bit chaotic as DeepRacer will 'greedily' put any workload on any GPU - which will 
 8 | lead to Out-of-Memory somewhere down the road.
 9 | 
10 | ## Checking available GPUs
11 | 
12 | You can use Tensorflow to give you an overview of available devices running `utils/cuda-check.sh`.
13 | 
14 | It will say something like:
15 | ```
16 | 2020-07-04 12:25:55.179580: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
17 | 2020-07-04 12:25:55.547206: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 0 with properties: 
18 | name: GeForce GTX 1650 major: 7 minor: 5 memoryClockRate(GHz): 1.68
19 | pciBusID: 0000:04:00.0
20 | totalMemory: 3.82GiB freeMemory: 3.30GiB
21 | 2020-07-04 12:25:55.732066: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 1 with properties: 
22 | name: Tesla M40 24GB major: 5 minor: 2 memoryClockRate(GHz): 1.112
23 | pciBusID: 0000:81:00.0
24 | totalMemory: 22.41GiB freeMemory: 22.30GiB
25 | 2020-07-04 12:25:55.732141: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1
26 | 2020-07-04 12:25:56.745647: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix:
27 | 2020-07-04 12:25:56.745719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977]      0 1 
28 | 2020-07-04 12:25:56.745732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0:   N N 
29 | 2020-07-04 12:25:56.745743: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1:   N N 
30 | 2020-07-04 12:25:56.745973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5)
31 | 2020-07-04 12:25:56.750352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2)
32 | 2020-07-04 12:25:56.774305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1
33 | 2020-07-04 12:25:56.774408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix:
34 | 2020-07-04 12:25:56.774425: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977]      0 1 
35 | 2020-07-04 12:25:56.774436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0:   N N 
36 | 2020-07-04 12:25:56.774446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1:   N N 
37 | 2020-07-04 12:25:56.774551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5)
38 | 2020-07-04 12:25:56.774829: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2)
39 | ['/device:GPU:0', '/device:GPU:1']
40 | ```
41 | In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Tesla M40.
42 | 
43 | ### Selecting Device
44 | 
45 | #### Robomaker
46 | To control the Robomaker then add the following to `system.env`:
47 | 
48 | ```
49 | CUDA_VISIBLE_DEVICES=1
50 | ``` 
51 | The number is the CUDA number of the GPU you want the Robomakers to use.
52 | 
53 | #### Sagemaker
54 | 
55 | Sagemaker is more critical to place, but also more complicated, as you will have to build a new Docker image for it to work.
56 | 
57 | A template is in `utils/Dockerfile.sagemaker-gpu`. Open it to alter the source image in `FROM`, and adapt `CUDA_VISIBLE_DEVICES`.
58 | 
59 | Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu .` with x being anything you like.
60 | 
61 | Update `system.env` to use the new image.
62 | 


--------------------------------------------------------------------------------
/scripts/viewer/start.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | usage(){
  4 | 	echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]"
  5 |   echo "       -w        Width of individual stream."
  6 |   echo "       -h        Heigth of individual stream."
  7 |   echo "       -q        Quality of the stream image."
  8 |   echo "       -t        Topic to follow - default /racecar/deepracer/kvs_stream"
  9 |   echo "       -b        Browser command (default: firefox --new-tab)"
 10 | 	exit 1
 11 | }
 12 | 
 13 | trap ctrl_c INT
 14 | 
 15 | function ctrl_c() {
 16 |         echo "Requested to stop."
 17 |         exit 1
 18 | }
 19 | 
 20 | # Stream definition
 21 | TOPIC="/racecar/deepracer/kvs_stream"
 22 | WIDTH=480
 23 | HEIGHT=360
 24 | QUALITY=75
 25 | BROWSER="firefox --new-tab"
 26 | 
 27 | while getopts ":w:h:q:t:b:" opt; do
 28 | case $opt in
 29 | w) WIDTH="$OPTARG"
 30 | ;;
 31 | h) HEIGHT="$OPTARG"
 32 | ;;
 33 | q) QUALITY="$OPTARG"
 34 | ;;
 35 | t) TOPIC="$OPTARG"
 36 | ;;
 37 | b) BROWSER="$OPTARG"
 38 | ;;
 39 | \?) echo "Invalid option -$OPTARG" >&2
 40 | usage
 41 | ;;
 42 | esac
 43 | done
 44 | 
 45 | export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html
 46 | export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf
 47 | 
 48 | cat << EOF > $DR_NGINX_CONF
 49 | server {
 50 |   listen 80;
 51 |   location / {
 52 |     root   /usr/share/nginx/html;
 53 |     index  index.html index.htm;
 54 |   }
 55 | EOF
 56 | echo "<html><head><title>DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</title><link href=\"https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap\" rel=\"stylesheet\"><style>body {display: block; margin: 0; background: #161e2d; color: #ffffff; font-family: \"Roboto\",sans-serif; font-size: 16px; font-weight: 400;} .container {display: flex; flex-direction: column; position: absolute; top: 42px; bottom: 0; left: 0; right: 0;} .navbar {position: fixed; top: 0; left: 0; right: 0; z-index: 2; background: #500280} .navbar-header { font-weight: 500; font-size: 1.125rem; display: flex; flex-wrap: wrap; align-items: center; padding: 12px 16px; box-shadow: rgba(0, 0, 0, 0.2) 0px 3px 5px -1px, rgba(0, 0, 0, 0.14) 0px 6px 10px 0px, rgba(0, 0, 0, 0.12) 0px 1px 18px 0px;} .main-container {justify-content: center; align-items: center; display: flex; flex-direction: row; flex-wrap: wrap; padding: 16px;} .card {margin: 8px; max-width: 480px; box-shadow: rgba(0, 0, 0, 0.2) 0px 2px 1px -1px, rgba(0, 0, 0, 0.14) 0px 1px 1px 0px, rgba(0, 0, 0, 0.12) 0px 1px 3px 0px; transition: box-shadow 280ms cubic-bezier(0.4, 0, 0.2, 1); border-radius: 4px; display: block; position: relative;} .card-img {border-radius: 4px;}</style></head><body><div class=\"container\"><div class=\"navbar\"><div class=\"navbar-header\">DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC</div></div><div class=\"main-container\">" > $DR_VIEWER_HTML
 57 | 
 58 | if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then
 59 |   ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ )
 60 | else
 61 |   ROBOMAKER_SERVICE_REPLICAS=$(docker service ps deepracer-${DR_RUN_ID}_robomaker | awk '/robomaker/ { print $1 }')
 62 |   for c in $ROBOMAKER_SERVICE_REPLICAS; do
 63 |     ROBOMAKER_CONTAINER_IP=$(docker inspect $c | jq -r '.[].NetworksAttachments[] | select (.Network.Spec.Name == "sagemaker-local") | .Addresses[0] ' | cut -f1 -d/)
 64 |     ROBOMAKER_CONTAINERS="${ROBOMAKER_CONTAINERS} ${ROBOMAKER_CONTAINER_IP}"
 65 |   done
 66 | fi
 67 | 
 68 | if [ -z "$ROBOMAKER_CONTAINERS" ]; then
 69 |     echo "No running robomakers. Exiting."
 70 |     exit
 71 | fi
 72 | 
 73 | 
 74 | for c in $ROBOMAKER_CONTAINERS; do
 75 |     C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}"
 76 |     C_IMG="<div class='card'><img class=\"card-img\" src=\"${C_URL}\"></img></div>"
 77 |     echo $C_IMG >> $DR_VIEWER_HTML
 78 |     echo "  location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >> $DR_NGINX_CONF
 79 | done
 80 | 
 81 | echo "</div></div></body></html>" >> $DR_VIEWER_HTML
 82 | echo "}" >> $DR_NGINX_CONF
 83 | 
 84 | # Check if we will use Docker Swarm or Docker Compose
 85 | STACK_NAME="deepracer-$DR_RUN_ID-viewer"
 86 | COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml
 87 | 
 88 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 89 | then
 90 |   COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-webviewer-swarm.yml"
 91 |   docker stack deploy -c $COMPOSE_FILES $STACK_NAME
 92 | else
 93 |   docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d 
 94 | fi
 95 | 
 96 | # Starting browser if using local X and having display defined.
 97 | if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then
 98 |   echo "Starting browser '$BROWSER'."
 99 |   if [ "${DR_DOCKER_STYLE,,}" == "swarm" ];
100 |   then
101 |     sleep 5
102 |   fi
103 |   $BROWSER "http://127.0.01:8100" &
104 | fi
105 | 
106 | 


--------------------------------------------------------------------------------
/bin/prepare.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | trap ctrl_c INT
  4 | 
  5 | function ctrl_c() {
  6 |         echo "Requested to stop."
  7 |         exit 1
  8 | }
  9 | 
 10 | 
 11 | 
 12 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 13 | 
 14 | ## Patch system
 15 | sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \
 16 |                         DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \
 17 |                         sudo apt-get install --no-install-recommends -y jq
 18 | source $DIR/detect.sh
 19 | echo "Detected cloud type ${CLOUD_NAME}"
 20 | 
 21 | ## Do I have a GPU
 22 | GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l )
 23 | if [ $? -ne 0 ] || [ $GPUS -eq 0 ];
 24 | then
 25 | 	ARCH="cpu"
 26 |         echo "No NVIDIA GPU detected. Will not install drivers."
 27 | else
 28 | 	ARCH="gpu"
 29 | fi
 30 | 
 31 | ## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure)
 32 | 
 33 | if [[ "${CLOUD_NAME}" == "azure" ]];
 34 | then
 35 |     ADDL_DISK=$(lsblk | awk  '/^sdc/ {print $1}')
 36 |     ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}')
 37 | 
 38 |     if [ -n "$ADDL_DISK" ] && [ -z "$ADDL_PART" ];
 39 |     then
 40 |         echo "Found $ADDL_DISK, preparing it for use"
 41 |         echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK
 42 |         sleep 1s
 43 |         ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1")
 44 |         sudo mkfs.ext4 $ADDL_DEVICE
 45 |         sudo mkdir -p /var/lib/docker
 46 |         echo "$ADDL_DEVICE   /var/lib/docker   ext4    rw,user,auto    0    0" | sudo tee -a /etc/fstab
 47 |         mount /var/lib/docker
 48 |         if [ $? -ne 0 ]
 49 |         then
 50 |             echo "Error during preparing of additional disk. Exiting."
 51 |             exit 1
 52 |         fi
 53 |     elif  [ -n "$ADDL_DISK" ] && [ -n "$ADDL_PART" ];
 54 |     then
 55 |         echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure."
 56 | 
 57 |     else
 58 |         echo "Did not find $ADDL_DISK. Installing into present drive/directory structure."
 59 |     fi
 60 | fi
 61 | 
 62 | ## Adding Nvidia Drivers
 63 | if [[ "${ARCH}" == "gpu" ]];
 64 | then
 65 | 	distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//')
 66 | 	sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub
 67 |     sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64/7fa2af80.pub
 68 | 	echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list
 69 | 	echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list
 70 | 	sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite"
 71 | fi
 72 | 
 73 | ## Adding AWSCli
 74 | sudo apt-get install -y --no-install-recommends awscli python3-boto3
 75 | 
 76 | ## Installing Docker
 77 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
 78 | sudo add-apt-repository    "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
 79 | sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io
 80 | 
 81 | if [[ "${ARCH}" == "gpu" ]];
 82 | then
 83 | 	distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 84 | 	curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 85 | 	curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 86 | 
 87 | 	sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime
 88 | 	cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json
 89 | fi
 90 | sudo systemctl enable docker
 91 | sudo systemctl restart docker
 92 | 
 93 | ## Ensure user can run docker
 94 | sudo usermod -a -G docker $(id -un)
 95 | 
 96 | ## Installing Docker Compose
 97 | sudo curl -L https://github.com/docker/compose/releases/download/1.29.2/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
 98 | sudo chmod +x /usr/local/bin/docker-compose
 99 | 
100 | ## Reboot to load driver -- continue install if in cloud-init
101 | CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l)
102 | 
103 | if [[ "$CLOUD_INIT" -ne 0 ]];
104 | then
105 |     echo "Rebooting in 5 seconds. Will continue with install."
106 |     cd $DIR
107 |     ./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}"
108 |     sleep 5s
109 |     sudo reboot
110 | else
111 |     echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}"
112 | fi
113 | 


--------------------------------------------------------------------------------
/scripts/training/start.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | source $DR_DIR/bin/scripts_wrapper.sh
  4 | 
  5 | usage(){
  6 | 	echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ] [-v]"
  7 |   echo "       -w        Wipes the target AWS DeepRacer model structure before upload."
  8 |   echo "       -q        Do not output / follow a log when starting."
  9 |   echo "       -a        Follow all Sagemaker and Robomaker logs."
 10 |   echo "       -s        Follow Sagemaker logs (default)."
 11 |   echo "       -v        Updates the viewer webpage."
 12 |   echo "       -r [n]    Follow Robomaker logs for worker n (default worker 0 / replica 1)."
 13 | 	exit 1
 14 | }
 15 | 
 16 | trap ctrl_c INT
 17 | 
 18 | function ctrl_c() {
 19 |         echo "Requested to stop."
 20 |         exit 1
 21 | }
 22 | 
 23 | OPT_DISPLAY="SAGEMAKER"
 24 | 
 25 | while getopts ":whqsavr:" opt; do
 26 | case $opt in
 27 | w) OPT_WIPE="WIPE"
 28 | ;;
 29 | q) OPT_QUIET="QUIET"
 30 | ;;
 31 | s) OPT_DISPLAY="SAGEMAKER"
 32 | ;;
 33 | a) OPT_DISPLAY="ALL"
 34 | ;;
 35 | r)  # Check if value is in numeric format.
 36 |     OPT_DISPLAY="ROBOMAKER"
 37 |     if [[ $OPTARG =~ ^[0-9]+$ ]]; then
 38 |         OPT_ROBOMAKER=$OPTARG
 39 |     else
 40 |         OPT_ROBOMAKER=0
 41 |         ((OPTIND--))
 42 |     fi
 43 | ;;  
 44 | v) OPT_VIEWER="VIEWER"
 45 | ;;
 46 | h) usage
 47 | ;;
 48 | \?) echo "Invalid option -$OPTARG" >&2
 49 | usage
 50 | ;;
 51 | esac
 52 | done
 53 | 
 54 | # Ensure Sagemaker's folder is there
 55 | if [ ! -d /tmp/sagemaker ]; then
 56 |   sudo mkdir -p /tmp/sagemaker
 57 |   sudo chmod -R g+w /tmp/sagemaker
 58 | fi
 59 | 
 60 | #Check if files are available
 61 | S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX"
 62 | 
 63 | S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l)
 64 | if [[ "$S3_FILES" -gt 0 ]];
 65 | then
 66 |   if [[ -z $OPT_WIPE ]];
 67 |   then
 68 |     echo "Selected path $S3_PATH exists. Delete it, or use -w option. Exiting."
 69 |     exit 1
 70 |   else
 71 |     echo "Wiping path $S3_PATH."
 72 |     aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH}
 73 |   fi
 74 | fi
 75 | 
 76 | # Base compose file
 77 | if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ];
 78 | then
 79 |   COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml"
 80 |   export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX"
 81 |   mkdir -p $DR_MOUNT_DIR
 82 | else
 83 |   COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE"
 84 | fi
 85 | 
 86 | # set evaluation specific environment variables
 87 | STACK_NAME="deepracer-$DR_RUN_ID"
 88 | 
 89 | export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE}
 90 | 
 91 | WORKER_CONFIG=$(python3 $DR_DIR/scripts/training/prepare-config.py)
 92 | 
 93 | if [ "$DR_WORKERS" -gt 1 ]; then
 94 |   echo "Starting $DR_WORKERS workers"
 95 | 
 96 |   if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]];
 97 |   then
 98 |     mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID
 99 |     rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/*
100 |     COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml"
101 |   fi
102 | 
103 |   if [ "$DR_TRAIN_MULTI_CONFIG" == "True" ]; then
104 |     export MULTI_CONFIG=$WORKER_CONFIG
105 |     echo "Multi-config training, creating multiple Robomaker configurations in $S3_PATH"  
106 |   else
107 |     echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" 
108 |   fi
109 |   export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch"
110 | 
111 | else
112 |   export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch"
113 |   echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE"
114 | fi
115 | 
116 | # Check if we are using Host X -- ensure variables are populated
117 | if [[ "${DR_HOST_X,,}" == "true" ]];
118 | then
119 |   if [[ -n "$DR_DISPLAY" ]]; then
120 |     ROBO_DISPLAY=$DR_DISPLAY
121 |   else
122 |     ROBO_DISPLAY=$DISPLAY
123 |   fi
124 |   
125 |   if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then 
126 |       echo "No X Server running on display $ROBO_DISPLAY. Exiting"
127 |       exit 0
128 |   fi
129 | 
130 |   if [[ -z "$XAUTHORITY" ]]; then
131 |     export XAUTHORITY=~/.Xauthority
132 |     if [[ ! -f "$XAUTHORITY" ]]; then
133 |       echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping."
134 |       exit 0
135 |     fi
136 |   fi
137 |   
138 | fi
139 | 
140 | # Check if we will use Docker Swarm or Docker Compose
141 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
142 | then
143 |   ROBOMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Robomaker == "true") | .ID' | wc -l)
144 |   if [[ "$ROBOMAKER_NODES" -eq 0 ]]; 
145 |   then
146 |     echo "ERROR: No Swarm Nodes labelled for placement of Robomaker. Please add Robomaker node."
147 |     echo "       Example: docker node update --label-add Robomaker=true $(docker node inspect self | jq .[0].ID -r)"
148 |     exit 0
149 |   fi
150 | 
151 |   SAGEMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Sagemaker == "true") | .ID' | wc -l)
152 |   if [[ "$SAGEMAKER_NODES" -eq 0 ]]; 
153 |   then
154 |     echo "ERROR: No Swarm Nodes labelled for placement of Sagemaker. Please add Sagemaker node."
155 |     echo "       Example: docker node update --label-add Sagemaker=true $(docker node inspect self | jq .[0].ID -r)"
156 |     exit 0
157 |   fi
158 | 
159 |   DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME
160 | 
161 | else
162 |   DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS
163 | fi
164 | 
165 | # Viewer
166 | if [ -n "$OPT_VIEWER" ]; then
167 |   (sleep 5; dr-update-viewer) 
168 | fi
169 | 
170 | # Request to be quiet. Quitting here.
171 | if [ -n "$OPT_QUIET" ]; then
172 |   exit 0
173 | fi
174 | 
175 | # Trigger requested log-file
176 | if [[ "${OPT_DISPLAY,,}" == "all" && -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then
177 |   dr-logs-sagemaker -w 15
178 |   if [ "${DR_WORKERS}" -gt 1 ]; then
179 |     for i in $(seq 1 ${DR_WORKERS})
180 |     do
181 |       dr-logs-robomaker -w 15 -n $i
182 |     done    
183 |   else
184 |     dr-logs-robomaker -w 15
185 |   fi
186 | elif [[ "${OPT_DISPLAY,,}" == "robomaker" ]]; then
187 |   dr-logs-robomaker -w 15 -n $OPT_ROBOMAKER
188 | elif [[ "${OPT_DISPLAY,,}" == "sagemaker" ]]; then
189 |   dr-logs-sagemaker -w 15
190 | fi
191 | 
192 | 


--------------------------------------------------------------------------------
/utils/sample-createspot.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##  This is sample code that will generally show you how to launch a spot instance on aws and leverage the 
  4 | ##  automation built into deepracer-for-cloud to automatically start training
  5 | ##  Changes required to work:
  6 | ##     Input location where your training will take place -- S3_LOCATION
  7 | ##     Input security group, iam role, and key-name
  8 | 
  9 | ## First you need to tell the script where in s3 your training will take place
 10 | ## can be either a bucket at the root level, or a bucket/prefix.  don't include the s3://
 11 | 
 12 | S3_LOCATION=<#########>
 13 | 
 14 | ## extract bucket location
 15 | BUCKET=${S3_LOCATION%%/*}
 16 | 
 17 | ## extract prefix location
 18 | if [[ "$S3_LOCATION" == *"/"* ]]
 19 | then
 20 |   PREFIX=${S3_LOCATION#*/}
 21 | else
 22 |   PREFIX=""
 23 | fi
 24 | 
 25 | ## Fill these out with your custom information if you want to upload and submit to leaderboard.  not required to run
 26 | DR_UPLOAD_S3_PREFIX=########
 27 | 
 28 | ## set the instance type you want to launch
 29 | INSTANCE_TYPE=c5.2xlarge
 30 | 
 31 | ## if you want to modify additional variables from the default, add them here, then add them to section further below called replace static paramamters.  I've only done World name for now
 32 | WORLD_NAME=FS_June2020
 33 | 
 34 | ## modify this if you want additional robomaker workers
 35 | DR_WORKERS=1
 36 | 
 37 | ## select which images you want to use.  these will be used later for a docker pull
 38 | DR_SAGEMAKER_IMAGE=cpu-avx-mkl
 39 | DR_ROBOMAKER_IMAGE=cpu-avx2
 40 | 
 41 | ## check the s3 location for existing training folders
 42 | ## automatically determine the latest training run (highest number), and set model parameters accordingly
 43 | ## this script assumes the format rl-deepracer-1, rl-deepracer-2, etc.  you will need to modify if your schema differs
 44 | 
 45 | LAST_TRAINING=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}')
 46 | ## drop trailing slash
 47 | LAST_TRAINING=$(echo $LAST_TRAINING | sed 's:/*$::')
 48 | 
 49 | CONFIG_FILE="./run.env"
 50 | OLD_SYSTEMENV="./system.env"
 51 | 
 52 | ## incorporate logic from increment.sh, slightly modified to use last training 
 53 | OPT_DELIM='-'
 54 | ## Read in data
 55 | CURRENT_RUN_MODEL=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}')
 56 | ## drop trailing slash
 57 | CURRENT_RUN_MODEL=$(echo $LAST_TRAINING | sed 's:/*$::')
 58 | ## get number at the end
 59 | CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \
 60 |                     awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }')
 61 | 
 62 | if [ -z $LAST_TRAINING ]
 63 | then
 64 |     echo No prior training found
 65 |     if [[ $PREFIX == "" ]]
 66 |     then
 67 |       NEW_RUN_MODEL=rl-deepracer-1
 68 |     else
 69 |       NEW_RUN_MODEL="$PREFIX/rl-deepracer-1"
 70 |     fi
 71 |     PRETRAINED=False
 72 |     CURRENT_RUN_MODEL=$NEW_RUN_MODEL
 73 | else
 74 | 
 75 |     NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc )
 76 |     PRETRAINED=True
 77 | 
 78 |     if [[ $PREFIX == "" ]]
 79 |     then
 80 |       NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")
 81 |     else
 82 |       NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/")     
 83 |       NEW_RUN_MODEL="$PREFIX/$NEW_RUN_MODEL"
 84 |       CURRENT_RUN_MODEL="$PREFIX/$CURRENT_RUN_MODEL"
 85 |     fi
 86 |     echo Last training was $CURRENT_RUN_MODEL so next training is $NEW_RUN_MODEL
 87 | fi
 88 | 
 89 | if [[ $PREFIX == "" ]]
 90 | then
 91 |     CUSTOM_FILES_PREFIX="custom_files"
 92 | else
 93 |     CUSTOM_FILES_PREFIX="$PREFIX/custom_files"
 94 | fi
 95 | 
 96 | ## Replace dynamic parameters in run.env (still local to your directory)
 97 | sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g; s:(DR_LOCAL_S3_CUSTOM_FILES_PREFIX=).*$:\1$CUSTOM_FILES_PREFIX:g" "$CONFIG_FILE"
 98 | sed -i.bak -re "s/(DR_LOCAL_S3_BUCKET=).*$/\1$BUCKET/g" "$CONFIG_FILE"
 99 | 
100 | ## Replace static parameters in run.env (still local to your directory)
101 | sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$DR_UPLOAD_S3_PREFIX/g" "$CONFIG_FILE"
102 | sed -i.bak -re "s/(DR_WORLD_NAME=).*$/\1$WORLD_NAME/g" "$CONFIG_FILE"
103 | 
104 | ## Replace static paramaters in system.env file, including sagemaker and robomaker images (still local to your directory) and the number of DR_workers
105 | sed -i.bak -re "s/(DR_UPLOAD_S3_BUCKET=).*$/\1$DR_UPLOAD_S3_BUCKET/g; s/(DR_SAGEMAKER_IMAGE=).*$/\1$DR_SAGEMAKER_IMAGE/g; s/(DR_ROBOMAKER_IMAGE=).*$/\1$DR_ROBOMAKER_IMAGE/g; s/(DR_WORKERS=).*$/\1$DR_WORKERS/g" "$OLD_SYSTEMENV"
106 | 
107 | ## upload the new run.env and system.env files into your S3 bucket (same s3 location identified earlier)
108 | ## files are loaded into the node-config folder/prefix.  You can also upload other files to node config, and they
109 | ## will sync to the EC2 instance as part of the autorun script later.  If you add other files, make sure they are 
110 | ## in node-config in the same directory structure as DRfc;   example:   s3location/node-config/scripts/training/.start.sh
111 | RUNENV_LOCATION=$S3_LOCATION/node-config/run.env
112 | SYSENV_LOCATION=$S3_LOCATION/node-config/system.env
113 | 
114 | aws s3 cp ./run.env s3://$RUNENV_LOCATION
115 | aws s3 cp ./system.env s3://$SYSENV_LOCATION
116 | 
117 | ## upload a custom autorun script to S3.  there is a default autorun script in the repo that will be used unless a custom one is specified here instead
118 | #aws s3 cp ./autorun.sh s3://$S3_LOCATION/autorun.sh
119 | 
120 | ## upload custom files -- if you dont want this, comment these lines out
121 | aws s3 cp ./model_metadata.json s3://$S3_LOCATION/custom_files/model_metadata.json
122 | aws s3 cp ./reward_function.py s3://$S3_LOCATION/custom_files/reward_function.py
123 | aws s3 cp ./hyperparameters.json s3://$S3_LOCATION/custom_files/hyperparameters.json
124 | 
125 | ## launch an ec2
126 | ## update with your own settings, including key-name, security-group, and iam-instance-profile at a minimum
127 | ## user data includes a command to create a .txt file which simply contains the name of the s3 location
128 | ## this filename will be used as fundamental input to autorun.sh script run later on that instance
129 | ## you need to ensure you have proper IAM permissions to launch this instance
130 | 
131 | aws ec2 run-instances \
132 |     --image-id ami-085925f297f89fce1 \
133 |     --count 1 \
134 |     --instance-type $INSTANCE_TYPE \
135 |     --key-name <####keyname####> \
136 |     --security-group-ids sg-<####sgid####> \
137 |     --block-device-mappings 'DeviceName=/dev/sda1,Ebs={DeleteOnTermination=true,VolumeSize=40}' \
138 |     --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \
139 |     --instance-market-options MarketType=spot \
140 |     --user-data "#!/bin/bash
141 |     su -c 'git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu"
142 | 


--------------------------------------------------------------------------------
/scripts/evaluation/prepare-config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import boto3
  4 | import sys
  5 | import os 
  6 | import time
  7 | import json
  8 | import io
  9 | import yaml
 10 | 
 11 | def str2bool(v):
 12 |   return v.lower() in ("yes", "true", "t", "1")
 13 | 
 14 | config = {}
 15 | config['CAR_COLOR'] = []
 16 | config['BODY_SHELL_TYPE'] = []
 17 | config['RACER_NAME'] = []
 18 | config['DISPLAY_NAME'] = []
 19 | config['MODEL_S3_PREFIX'] = []
 20 | config['MODEL_S3_BUCKET'] = []
 21 | config['SIMTRACE_S3_PREFIX'] = []
 22 | config['SIMTRACE_S3_BUCKET'] = []
 23 | config['KINESIS_VIDEO_STREAM_NAME'] = []
 24 | config['METRICS_S3_BUCKET'] = []
 25 | config['METRICS_S3_OBJECT_KEY'] = []
 26 | config['MP4_S3_BUCKET'] = []
 27 | config['MP4_S3_OBJECT_PREFIX'] = []
 28 | 
 29 | # Basic configuration; including all buckets etc.
 30 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
 31 | config['JOB_TYPE'] = 'EVALUATION'
 32 | config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream')
 33 | config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy')
 34 | 
 35 | config['MODEL_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
 36 | config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
 37 | config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
 38 | config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
 39 | 
 40 | # Metrics
 41 | config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
 42 | metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None)
 43 | if metrics_prefix is not None:
 44 |     config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))))
 45 | else:
 46 |     config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))))
 47 |     
 48 | # MP4 configuration / sav
 49 | save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False"))
 50 | if save_mp4:
 51 |     config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
 52 |     config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'bucket'),'mp4'))
 53 | 
 54 | # Checkpoint
 55 | config['EVAL_CHECKPOINT'] = os.environ.get('DR_EVAL_CHECKPOINT', 'last')
 56 | 
 57 | # Car and training 
 58 | body_shell_type = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
 59 | config['BODY_SHELL_TYPE'].append(body_shell_type)
 60 | if body_shell_type == 'deepracer':
 61 |     config['CAR_COLOR'].append(os.environ.get('DR_CAR_COLOR', 'Red'))
 62 | config['DISPLAY_NAME'].append(os.environ.get('DR_DISPLAY_NAME', 'racer1'))
 63 | config['RACER_NAME'].append(os.environ.get('DR_RACER_NAME', 'racer1'))
 64 | 
 65 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
 66 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
 67 | config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5')
 68 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
 69 | config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0')
 70 | 
 71 | config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True')
 72 | config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_MAX_RESETS', '0')
 73 | 
 74 | config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0')
 75 | config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0')
 76 | 
 77 | # Object Avoidance
 78 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
 79 |     config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
 80 |     config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
 81 |     config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
 82 |     config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')
 83 | 
 84 |     object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
 85 |     if object_position_str != "":
 86 |         object_positions = []
 87 |         for o in object_position_str.split(";"):
 88 |             object_positions.append(o)
 89 |         config['OBJECT_POSITIONS'] = object_positions
 90 |         config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))
 91 | 
 92 | # Head to Bot
 93 | if config['RACE_TYPE'] == 'HEAD_TO_BOT':
 94 |     config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
 95 |     config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
 96 |     config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
 97 |     config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
 98 |     config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
 99 |     config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
100 |     config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
101 |     config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')
102 |     config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0')
103 | 
104 | # Head to Model
105 | if config['RACE_TYPE'] == 'HEAD_TO_MODEL':
106 |     config['MODEL_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
107 |     config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
108 |     config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
109 |     config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'))
110 | 
111 |     # Metrics
112 |     config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
113 |     metrics_prefix = os.environ.get('DR_EVAL_OPP_S3_METRICS_PREFIX', '{}/{}'.format(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'),'metrics'))
114 |     if metrics_prefix is not None:
115 |         config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time()))))
116 |     else:
117 |         config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time()))))
118 | 
119 |     # MP4 configuration / sav
120 |     save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False"))
121 |     if save_mp4:
122 |         config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket'))
123 |         config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_EVAL_OPP_MODEL_PREFIX', 'bucket'),'mp4'))
124 | 
125 |     # Car and training 
126 |     config['DISPLAY_NAME'].append(os.environ.get('DR_EVAL_OPP_DISPLAY_NAME', 'racer1'))
127 |     config['RACER_NAME'].append(os.environ.get('DR_EVAL_OPP_RACER_NAME', 'racer1'))
128 | 
129 |     body_shell_type = os.environ.get('DR_EVAL_OPP_CAR_BODY_SHELL_TYPE', 'deepracer')
130 |     config['BODY_SHELL_TYPE'].append(body_shell_type)
131 |     config['VIDEO_JOB_TYPE'] = 'EVALUATION'
132 |     config['CAR_COLOR'] = ['Purple', 'Orange']    
133 |     config['MODEL_NAME'] = config['DISPLAY_NAME']
134 | 
135 | # S3 Setup / write and upload file
136 | s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None)
137 | s3_region = config['AWS_REGION']
138 | s3_bucket = config['MODEL_S3_BUCKET'][0]
139 | s3_prefix = config['MODEL_S3_PREFIX'][0]
140 | s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile')
141 | if s3_mode == 'profile':
142 |     s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default')
143 | else: # mode is 'role'
144 |     s3_profile = None
145 | s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval_params.yaml')
146 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
147 | 
148 | session = boto3.session.Session(profile_name=s3_profile)
149 | s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url)
150 | 
151 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
152 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'eval-params-' + str(round(time.time())) + '.yaml'))
153 | 
154 | with open(local_yaml_path, 'w') as yaml_file:
155 |     yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)
156 | 
157 | s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)
158 | 


--------------------------------------------------------------------------------
/bin/init.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | trap ctrl_c INT
  4 | 
  5 | function ctrl_c() {
  6 |         echo "Requested to stop."
  7 |         exit 1
  8 | }
  9 | 
 10 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 11 | INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )"
 12 | 
 13 | OPT_ARCH="gpu"
 14 | OPT_CLOUD=""
 15 | 
 16 | while getopts ":m:c:a:" opt; do
 17 | case $opt in
 18 | a) OPT_ARCH="$OPTARG"
 19 | ;;
 20 | m) OPT_MOUNT="$OPTARG"
 21 | ;; 
 22 | c) OPT_CLOUD="$OPTARG"
 23 | ;;
 24 | \?) echo "Invalid option -$OPTARG" >&2
 25 | exit 1
 26 | ;;
 27 | esac
 28 | done
 29 | 
 30 | if [[ -z "$OPT_CLOUD" ]]; then
 31 |     source $SCRIPT_DIR/detect.sh
 32 |     OPT_CLOUD=$CLOUD_NAME
 33 |     echo "Detected cloud type to be $CLOUD_NAME"
 34 | fi
 35 | 
 36 | # Find CPU Level
 37 | CPU_LEVEL="cpu-avx"
 38 | 
 39 | if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep avx2 | wc -l)" > 0 ]]; then
 40 |     CPU_LEVEL="cpu-avx2"
 41 | elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n hw.optional.avx2_0)" == 1 ]]; then
 42 |     CPU_LEVEL="cpu-avx2"
 43 | fi
 44 | 
 45 | # Check if Intel (to ensure MKN)
 46 | if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep GenuineIntel | wc -l)" > 0 ]]; then
 47 |     CPU_INTEL="true"
 48 | elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n machdep.cpu.vendor)" == "GenuineIntel" ]]; then
 49 |     CPU_INTEL="true"
 50 | fi
 51 | 
 52 | # Check GPU
 53 | if [[ "${OPT_ARCH}" == "gpu" ]]
 54 | then
 55 |     docker build -t local/gputest - < $INSTALL_DIR/utils/Dockerfile.gpu-detect 
 56 |     GPUS=$(docker run --rm --gpus all local/gputest 2> /dev/null | awk  '/Device: ./' | wc -l )
 57 |     if [ $? -ne 0 ] || [ $GPUS -eq 0 ]
 58 |     then
 59 |         echo "No GPU detected in docker. Using CPU".
 60 |         OPT_ARCH="cpu-avx"
 61 |     fi
 62 | fi
 63 | 
 64 | cd $INSTALL_DIR
 65 | 
 66 | # create directory structure for docker volumes
 67 | mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket 
 68 | mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/tmp
 69 | sudo mkdir -p /tmp/sagemaker
 70 | sudo chmod -R g+w /tmp/sagemaker
 71 | 
 72 | # create symlink to current user's home .aws directory 
 73 | # NOTE: AWS cli must be installed for this to work
 74 | # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html
 75 | mkdir -p $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/
 76 | ln -sf $(eval echo "~${USER}")/.aws  $INSTALL_DIR/docker/volumes/
 77 | 
 78 | # copy rewardfunctions
 79 | mkdir -p $INSTALL_DIR/custom_files 
 80 | cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/
 81 | cp $INSTALL_DIR/defaults/model_metadata.json $INSTALL_DIR/custom_files/
 82 | cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/
 83 | 
 84 | cp $INSTALL_DIR/defaults/template-system.env $INSTALL_DIR/system.env
 85 | cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env
 86 | if [[ "${OPT_CLOUD}" == "aws" ]]; then
 87 |     AWS_EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone`
 88 |     AWS_REGION="`echo \"$AWS_EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`"
 89 |     sed -i "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
 90 |     sed -i "s/<LOCAL_PROFILE>/default/g" $INSTALL_DIR/system.env
 91 | elif [[ "${OPT_CLOUD}" == "azure" ]]; then
 92 |     AWS_REGION="us-east-1"
 93 |     sed -i "s/<LOCAL_PROFILE>/azure/g" $INSTALL_DIR/system.env
 94 |     sed -i "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
 95 |     echo "Please run 'aws configure --profile azure' to set the credentials"
 96 | elif [[ "${OPT_CLOUD}" == "remote" ]]; then
 97 |     AWS_REGION="us-east-1"
 98 |     sed -i "s/<LOCAL_PROFILE>/minio/g" $INSTALL_DIR/system.env
 99 |     sed -i "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
100 |     echo "Please run 'aws configure --profile minio' to set the credentials"
101 |     echo "Please define DR_REMOTE_MINIO_URL in system.env to point to remote minio instance."
102 | else
103 |     AWS_REGION="us-east-1"
104 |     sed -i "s/<LOCAL_PROFILE>/minio/g" $INSTALL_DIR/system.env
105 |     sed -i "s/<AWS_DR_BUCKET>/not-defined/g" $INSTALL_DIR/system.env
106 |     echo "Please run 'aws configure --profile minio' to set the credentials"
107 | fi
108 | sed -i "s/<AWS_DR_BUCKET_ROLE>/to-be-defined/g" $INSTALL_DIR/system.env
109 | sed -i "s/<CLOUD_REPLACE>/$OPT_CLOUD/g" $INSTALL_DIR/system.env
110 | sed -i "s/<REGION_REPLACE>/$AWS_REGION/g" $INSTALL_DIR/system.env
111 | 
112 | 
113 | if [[ "${OPT_ARCH}" == "gpu" ]]; then
114 |     SAGEMAKER_TAG="gpu"   
115 | elif [[ -n "${CPU_INTEL}" ]]; then
116 |     SAGEMAKER_TAG="cpu" 
117 | else
118 |     SAGEMAKER_TAG="cpu" 
119 | fi
120 | 
121 | #set proxys if required
122 | for arg in "$@";
123 | do
124 |     IFS='=' read -ra part <<< "$arg"
125 |     if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then
126 |         var=${part[0]:2}=${part[1]}
127 |         args="${args} --build-arg ${var}"
128 |     fi
129 | done
130 | 
131 | # Download docker images. Change to build statements if locally built images are desired.
132 | COACH_VERSION=$(jq -r '.containers.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
133 | sed -i "s/<COACH_TAG>/$COACH_VERSION/g" $INSTALL_DIR/system.env
134 | 
135 | ROBOMAKER_VERSION=$(jq -r '.containers.robomaker  | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
136 | if [ -n $ROBOMAKER_VERSION ]; then
137 |     ROBOMAKER_VERSION=$ROBOMAKER_VERSION-$CPU_LEVEL
138 | else   
139 |     ROBOMAKER_VERSION=$CPU_LEVEL
140 | fi
141 | sed -i "s/<ROBO_TAG>/$ROBOMAKER_VERSION/g" $INSTALL_DIR/system.env
142 | 
143 | SAGEMAKER_VERSION=$(jq -r '.containers.sagemaker  | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json)
144 | if [ -n $SAGEMAKER_VERSION ]; then
145 |     SAGEMAKER_VERSION=$SAGEMAKER_VERSION-$SAGEMAKER_TAG
146 | else   
147 |     SAGEMAKER_VERSION=$SAGEMAKER_TAG
148 | fi
149 | sed -i "s/<SAGE_TAG>/$SAGEMAKER_VERSION/g" $INSTALL_DIR/system.env
150 | 
151 | docker pull awsdeepracercommunity/deepracer-rlcoach:$COACH_VERSION
152 | docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_VERSION
153 | docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION
154 | 
155 | # create the network sagemaker-local if it doesn't exit
156 | SAGEMAKER_NW='sagemaker-local'
157 | docker swarm init
158 | SWARM_NODE=$(docker node inspect self | jq .[0].ID -r)
159 | docker node update --label-add Sagemaker=true $SWARM_NODE
160 | docker node update --label-add Robomaker=true $SWARM_NODE
161 | docker network ls | grep -q $SAGEMAKER_NW
162 | if [ $? -ne 0 ]
163 | then
164 |     docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm
165 | else
166 |     docker network rm $SAGEMAKER_NW
167 |     docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm
168 | fi
169 | 
170 | # ensure our variables are set on startup - not for local setup.
171 | if [[ "${OPT_CLOUD}" != "local" ]]; then
172 |     NUM_IN_PROFILE=$(cat $HOME/.profile | grep "$INSTALL_DIR/bin/activate.sh" | wc -l)
173 |     if [ "$NUM_IN_PROFILE" -eq 0 ]; then
174 |         echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile
175 |     fi
176 | fi
177 | 
178 | # mark as done
179 | date | tee $INSTALL_DIR/DONE
180 | 
181 | ## Optional auturun feature
182 | # if using automation scripts to auto configure and run
183 | # you must pass s3_training_location.txt to this instance in order for this to work
184 | if [[ -f "$INSTALL_DIR/autorun.s3url" ]]
185 | then
186 |     ## read in first line.  first line always assumed to be training location regardless what else is in file
187 |     TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/autorun.s3url)
188 |     
189 |     #get bucket name
190 |     TRAINING_BUCKET=${TRAINING_LOC%%/*}
191 |     #get prefix. minor exception handling in case there is no prefix and a root bucket is passed
192 |     if [[ "$TRAINING_LOC" == *"/"* ]]
193 |     then
194 |       TRAINING_PREFIX=${TRAINING_LOC#*/}
195 |     else
196 |       TRAINING_PREFIX=""
197 |     fi
198 |           
199 |     ##check if custom autorun script exists in s3 training bucket.  If not, use default in this repo
200 |     aws s3api head-object --bucket $TRAINING_BUCKET --key $TRAINING_PREFIX/autorun.sh || not_exist=true
201 |     if [ $not_exist ]; then
202 |         echo "custom file does not exist, using local copy"      
203 |     else
204 |         echo "custom script does exist, use it"
205 |         aws s3 cp s3://$TRAINING_LOC/autorun.sh $INSTALL_DIR/bin/autorun.sh   
206 |     fi
207 |     chmod +x $INSTALL_DIR/bin/autorun.sh
208 |     bash -c "source $INSTALL_DIR/bin/autorun.sh"
209 | fi
210 | 
211 | 


--------------------------------------------------------------------------------
/bin/activate.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | verlte() {
  4 |     [  "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
  5 | }
  6 | 
  7 | function dr-update-env {
  8 | 
  9 |   if [[ -f "$DIR/system.env" ]]
 10 |   then
 11 |     LINES=$(grep -v '^#' $DIR/system.env)
 12 |     for l in $LINES; do
 13 |       env_var=$(echo $l | cut -f1 -d\=)
 14 |       env_val=$(echo $l | cut -f2 -d\=)
 15 |       eval "export $env_var=$env_val"
 16 |     done
 17 |   else
 18 |     echo "File system.env does not exist."
 19 |     return 1
 20 |   fi
 21 | 
 22 |   if [[ -f "$DR_CONFIG" ]]
 23 |   then
 24 |     LINES=$(grep -v '^#' $DR_CONFIG)
 25 |     for l in $LINES; do
 26 |       env_var=$(echo $l | cut -f1 -d\=)
 27 |       env_val=$(echo $l | cut -f2 -d\=)
 28 |       eval "export $env_var=$env_val"
 29 |     done
 30 |   else
 31 |     echo "File run.env does not exist."
 32 |     return 1
 33 |   fi
 34 | 
 35 |   if [[ -z "${DR_RUN_ID}" ]]; then
 36 |     export DR_RUN_ID=0
 37 |   fi
 38 | 
 39 |   if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 40 |   then
 41 |     export DR_ROBOMAKER_TRAIN_PORT=$(expr 8080 + $DR_RUN_ID)
 42 |     export DR_ROBOMAKER_EVAL_PORT=$(expr 8180 + $DR_RUN_ID)
 43 |     export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID)
 44 |   else
 45 |     export DR_ROBOMAKER_TRAIN_PORT="8080-8089"
 46 |     export DR_ROBOMAKER_EVAL_PORT="8080-8089"
 47 |     export DR_ROBOMAKER_GUI_PORT="5901-5920"
 48 |   fi
 49 | 
 50 | }
 51 | 
 52 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 53 | DIR="$( dirname $SCRIPT_DIR )"
 54 | export DR_DIR=$DIR
 55 | 
 56 | if [[ -f "$1" ]];
 57 | then
 58 |   export DR_CONFIG=$(readlink -f $1)
 59 |   dr-update-env
 60 | elif [[ -f "$DIR/run.env" ]];
 61 | then
 62 |   export DR_CONFIG="$DIR/run.env"
 63 |   dr-update-env
 64 | else
 65 |   echo "No configuration file."
 66 |   return 1
 67 | fi
 68 | 
 69 | # Check if Docker runs -- if not, then start it.
 70 | if [[ "$(type service 2> /dev/null)" ]]; then
 71 |   service docker status > /dev/null || sudo service docker start
 72 | fi
 73 | 
 74 | # Check if we will use Docker Swarm or Docker Compose
 75 | # If not defined then use Swarm
 76 | if [[ -z "${DR_DOCKER_STYLE}" ]]; then
 77 |   export DR_DOCKER_STYLE="swarm"
 78 | fi
 79 | 
 80 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
 81 | then
 82 |   export DR_DOCKER_FILE_SEP="-c"
 83 |   SWARM_NODE=$(docker node inspect self | jq .[0].ID -r)
 84 |   SWARM_NODE_UPDATE=$(docker node update --label-add Sagemaker=true $SWARM_NODE)
 85 | else
 86 |   export DR_DOCKER_FILE_SEP="-f"
 87 | fi
 88 | 
 89 | # Prepare the docker compose files depending on parameters
 90 | if [[ "${DR_CLOUD,,}" == "azure" ]];
 91 | then
 92 |     export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000"
 93 |     export DR_MINIO_URL="http://minio:9000"
 94 |     DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
 95 |     DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
 96 |     DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
 97 |     DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-azure.yml"
 98 | elif [[ "${DR_CLOUD,,}" == "local" ]];
 99 | then
100 |     export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000"
101 |     export DR_MINIO_URL="http://minio:9000"
102 |     DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
103 |     DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
104 |     DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
105 |     DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml"
106 | elif [[ "${DR_CLOUD,,}" == "remote" ]];
107 | then
108 |     export DR_LOCAL_S3_ENDPOINT_URL="$DR_REMOTE_MINIO_URL"
109 |     export DR_MINIO_URL="$DR_REMOTE_MINIO_URL"
110 |     DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL"
111 |     DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
112 |     DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml"
113 |     DR_MINIO_COMPOSE_FILE=""
114 | else
115 |     DR_LOCAL_PROFILE_ENDPOINT_URL=""
116 |     DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml"
117 |     DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml"
118 | fi
119 | 
120 | # Prevent docker swarms to restart
121 | if [[ "${DR_HOST_X,,}" == "true" ]];
122 | then
123 |     DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
124 |     DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml"
125 | fi
126 | 
127 | # Prevent docker swarms to restart
128 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
129 | then
130 |     DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training-swarm.yml"
131 |     DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval-swarm.yml"
132 | fi
133 | 
134 | # Enable logs in CloudWatch
135 | if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then
136 |     DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml"
137 |     DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml"
138 | fi
139 | 
140 | ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials.
141 | if [ "${DR_CLOUD,,}" == "aws" ] && [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -gt 0 ];
142 | then
143 |     export DR_LOCAL_S3_AUTH_MODE="role"
144 | else 
145 |     export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs)
146 |     export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs)
147 |     DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml"
148 |     DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml"
149 |     export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE"
150 |     export DR_LOCAL_S3_AUTH_MODE="profile"
151 | fi
152 | 
153 | export DR_TRAIN_COMPOSE_FILE
154 | export DR_EVAL_COMPOSE_FILE
155 | export DR_LOCAL_PROFILE_ENDPOINT_URL
156 | 
157 | if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then
158 |     export MINIO_UID=$(id -u)
159 |     export MINIO_USERNAME=$(id -u -n)
160 |     export MINIO_GID=$(id -g)
161 |     export MINIO_GROUPNAME=$(id -g -n)
162 |     if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]];
163 |     then
164 |         docker stack deploy $DR_MINIO_COMPOSE_FILE s3
165 |     else
166 |         docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d
167 |     fi
168 | 
169 | fi
170 | 
171 | ## Version check
172 | DEPENDENCY_VERSION=$(jq -r '.master_version  | select (.!=null)' $DIR/defaults/dependencies.json)
173 | 
174 | SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version)
175 | if [ -z "$SAGEMAKER_VER" ]; then SAGEMAKER_VER=$DR_SAGEMAKER_IMAGE; fi
176 | if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then
177 |   echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER."
178 | fi
179 | 
180 | ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version )
181 | if [ -z "$ROBOMAKER_VER" ]; then ROBOMAKER_VER=$DR_ROBOMAKER_IMAGE; fi
182 | if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then
183 |   echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER."
184 | fi
185 | 
186 | COACH_VER=$(docker inspect awsdeepracercommunity/deepracer-rlcoach:$DR_COACH_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version)
187 | if [ -z "$COACH_VER" ]; then COACH_VER=$DR_COACH_IMAGE; fi
188 | if ! verlte $DEPENDENCY_VERSION $COACH_VER; then
189 |   echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER."
190 | fi
191 | 
192 | source $SCRIPT_DIR/scripts_wrapper.sh
193 | 
194 | function dr-update {
195 |    dr-update-env
196 | }
197 | 
198 | function dr-reload {
199 |    source $DIR/bin/activate.sh $DR_CONFIG
200 | }
201 | 


--------------------------------------------------------------------------------
/scripts/upload/upload-model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage(){
  4 | 	echo "Usage: $0 [-f] [-w] [-d] [-b] [-c <checkpoint>] [-p <model-prefix>]"
  5 |   echo "       -f        Force upload. No confirmation question."
  6 |   echo "       -w        Wipes the target AWS DeepRacer model structure before upload."
  7 |   echo "       -d        Dry-Run mode. Does not perform any write or delete operatios on target."
  8 |   echo "       -b        Uploads best checkpoint. Default is last checkpoint."
  9 |   echo "       -p model  Uploads model in specified S3 prefix."
 10 |   echo "       -i        Import model with the upload name"
 11 |   echo "       -I name   Import model with a specific name"
 12 |   echo "       -1        Increment upload name with 1 (dr-increment-upload-model)"
 13 | 	exit 1
 14 | }
 15 | 
 16 | trap ctrl_c INT
 17 | 
 18 | function ctrl_c() {
 19 |         echo "Requested to stop."
 20 |         exit 1
 21 | }
 22 | 
 23 | while getopts ":fwdhbp:c:1iI:" opt; do
 24 | case $opt in
 25 | b) OPT_CHECKPOINT="Best"
 26 | ;; 
 27 | c) OPT_CHECKPOINT_NUM="$OPTARG"
 28 | ;;
 29 | f) OPT_FORCE="-f"
 30 | ;;
 31 | d) OPT_DRYRUN="--dryrun"
 32 | ;;
 33 | p) OPT_PREFIX="$OPTARG"
 34 | ;;
 35 | w) OPT_WIPE="--delete"
 36 | ;;
 37 | i) OPT_IMPORT="$DR_UPLOAD_S3_PREFIX"
 38 | ;;
 39 | I) OPT_IMPORT="$OPTARG"
 40 | ;;
 41 | 1) OPT_INCREMENT="Yes"
 42 | ;;
 43 | h) usage
 44 | ;;
 45 | \?) echo "Invalid option -$OPTARG" >&2
 46 | usage
 47 | ;;
 48 | esac
 49 | done
 50 | 
 51 | if [[ -n "${OPT_DRYRUN}" ]];
 52 | then
 53 |   echo "*** DRYRUN MODE ***"
 54 | fi
 55 | 
 56 | if [[ -n "${OPT_INCREMENT}" ]];
 57 | then
 58 |   source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE}
 59 |   OPT_IMPORT="$DR_UPLOAD_S3_PREFIX"
 60 | fi
 61 | 
 62 | export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET}
 63 | export TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX}
 64 | 
 65 | if [[ -z "${DR_UPLOAD_S3_BUCKET}" ]];
 66 | then
 67 |   echo "No upload bucket defined. Exiting."
 68 |   exit 1
 69 | fi
 70 | 
 71 | if [[ -z "${DR_UPLOAD_S3_PREFIX}" ]];
 72 | then
 73 |   echo "No upload prefix defined. Exiting."
 74 |   exit 1
 75 | fi
 76 | 
 77 | SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET}
 78 | if [[ -n "${OPT_PREFIX}" ]];
 79 | then
 80 |   SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX}
 81 | else
 82 |   SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}
 83 | fi
 84 | SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX}
 85 | SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY}
 86 | SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json"
 87 | 
 88 | export WORK_DIR=${DR_DIR}/tmp/upload/
 89 | mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip
 90 | 
 91 | # Upload information on model.
 92 | TARGET_PARAMS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/training_params.yaml"
 93 | TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_function.py"
 94 | TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json"
 95 | TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/TrainingMetrics.json"
 96 | 
 97 | # Check if metadata-files are available
 98 | REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py 2> /dev/null | wc -l)
 99 | if [ "$REWARD_IN_ROOT" -ne 0 ];
100 | then
101 |     REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null)
102 | else
103 |     echo "Looking for Reward Function in s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD}"
104 |     REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null)
105 | fi
106 | 
107 | METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null)
108 | HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null)
109 | METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null)
110 | 
111 | if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ] && [ -n "$METRICS_FILE" ]; 
112 | then
113 |     echo "All meta-data files found. Looking for checkpoint."
114 | else
115 |     echo "Meta-data files are not found. Exiting."
116 |     exit 1
117 | fi
118 | 
119 | # Download checkpoint file
120 | echo "Looking for model to upload from s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/"
121 | CHECKPOINT_INDEX=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/deepracer_checkpoints.json ${WORK_DIR}model/ --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) 
122 | 
123 | if [ -z "$CHECKPOINT_INDEX" ]; then
124 |   echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting."
125 |   exit 1
126 | fi
127 | 
128 | if [ -n "$OPT_CHECKPOINT_NUM" ]; then
129 |   echo "Checking for checkpoint $OPT_CHECKPOINT_NUM"
130 |   export OPT_CHECKPOINT_NUM
131 |   CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/')
132 |   CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_`
133 |   TIMESTAMP=`date +%s`
134 |   CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $timestamp | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_FILE --arg timestamp $TIMESTAMP)
135 |   CHECKPOINT_JSON=$(echo $CHECKPOINT_JSON_PART | jq '. | {last_checkpoint: .checkpoint, best_checkpoint: .checkpoint}')
136 | elif [ -z "$OPT_CHECKPOINT" ]; then
137 |   echo "Checking for latest tested checkpoint"
138 |   CHECKPOINT_FILE=`jq -r .last_checkpoint.name < $CHECKPOINT_INDEX`
139 |   CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_`
140 |   CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .last_checkpoint, best_checkpoint: .last_checkpoint}' < $CHECKPOINT_INDEX )
141 |   echo "Latest checkpoint = $CHECKPOINT"
142 | else
143 |   echo "Checking for best checkpoint"
144 |   CHECKPOINT_FILE=`jq -r .best_checkpoint.name < $CHECKPOINT_INDEX`
145 |   CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_`
146 |   CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .best_checkpoint, best_checkpoint: .best_checkpoint}' < $CHECKPOINT_INDEX )
147 |   echo "Best checkpoint: $CHECKPOINT"
148 | fi
149 | 
150 | # Find checkpoint & model files - download
151 | if [ -n "$CHECKPOINT" ]; then
152 |     CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null)
153 |     CHECKPOINT_MODEL_FILE_COUNT=$(echo $CHECKPOINT_MODEL_FILES | wc -l)
154 |     if [ "$CHECKPOINT_MODEL_FILE_COUNT" -eq 0 ]; then
155 |       echo "No model files found. Files possibly deleted. Try again."
156 |       exit 1 
157 |     fi
158 |     cp ${METADATA_FILE} ${WORK_DIR}model/
159 | #    echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint
160 |     echo ${CHECKPOINT_FILE} | tee ${WORK_DIR}model/.coach_checkpoint > /dev/null
161 | else
162 |     echo "Checkpoint not found. Exiting."
163 |     exit 1
164 | fi
165 | 
166 | # Create Training Params Yaml.
167 | PARAMS_FILE=$(python3 $DR_DIR/scripts/upload/prepare-config.py)
168 | 
169 | # Upload files
170 | if [[ -z "${OPT_FORCE}" ]];
171 | then
172 |     echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/"
173 |     read -r -p "Are you sure? [y/N] " response
174 |     if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]]
175 |     then
176 |         echo "Aborting."
177 |         exit 1
178 |     fi
179 | fi
180 | 
181 | # echo "" > ${WORK_DIR}model/.ready 
182 | cd ${WORK_DIR}
183 | echo ${CHECKPOINT_JSON} > ${WORK_DIR}model/deepracer_checkpoints.json
184 | aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE}
185 | aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN}
186 | aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN}
187 | aws ${DR_UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN}
188 | aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN}
189 | 
190 | # After upload trigger the import
191 | if [[ -n "${OPT_IMPORT}" ]];
192 | then
193 |     $DR_DIR/scripts/upload/import-model.py "${DR_UPLOAD_S3_PROFILE}" "${DR_UPLOAD_S3_ROLE}" "${TARGET_S3_BUCKET}" "${TARGET_S3_PREFIX}" "${OPT_IMPORT}"
194 | fi


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
  1 | # Installing Deepracer-for-Cloud
  2 | 
  3 | ## Requirements
  4 | 
  5 | Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. Both CPU-only as well as GPU systems are supported.
  6 | 
  7 | **AWS**:
  8 | 
  9 | * EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training.
 10 |   * Ubuntu 20.04
 11 |   * Minimum 30 GB, preferred 40 GB of OS disk.
 12 |   * Ephemeral Drive connected
 13 |   * Minimum of 8 GB GPU-RAM if running with GPU.
 14 |   * Recommended at least 6 VCPUs
 15 | * S3 bucket. Preferrably in same region as EC2 instance.
 16 | 
 17 | **Azure**:
 18 | 
 19 | * N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard
 20 |   * Ubuntu 20.04
 21 |   * Standard 30 GB OS drive is sufficient to get started.
 22 |   * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container.
 23 |   * Minimum 8 GB GPU-RAM
 24 |   * Recommended at least 6 VCPUs
 25 | * Storage Account with one Blob container configured for Access Key authentication.
 26 | 
 27 | **Local**:
 28 | 
 29 | * A modern, comparatively powerful, Intel based system.
 30 |   * Ubuntu 20.04, other Linux-dristros likely to work.
 31 |   * 4 core-CPU, equivalent to 8 vCPUs; the more the better.
 32 |   * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each.
 33 |   * System RAM + GPU RAM should be at least 32 GB.
 34 | * Running DRfC Ubuntu 20.04 on Windows using Windows Subsystem for Linux 2 is possible. See [Installing on Windows](windows.md)
 35 | 
 36 | ## Installation
 37 | 
 38 | The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine.
 39 | 
 40 | ```shell
 41 | git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git
 42 | ```
 43 | 
 44 | **For cloud setup** execute:
 45 | 
 46 | ```shell
 47 | cd deepracer-for-cloud && ./bin/prepare.sh
 48 | ```
 49 | 
 50 | This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed.
 51 | 
 52 | The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`.
 53 | 
 54 | **For local install** it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly.
 55 | 
 56 | The Init Script takes a few parameters:
 57 | 
 58 | | Variable | Description |
 59 | |----------|-------------|
 60 | | `-c <cloud>` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` |
 61 | | `-a <arch>` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. |
 62 | 
 63 | *TODO: Document how to configure via cloud-init.*
 64 | 
 65 | ## Environment Setup
 66 | 
 67 | The initialization script will attempt to auto-detect your environment (`Azure`, `AWS` or `Local`), and store the outcome in the `DR_CLOUD` parameter in `system.env`. You can also pass in a `-c <cloud>` parameter to override it, e.g. if you want to run the minio-based `local` mode in the cloud.
 68 | 
 69 | The main difference between the mode is based on authentication mechanisms and type of storage being configured. The next chapters will review each type of environment on its own.
 70 | 
 71 | ### AWS
 72 | 
 73 | In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys.
 74 | 
 75 | #### IAM Role
 76 | 
 77 | To use IAM Roles:
 78 | 
 79 | * An empty S3 bucket in the same region as the EC2 instance.
 80 | * An IAM Role that has permissions to:
 81 |   * Access both the *new* S3 bucket as well as the DeepRacer bucket.
 82 |   * AmazonVPCReadOnlyAccess
 83 |   * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis
 84 |   * CloudWatch
 85 | * An EC2 instance with the defined IAM Role assigned.
 86 | * Configure `system.env` as follows:
 87 |   * `DR_LOCAL_S3_PROFILE=default`
 88 |   * `DR_LOCAL_S3_BUCKET=<bucketname>`
 89 |   * `DR_UPLOAD_S3_PROFILE=default`
 90 |   * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
 91 | * Run `dr-update` for configuration to take effect.
 92 | 
 93 | #### Manual setup
 94 | 
 95 | For access with IAM user:
 96 | 
 97 | * An empty S3 bucket in the same region as the EC2 instance.
 98 | * A real AWS IAM user set up with access keys:
 99 |   * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket.
100 |   * Use `aws configure` to configure this into the default profile.
101 | * Configure `system.env` as follows:
102 |   * `DR_LOCAL_S3_PROFILE=default`
103 |   * `DR_LOCAL_S3_BUCKET=<bucketname>`
104 |   * `DR_UPLOAD_S3_PROFILE=default`
105 |   * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
106 | * Run `dr-update` for configuration to take effect.
107 | 
108 | ### Azure
109 | 
110 | In Azure mode the script-set requires the following:
111 | 
112 | * A storage account with a blob container set up with access keys:
113 |   * Use `aws configure --profile <myprofile>` to configure this into a specific profile.
114 |   * `<myprofile>` can be defined by the user, but do not use `default`.
115 |     * Access Key ID is the Storage Account name.
116 |     * Secret Access Key is the Access Key for the Storage Account.
117 |   * The blob container is equivalent to the S3 bucket.
118 | * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer.
119 | * Configure `system.env` as follows:
120 |   * `DR_LOCAL_S3_PROFILE=default`
121 |   * `DR_LOCAL_S3_BUCKET=<bucketname>`
122 |   * `DR_UPLOAD_S3_PROFILE=default`
123 |   * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
124 | * Run `dr-update` for configuration to take effect.
125 | 
126 | As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models.
127 | 
128 | If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration.
129 | 
130 | ### Local
131 | 
132 | Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3.
133 | 
134 | In Local mode the script-set requires the following:
135 | 
136 | * Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8.
137 | * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer.
138 | * Configure `system.env` as follows:
139 |   * `DR_LOCAL_S3_PROFILE=default`
140 |   * `DR_LOCAL_S3_BUCKET=<bucketname>`
141 |   * `DR_UPLOAD_S3_PROFILE=default`
142 |   * `DR_UPLOAD_S3_BUCKET=<your-aws-deepracer-bucket>`
143 | * Run `dr-update` for configuration to take effect.
144 | 
145 | ## First Run
146 | 
147 | For the first run the following final steps are needed. This creates a training run with all default values in
148 | 
149 | * Define your custom files in `custom_files/` - samples can be found in `defaults` which you must copy over:
150 |   * `hyperparameters.json` - definining the training hyperparameters
151 |   * `model_metadata.json` - defining the action space and sensors
152 |   * `reward_function.py` - defining the reward function
153 | * Upload the files into the bucket with `dr-upload-custom-files`. This will also start minio if required.
154 | * Start training with `dr-start-training`
155 | 
156 | After a while you will see the sagemaker logs on the screen.
157 | 
158 | ## Troubleshooting
159 | 
160 | Here are some hints for troubleshooting specific issues you may encounter
161 | 
162 | ### Local training troubleshooting
163 | 
164 | | Issue        | Troubleshooting hint |
165 | |------------- | ---------------------|
166 | Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors
167 | Check docker errors for specific container | Run `docker logs -f <containerid>`
168 | Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface <your_interface> ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.<br> If you don't care which one to use, you can get the first one by running ```ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask)```.<br> Edit   `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr <your_IP>`.<br> Rerun  `./bin/init.sh -c local -a cpu`
169 | 


--------------------------------------------------------------------------------
/bin/scripts_wrapper.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | function dr-upload-custom-files {
  4 |   eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/)
  5 |   echo "Uploading files to $CUSTOM_TARGET"
  6 |   aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET
  7 | }
  8 | 
  9 | function dr-upload-model {
 10 |   dr-update-env && ${DR_DIR}/scripts/upload/upload-model.sh "$@"
 11 | }
 12 | 
 13 | function dr-download-model {
 14 |   dr-update-env && ${DR_DIR}/scripts/upload/download-model.sh "$@"
 15 | }
 16 | 
 17 | function dr-upload-car-zip {
 18 |   dr-update-env && ${DR_DIR}/scripts/upload/upload-car.sh "$@"
 19 | }
 20 | 
 21 | function dr-list-aws-models {
 22 |   echo "Due to changes in AWS DeepRacer Console this command is no longer available."
 23 | }
 24 | 
 25 | function dr-set-upload-model {
 26 |   echo "Due to changes in AWS DeepRacer Console this command is no longer available."
 27 | }
 28 | 
 29 | function dr-increment-upload-model {
 30 |   dr-update-env && ${DR_DIR}/scripts/upload/increment.sh "$@" && dr-update-env
 31 | }
 32 | 
 33 | function dr-download-custom-files {
 34 |   eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/)
 35 |   echo "Downloading files from $CUSTOM_TARGET"
 36 |   aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DR_DIR/custom_files/
 37 | }
 38 | 
 39 | function dr-start-training {
 40 |   dr-update-env
 41 |   $DR_DIR/scripts/training/start.sh "$@"
 42 | }
 43 | 
 44 | function dr-increment-training {
 45 |   dr-update-env && ${DR_DIR}/scripts/training/increment.sh "$@" && dr-update-env
 46 | }
 47 | 
 48 | function dr-stop-training {
 49 |   ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/training && ./stop.sh"
 50 | }
 51 | 
 52 | function dr-start-evaluation {
 53 |   dr-update-env
 54 |   $DR_DIR/scripts/evaluation/start.sh "$@"
 55 | }
 56 | 
 57 | function dr-stop-evaluation {
 58 |   ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh"
 59 | }
 60 | 
 61 | 
 62 | function dr-start-tournament {
 63 |   echo "Tournaments are no longer supported. Use Head-to-Model evaluation instead."
 64 | }
 65 | 
 66 | 
 67 | function dr-start-loganalysis {
 68 |   ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh"
 69 | }
 70 | 
 71 | 
 72 | function dr-stop-loganalysis {
 73 |   eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }')
 74 |   if [ -n "$LOG_ANALYSIS_ID" ]; then
 75 |     ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh"
 76 |   else
 77 |     echo "Log-analysis is not running."
 78 |   fi
 79 | 
 80 | }
 81 | 
 82 | function dr-logs-sagemaker {
 83 | 
 84 |   local OPTIND
 85 |   OPT_TIME="--since 5m"
 86 | 
 87 |   while getopts ":w:a" opt; do
 88 |   case $opt in
 89 |   w) OPT_WAIT=$OPTARG
 90 |   ;;
 91 |   a) OPT_TIME=""
 92 |   ;;  
 93 |   \?) echo "Invalid option -$OPTARG" >&2
 94 |   ;;
 95 |   esac
 96 |   done
 97 | 
 98 |   SAGEMAKER_CONTAINER=$(dr-find-sagemaker)
 99 | 
100 |   if [[ -z "$SAGEMAKER_CONTAINER" ]];
101 |   then
102 |     if [[ -n "$OPT_WAIT" ]]; then
103 |       WAIT_TIME=$OPT_WAIT
104 |       echo "Waiting up to $WAIT_TIME seconds for Sagemaker to start up..."
105 |       until [ -n "$SAGEMAKER_CONTAINER" ]
106 |       do
107 |         sleep 1
108 |         ((WAIT_TIME--))
109 |         if [ "$WAIT_TIME" -lt 1 ]; then
110 |           echo "Sagemaker is not running."
111 |           return 1
112 |         fi
113 |         SAGEMAKER_CONTAINER=$(dr-find-sagemaker)
114 |       done
115 |     else
116 |       echo "Sagemaker is not running."
117 |       return 1
118 |     fi
119 |   fi
120 | 
121 |   if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]];
122 |   then
123 |     if [ -x "$(command -v gnome-terminal)" ]; 
124 |     then
125 |       gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null
126 |       echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate gnome-terminal. "
127 |     elif [ -x "$(command -v x-terminal-emulator)" ]; 
128 |     then
129 |       x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null
130 |       echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate terminal. "      
131 |     else
132 |       echo 'Could not find a defined x-terminal-emulator. Displaying inline.'
133 |       docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER
134 |     fi
135 |   else
136 |       docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER
137 |   fi
138 | 
139 | }
140 | 
141 | function dr-find-sagemaker {
142 | 
143 |     STACK_NAME="deepracer-$DR_RUN_ID"
144 |     RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX}
145 | 
146 |     SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs )
147 | 
148 |     if [[ -n $SAGEMAKER_CONTAINERS ]];
149 |     then
150 |         for CONTAINER in $SAGEMAKER_CONTAINERS; do
151 |             CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER)
152 |             CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1')
153 |             COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2')
154 |             COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX)
155 |             if [[ -n $COMPOSE_FILE ]]; then
156 |                 echo $CONTAINER
157 |                 return
158 |             fi
159 |         done
160 |     fi
161 | 
162 | }
163 | 
164 | function dr-logs-robomaker {
165 | 
166 |   OPT_REPLICA=1
167 |   OPT_EVAL=""
168 |   local OPTIND
169 |   OPT_TIME="--since 5m"
170 | 
171 |   while getopts ":w:n:ea" opt; do
172 |   case $opt in
173 |   w) OPT_WAIT=$OPTARG
174 |   ;;
175 |   n) OPT_REPLICA=$OPTARG
176 |   ;;
177 |   e) OPT_EVAL="-e"
178 |   ;;  
179 |   a) OPT_TIME=""
180 |   ;;
181 |   \?) echo "Invalid option -$OPTARG" >&2
182 |   ;;
183 |   esac
184 |   done
185 | 
186 |   ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL})
187 | 
188 |   if [[ -z "$ROBOMAKER_CONTAINER" ]];
189 |   then
190 |     if [[ -n "$OPT_WAIT" ]]; then
191 |       WAIT_TIME=$OPT_WAIT
192 |       echo "Waiting up to $WAIT_TIME seconds for Robomaker #${OPT_REPLICA} to start up..."
193 |       until [ -n "$ROBOMAKER_CONTAINER" ]
194 |       do
195 |         sleep 1
196 |         ((WAIT_TIME--))
197 |         if [ "$WAIT_TIME" -lt 1 ]; then
198 |           echo "Robomaker #${OPT_REPLICA} is not running."
199 |           return 1
200 |         fi
201 |         ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL})
202 |       done
203 |     else
204 |       echo "Robomaker #${OPT_REPLICA} is not running."
205 |       return 1
206 |     fi
207 |   fi
208 | 
209 |   if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]];
210 |   then
211 |     if [ -x "$(command -v gnome-terminal)" ]; 
212 |     then
213 |       gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null
214 |       echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate gnome-terminal. "
215 |     elif [ -x "$(command -v x-terminal-emulator)" ]; 
216 |     then
217 |       x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null
218 |       echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate terminal. "
219 |     else
220 |       echo 'Could not find a defined x-terminal-emulator. Displaying inline.'
221 |       docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER 
222 |     fi
223 |   else
224 |       docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER
225 |   fi
226 | 
227 | }
228 | 
229 | function dr-find-robomaker {
230 | 
231 |   local OPTIND
232 | 
233 |   OPT_PREFIX="deepracer"
234 | 
235 |   while getopts ":n:e" opt; do
236 |   case $opt in
237 |   n) OPT_REPLICA=$OPTARG
238 |   ;;
239 |   e) OPT_PREFIX="-eval"
240 |   ;;  
241 |   \?) echo "Invalid option -$OPTARG" >&2
242 |   ;;
243 |   esac
244 |   done
245 | 
246 |   eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}_robomaker.${OPT_REPLICA}" | cut -f1 -d\  | head -1)
247 |   if [ -n "$ROBOMAKER_ID" ]; then
248 |     echo $ROBOMAKER_ID
249 |   fi
250 | }
251 | 
252 | function dr-get-robomaker-stats {
253 | 
254 |   local OPTIND
255 |   OPT_REPLICA=1
256 | 
257 |   while getopts ":n:" opt; do
258 |   case $opt in
259 |   n) OPT_REPLICA=$OPTARG
260 |   ;;
261 |   \?) echo "Invalid option -$OPTARG" >&2
262 |   ;;
263 |   esac
264 |   done
265 | 
266 |   eval ROBOMAKER_ID=$(dr-find-robomaker -n $OPT_REPLICA )
267 |   if [ -n "$ROBOMAKER_ID" ]; then
268 |     echo "Showing statistics for Robomaker #$OPT_REPLICA - container $ROBOMAKER_ID"
269 |     docker exec -ti $ROBOMAKER_ID bash -c "gz stats"
270 |   else
271 |     echo "Robomaker #$OPT_REPLICA is not running."
272 |   fi
273 | }
274 | 
275 | function dr-logs-loganalysis {
276 |   eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }')
277 |   if [ -n "$LOG_ANALYSIS_ID" ]; then
278 |     docker logs -f $LOG_ANALYSIS_ID
279 |   else
280 |     echo "Log-analysis is not running."
281 |   fi
282 | 
283 | }
284 | 
285 | function dr-url-loganalysis {
286 |   eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }')
287 |   if [ -n "$LOG_ANALYSIS_ID" ]; then
288 |     docker exec "$LOG_ANALYSIS_ID" bash -c "jupyter server list"
289 |   else
290 |     echo "Log-analysis is not running."
291 |   fi
292 | }
293 | 
294 | function dr-view-stream {
295 |   ${DR_DIR}/utils/start-local-browser.sh "$@"
296 | }
297 | 
298 | function dr-start-viewer {
299 |   $DR_DIR/scripts/viewer/start.sh "$@"
300 | }
301 | 
302 | function dr-stop-viewer {
303 |   $DR_DIR/scripts/viewer/stop.sh "$@"
304 | }
305 | 
306 | function dr-update-viewer {
307 |   $DR_DIR/scripts/viewer/stop.sh "$@"
308 |   $DR_DIR/scripts/viewer/start.sh "$@"
309 | }
310 | 


--------------------------------------------------------------------------------
/docs/reference.md:
--------------------------------------------------------------------------------
 1 | # Deepracer-for-Cloud Reference
 2 | 
 3 | ## Environment Variables
 4 | 
 5 | The scripts assume that two files `system.env` containing constant configuration values and  `run.env` with run specific values is populated with the required values. Which values go into which file is not really important.
 6 | 
 7 | | Variable | Description |
 8 | |----------|-------------|
 9 | | `DR_RUN_ID` | Used if you have multiple independent training jobs only a single DRfC instance. This is an advanced configuration and generally you should just leave this as the default `0`.|
10 | | `DR_WORLD_NAME` | Defines the track to be used.|
11 | | `DR_RACE_TYPE` | Valid options are `TIME_TRIAL`, `OBJECT_AVOIDANCE`, and `HEAD_TO_BOT`.|
12 | | `DR_CAR_COLOR` | Valid options are `Black`, `Grey`, `Blue`, `Red`, `Orange`, `White`, and `Purple`.|
13 | | `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading.|
14 | | `DR_ENABLE_DOMAIN_RANDOMIZATION` | If `True`, this cycles through different environment colors and lighting each episode.  This is typically used to make your model more robust and generalized instead of tightly aligned with the simulator|
15 | | `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`|
16 | | `DR_EVAL_NUMBER_OF_TRIALS` | How many laps to complete for evaluation simulations.|
17 | | `DR_EVAL_IS_CONTINUOUS` | If False, your evaluation trial will end if you car goes off track or is in a collision. If True, your car will take the penalty times as configured in those parameters, but continue evaluating the trial.|
18 | | `DR_EVAL_OFF_TRACK_PENALTY` | Number of seconds penalty time added for an off track during evaluation.  Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.|
19 | | `DR_EVAL_COLLISION_PENALTY` | Number of seconds penalty time added for a collision during evaluation.  Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.|
20 | | `DR_EVAL_SAVE_MP4` | Set to `True` to save MP4 of an evaluation run. |
21 | | `DR_TRAIN_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)|
22 | | `DR_TRAIN_ALTERNATE_DRIVING_DIRECTION` | `True` or `False`.  If `True`, the car will alternate driving between clockwise and counter-clockwise each episode.|
23 | | `DR_TRAIN_START_POSITION_OFFSET` | Used to control where to start the training from on first episode.|
24 | | `DR_TRAIN_ROUND_ROBIN_ADVANCE_DISTANCE` | How far to progress each episode in round robin.  0.05 is 5% of the track.  Generally best to try and keep this to even numbers that match with your total number of episodes to allow for even distribution around the track.  For example, if 20 episodes per iternation, .05 or .10 or .20  would be good.|
25 | | `DR_TRAIN_MULTI_CONFIG` | `True` or `False`.  This is used if you want to use different run.env configurations for each worker in a multi worker training run.  See multi config documentation for more details on how to set this up.|
26 | | `DR_TRAIN_MIN_EVAL_TRIALS` | The minimum number of evaluation trials run between each training iteration.  Evaluations will continue as long as policy training is occuring and may be more than this number.  This establishes the minimum, and is generally useful if you want to speed up training especially when using gpu sagemaker containers.|
27 | | `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.|
28 | | `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.|
29 | | `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.|
30 | | `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.|
31 | | `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.|
32 | | `DR_LOCAL_S3_TRAINING_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during training. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.|
33 | | `DR_LOCAL_S3_EVAL_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during evaluations.  Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.|
34 | | `DR_LOCAL_S3_MODEL_METADATA_KEY` | Location where the `model_metadata.json` file is stored.|
35 | | `DR_LOCAL_S3_HYPERPARAMETERS_KEY` | Location where the `hyperparameters.json` file is stored.|
36 | | `DR_LOCAL_S3_REWARD_KEY` | Location where the `reward_function.py` file is stored.|
37 | | `DR_LOCAL_S3_METRICS_PREFIX` | Location where the metrics will be stored.|
38 | | `DR_OA_NUMBER_OF_OBSTACLES` | For Object Avoidance, the number of obstacles on the track.|
39 | | `DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES` | Minimum distance in meters between obstacles.|
40 | | `DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS` | If True, obstacle locations will randomly change after each episode.|
41 | | `DR_OA_IS_OBSTACLE_BOT_CAR` | If True, obstacles will appear as a stationary car instead of a box.|
42 | | `DR_OA_OBJECT_POSITIONS` | Positions of boxes on the track. Tuples consisting of progress (fraction [0..1]) and inside or outside lane (-1 or 1). Example: `"0.23,-1;0.46,1"`|
43 | | `DR_H2B_IS_LANE_CHANGE` | If True, bot cars will change lanes based on configuration.|
44 | | `DR_H2B_LOWER_LANE_CHANGE_TIME` | Minimum time in seconds before car will change lanes.|
45 | | `DR_H2B_UPPER_LANE_CHANGE_TIME` | Maximum time in seconds before car will change langes.|
46 | | `DR_H2B_LANE_CHANGE_DISTANCE` | Distance in meters how long it will take the car to change lanes.|
47 | | `DR_H2B_NUMBER_OF_BOT_CARS` | Number of bot cars on the track.|
48 | | `DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS` | Minimum distance between bot cars.|
49 | | `DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS` | If True, bot car locations will randomly change after each episode.|
50 | | `DR_H2B_BOT_CAR_SPEED` | How fast the bot cars go in meters per second.|
51 | | `DR_CLOUD` | Can be `azure`, `aws`, `local` or `remote`; determines how the storage will be configured.|
52 | | `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) |
53 | | `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.|
54 | | `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)|
55 | | `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.|
56 | | `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker |
57 | | `DR_KINESIS_STREAM_NAME` | Kinesis stream name. Used if you actually publish to the AWS KVS service. Leave blank if you do not want this. |
58 | | `DR_KINESIS_STREAM_ENABLE` | Enable or disable 'Kinesis Stream', True both publishes to a AWS KVS stream (if name not None), and to the topic `/racecar/deepracer/kvs_stream`. Leave True if you want to watch the car racing. |
59 | | `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.|
60 | | `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.|
61 | | `DR_COACH_IMAGE` | Determines which coach image will be used for training.|
62 | | `DR_WORKERS` | Number of Robomaker workers to be used for training.  See additional documentation for more information about this feature.|
63 | | `DR_ROBOMAKER_MOUNT_LOGS` | TODO.|
64 | | `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.|
65 | | `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`.  Use Compose for openGL optimized containers.|
66 | | `DR_HOST_X` | Uses the host X-windows server, rather than starting one inside of Robomaker. Required for OpenGL images.|
67 | | `DR_WEBVIEWER_PORT` | Port for the web-viewer proxy which enables the streaming of all robomaker workers at once.|
68 | | `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.|
69 | 
70 | ## Commands
71 | 
72 | | Command | Description |
73 | |---------|-------------|
74 | | `dr-update` | Loads in all scripts and environment variables again.|
75 | | `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.|
76 | | `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.|
77 | | `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.|
78 | | `dr-start-training` | Starts a training session in the local VM based on current configuration.|
79 | | `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.|
80 | | `dr-stop-training` | Stops the current local training session. Uploads log files.|
81 | | `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.|
82 | | `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.|
83 | | `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.|
84 | | `dr-stop-loganalysis` | Stops the Jupyter log-analysis container.|
85 | | `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible remotly.|
86 | | `dr-stop-viewer` | Stops the NGINX proxy.|
87 | | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.|
88 | | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.|
89 | | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. |
90 | | `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. |
91 | | `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` |
92 | | `dr-download-model` | Downloads a file from a 'real' S3 location into a local prefix of choice. |
93 | 


--------------------------------------------------------------------------------
/utils/submit-monitor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import getopt
  5 | import os
  6 | import traceback
  7 | import pickle
  8 | import urllib.request
  9 | 
 10 | import boto3
 11 | from botocore.exceptions import ClientError
 12 | 
 13 | try:
 14 |     import pandas as pd
 15 |     from deepracer import boto3_enhancer
 16 | except ImportError:
 17 |     print("You need to install pandas and deepracer-utils to use this utility.")
 18 |     sys.exit(1)
 19 | 
 20 | dr = None
 21 | 
 22 | 
 23 | def main():
 24 | 
 25 |     # Parse Arguments
 26 |     try:
 27 |         opts, _ = getopt.getopt(
 28 |             sys.argv[1:],
 29 |             "lvsghm:b:",
 30 |             ["logs", "verbose", "summary", "graphics", "help", "model=", "board="],
 31 |         )
 32 |     except getopt.GetoptError as err:
 33 |         # print help information and exit:
 34 |         print(err)  # will print something like "option -x not recognized"
 35 |         usage()
 36 |         sys.exit(2)
 37 | 
 38 |     logs_path = "{}/data/logs/leaderboards".format(os.environ.get("DR_DIR", None))
 39 | 
 40 |     download_logs = False
 41 |     download_videos = False
 42 |     verbose = False
 43 |     create_summary = False
 44 |     model_name = None
 45 |     leaderboard_guid = None
 46 |     leaderboard_arn = None
 47 | 
 48 |     for opt, arg in opts:
 49 |         if opt in ("-l", "--logs"):
 50 |             download_logs = True
 51 |         elif opt in ("-g", "--graphics"):
 52 |             download_videos = True
 53 |         elif opt in ("-v", "--verbose"):
 54 |             verbose = True
 55 |         elif opt in ("-s", "--summary"):
 56 |             create_summary = True
 57 |         elif opt in ("-m", "--model"):
 58 |             model_name = arg.strip()
 59 |         elif opt in ("-b", "--board"):
 60 |             leaderboard_guid = arg.strip()
 61 |         elif opt in ("-h", "--help"):
 62 |             usage()
 63 |             sys.exit()
 64 | 
 65 |     # Prepare Boto3
 66 |     session = boto3.session.Session(
 67 |         region_name="us-east-1",
 68 |         profile_name=os.environ.get("DR_UPLOAD_S3_PROFILE", None),
 69 |     )
 70 | 
 71 |     global dr
 72 |     dr = boto3_enhancer.deepracer_client(session=session)
 73 | 
 74 |     # Find the ARN for my model
 75 |     my_model = find_model(model_name)
 76 | 
 77 |     if my_model is not None:
 78 |         my_model_arn = my_model["ModelArn"].values[0]
 79 |         if verbose:
 80 |             print("Found ModelARN for model {}: {}".format(model_name, my_model_arn))
 81 |     else:
 82 |         print("Did not find model with name {}".format(model_name))
 83 |         sys.exit(1)
 84 | 
 85 |     if leaderboard_guid.startswith('arn'):
 86 |         leaderboard_arn = leaderboard_guid
 87 | 
 88 |     # Find the leaderboard
 89 |     if not leaderboard_arn:
 90 |         leaderboard_arn = find_leaderboard(leaderboard_guid)
 91 | 
 92 |     if leaderboard_arn is not None:
 93 |         if verbose:
 94 |             print("Found Leaderboard with ARN {}".format(leaderboard_arn))
 95 |     else:
 96 |         print("Did not find Leaderboard with ARN {}".format(leaderboard_arn))
 97 |         sys.exit(1)
 98 | 
 99 |     # Load summary from file if we are interested in it!
100 |     if create_summary:
101 | 
102 |         pkl_f = "{}/{}/summary.pkl".format(logs_path, leaderboard_guid)
103 |         if os.path.isfile(pkl_f):
104 |             infile = open(pkl_f, "rb")
105 |             my_submissions = pickle.load(infile)
106 |             infile.close()
107 |         else:
108 |             my_submissions = {}
109 |             my_submissions["LeaderboardSubmissions"] = []
110 | 
111 |             dir_path = os.path.dirname(pkl_f)
112 |             os.makedirs(dir_path, exist_ok=True)
113 | 
114 |     # Collect data about latest submission
115 |     submission_response = dr.get_latest_user_submission(LeaderboardArn=leaderboard_arn)
116 |     latest_submission = submission_response["LeaderboardSubmission"]
117 |     if latest_submission:
118 |         jobid = latest_submission["ActivityArn"].split("/", 1)[1]
119 |         print(
120 |             "Job {} has status {}".format(
121 |                 jobid, latest_submission["LeaderboardSubmissionStatusType"]
122 |             )
123 |         )
124 | 
125 |         if latest_submission["LeaderboardSubmissionStatusType"] == "SUCCESS":
126 |             if download_logs:
127 |                 try:
128 |                     f_url = dr.get_asset_url(
129 |                         Arn=latest_submission["ActivityArn"],
130 |                         AssetType="LOGS",
131 |                     )["Url"]
132 |                     download_file(
133 |                         "{}/{}/robomaker-{}-{}.tar.gz".format(
134 |                             logs_path,
135 |                             leaderboard_guid,
136 |                             latest_submission["SubmissionTime"],
137 |                             jobid,
138 |                         ),
139 |                         f_url,
140 |                     )
141 |                 except ClientError:
142 |                     print(("WARNING: Logfile for job {} not available.").format(jobid))
143 |                     traceback.print_exc()
144 | 
145 |             if download_videos:
146 |                 download_file(
147 |                     "{}/{}/video-{}-{}.mp4".format(
148 |                         logs_path,
149 |                         leaderboard_guid,
150 |                         latest_submission["SubmissionTime"],
151 |                         jobid,
152 |                     ),
153 |                     latest_submission["SubmissionVideoS3path"],
154 |                 )
155 | 
156 |             # Submit again
157 |             _ = dr.create_leaderboard_submission(
158 |                 ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn
159 |             )
160 |             print("Submitted {} to {}.".format(model_name, leaderboard_arn))
161 | 
162 |         elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR" or latest_submission["LeaderboardSubmissionStatusType"] == "FAILED":
163 |             print("Error in previous submission")
164 |             if download_logs:
165 |                 try:
166 |                     f_url = dr.get_asset_url(
167 |                         Arn=latest_submission["ActivityArn"],
168 |                         AssetType="LOGS",
169 |                     )["Url"]
170 |                     download_file(
171 |                         "{}/{}/robomaker-{}-{}.tar.gz".format(
172 |                             logs_path,
173 |                             leaderboard_guid,
174 |                             latest_submission["SubmissionTime"],
175 |                             jobid,
176 |                         ),
177 |                         f_url,
178 |                     )
179 |                 except ClientError:
180 |                     print(("WARNING: Logfile for job {} not available.").format(jobid))
181 |                     traceback.print_exc()
182 | 
183 |             # Submit again
184 |             _ = dr.create_leaderboard_submission(
185 |                 ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn
186 |             )
187 |             print("Submitted {} to {}.".format(model_name, leaderboard_arn))
188 | 
189 |     # Maintain our summary
190 |     if create_summary:
191 |         for idx, i in enumerate(my_submissions["LeaderboardSubmissions"]):
192 |             if "SubmissionTime" in i:
193 |                 if i["SubmissionTime"] == latest_submission["SubmissionTime"]:
194 |                     del my_submissions["LeaderboardSubmissions"][idx]
195 |             else:
196 |                 del my_submissions["LeaderboardSubmissions"][idx]
197 |         my_submissions["LeaderboardSubmissions"].append(latest_submission)
198 | 
199 |         # Save summary
200 |         outfile = open(pkl_f, "wb")
201 |         pickle.dump(my_submissions, outfile)
202 |         outfile.close()
203 | 
204 |         # Display summary
205 |         if verbose:
206 |             display_submissions(my_submissions)
207 | 
208 | 
209 | def download_file(f_name, url):
210 | 
211 |     dir_path = os.path.dirname(f_name)
212 |     os.makedirs(dir_path, exist_ok=True)
213 |     if not os.path.isfile(f_name):
214 |         print("Downloading {}".format(os.path.basename(f_name)))
215 |         urllib.request.urlretrieve(url, f_name)
216 | 
217 | 
218 | def find_model(model_name):
219 | 
220 |     m_response = dr.list_models(ModelType="REINFORCEMENT_LEARNING", MaxResults=25)
221 |     model_dict = m_response["Models"]
222 |     models = pd.DataFrame.from_dict(model_dict)
223 |     my_model = models[models["ModelName"] == model_name]
224 | 
225 |     if my_model.size > 0:
226 |         return my_model
227 | 
228 |     while "NextToken" in m_response:
229 |         m_response = dr.list_models(
230 |             ModelType="REINFORCEMENT_LEARNING",
231 |             MaxResults=50,
232 |             NextToken=m_response["NextToken"],
233 |         )
234 |         model_dict = m_response["Models"]
235 | 
236 |         models = pd.DataFrame.from_dict(model_dict)
237 |         my_model = models[models["ModelName"] == model_name]
238 |         if my_model.size > 0:
239 |             return my_model
240 | 
241 |     return None
242 | 
243 | 
244 | def find_leaderboard(leaderboard_guid):
245 |     leaderboard_arn = "arn:aws:deepracer:::leaderboard/{}".format(leaderboard_guid)
246 | 
247 |     l_response = dr.list_leaderboards(MaxResults=25)
248 |     lboards_dict = l_response["Leaderboards"]
249 |     leaderboards = pd.DataFrame.from_dict(l_response["Leaderboards"])
250 |     if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0:
251 |         return leaderboard_arn
252 | 
253 |     while "NextToken" in l_response:
254 |         l_response = dr.list_leaderboards(
255 |             MaxResults=50, NextToken=l_response["NextToken"]
256 |         )
257 |         lboards_dict = l_response["Leaderboards"]
258 | 
259 |         leaderboards = pd.DataFrame.from_dict(lboards_dict)
260 |         if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0:
261 |             return leaderboard_arn
262 | 
263 |     return None
264 | 
265 | 
266 | def display_submissions(submissions_dict):
267 |     # Display status
268 |     my_columns = [
269 |         "SubmissionTime",
270 |         "TotalLapTime",
271 |         "BestLapTime",
272 |         "ResetCount",
273 |         "CollisionCount",
274 |         "OffTrackCount",
275 |         "Model",
276 |         "JobId",
277 |         "Status",
278 |     ]
279 |     my_submissions_df = pd.DataFrame.from_dict(
280 |         submissions_dict["LeaderboardSubmissions"]
281 |     )
282 |     my_submissions_df["SubmissionTime"] = (
283 |         my_submissions_df["SubmissionTime"]
284 |         .values.astype(dtype="datetime64[ms]")
285 |         .astype(dtype="datetime64[s]")
286 |     )
287 |     my_submissions_df["TotalLapTime"] = my_submissions_df["TotalLapTime"].values.astype(
288 |         dtype="datetime64[ms]"
289 |     )
290 |     my_submissions_df["TotalLapTime"] = (
291 |         my_submissions_df["TotalLapTime"].dt.strftime("%M:%S.%f").str[:-4]
292 |     )
293 |     my_submissions_df["BestLapTime"] = my_submissions_df["BestLapTime"].values.astype(
294 |         dtype="datetime64[ms]"
295 |     )
296 |     my_submissions_df["BestLapTime"] = (
297 |         my_submissions_df["BestLapTime"].dt.strftime("%M:%S.%f").str[:-4]
298 |     )
299 |     my_submissions_df["JobId"] = my_submissions_df["ActivityArn"].str.split("/").str[1]
300 |     my_submissions_df["Status"] = my_submissions_df["LeaderboardSubmissionStatusType"]
301 |     my_submissions_df[[None, None, "Model"]] = my_submissions_df.ModelArn.str.split(
302 |         "/", expand=True,
303 |     )
304 | 
305 |     # Display
306 |     print("")
307 |     print(my_submissions_df[my_columns])
308 | 
309 | 
310 | def usage():
311 |     print(
312 |         "Usage: submit-monitor.py [-v] [-s] [-l] [-g] -m <model-name> -b <leaderboard guid>"
313 |     )
314 |     print("        -v                Verbose output.")
315 |     print("        -s                Store a summary of all submissions.")
316 |     print("        -l                Download robomaker logfiles.")
317 |     print("        -g                Download video recordings.")
318 |     print("        -m                Display name of the model to submit.")
319 |     print("        -b                GUID or ARN of the leaderboard to submit to.")
320 |     sys.exit(1)
321 | 
322 | 
323 | if __name__ == "__main__":
324 |     main()
325 | 


--------------------------------------------------------------------------------
/scripts/training/prepare-config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import boto3
  4 | import sys
  5 | import os 
  6 | import time
  7 | import json
  8 | import io
  9 | import yaml
 10 | 
 11 | config = {}
 12 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1')
 13 | config['JOB_TYPE'] = 'TRAINING'
 14 | config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream')
 15 | config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')
 16 | 
 17 | metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None)
 18 | if metrics_prefix is not None:
 19 |     config['METRICS_S3_OBJECT_KEY'] = '{}/TrainingMetrics.json'.format(metrics_prefix)
 20 | else:
 21 |     config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time())))
 22 | 
 23 | config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') 
 24 | config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py')
 25 | config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy')
 26 | config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1)
 27 | config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')
 28 | config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')
 29 | config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')
 30 | config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')
 31 | config['TRAINING_JOB_ARN'] = 'arn:Dummy'
 32 | 
 33 | # Car and training 
 34 | config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer')
 35 | if config['BODY_SHELL_TYPE'] == 'deepracer':
 36 |     config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red')
 37 | config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red')
 38 | config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar')
 39 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL')
 40 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide')
 41 | config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1')
 42 | config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1')
 43 | 
 44 | config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false'))
 45 | config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true'))
 46 | config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05')
 47 | config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')
 48 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false')
 49 | config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5')
 50 | 
 51 | # Object Avoidance
 52 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
 53 |     config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6')
 54 |     config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0')
 55 |     config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True')
 56 |     config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false')
 57 | 
 58 |     object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
 59 |     if object_position_str != "":
 60 |         object_positions = []
 61 |         for o in object_position_str.split(";"):
 62 |             object_positions.append(o)
 63 |         config['OBJECT_POSITIONS'] = object_positions
 64 |         config['NUMBER_OF_OBSTACLES'] = str(len(object_positions))
 65 | 
 66 | # Head to Bot
 67 | if config['RACE_TYPE'] == 'HEAD_TO_BOT':
 68 |     config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False')
 69 |     config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0')
 70 |     config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0')
 71 |     config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0')
 72 |     config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0')
 73 |     config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0')
 74 |     config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False')
 75 |     config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2')
 76 |     config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0')
 77 | 
 78 | s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None)
 79 | s3_region = config['AWS_REGION']
 80 | s3_bucket = config['SAGEMAKER_SHARED_S3_BUCKET']
 81 | s3_prefix = config['SAGEMAKER_SHARED_S3_PREFIX']
 82 | s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile')
 83 | if s3_mode == 'profile':
 84 |     s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default')
 85 | else: # mode is 'role'
 86 |     s3_profile = None
 87 | s3_yaml_name = os.environ.get('DR_LOCAL_S3_TRAINING_PARAMS_FILE', 'training_params.yaml')
 88 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
 89 | 
 90 | session = boto3.session.Session(profile_name=s3_profile)
 91 | s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url)
 92 | 
 93 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name))
 94 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml'))
 95 | 
 96 | with open(local_yaml_path, 'w') as yaml_file:
 97 |     yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)
 98 | 
 99 | # Copy the reward function to the s3 prefix bucket for compatability with DeepRacer console.
100 | reward_function_key = os.path.normpath(os.path.join(s3_prefix, "reward_function.py"))
101 | copy_source = {
102 |     'Bucket': s3_bucket,
103 |     'Key': config['REWARD_FILE_S3_KEY']
104 | }
105 | s3_client.copy(copy_source, Bucket=s3_bucket, Key=reward_function_key)
106 | 
107 | # Training with different configurations on each worker (aka Multi Config training)
108 | config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False')
109 | num_workers = int(config['NUM_WORKERS'])
110 | 
111 | if config['MULTI_CONFIG'] == "True" and num_workers > 0:
112 |     
113 |     multi_config = {}
114 |     multi_config['multi_config'] = [None] * num_workers
115 | 
116 |     for i in range(1,num_workers+1,1):
117 |         if i == 1:
118 |             # copy training_params to training_params_1
119 |             s3_yaml_name_list = s3_yaml_name.split('.')
120 |             s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i
121 | 
122 |             #upload additional training params files
123 |             yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp))
124 |             s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)            
125 | 
126 |             # Store in multi_config array
127 |             multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp,
128 |                                                              'world_name': config['WORLD_NAME']}
129 | 
130 |         else:  # i >= 2 
131 |             #read in additional configuration file.  format of file must be worker#-run.env
132 |             location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i)))
133 |             with open(location, 'r') as fh:
134 |                 vars_dict = dict(
135 |                     tuple(line.split('='))
136 |                     for line in fh.read().splitlines() if not line.startswith('#')
137 |                     )
138 | 
139 |             # Reset parameters for the configuration of this worker number
140 |             os.environ.update(vars_dict)
141 | 
142 |             # Update car and training parameters
143 |             config.update({'WORLD_NAME': os.environ.get('DR_WORLD_NAME')})
144 |             config.update({'RACE_TYPE': os.environ.get('DR_RACE_TYPE')})
145 |             config.update({'CAR_COLOR': os.environ.get('DR_CAR_COLOR')})
146 |             config.update({'ALTERNATE_DRIVING_DIRECTION': os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION')})
147 |             config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')})
148 |             config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')})
149 |             config.update({'ENABLE_DOMAIN_RANDOMIZATION': os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION')})
150 |             config.update({'START_POSITION_OFFSET': os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')})
151 | 
152 |             # Update Object Avoidance parameters
153 |             if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE':
154 |                 config.update({'NUMBER_OF_OBSTACLES': os.environ.get('DR_OA_NUMBER_OF_OBSTACLES')})
155 |                 config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')})
156 |                 config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')})
157 |                 config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')})
158 |                 object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "")
159 |                 if object_position_str != "":
160 |                     object_positions = []
161 |                     for o in object_position_str.replace('"','').split(";"):
162 |                         object_positions.append(o)
163 |                     config.update({'OBJECT_POSITIONS': object_positions})
164 |                     config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))})
165 |                 else:
166 |                     config.pop('OBJECT_POSITIONS',[])
167 |             else:
168 |                 config.pop('NUMBER_OF_OBSTACLES', None)
169 |                 config.pop('MIN_DISTANCE_BETWEEN_OBSTACLES', None)
170 |                 config.pop('RANDOMIZE_OBSTACLE_LOCATIONS', None)
171 |                 config.pop('IS_OBSTACLE_BOT_CAR', None)
172 |                 config.pop('OBJECT_POSITIONS',[])
173 | 
174 |             # Update Head to Bot parameters
175 |             if config['RACE_TYPE'] == 'HEAD_TO_BOT':
176 |                 config.update({'IS_LANE_CHANGE': os.environ.get('DR_H2B_IS_LANE_CHANGE')})
177 |                 config.update({'LOWER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME')})
178 |                 config.update({'UPPER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME')})
179 |                 config.update({'LANE_CHANGE_DISTANCE': os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE')})
180 |                 config.update({'NUMBER_OF_BOT_CARS': os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS')})
181 |                 config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')})
182 |                 config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')})
183 |                 config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')})
184 |                 config.update({'PENALTY_SECONDS': os.environ.get('DR_H2B_BOT_CAR_PENALTY')})
185 |             else:
186 |                 config.pop('IS_LANE_CHANGE', None)
187 |                 config.pop('LOWER_LANE_CHANGE_TIME', None)
188 |                 config.pop('UPPER_LANE_CHANGE_TIME', None)
189 |                 config.pop('LANE_CHANGE_DISTANCE', None)
190 |                 config.pop('NUMBER_OF_BOT_CARS', None)
191 |                 config.pop('MIN_DISTANCE_BETWEEN_BOT_CARS', None)
192 |                 config.pop('RANDOMIZE_BOT_CAR_LOCATIONS', None)
193 |                 config.pop('BOT_CAR_SPEED', None)
194 | 
195 |             #split string s3_yaml_name, insert the worker number, and add back on the .yaml extension
196 |             s3_yaml_name_list = s3_yaml_name.split('.')
197 |             s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i
198 | 
199 |             #upload additional training params files
200 |             yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp))
201 |             local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml'))
202 |             with open(local_yaml_path, 'w') as yaml_file:
203 |                 yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True)
204 |             s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)
205 | 
206 |             # Store in multi_config array
207 |             multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp,
208 |                                                              'world_name': config['WORLD_NAME']}
209 | 
210 |     print(json.dumps(multi_config))
211 | 
212 | else:
213 |     s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path)
214 | 


--------------------------------------------------------------------------------