├── scripts ├── log-analysis │ ├── stop.sh │ └── start.sh ├── viewer │ ├── stop.sh │ └── start.sh ├── evaluation │ ├── stop.sh │ ├── start.sh │ └── prepare-config.py ├── training │ ├── stop.sh │ ├── increment.sh │ ├── start.sh │ └── prepare-config.py └── upload │ ├── upload-car.sh │ ├── import-model.py │ ├── increment.sh │ ├── download-model.sh │ ├── prepare-config.py │ └── upload-model.sh ├── utils ├── Dockerfile.sagemaker-gpu ├── cuda-check.sh ├── Dockerfile.gpu-detect ├── cuda-check-tf.py ├── start-xorg.sh ├── setup-xorg.sh ├── start-local-browser.sh ├── sample-createspot.sh └── submit-monitor.py ├── docker ├── docker-compose-mount.yml ├── docker-compose-robomaker-multi.yml ├── docker-compose-endpoint.yml ├── docker-compose-local-xorg.yml ├── docker-compose-webviewer-swarm.yml ├── docker-compose-keys.yml ├── docker-compose-webviewer.yml ├── docker-compose-eval-swarm.yml ├── docker-compose-training-swarm.yml ├── docker-compose-azure.yml ├── docker-compose-cwlog.yml ├── docker-compose-local.yml ├── docker-compose-eval.yml └── docker-compose-training.yml ├── defaults ├── dependencies.json ├── model_metadata_sac.json ├── hyperparameters.json ├── template-system.env ├── model_metadata.json ├── template-worker.env ├── reward_function.py ├── debug-reward_function.py └── template-run.env ├── docs ├── _config.yml ├── head-to-head.md ├── multi_run.md ├── video.md ├── multi_worker.md ├── docker.md ├── opengl.md ├── index.md ├── upload.md ├── windows.md ├── multi_gpu.md ├── installation.md └── reference.md ├── .gitignore ├── bin ├── detect.sh ├── autorun.sh ├── runonce.sh ├── prepare.sh ├── init.sh ├── activate.sh └── scripts_wrapper.sh └── README.md /scripts/log-analysis/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | docker stop loganalysis 4 | -------------------------------------------------------------------------------- /utils/Dockerfile.sagemaker-gpu: -------------------------------------------------------------------------------- 1 | FROM awsdeepracercommunity/deepracer-sagemaker:5.0.0-gpu 2 | ENV CUDA_VISIBLE_DEVICES=0 3 | -------------------------------------------------------------------------------- /docker/docker-compose-mount.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | robomaker: 5 | volumes: 6 | - "${DR_MOUNT_DIR}:/root/.ros/log" 7 | -------------------------------------------------------------------------------- /docker/docker-compose-robomaker-multi.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | robomaker: 5 | volumes: 6 | - "${DR_DIR}/tmp/comms.${DR_RUN_ID}:/mnt/comms" 7 | -------------------------------------------------------------------------------- /defaults/dependencies.json: -------------------------------------------------------------------------------- 1 | { 2 | "master_version": "5.0", 3 | "containers": { 4 | "rl_coach": "5.0.0", 5 | "robomaker": "5.0.3", 6 | "sagemaker": "5.0.0" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | theme: jekyll-theme-slate 3 | markdown: GFM 4 | name: Deepracer-for-Cloud 5 | plugins: 6 | - jekyll-relative-links 7 | relative_links: 8 | enabled: true 9 | collections: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | custom_files/ 3 | logs/ 4 | docker/volumes/ 5 | recording/ 6 | recording 7 | /*.env 8 | /*.bak 9 | /*.json 10 | DONE 11 | data/ 12 | tmp/ 13 | autorun.s3url 14 | nohup.out 15 | start.sh -------------------------------------------------------------------------------- /docker/docker-compose-endpoint.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | rl_coach: 5 | environment: 6 | - S3_ENDPOINT_URL=${DR_MINIO_URL} 7 | robomaker: 8 | environment: 9 | - S3_ENDPOINT_URL=${DR_MINIO_URL} 10 | -------------------------------------------------------------------------------- /utils/cuda-check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CONTAINER_ID=$(docker create --rm -ti -e CUDA_VISIBLE_DEVICES --name cuda-check awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE "python3 cuda-check-tf.py") 4 | docker cp $DR_DIR/utils/cuda-check-tf.py $CONTAINER_ID:/opt/install/ 5 | docker start -a $CONTAINER_ID -------------------------------------------------------------------------------- /utils/Dockerfile.gpu-detect: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.4.2-base-ubuntu18.04 2 | RUN apt-get update && apt-get install -y --no-install-recommends wget python3 3 | RUN wget https://gist.githubusercontent.com/f0k/63a664160d016a491b2cbea15913d549/raw/f25b6b38932cfa489150966ee899e5cc899bf4a6/cuda_check.py 4 | CMD ["python3","cuda_check.py"] -------------------------------------------------------------------------------- /defaults/model_metadata_sac.json: -------------------------------------------------------------------------------- 1 | { 2 | "action_space": {"speed": {"high": 2, "low": 1}, "steering_angle": {"high": 30, "low": -30}}, 3 | "sensor": ["FRONT_FACING_CAMERA"], 4 | "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", 5 | "training_algorithm": "sac", 6 | "action_space_type": "continuous", 7 | "version": "4" 8 | } 9 | -------------------------------------------------------------------------------- /docker/docker-compose-local-xorg.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | robomaker: 5 | environment: 6 | - DISPLAY 7 | - USE_EXTERNAL_X=${DR_HOST_X} 8 | - XAUTHORITY=/root/.Xauthority 9 | - QT_X11_NO_MITSHM=1 10 | volumes: 11 | - '/tmp/.X11-unix/:/tmp/.X11-unix' 12 | - '${XAUTHORITY}:/root/.Xauthority' -------------------------------------------------------------------------------- /docker/docker-compose-webviewer-swarm.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | networks: 4 | default: 5 | external: true 6 | name: sagemaker-local 7 | 8 | services: 9 | proxy: 10 | deploy: 11 | restart_policy: 12 | condition: none 13 | replicas: 1 14 | placement: 15 | constraints: [node.labels.Sagemaker == true ] 16 | -------------------------------------------------------------------------------- /docker/docker-compose-keys.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | rl_coach: 5 | environment: 6 | - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} 7 | - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} 8 | robomaker: 9 | environment: 10 | - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} 11 | - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} 12 | -------------------------------------------------------------------------------- /docker/docker-compose-webviewer.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | networks: 4 | default: 5 | external: true 6 | name: sagemaker-local 7 | 8 | services: 9 | proxy: 10 | image: nginx 11 | ports: 12 | - "${DR_WEBVIEWER_PORT}:80" 13 | volumes: 14 | - ${DR_VIEWER_HTML}:/usr/share/nginx/html/index.html 15 | - ${DR_NGINX_CONF}:/etc/nginx/conf.d/default.conf 16 | 17 | -------------------------------------------------------------------------------- /scripts/viewer/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | STACK_NAME="deepracer-$DR_RUN_ID-viewer" 4 | COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml 5 | 6 | # Check if we will use Docker Swarm or Docker Compose 7 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 8 | then 9 | docker stack rm $STACK_NAME 10 | else 11 | docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down 12 | fi -------------------------------------------------------------------------------- /scripts/log-analysis/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | docker run --rm -d -p "8888:8888" \ 4 | -v `pwd`/../../data/logs:/workspace/logs \ 5 | -v `pwd`/../../docker/volumes/.aws:/root/.aws \ 6 | -v `pwd`/../../data/analysis:/workspace/analysis \ 7 | -v `pwd`/../../data/minio:/workspace/minio \ 8 | --name loganalysis \ 9 | --network sagemaker-local \ 10 | awsdeepracercommunity/deepracer-analysis:$DR_ANALYSIS_IMAGE 11 | 12 | docker logs -f loganalysis -------------------------------------------------------------------------------- /utils/cuda-check-tf.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.client import device_lib 2 | import tensorflow as tf 3 | 4 | def get_available_gpus(): 5 | local_device_protos = device_lib.list_local_devices() 6 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 7 | 8 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05) 9 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 10 | print(get_available_gpus()) 11 | -------------------------------------------------------------------------------- /defaults/hyperparameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "batch_size": 64, 3 | "beta_entropy": 0.01, 4 | "discount_factor": 0.995, 5 | "e_greedy_value": 0.05, 6 | "epsilon_steps": 10000, 7 | "exploration_type": "categorical", 8 | "loss_type": "huber", 9 | "lr": 0.0003, 10 | "num_episodes_between_training": 20, 11 | "num_epochs": 10, 12 | "stack_size": 1, 13 | "term_cond_avg_score": 350.0, 14 | "term_cond_max_episodes": 1000, 15 | "sac_alpha": 0.2 16 | } -------------------------------------------------------------------------------- /docker/docker-compose-eval-swarm.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | rl_coach: 5 | deploy: 6 | restart_policy: 7 | condition: none 8 | placement: 9 | constraints: [node.labels.Sagemaker == true ] 10 | robomaker: 11 | deploy: 12 | restart_policy: 13 | condition: none 14 | replicas: 1 15 | placement: 16 | constraints: [node.labels.Robomaker == true ] 17 | environment: 18 | - DOCKER_REPLICA_SLOT={{.Task.Slot}} -------------------------------------------------------------------------------- /docker/docker-compose-training-swarm.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | rl_coach: 5 | deploy: 6 | restart_policy: 7 | condition: none 8 | placement: 9 | constraints: [node.labels.Sagemaker == true ] 10 | robomaker: 11 | deploy: 12 | restart_policy: 13 | condition: none 14 | replicas: ${DR_WORKERS} 15 | placement: 16 | constraints: [node.labels.Robomaker == true ] 17 | environment: 18 | - DOCKER_REPLICA_SLOT={{.Task.Slot}} -------------------------------------------------------------------------------- /docker/docker-compose-azure.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | networks: 4 | default: 5 | external: true 6 | name: sagemaker-local 7 | 8 | services: 9 | minio: 10 | image: minio/minio 11 | ports: 12 | - "9000:9000" 13 | command: gateway azure 14 | environment: 15 | - MINIO_ACCESS_KEY=${DR_LOCAL_ACCESS_KEY_ID} 16 | - MINIO_SECRET_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} 17 | - AWS_ACCESS_KEY_ID=${DR_LOCAL_ACCESS_KEY_ID} 18 | - AWS_SECRET_ACCESS_KEY=${DR_LOCAL_SECRET_ACCESS_KEY} 19 | 20 | -------------------------------------------------------------------------------- /scripts/evaluation/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | STACK_NAME="deepracer-eval-$DR_RUN_ID" 4 | RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} 5 | 6 | # Check if we will use Docker Swarm or Docker Compose 7 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 8 | then 9 | docker stack rm $STACK_NAME 10 | else 11 | COMPOSE_FILES=$(echo ${DR_EVAL_COMPOSE_FILE} | cut -f1-2 -d\ ) 12 | export DR_CURRENT_PARAMS_FILE="" 13 | export ROBOMAKER_COMMAND="" 14 | docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down 15 | fi -------------------------------------------------------------------------------- /docker/docker-compose-cwlog.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | services: 4 | rl_coach: 5 | logging: 6 | driver: awslogs 7 | options: 8 | awslogs-group: '/deepracer-for-cloud' 9 | awslogs-create-group: 'true' 10 | awslogs-region: ${DR_AWS_APP_REGION} 11 | tag: "{{.Name}}" 12 | robomaker: 13 | logging: 14 | driver: awslogs 15 | options: 16 | awslogs-group: '/deepracer-for-cloud' 17 | awslogs-create-group: 'true' 18 | awslogs-region: ${DR_AWS_APP_REGION} 19 | tag: "{{.Name}}" -------------------------------------------------------------------------------- /docker/docker-compose-local.yml: -------------------------------------------------------------------------------- 1 | 2 | version: '3.7' 3 | 4 | networks: 5 | default: 6 | external: true 7 | name: sagemaker-local 8 | 9 | services: 10 | minio: 11 | image: minio/minio 12 | ports: 13 | - "9000:9000" 14 | - "9001:9001" 15 | command: server /data --console-address ":9001" 16 | environment: 17 | - MINIO_ROOT_USER=${DR_LOCAL_ACCESS_KEY_ID} 18 | - MINIO_ROOT_PASSWORD=${DR_LOCAL_SECRET_ACCESS_KEY} 19 | - MINIO_UID 20 | - MINIO_GID 21 | - MINIO_USERNAME 22 | - MINIO_GROUPNAME 23 | volumes: 24 | - ${DR_DIR}/data/minio:/data 25 | -------------------------------------------------------------------------------- /utils/start-xorg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export DISPLAY=$DR_DISPLAY 3 | 4 | nohup sudo xinit /usr/bin/jwm -- /usr/lib/xorg/Xorg $DISPLAY -config $DR_DIR/tmp/xorg.conf > $DR_DIR/tmp/xorg.log 2>&1 & 5 | sleep 1 6 | 7 | if [[ "${DR_GUI_ENABLE,,}" == "true" ]]; then 8 | xrandr -s 1400x900 9 | x11vnc -bg -forever -no6 -nopw -rfbport 5901 -rfbportv6 -1 -loop -display WAIT$DISPLAY & 10 | sleep 1 11 | fi 12 | 13 | xauth generate $DISPLAY 14 | export XAUTHORITY=~/.Xauthority 15 | 16 | if timeout 1s xset q &>/dev/null; then 17 | echo "X Server started on display $DISPLAY" 18 | else 19 | echo "Server failed to start on display $DISPLAY" 20 | fi -------------------------------------------------------------------------------- /defaults/template-system.env: -------------------------------------------------------------------------------- 1 | DR_CLOUD= 2 | DR_AWS_APP_REGION= 3 | DR_UPLOAD_S3_PROFILE=default 4 | DR_UPLOAD_S3_BUCKET= 5 | DR_UPLOAD_S3_ROLE= 6 | DR_LOCAL_S3_BUCKET=bucket 7 | DR_LOCAL_S3_PROFILE= 8 | DR_GUI_ENABLE=False 9 | DR_KINESIS_STREAM_NAME= 10 | DR_KINESIS_STREAM_ENABLE=True 11 | DR_SAGEMAKER_IMAGE= 12 | DR_ROBOMAKER_IMAGE= 13 | DR_ANALYSIS_IMAGE=cpu 14 | DR_COACH_IMAGE= 15 | DR_WORKERS=1 16 | DR_ROBOMAKER_MOUNT_LOGS=False 17 | DR_CLOUD_WATCH_ENABLE=False 18 | DR_DOCKER_STYLE=swarm 19 | DR_HOST_X=False 20 | DR_WEBVIEWER_PORT=8100 21 | # DR_DISPLAY=:99 22 | # DR_REMOTE_MINIO_URL=http://mynas:9000 23 | # CUDA_VISIBLE_DEVICES=0 -------------------------------------------------------------------------------- /utils/setup-xorg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to install basic X-Windows on a headless instance (e.g. in EC2) 4 | 5 | # Install additional packages 6 | sudo apt-get install xinit xserver-xorg-legacy x11-xserver-utils x11-utils \ 7 | menu mesa-utils xterm jwm x11vnc pkg-config -y --no-install-recommends 8 | 9 | # Configure 10 | sudo sed -i -e "s/console/anybody/" /etc/X11/Xwrapper.config 11 | BUS_ID=$(nvidia-xconfig --query-gpu-info | grep "PCI BusID" | cut -f2- -d: | sed -e 's/^[[:space:]]*//' | head -1) 12 | sudo nvidia-xconfig --busid=$BUS_ID -o $DR_DIR/tmp/xorg.conf 13 | 14 | touch ~/.Xauthority 15 | 16 | sudo tee -a $DR_DIR/tmp/xorg.conf << EOF 17 | 18 | Section "DRI" 19 | Mode 0666 20 | EndSection 21 | EOF 22 | -------------------------------------------------------------------------------- /defaults/model_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "action_space": [ 3 | { 4 | "steering_angle": -30, 5 | "speed": 0.6 6 | }, 7 | { 8 | "steering_angle": -15, 9 | "speed": 0.6 10 | }, 11 | { 12 | "steering_angle": 0, 13 | "speed": 0.6 14 | }, 15 | { 16 | "steering_angle": 15, 17 | "speed": 0.6 18 | }, 19 | { 20 | "steering_angle": 30, 21 | "speed": 0.6 22 | } 23 | ], 24 | "sensor": ["FRONT_FACING_CAMERA"], 25 | "neural_network": "DEEP_CONVOLUTIONAL_NETWORK_SHALLOW", 26 | "training_algorithm": "clipped_ppo", 27 | "action_space_type": "discrete", 28 | "version": "4" 29 | } 30 | -------------------------------------------------------------------------------- /bin/detect.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## What am I? 4 | if [[ -f /var/run/cloud-init/instance-data.json ]]; 5 | then 6 | # We have a cloud-init environment (Azure or AWS). 7 | CLOUD_NAME=$(jq -r '.v1."cloud-name"' /var/run/cloud-init/instance-data.json) 8 | if [[ "${CLOUD_NAME}" == "azure" ]]; 9 | then 10 | export CLOUD_NAME 11 | export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta_data".imds.compute."vmSize"' /var/run/cloud-init/instance-data.json) 12 | elif [[ "${CLOUD_NAME}" == "aws" ]]; 13 | then 14 | export CLOUD_NAME 15 | export CLOUD_INSTANCETYPE=$(jq -r '.ds."meta-data"."instance-type"' /var/run/cloud-init/instance-data.json) 16 | else 17 | export CLOUD_NAME=local 18 | fi 19 | else 20 | export CLOUD_NAME=local 21 | fi -------------------------------------------------------------------------------- /defaults/template-worker.env: -------------------------------------------------------------------------------- 1 | DR_WORLD_NAME=reInvent2019_track 2 | DR_RACE_TYPE=TIME_TRIAL 3 | DR_CAR_COLOR=Blue 4 | DR_ENABLE_DOMAIN_RANDOMIZATION=False 5 | DR_TRAIN_CHANGE_START_POSITION=True 6 | DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False 7 | DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 8 | DR_TRAIN_START_POSITION_OFFSET=0.0 9 | DR_OA_NUMBER_OF_OBSTACLES=6 10 | DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 11 | DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False 12 | DR_OA_IS_OBSTACLE_BOT_CAR=False 13 | DR_OA_OBJECT_POSITIONS= 14 | DR_H2B_IS_LANE_CHANGE=False 15 | DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 16 | DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 17 | DR_H2B_LANE_CHANGE_DISTANCE=1.0 18 | DR_H2B_NUMBER_OF_BOT_CARS=3 19 | DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 20 | DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False 21 | DR_H2B_BOT_CAR_SPEED=0.2 22 | -------------------------------------------------------------------------------- /docs/head-to-head.md: -------------------------------------------------------------------------------- 1 | # Head-to-Head Race (Beta) 2 | 3 | It is possible to run a head-to-head race, similar to the races in the brackets 4 | run by AWS in the Virtual Circuits to determine the winner of the head-to-bot races. 5 | 6 | This replaces the "Tournament Mode". 7 | 8 | ## Introduction 9 | 10 | The concept is that you have two models racing each other, one Purple and one Orange Car. One car 11 | is powered by our primary configured model, and the second car is powered by the model in `DR_EVAL_OPP_S3_MODEL_PREFIX` 12 | 13 | ## Configuration 14 | 15 | ### run.env 16 | 17 | Configure `run.env` with the following parameters: 18 | * `DR_RACE_TYPE` should be `HEAD_TO_MODEL`. 19 | * `DR_EVAL_OPP_S3_MODEL_PREFIX` will be the S3 prefix for the secondary model. 20 | * `DR_EVAL_OPP_CAR_NAME` is the display name of this model. 21 | 22 | Metrics, Traces and Videos will be stored in each models' prefix. 23 | 24 | ## Run 25 | 26 | Run the race with `dr-start-evaluation`; one race will be run. -------------------------------------------------------------------------------- /docs/multi_run.md: -------------------------------------------------------------------------------- 1 | # Running Multiple Experiments 2 | 3 | It is possible to run multiple experiments on one computer in parallel. This is possible both in `swarm` and `compose` mode, and is controlled by `DR_RUN_ID` in `run.env`. 4 | 5 | The feature works by creating unique prefixes to the container names: 6 | * In Swarm mode this is done through defining a stack name (default: deepracer-0) 7 | * In Compose mode this is done through adding a project name. 8 | 9 | ## Suggested way to use the feature 10 | 11 | By default `run.env` is loaded when DRfC is activated - but it is possible to load a separate configuration through `source bin/activate.sh `. 12 | 13 | The best way to use this feature is to have a bash-shell per experiment, and to load a separate configuration per shell. 14 | 15 | After activating one can control each experiment independently through using the `dr-*` commands. 16 | 17 | If using local or Azure the S3 / Minio instance will be shared, and is running only once. -------------------------------------------------------------------------------- /docker/docker-compose-eval.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | 3 | networks: 4 | default: 5 | external: true 6 | name: sagemaker-local 7 | 8 | services: 9 | rl_coach: 10 | image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE} 11 | command: ["/bin/bash", "-c", "echo No work for coach in Evaluation Mode"] 12 | robomaker: 13 | image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} 14 | command: ["${ROBOMAKER_COMMAND}"] 15 | ports: 16 | - "${DR_ROBOMAKER_EVAL_PORT}:8080" 17 | environment: 18 | - CUDA_VISIBLE_DEVICES 19 | - DEBUG_REWARD=${DR_EVAL_DEBUG_REWARD} 20 | - WORLD_NAME=${DR_WORLD_NAME} 21 | - NUMBER_OF_TRIALS=${DR_NUMBER_OF_EPISODES} 22 | - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} 23 | - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 24 | - APP_REGION=${DR_AWS_APP_REGION} 25 | - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} 26 | - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} 27 | - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} 28 | - ENABLE_GUI=${DR_GUI_ENABLE} 29 | - ROLLOUT_IDX=0 30 | - RTF_OVERRIDE=${DR_EVAL_RTF} 31 | -------------------------------------------------------------------------------- /defaults/reward_function.py: -------------------------------------------------------------------------------- 1 | def reward_function(params): 2 | ''' 3 | Example of penalize steering, which helps mitigate zig-zag behaviors 4 | ''' 5 | 6 | # Read input parameters 7 | distance_from_center = params['distance_from_center'] 8 | track_width = params['track_width'] 9 | steering = abs(params['steering_angle']) # Only need the absolute steering angle 10 | 11 | # Calculate 3 marks that are farther and father away from the center line 12 | marker_1 = 0.1 * track_width 13 | marker_2 = 0.25 * track_width 14 | marker_3 = 0.5 * track_width 15 | 16 | # Give higher reward if the car is closer to center line and vice versa 17 | if distance_from_center <= marker_1: 18 | reward = 1 19 | elif distance_from_center <= marker_2: 20 | reward = 0.5 21 | elif distance_from_center <= marker_3: 22 | reward = 0.1 23 | else: 24 | reward = 1e-3 # likely crashed/ close to off track 25 | 26 | # Steering penality threshold, change the number based on your action space setting 27 | ABS_STEERING_THRESHOLD = 15 28 | 29 | # Penalize reward if the car is steering too much 30 | if steering > ABS_STEERING_THRESHOLD: 31 | reward *= 0.8 32 | 33 | return float(reward) 34 | -------------------------------------------------------------------------------- /bin/autorun.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## this is the default autorun script 4 | ## file should run automatically after init.sh completes. 5 | ## this script downloads your configured run.env, system.env and any custom container requests 6 | 7 | INSTALL_DIR_TEMP="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" 8 | 9 | ## retrieve the s3_location name you sent the instance in user data launch 10 | ## assumed to first line of file 11 | S3_LOCATION=$(awk 'NR==1 {print; exit}' $INSTALL_DIR_TEMP/autorun.s3url) 12 | 13 | source $INSTALL_DIR_TEMP/bin/activate.sh 14 | 15 | ## get the updatated run.env and system.env files and any others you stashed in s3 16 | aws s3 sync s3://$S3_LOCATION $INSTALL_DIR_TEMP 17 | 18 | ## get the right docker containers, if needed 19 | SYSENV="$INSTALL_DIR_TEMP/system.env" 20 | SAGEMAKER_IMAGE=$(cat $SYSENV | grep DR_SAGEMAKER_IMAGE | sed 's/.*=//') 21 | ROBOMAKER_IMAGE=$(cat $SYSENV | grep DR_ROBOMAKER_IMAGE | sed 's/.*=//') 22 | 23 | docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_IMAGE 24 | docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_IMAGE 25 | 26 | dr-reload 27 | 28 | date | tee $INSTALL_DIR_TEMP/DONE-AUTORUN 29 | 30 | ## start training 31 | cd $INSTALL_DIR_TEMP/scripts/training 32 | ./start.sh 33 | 34 | -------------------------------------------------------------------------------- /scripts/training/stop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | STACK_NAME="deepracer-$DR_RUN_ID" 4 | RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} 5 | 6 | SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) 7 | 8 | if [[ -n $SAGEMAKER_CONTAINERS ]]; 9 | then 10 | for CONTAINER in $SAGEMAKER_CONTAINERS; do 11 | CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) 12 | CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') 13 | COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') 14 | COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) 15 | if [[ -n $COMPOSE_FILE ]]; then 16 | sudo docker-compose -f $COMPOSE_FILE stop $COMPOSE_SERVICE_NAME 17 | docker container rm $CONTAINER 18 | fi 19 | done 20 | fi 21 | 22 | # Check if we will use Docker Swarm or Docker Compose 23 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 24 | then 25 | docker stack rm $STACK_NAME 26 | else 27 | COMPOSE_FILES=$(echo ${DR_TRAIN_COMPOSE_FILE} | cut -f1-2 -d\ ) 28 | export DR_CURRENT_PARAMS_FILE="" 29 | export ROBOMAKER_COMMAND="" 30 | docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR down 31 | fi -------------------------------------------------------------------------------- /scripts/upload/upload-car.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script creates the tar.gz file necessary to operate inside a deepracer physical car 4 | # The file is created directly from within the sagemaker container, using the most recent checkpoint 5 | 6 | # Find name of sagemaker container 7 | SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) 8 | if [[ -n $SAGEMAKER_CONTAINERS ]]; 9 | then 10 | for CONTAINER in $SAGEMAKER_CONTAINERS; do 11 | CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) 12 | CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') 13 | echo $CONTAINER_NAME 14 | done 15 | fi 16 | 17 | #create tmp directory if it doesnt already exit 18 | mkdir -p $DR_DIR/tmp/car_upload 19 | cd $DR_DIR/tmp/car_upload 20 | #ensure directory is empty 21 | rm -r $DR_DIR/tmp/car_upload/* 22 | #The files we want are located inside the sagemaker container at /opt/ml/model. Copy them to the tmp directory 23 | docker cp $CONTAINER_NAME:/opt/ml/model $DR_DIR/tmp/car_upload 24 | cd $DR_DIR/tmp/car_upload/model 25 | #create a tar.gz file containing all of these files 26 | tar -czvf carfile.tar.gz * 27 | 28 | #upload to s3 29 | aws ${DR_UPLOAD_PROFILE} s3 cp carfile.tar.gz s3://${DR_UPLOAD_S3_BUCKET}/${DR_UPLOAD_S3_PREFIX}/carfile.tar.gz 30 | 31 | -------------------------------------------------------------------------------- /bin/runonce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -eq 0 ]]; then 4 | echo "Schedules a command to be run after the next reboot." 5 | echo "Usage: $(basename $0) " 6 | echo " $(basename $0) -p " 7 | echo " $(basename $0) -r " 8 | else 9 | REMOVE=0 10 | COMMAND=${!#} 11 | SCRIPTPATH=$PATH 12 | 13 | while getopts ":r:p:" optionName; do 14 | case "$optionName" in 15 | r) REMOVE=1; COMMAND=$OPTARG;; 16 | p) SCRIPTPATH=$OPTARG;; 17 | esac 18 | done 19 | 20 | SCRIPT="${HOME}/.$(basename $0)_$(echo $COMMAND | sed 's/[^a-zA-Z0-9_]/_/g')" 21 | 22 | if [[ ! -f $SCRIPT ]]; then 23 | echo "PATH=$SCRIPTPATH" >> $SCRIPT 24 | echo "cd $(pwd)" >> $SCRIPT 25 | echo "logger -t $(basename $0) -p local3.info \"COMMAND=$COMMAND ; USER=\$(whoami) ($(logname)) ; PWD=$(pwd) ; PATH=\$PATH\"" >> $SCRIPT 26 | echo "$COMMAND | logger -t $(basename $0) -p local3.info" >> $SCRIPT 27 | echo "$0 -r \"$(echo $COMMAND | sed 's/\"/\\\"/g')\"" >> $SCRIPT 28 | chmod +x $SCRIPT 29 | fi 30 | 31 | CRONTAB="${HOME}/.$(basename $0)_temp_crontab_$RANDOM" 32 | ENTRY="@reboot $SCRIPT" 33 | 34 | echo "$(crontab -l 2>/dev/null)" | grep -v "$ENTRY" | grep -v "^# DO NOT EDIT THIS FILE - edit the master and reinstall.$" | grep -v "^# ([^ ]* installed on [^)]*)$" | grep -v "^# (Cron version [^$]*\$[^$]*\$)$" > $CRONTAB 35 | 36 | if [[ $REMOVE -eq 0 ]]; then 37 | echo "$ENTRY" >> $CRONTAB 38 | fi 39 | 40 | crontab $CRONTAB 41 | rm $CRONTAB 42 | 43 | if [[ $REMOVE -ne 0 ]]; then 44 | rm $SCRIPT 45 | fi 46 | fi 47 | -------------------------------------------------------------------------------- /docker/docker-compose-training.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | networks: 4 | default: 5 | external: true 6 | name: sagemaker-local 7 | 8 | services: 9 | rl_coach: 10 | image: awsdeepracercommunity/deepracer-rlcoach:${DR_COACH_IMAGE} 11 | environment: 12 | - SAGEMAKER_IMAGE=${DR_SAGEMAKER_IMAGE} 13 | - PRETRAINED=${DR_LOCAL_S3_PRETRAINED} 14 | - PRETRAINED_S3_PREFIX=${DR_LOCAL_S3_PRETRAINED_PREFIX} 15 | - PRETRAINED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 16 | - PRETRAINED_CHECKPOINT=${DR_LOCAL_S3_PRETRAINED_CHECKPOINT} 17 | - MODEL_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} 18 | - MODEL_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 19 | - HYPERPARAMETER_FILE_S3_KEY=${DR_LOCAL_S3_HYPERPARAMETERS_KEY} 20 | - MODELMETADATA_FILE_S3_KEY=${DR_LOCAL_S3_MODEL_METADATA_KEY} 21 | volumes: 22 | - "/var/run/docker.sock:/var/run/docker.sock" 23 | - "/tmp/sagemaker:/tmp/sagemaker" 24 | robomaker: 25 | image: awsdeepracercommunity/deepracer-robomaker:${DR_ROBOMAKER_IMAGE} 26 | command: ["${ROBOMAKER_COMMAND}"] 27 | ports: 28 | - "${DR_ROBOMAKER_TRAIN_PORT}:8080" 29 | - "${DR_ROBOMAKER_GUI_PORT}:5900" 30 | environment: 31 | - WORLD_NAME=${DR_WORLD_NAME} 32 | - SAGEMAKER_SHARED_S3_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} 33 | - SAGEMAKER_SHARED_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 34 | - APP_REGION=${DR_AWS_APP_REGION} 35 | - S3_YAML_NAME=${DR_CURRENT_PARAMS_FILE} 36 | - KINESIS_VIDEO_STREAM_NAME=${DR_KINESIS_STREAM_NAME} 37 | - ENABLE_KINESIS=${DR_KINESIS_STREAM_ENABLE} 38 | - ENABLE_GUI=${DR_GUI_ENABLE} 39 | - CUDA_VISIBLE_DEVICES 40 | - MULTI_CONFIG 41 | - RTF_OVERRIDE=${DR_TRAIN_RTF} 42 | -------------------------------------------------------------------------------- /defaults/debug-reward_function.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy 3 | import rospy 4 | import time 5 | 6 | class Reward: 7 | 8 | ''' 9 | Debugging reward function to be used to track performance of local training. 10 | Will print out the Real-Time-Factor (RTF), as well as how many 11 | steps-per-second (sim-time) that the system is able to deliver. 12 | ''' 13 | 14 | def __init__(self, verbose=False, track_time=False): 15 | self.verbose = verbose 16 | self.track_time = track_time 17 | 18 | if track_time: 19 | TIME_WINDOW=10 20 | self.time = numpy.zeros([TIME_WINDOW, 2]) 21 | 22 | if verbose: 23 | print("Initializing Reward Class") 24 | 25 | def get_time(self): 26 | 27 | wall_time_incr = numpy.max(self.time[:,0]) - numpy.min(self.time[:,0]) 28 | sim_time_incr = numpy.max(self.time[:,1]) - numpy.min(self.time[:,1]) 29 | 30 | rtf = sim_time_incr / wall_time_incr 31 | fps = (self.time.shape[0] - 1) / sim_time_incr 32 | 33 | return rtf, fps 34 | 35 | def record_time(self, steps): 36 | 37 | index = int(steps) % self.time.shape[0] 38 | self.time[index,0] = time.time() 39 | self.time[index,1] = rospy.get_time() 40 | 41 | def reward_function(self, params): 42 | 43 | # Read input parameters 44 | steps = params["steps"] 45 | 46 | if self.track_time: 47 | self.record_time(steps) 48 | 49 | if self.track_time: 50 | if steps >= self.time.shape[0]: 51 | rtf, fps = self.get_time() 52 | print("TIME: s: {}, rtf: {}, fps:{}".format(int(steps), round(rtf, 2), round(fps, 2) )) 53 | 54 | return 1.0 55 | 56 | 57 | reward_object = Reward(verbose=False, track_time=True) 58 | 59 | def reward_function(params): 60 | return reward_object.reward_function(params) 61 | -------------------------------------------------------------------------------- /scripts/upload/import-model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import boto3 4 | import sys 5 | import os 6 | import time 7 | import json 8 | import io 9 | import yaml 10 | from botocore.loaders import UnknownServiceError 11 | 12 | try: 13 | import pandas as pd 14 | import deepracer 15 | except ImportError: 16 | print("You need to install pandas and deepracer-utils to use the import function.") 17 | exit(1) 18 | 19 | # Read in command 20 | aws_profile = sys.argv[1] 21 | aws_s3_role = sys.argv[2] 22 | aws_s3_bucket = sys.argv[3] 23 | aws_s3_prefix = sys.argv[4] 24 | dr_model_name = sys.argv[5] 25 | 26 | if not aws_s3_role: 27 | print("You must configure an IAM role with access to the S3 bucket in variable DR_UPLOAD_S3_ROLE ") 28 | exit(1) 29 | 30 | session = boto3.session.Session(region_name='us-east-1', profile_name=aws_profile) 31 | 32 | try: 33 | dr = session.client('deepracer') 34 | except UnknownServiceError: 35 | print ("Boto3 service 'deepracer' is not installed. Cannot import model.") 36 | print ("Install with 'pip install deepracer-utils' and 'python -m deepracer install-cli --force'") 37 | exit(1) 38 | 39 | # Load model to check if it already exists 40 | a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25) 41 | model_dict = a['Models'] 42 | while "NextToken" in a: 43 | a = dr.list_models(ModelType='REINFORCEMENT_LEARNING', MaxResults=25, NextToken=a["NextToken"]) 44 | model_dict.extend(a['Models']) 45 | 46 | models = pd.DataFrame.from_dict(model_dict) 47 | 48 | if models[models['ModelName']==dr_model_name].size > 0: 49 | sys.exit('Model {} already exists.'.format(dr_model_name)) 50 | 51 | # Import from S3 52 | print('Importing from s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix)) 53 | response = dr.import_model(Name=dr_model_name, ModelArtifactsS3Path='s3://{}/{}'.format(aws_s3_bucket,aws_s3_prefix), RoleArn=aws_s3_role, Type='REINFORCEMENT_LEARNING') 54 | 55 | if response['ResponseMetadata']['HTTPStatusCode'] == 200: 56 | print('Model importing as {}'.format(response['ModelArn'])) 57 | else: 58 | sys.exit('Error occcured when uploading') -------------------------------------------------------------------------------- /docs/video.md: -------------------------------------------------------------------------------- 1 | # Watching the car 2 | 3 | There are multiple ways to watch the car during training and evaluation. The ports and 'features' depend on the docker mode (swarm vs. compose) as well as between training and evaluation. 4 | 5 | ## Training using Viewer 6 | 7 | DRfC has a built in viewer that supports showing the video stream from up to 6 workers on one webpage. 8 | 9 | The view can be started with `dr-start-viewer` and is available on `http://localhost:8100` or `http://127.0.0.1:8100`. The viewer must be updated if training is restarted using `dr-update-viewer`, as it needs to connect to the new containers. 10 | 11 | It is also possible to automatically start/update the viewer using the `-v` flag to `dr-start-training`. 12 | 13 | ## ROS Stream Viewer 14 | 15 | The ROS Stream Viewer is a built in ROS feature that will stream any topic in ROS that publishing ROSImg messages. The viewer starts automatically. 16 | 17 | ### Ports 18 | 19 | | Docker Mode | Training | Evaluation | Comment 20 | | -------- | -------- | -------- | -------- | 21 | | swarm | 8080 + `DR_RUN_ID` | 8180 + `DR_RUN_ID` | Default 8080/8180. Multiple workers share one port, press F5 to cycle between them. 22 | | compose | 8080-8089 | 8080-8089 | Each worker gets a unique port. 23 | 24 | ### Topics 25 | 26 | | Topic | Description | 27 | | -------- | -------- | 28 | | `/racecar/camera/zed/rgb/image_rect_color` | In-car video stream. This is used for inference. | 29 | | `/racecar/main_camera/zed/rgb/image_rect_color` | Camera following the car. Stream without overlay | 30 | | `/sub_camera/zed/rgb/image_rect_color` | Top-view of the track | 31 | | `/racecar/deepracer/kvs_stream` | Camera following the car. Stream with overlay. Different overlay in Training and Evaluation | 32 | | `/racecar/deepracer/main_camera_stream` | Same as `kvs_stream`, topic used for MP4 production. Only active in Evaluation if `DR_EVAL_SAVE_MP4=True` | 33 | 34 | ## Saving Evaluation to File 35 | 36 | During evaluation (`dr-start-evaluation`), if `DR_EVAL_SAVE_MP4=True` then three MP4 files are created in the S3 bucket's MP4 folder. They contain the in-car camera, top-camera and the camera following the car. -------------------------------------------------------------------------------- /utils/start-local-browser.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]" 5 | echo " -w Width of individual stream." 6 | echo " -h Heigth of individual stream." 7 | echo " -q Quality of the stream image." 8 | echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" 9 | echo " -b Browser command (default: firefox --new-tab)" 10 | exit 1 11 | } 12 | 13 | trap ctrl_c INT 14 | 15 | function ctrl_c() { 16 | echo "Requested to stop." 17 | exit 1 18 | } 19 | 20 | # Stream definition 21 | TOPIC="/racecar/deepracer/kvs_stream" 22 | WIDTH=480 23 | HEIGHT=360 24 | QUALITY=75 25 | BROWSER="firefox --new-tab" 26 | 27 | while getopts ":w:h:q:t:b:" opt; do 28 | case $opt in 29 | w) WIDTH="$OPTARG" 30 | ;; 31 | h) HEIGHT="$OPTARG" 32 | ;; 33 | q) QUALITY="$OPTARG" 34 | ;; 35 | t) TOPIC="$OPTARG" 36 | ;; 37 | b) BROWSER="$OPTARG" 38 | ;; 39 | \?) echo "Invalid option -$OPTARG" >&2 40 | usage 41 | ;; 42 | esac 43 | done 44 | 45 | FILE=$DR_DIR/tmp/streams-$DR_RUN_ID.html 46 | 47 | # Check if we will use Docker Swarm or Docker Compose 48 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 49 | then 50 | echo "This script does not support swarm mode. Use `dr-start-viewer`." 51 | exit 52 | fi 53 | 54 | echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC

" > $FILE 55 | 56 | ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}}" --filter name=deepracer-$DR_RUN_ID --filter "ancestor=awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE") 57 | if [ -z "$ROBOMAKER_CONTAINERS" ]; then 58 | echo "No running robomakers. Exiting." 59 | exit 60 | fi 61 | 62 | for c in $ROBOMAKER_CONTAINERS; do 63 | C_PORT=$(docker inspect $c | jq -r '.[0].NetworkSettings.Ports["8080/tcp"][0].HostPort') 64 | C_URL="http://localhost:${C_PORT}/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" 65 | C_IMG="" 66 | echo $C_IMG >> $FILE 67 | done 68 | 69 | echo "" >> $FILE 70 | echo "Starting browser '$BROWSER'." 71 | $BROWSER `readlink -f $FILE ` & -------------------------------------------------------------------------------- /docs/multi_worker.md: -------------------------------------------------------------------------------- 1 | # Using multiple Robomaker workers 2 | 3 | One way to accelerate training is to launch multiple Robomaker workers that feed into one Sagemaker instance. 4 | 5 | The number of workers is configured through setting `system.env` `DR_WORKERS` to the desired number of workers. The result is that the number of episodes (hyperparameter `num_episodes_between_training`) will be divivided over the number of workers. The theoretical maximum number of workers equals `num_episodes_between_training`. 6 | 7 | The training can be started as normal. 8 | 9 | ## How many workers do I need? 10 | 11 | One Robomaker worker requires 2-4 vCPUs. Tests show that a `c5.4xlarge` instance can run 3 workers and the Sagemaker without a drop in performance. Using OpenGL images reduces the number of vCPUs required per worker. 12 | 13 | To avoid issues with the position from which evaluations are run ensure that `( num_episodes_between_training / DR_WORKERS) * DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST = 1.0`. 14 | 15 | Example: With 3 workers set `num_episodes_between_training: 30` and `DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.1`. 16 | 17 | Note; Sagemaker will stop collecting experiences once you have reached 10.000 steps (3-layer CNN) in an iteration. For longer tracks with 600-1000 steps per completed episodes this will define the upper bound for the number of workers and episodes per iteration. 18 | 19 | ## Training with different parameters for each worker 20 | 21 | It is also possible to use different configurations between workers, such as different tracks (WORLD_NAME). To enable, set DR_TRAIN_MULTI_CONFIG=True inside run.env, then make copies of defaults/template-worker.env in the main deepracer-for-cloud directory with format worker-2.env, worker-3.env, etc. (So alongside run.env, you should have woker-2.env, worker-3.env, etc. run.env is still used for worker 1) Modify the worker env files with your desired changes, which can be more than just the world_name. These additional worker env files are only used if you are training with multiple workers. 22 | 23 | ## Watching the streams 24 | 25 | If you want to watch the streams -- and are in `compose` mode you can use the script `utils/start-local-browser.sh` to dynamically create a HTML that streams the KVS stream from ALL workers at a time. 26 | -------------------------------------------------------------------------------- /defaults/template-run.env: -------------------------------------------------------------------------------- 1 | DR_RUN_ID=0 2 | DR_WORLD_NAME=reInvent2019_track 3 | DR_RACE_TYPE=TIME_TRIAL 4 | DR_CAR_NAME=FastCar 5 | DR_CAR_BODY_SHELL_TYPE=deepracer 6 | DR_CAR_COLOR=Red 7 | DR_DISPLAY_NAME=$DR_CAR_NAME 8 | DR_RACER_NAME=$DR_CAR_NAME 9 | DR_ENABLE_DOMAIN_RANDOMIZATION=False 10 | DR_EVAL_NUMBER_OF_TRIALS=3 11 | DR_EVAL_IS_CONTINUOUS=True 12 | DR_EVAL_MAX_RESETS=100 13 | DR_EVAL_OFF_TRACK_PENALTY=5.0 14 | DR_EVAL_COLLISION_PENALTY=5.0 15 | DR_EVAL_SAVE_MP4=False 16 | DR_EVAL_CHECKPOINT=last 17 | DR_EVAL_OPP_S3_MODEL_PREFIX=rl-deepracer-sagemaker 18 | DR_EVAL_OPP_CAR_BODY_SHELL_TYPE=deepracer 19 | DR_EVAL_OPP_CAR_NAME=FasterCar 20 | DR_EVAL_OPP_DISPLAY_NAME=$DR_EVAL_OPP_CAR_NAME 21 | DR_EVAL_OPP_RACER_NAME=$DR_EVAL_OPP_CAR_NAME 22 | DR_EVAL_DEBUG_REWARD=False 23 | DR_EVAL_RESET_BEHIND_DIST=1.0 24 | #DR_EVAL_RTF=1.0 25 | DR_TRAIN_CHANGE_START_POSITION=True 26 | DR_TRAIN_ALTERNATE_DRIVING_DIRECTION=False 27 | DR_TRAIN_START_POSITION_OFFSET=0.0 28 | DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST=0.05 29 | DR_TRAIN_MULTI_CONFIG=False 30 | DR_TRAIN_MIN_EVAL_TRIALS=5 31 | #DR_TRAIN_RTF=1.0 32 | DR_LOCAL_S3_MODEL_PREFIX=rl-deepracer-sagemaker 33 | DR_LOCAL_S3_PRETRAINED=False 34 | DR_LOCAL_S3_PRETRAINED_PREFIX=rl-sagemaker-pretrained 35 | DR_LOCAL_S3_PRETRAINED_CHECKPOINT=last 36 | DR_LOCAL_S3_CUSTOM_FILES_PREFIX=custom_files 37 | DR_LOCAL_S3_TRAINING_PARAMS_FILE=training_params.yaml 38 | DR_LOCAL_S3_EVAL_PARAMS_FILE=evaluation_params.yaml 39 | DR_LOCAL_S3_MODEL_METADATA_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/model_metadata.json 40 | DR_LOCAL_S3_HYPERPARAMETERS_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/hyperparameters.json 41 | DR_LOCAL_S3_REWARD_KEY=$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/reward_function.py 42 | DR_LOCAL_S3_METRICS_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX/metrics 43 | DR_UPLOAD_S3_PREFIX=$DR_LOCAL_S3_MODEL_PREFIX 44 | DR_OA_NUMBER_OF_OBSTACLES=6 45 | DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES=2.0 46 | DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS=False 47 | DR_OA_IS_OBSTACLE_BOT_CAR=False 48 | DR_OA_OBJECT_POSITIONS= 49 | DR_H2B_IS_LANE_CHANGE=False 50 | DR_H2B_LOWER_LANE_CHANGE_TIME=3.0 51 | DR_H2B_UPPER_LANE_CHANGE_TIME=5.0 52 | DR_H2B_LANE_CHANGE_DISTANCE=1.0 53 | DR_H2B_NUMBER_OF_BOT_CARS=3 54 | DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS=2.0 55 | DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS=False 56 | DR_H2B_BOT_CAR_SPEED=0.2 57 | DR_H2B_BOT_CAR_PENALTY=5.0 -------------------------------------------------------------------------------- /docs/docker.md: -------------------------------------------------------------------------------- 1 | # About the Docker setup 2 | 3 | DRfC supports running Docker in to modes `swarm` and `compose` - this behaviour is configured in `system.env` through `DR_DOCKER_STYLE`. 4 | 5 | ## Swarm Mode 6 | 7 | Docker Swarm mode is the default. Docker Swarm makes it possible to connect multiple hosts together to spread the load -- esp. useful if one wants to run multiple Robomaker workers, but can also be useful locally if one has two computers that each are not powerful enough to run DeepRacer. 8 | 9 | In Swarm mode DRfC creates Stacks, using `docker stack`. During operations one can check running stacks through `docker stack ls`, and running services through `docker stack ls`. 10 | 11 | DRfC is installed only on the manager. (The first installed host.) Swarm workers are 'dumb' and do not need to have DRfC installed. 12 | 13 | ### Key features 14 | 15 | * Allows user to connect multiple computers on the same network. (In AWS the instances must be connected on same VPC, and instances must be allowed to communicate.) 16 | * Supports [multiple Robomaker workers](multi_worker.md) 17 | * Supports [running multiple parallel experiments](multi_run.md) 18 | 19 | ### Limitations 20 | 21 | * The Sagemaker container can only be run on the manager. 22 | * Docker images are downloaded from Docker Hub. Locally built images are allowed only if they have a unique tag, not in Docker Hub. If you have multiple Docker nodes ensure that they all have the image available. 23 | 24 | ### Connecting Workers 25 | 26 | * On the manager run `docker swarm join-token manager`. 27 | * On the worker run the command that was displayed on the manager `docker swarm join --token :`. 28 | 29 | ### Ports 30 | 31 | Docker Swarm will automatically put a load-balancer in front of all replicas in a service. This means that the ROS Web View, which provides a video stream of the DeepRacer during training, will be load balanced - sharing one port (`8080`). If you have multiple workers (even across multiple hosts) then press F5 to cycle through them. 32 | 33 | ## Compose Mode 34 | 35 | In Compose mode DRfC creates Services, using `docker-compose`. During operations one can check running stacks through `docker service ls`, and running services through `docker service ps`. 36 | 37 | ### Key features 38 | 39 | * Supports [multiple Robomaker workers](multi_worker.md) 40 | * Supports [running multiple parallel experiments](multi_run.md) 41 | * Supports [GPU Accelerated OpenGL for Robomaker](opengl.md) 42 | 43 | ### Limitations 44 | 45 | * Workload cannot be spread across multiple hosts. 46 | 47 | ### Ports 48 | 49 | In the case of using Docker Compose the different Robomaker worker will require unique ports for ROS Web Vew and VNC. Docker will assign these dynamically. Use `docker ps` to see which container has been assigned which ports. 50 | -------------------------------------------------------------------------------- /docs/opengl.md: -------------------------------------------------------------------------------- 1 | # GPU Accelerated OpenGL for Robomaker 2 | 3 | One way to improve performance, especially of Robomaker, is to enable GPU-accelerated OpenGL. OpenGL can significantly improve Gazebo performance, even where the GPU does not have enough GPU RAM, or is too old, to support Tensorflow. 4 | 5 | ## Desktop 6 | 7 | On a Ubuntu desktop running Unity there are hardly any additional steps required. 8 | 9 | * Ensure that a recent Nvidia driver is installed and is running. 10 | * Ensure that nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. 11 | * Configure DRfC using the following settings in `system.env`: 12 | * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. 13 | * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. 14 | 15 | Before running `dr-start-training` ensure that environment variables `DISPLAY` and `XAUTHORITY` are defined. 16 | 17 | NOTE: Users have experienced issues to start training process from remote (SSH / RDP), as a local X session may not be running / you are not allowed to connect to it. Workaround is to start an additional X server following the steps for Headless Server. 18 | 19 | With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. Older drivers (e.g. 390 for NVS 315) may not support showing which processes are running on the GPU. 20 | 21 | ## Headless Server 22 | 23 | Also a headless server with a GPU, e.g. an EC2 instance, or a local computer with a displayless GPU (e.g. Tesla K40, K80, M40). 24 | 25 | * Ensure that a Nvidia driver and nvidia-docker is installed; review `bin/prepare.sh` for steps if you do not want to directly run the script. 26 | * Setup an X-server on the host. `utils/setup-xorg.sh` is a basic installation script. 27 | * Configure DRfC using the following settings in `system.env`: 28 | * `DR_HOST_X=True`; uses the local X server rather than starting one within the docker container. 29 | * `DR_ROBOMAKER_IMAGE`; choose the tag for an OpenGL enabled image - e.g. `cpu-gl-avx` for an image where Tensorflow will use CPU or `gpu-gl` for an image where also Tensorflow will use the GPU. 30 | * `DR_DISPLAY`; the X display that the headless X server will start on. (Default is `:99`.) 31 | 32 | Start up the X server with `utils/start-xorg.sh`. 33 | 34 | If `DR_GUI_ENABLE=True` then a VNC server will be started on port 5900 so that you can connect and interact with the Gazebo UI. 35 | 36 | With recent Nvidia drivers you can confirm that the setup is working by running `nvidia-smi` on the host and see that `gzserver` is listed as running on the GPU. 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepRacer-For-Cloud 2 | Provides a quick and easy way to get up and running with a DeepRacer training environment in Azure or AWS, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing). 3 | 4 | ## Introduction 5 | 6 | DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities are in the [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository. 7 | 8 | ## Main Features 9 | 10 | DRfC supports a wide set of features to ensure that you can focus on creating the best model: 11 | * User-friendly 12 | * Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups. 13 | * Wide set of scripts (`dr-*`) enables effortless training. 14 | * Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them. 15 | * Modes 16 | * Time Trial 17 | * Object Avoidance 18 | * Head-to-Bot 19 | * Training 20 | * Multiple Robomaker instances per Sagemaker (N:1) to improve training progress. 21 | * Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel. 22 | * Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances. 23 | * Evaluation 24 | * Evaluate independently from training. 25 | * Save evaluation run to MP4 file in S3. 26 | * Logging 27 | * Training metrics and trace files are stored to S3. 28 | * Optional integration with AWS CloudWatch. 29 | * Optional exposure of Robomaker internal log-files. 30 | * Technology 31 | * Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL) 32 | 33 | ## Documentation 34 | 35 | Full documentation can be found on the [Deepracer-for-Cloud GitHub Pages](https://aws-deepracer-community.github.io/deepracer-for-cloud). 36 | 37 | ## Support 38 | 39 | * For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-drfc-setup where the community provides active support. 40 | * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. 41 | -------------------------------------------------------------------------------- /scripts/upload/increment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-f] [-w] [-p ] [-d ]" 5 | echo "" 6 | echo "Command will increment a numerical suffix on the current upload model." 7 | echo "-p model Sets the to-be name to be rather than auto-incremeneting the previous model." 8 | echo "-d delim Delimiter in model-name (e.g. '-' in 'test-model-1')" 9 | echo "-f Force. Ask for no confirmations." 10 | echo "-w Wipe the S3 prefix to ensure that two models are not mixed." 11 | exit 1 12 | } 13 | 14 | trap ctrl_c INT 15 | 16 | function ctrl_c() { 17 | echo "Requested to stop." 18 | exit 1 19 | } 20 | 21 | OPT_DELIM='-' 22 | 23 | while getopts ":fwp:d:" opt; do 24 | case $opt in 25 | 26 | f) OPT_FORCE="True" 27 | ;; 28 | p) OPT_PREFIX="$OPTARG" 29 | ;; 30 | w) OPT_WIPE="--delete" 31 | ;; 32 | d) OPT_DELIM="$OPTARG" 33 | ;; 34 | h) usage 35 | ;; 36 | \?) echo "Invalid option -$OPTARG" >&2 37 | usage 38 | ;; 39 | esac 40 | done 41 | 42 | CONFIG_FILE=$DR_CONFIG 43 | echo "Configuration file $CONFIG_FILE will be updated." 44 | 45 | ## Read in data 46 | CURRENT_UPLOAD_MODEL=$(grep -e "^DR_UPLOAD_S3_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') 47 | CURRENT_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL}" | \ 48 | awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') 49 | if [[ -z ${CURRENT_UPLOAD_MODEL_NUM} ]]; 50 | then 51 | NEW_UPLOAD_MODEL="${CURRENT_UPLOAD_MODEL}${OPT_DELIM}1" 52 | else 53 | NEW_UPLOAD_MODEL_NUM=$(echo "${CURRENT_UPLOAD_MODEL_NUM} + 1" | bc ) 54 | NEW_UPLOAD_MODEL=$(echo $CURRENT_UPLOAD_MODEL | sed "s/${CURRENT_UPLOAD_MODEL_NUM}\$/${NEW_UPLOAD_MODEL_NUM}/") 55 | fi 56 | 57 | if [[ -n "${NEW_UPLOAD_MODEL}" ]]; 58 | then 59 | echo "Incrementing model from ${CURRENT_UPLOAD_MODEL} to ${NEW_UPLOAD_MODEL}" 60 | if [[ -z "${OPT_FORCE}" ]]; 61 | then 62 | read -r -p "Are you sure? [y/N] " response 63 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 64 | then 65 | echo "Aborting." 66 | exit 1 67 | fi 68 | fi 69 | sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$NEW_UPLOAD_MODEL/g" "$CONFIG_FILE" && echo "Done." 70 | else 71 | echo "Error in determining new model. Aborting." 72 | exit 1 73 | fi 74 | 75 | export DR_UPLOAD_S3_PREFIX=$(eval echo "${NEW_UPLOAD_MODEL}") 76 | 77 | if [[ -n "${OPT_WIPE}" ]]; 78 | then 79 | MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} ) 80 | if [[ -n "${MODEL_DIR_S3}" ]]; 81 | then 82 | echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} exists. Will wipe." 83 | fi 84 | if [[ -z "${OPT_FORCE}" ]]; 85 | then 86 | read -r -p "Are you sure? [y/N] " response 87 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 88 | then 89 | echo "Aborting." 90 | exit 1 91 | fi 92 | fi 93 | aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_UPLOAD_MODEL} --recursive 94 | fi 95 | -------------------------------------------------------------------------------- /scripts/training/increment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-f] [-w] [-p ] [-d ]" 5 | echo "" 6 | echo "Command will set the current model to be the pre-trained model and increment a numerical suffix." 7 | echo "-p model Sets the to-be name to be rather than auto-incremeneting the previous model." 8 | echo "-d delim Delimiter in model-name (e.g. '-' in 'test-model-1')" 9 | echo "-f Force. Ask for no confirmations." 10 | echo "-w Wipe the S3 prefix to ensure that two models are not mixed." 11 | exit 1 12 | } 13 | 14 | trap ctrl_c INT 15 | 16 | function ctrl_c() { 17 | echo "Requested to stop." 18 | exit 1 19 | } 20 | 21 | OPT_DELIM='-' 22 | 23 | while getopts ":fwp:d:" opt; do 24 | case $opt in 25 | 26 | f) OPT_FORCE="True" 27 | ;; 28 | p) OPT_PREFIX="$OPTARG" 29 | ;; 30 | w) OPT_WIPE="--delete" 31 | ;; 32 | d) OPT_DELIM="$OPTARG" 33 | ;; 34 | h) usage 35 | ;; 36 | \?) echo "Invalid option -$OPTARG" >&2 37 | usage 38 | ;; 39 | esac 40 | done 41 | 42 | CONFIG_FILE=$DR_CONFIG 43 | echo "Configuration file $CONFIG_FILE will be updated." 44 | 45 | ## Read in data 46 | CURRENT_RUN_MODEL=$(grep -e "^DR_LOCAL_S3_MODEL_PREFIX" ${CONFIG_FILE} | awk '{split($0,a,"="); print a[2] }') 47 | CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \ 48 | awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') 49 | if [[ -z ${CURRENT_RUN_MODEL_NUM} ]]; 50 | then 51 | NEW_RUN_MODEL="${CURRENT_RUN_MODEL}${OPT_DELIM}1" 52 | else 53 | NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc ) 54 | NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") 55 | fi 56 | 57 | if [[ -n "${NEW_RUN_MODEL}" ]]; 58 | then 59 | echo "Incrementing model from ${CURRENT_RUN_MODEL} to ${NEW_RUN_MODEL}" 60 | if [[ -z "${OPT_FORCE}" ]]; 61 | then 62 | read -r -p "Are you sure? [y/N] " response 63 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 64 | then 65 | echo "Aborting." 66 | exit 1 67 | fi 68 | fi 69 | sed -i.bak -re "s/(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$/\1$CURRENT_RUN_MODEL/g; s/(DR_LOCAL_S3_PRETRAINED=).*$/\1True/g; ; s/(DR_LOCAL_S3_MODEL_PREFIX=).*$/\1$NEW_RUN_MODEL/g" "$CONFIG_FILE" && echo "Done." 70 | else 71 | echo "Error in determining new model. Aborting." 72 | exit 1 73 | fi 74 | 75 | if [[ -n "${OPT_WIPE}" ]]; 76 | then 77 | MODEL_DIR_S3=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} ) 78 | if [[ -n "${MODEL_DIR_S3}" ]]; 79 | then 80 | echo "The new model's S3 prefix s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} exists. Will wipe." 81 | fi 82 | if [[ -z "${OPT_FORCE}" ]]; 83 | then 84 | read -r -p "Are you sure? [y/N] " response 85 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 86 | then 87 | echo "Aborting." 88 | exit 1 89 | fi 90 | fi 91 | aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm s3://${DR_LOCAL_S3_BUCKET}/${NEW_RUN_MODEL} --recursive 92 | fi 93 | -------------------------------------------------------------------------------- /scripts/evaluation/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source $DR_DIR/bin/scripts_wrapper.sh 4 | 5 | usage(){ 6 | echo "Usage: $0 [-q] [-c]" 7 | echo " -q Quiet - does not start log tracing." 8 | echo " -c Clone - copies model into new prefix before evaluating." 9 | exit 1 10 | } 11 | 12 | trap ctrl_c INT 13 | 14 | function ctrl_c() { 15 | echo "Requested to stop." 16 | exit 1 17 | } 18 | 19 | while getopts ":qc" opt; do 20 | case $opt in 21 | q) OPT_QUIET="QUIET" 22 | ;; 23 | c) OPT_CLONE="CLONE" 24 | ;; 25 | h) usage 26 | ;; 27 | \?) echo "Invalid option -$OPTARG" >&2 28 | usage 29 | ;; 30 | esac 31 | done 32 | 33 | # clone if required 34 | if [ -n "$OPT_CLONE" ]; then 35 | echo "Cloning model into s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E" 36 | aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/model s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/model 37 | aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX/ip s3://$DR_LOCAL_S3_BUCKET/${DR_LOCAL_S3_MODEL_PREFIX}-E/ip 38 | export DR_LOCAL_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX}-E 39 | fi 40 | 41 | # set evaluation specific environment variables 42 | S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" 43 | STACK_NAME="deepracer-eval-$DR_RUN_ID" 44 | 45 | export ROBOMAKER_COMMAND="./run.sh run evaluation.launch" 46 | export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_EVAL_PARAMS_FILE} 47 | 48 | if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; 49 | then 50 | COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" 51 | export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" 52 | mkdir -p $DR_MOUNT_DIR 53 | else 54 | COMPOSE_FILES="$DR_EVAL_COMPOSE_FILE" 55 | fi 56 | 57 | echo "Creating Robomaker configuration in $S3_PATH/$DR_CURRENT_PARAMS_FILE" 58 | python3 $DR_DIR/scripts/evaluation/prepare-config.py 59 | 60 | # Check if we are using Host X -- ensure variables are populated 61 | if [[ "${DR_HOST_X,,}" == "true" ]]; 62 | then 63 | if [[ -n "$DR_DISPLAY" ]]; then 64 | ROBO_DISPLAY=$DR_DISPLAY 65 | else 66 | ROBO_DISPLAY=$DISPLAY 67 | fi 68 | 69 | if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then 70 | echo "No X Server running on display $ROBO_DISPLAY. Exiting" 71 | exit 0 72 | fi 73 | 74 | if [[ -z "$XAUTHORITY" ]]; then 75 | export XAUTHORITY=~/.Xauthority 76 | if [[ ! -f "$XAUTHORITY" ]]; then 77 | echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping." 78 | exit 0 79 | fi 80 | fi 81 | fi 82 | 83 | 84 | # Check if we will use Docker Swarm or Docker Compose 85 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 86 | then 87 | DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME 88 | else 89 | DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES --log-level ERROR -p $STACK_NAME up -d 90 | fi 91 | 92 | # Request to be quiet. Quitting here. 93 | if [ -n "$OPT_QUIET" ]; then 94 | exit 0 95 | fi 96 | 97 | # Trigger requested log-file 98 | dr-logs-robomaker -w 15 -e 99 | 100 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Provides a quick and easy way to get up and running with a DeepRacer training environment in AWS or Azure, using either the Azure [N-Series Virtual Machines](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) or [AWS EC2 Accelerated Computing instances](https://aws.amazon.com/ec2/instance-types/?nc1=h_ls#Accelerated_Computing), or locally on your own desktop or server. 4 | 5 | DeepRacer-For-Cloud (DRfC) started as an extension of the work done by Alex (https://github.com/alexschultz/deepracer-for-dummies), which is again a wrapper around the amazing work done by Chris (https://github.com/crr0004/deepracer). With the introduction of the second generation Deepracer Console the repository has been split up. This repository contains the scripts needed to *run* the training, but depends on Docker Hub to provide pre-built docker images. All the under-the-hood building capabilities have been moved to my [Deepracer Build](https://github.com/aws-deepracer-community/deepracer) repository. 6 | 7 | # Main Features 8 | 9 | DRfC supports a wide set of features to ensure that you can focus on creating the best model: 10 | * User-friendly 11 | * Based on the continously updated community [Robomaker](https://github.com/aws-deepracer-community/deepracer-simapp) and [Sagemaker](https://github.com/aws-deepracer-community/deepracer-sagemaker-container) containers, supporting a wide range of CPU and GPU setups. 12 | * Wide set of scripts (`dr-*`) enables effortless training. 13 | * Detection of your AWS DeepRacer Console models; allows upload of a locally trained model to any of them. 14 | * Modes 15 | * Time Trial 16 | * Object Avoidance 17 | * Head-to-Bot 18 | * Training 19 | * Multiple Robomaker instances per Sagemaker (N:1) to improve training progress. 20 | * Multiple training sessions in parallel - each being (N:1) if hardware supports it - to test out things in parallel. 21 | * Connect multiple nodes together (Swarm-mode only) to combine the powers of multiple computers/instances. 22 | * Evaluation 23 | * Evaluate independently from training. 24 | * Save evaluation run to MP4 file in S3. 25 | * Logging 26 | * Training metrics and trace files are stored to S3. 27 | * Optional integration with AWS CloudWatch. 28 | * Optional exposure of Robomaker internal log-files. 29 | * Technology 30 | * Supports both Docker Swarm (used for connecting multiple nodes together) and Docker Compose (used to support OpenGL) 31 | 32 | # Documentation 33 | 34 | * [Initial Installation](installation.md) 35 | * [Upload Model to Console](upload.md) 36 | * [Reference](reference.md) 37 | * [Using multiple Robomaker workers](multi_worker.md) 38 | * [Running multiple parallel experiments](multi_run.md) 39 | * [GPU Accelerated OpenGL for Robomaker](opengl.md) 40 | * [Having multiple GPUs in one Computer](multi_gpu.md) 41 | * [Installing on Windows](windows.md) 42 | * [Run a Head-to-Head Race](head-to-head.md) 43 | * [Watching the car](video.md) 44 | 45 | # Support 46 | 47 | * For general support it is suggested to join the [AWS DeepRacing Community](https://deepracing.io/). The Community Slack has a channel #dr-training-local where the community provides active support. 48 | * Create a GitHub issue if you find an actual code issue, or where updates to documentation would be required. 49 | -------------------------------------------------------------------------------- /docs/upload.md: -------------------------------------------------------------------------------- 1 | # Upload Model to AWS Console 2 | 3 | Starting end July 2020 the AWS DeepRacer Console was re-designed which is now changing the way 4 | that models need to be uploaded to enable them to be evaluated or submitted to the AWS hosted Summit or Virtual League events. 5 | 6 | ## Create Upload Bucket 7 | 8 | The recommendation is to create a unique bucket in `us-east-1` which is used as 'transit' between your training bucket, local or in an AWS region close to your EC2 instances. 9 | 10 | The bucket needs to be defined so that 'Objects can be public'; AWS will create a specific IAM policy to access the data in your bucket as part of the import. 11 | 12 | ## Configure Upload Bucket 13 | 14 | In `system.env` set `DR_UPLOAD_S3_BUCKET` to the name of your created bucket. 15 | 16 | In `run.env` set the `DR_UPLOAD_S3_PREFIX` to any prefix of your choice. 17 | 18 | ## Upload Model 19 | 20 | After configuring the system you can run `dr-upload-model`; it will copy out the required parts of `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX` into `s3://DR_UPLOAD_S3_BUCKET/DR_UPLOAD_S3_PREFIX`. 21 | 22 | Once uploaded you can use the [Import model](https://console.aws.amazon.com/deepracer/home?region=us-east-1#models/importModel) feature of the AWS DeepRacer console to load the model into the model store. 23 | 24 | ## Things to know 25 | 26 | ### Upload switches 27 | There are several useful switches to the upload command: 28 | * f - this will force upload, no confirmation question if you want to proceed with upload 29 | * w - wipes the target AWS DeepRacer model structure before upload in the designated bucket/prefix 30 | * d - dry-Run mode, does not perform any write or delete operatios on target 31 | * b - uploads best checkpoint instead of default which is last checkpoint 32 | * p prefix - uploads model into specified S3 prefix 33 | * i - imports model using the prefix as the model name 34 | * I name - import model with a specific model name" 35 | 36 | ### Import 37 | If you want to use the import switches (`-i` or `-I`) there are a few pre-requisites. 38 | 39 | * Python packages to be installed with `pip install`: 40 | * pandas 41 | * deepracer-utils 42 | * Install boto3 service `deepracer` with `python -m deepracer install-cli --force`. 43 | * Create an IAM Role which the Deepracer service can use to access S3. Declare the ARN in `DR_UPLOAD_S3_ROLE` in `system.env`. 44 | 45 | ### Managing your models 46 | You should decide how you're going to manage your models. Upload to AWS does not preserve all the files created locally so if you delete your local files you will find it hard to go back to a previous model and resume training. 47 | 48 | ### Create file formatted for physical car, and upload to S3 49 | You can also create the file in the format necessary to run on the physical car directly from DRfC, without going through the AWS console. 50 | This is executed by running 'dr-upload-car-zip'; it will copy files out of the running sagemaker container, format them into the proper .tar.gz file, and upload that file to `s3://DR_LOCAL_S3_BUCKET/DR_LOCAL_S3_PREFIX`. One of the limitations of this approach is that it only uses the latest checkpoint, and does not have the option to use the "best" checkpoint, or an earlier checkpoint. Another limitation is that the sagemaker container must be running at the time this command is executed. 51 | -------------------------------------------------------------------------------- /scripts/upload/download-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-f] [-w] [-d] -s -t &2 38 | usage 39 | ;; 40 | esac 41 | done 42 | 43 | if [[ -n "${OPT_DRYRUN}" ]]; 44 | then 45 | echo "*** DRYRUN MODE ***" 46 | fi 47 | 48 | SOURCE_S3_URL="${OPT_SOURCE}" 49 | 50 | if [[ -z "${SOURCE_S3_URL}" ]]; 51 | then 52 | echo "No source URL to download model from." 53 | exit 1 54 | fi 55 | 56 | TARGET_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 57 | TARGET_S3_PREFIX=${OPT_TARGET} 58 | if [[ -z "${TARGET_S3_PREFIX}" ]]; 59 | then 60 | echo "No target prefix defined. Exiting." 61 | exit 1 62 | fi 63 | 64 | SOURCE_REWARD_FILE_S3_KEY="${SOURCE_S3_URL}/reward_function.py" 65 | SOURCE_HYPERPARAM_FILE_S3_KEY="${SOURCE_S3_URL}/ip/hyperparameters.json" 66 | SOURCE_METADATA_S3_KEY="${SOURCE_S3_URL}/model/model_metadata.json" 67 | 68 | WORK_DIR=${DR_DIR}/tmp/download 69 | mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}/config ${WORK_DIR}/full 70 | 71 | # Check if metadata-files are available 72 | REWARD_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_REWARD_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) 73 | METADATA_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_METADATA_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) 74 | HYPERPARAM_FILE=$(aws ${DR_UPLOAD_PROFILE} s3 cp "${SOURCE_HYPERPARAM_FILE_S3_KEY}" ${WORK_DIR}/config/ --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) 75 | 76 | if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ]; 77 | then 78 | echo "All meta-data files found. Source model ${SOURCE_S3_URL} valid." 79 | else 80 | echo "Meta-data files are not found. Source model ${SOURCE_S3_URL} not valid. Exiting." 81 | exit 1 82 | fi 83 | 84 | # Upload files 85 | if [[ -z "${OPT_FORCE}" ]]; 86 | then 87 | echo "Ready to download model ${SOURCE_S3_URL} to local ${TARGET_S3_PREFIX}" 88 | read -r -p "Are you sure? [y/N] " response 89 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 90 | then 91 | echo "Aborting." 92 | exit 1 93 | fi 94 | fi 95 | 96 | cd ${WORK_DIR} 97 | aws ${DR_UPLOAD_PROFILE} s3 sync "${SOURCE_S3_URL}" ${WORK_DIR}/full/ ${OPT_DRYRUN} 98 | aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync ${WORK_DIR}/full/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ ${OPT_DRYRUN} ${OPT_WIPE} 99 | 100 | if [[ -n "${OPT_CONFIG}" ]]; 101 | then 102 | echo "Copy configuration to custom_files" 103 | cp ${WORK_DIR}/config/* ${DR_DIR}/custom_files/ 104 | fi 105 | 106 | echo "Done." 107 | -------------------------------------------------------------------------------- /docs/windows.md: -------------------------------------------------------------------------------- 1 | # Installing on Windows 2 | 3 | ## Prerequisites 4 | 5 | The basic installation steps to get a NVIDIA GPU / CUDA enabled Ubuntu subsystem on Windows can be found in the [Cuda on WSL User Guide](https://docs.nvidia.com/cuda/wsl-user-guide/index.html). Ensure your windows has an updated [nvidia cuda enabled driver](https://developer.nvidia.com/cuda/wsl/download) that will work with WSL. 6 | 7 | The further instructions assume that you have a basic working WSL using the default Ubuntu distribution. 8 | 9 | 10 | ## Additional steps 11 | 12 | The typical `bin/prepare.sh` script will not work for a Ubuntu WSL installation, hence alternate steps will be required. 13 | 14 | ### Adding required packages 15 | 16 | Install additional packages with the following command: 17 | 18 | ``` 19 | sudo apt-get install jq awscli python3-boto3 docker-compose 20 | ``` 21 | 22 | ### Install and configure docker and nvidia-docker 23 | ``` 24 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 25 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 26 | sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io 27 | 28 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 29 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 30 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 31 | 32 | cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json 33 | sudo usermod -a -G docker $(id -un) 34 | ``` 35 | 36 | 37 | ### Install DRfC 38 | 39 | You can now run `bin/init.sh -a gpu -c local` to setup DRfC, and follow the typical DRfC startup instructions 40 | 41 | ## Known Issues 42 | 43 | * `init.sh` is not able to detect the GPU given differences in the Nvidia drivers, and the WSL2 Linux Kernel. You need to manually set the GPU image in `system.env`. 44 | * Docker does not start automatically when you launch Ubuntu. Start it manually with `sudo service docker start` 45 | 46 | You can also configure the service to start automatically using the Windows Task Scheduler 47 | 48 | *1)* Create a new file at /etc/init-wsl (sudo vi /etc/init-wsl) with the following contents. 49 | 50 | ``` 51 | #!/bin/sh 52 | service start docker 53 | ``` 54 | 55 | *2)* Make the script executable `sudo chmod +x /etc/init-wsl` 56 | 57 | *3)* Open Task Scheduler in Windows 10 58 | 59 | - On the left, click **Task Scheduler Library** option, and then on the right, click **Create Task** 60 | 61 | - In **General** Tab, Enter Name **WSL Startup**, and select **Run whether user is logged on or not** and **Run with highest privileges** options. 62 | 63 | - In **Trigger** tab, click New ... > Begin the task: **At startup** > OK 64 | 65 | - In **Actions** tab, click New ... > Action: **Start a program** 66 | 67 | program/script: **wsl** 68 | 69 | add arguments: **-u root /etc/init-wsl** 70 | 71 | - Click OK to exit 72 | 73 | *4)* You can run the task manually to confirm, or after Windows reboot docker should now automatically start. 74 | 75 | * Video streams may not load using the localhost address. To access the html video streams from your windows browser, you may need to use the IP address of the WSL VM. From a WSL terminal, determine your IP address by the command 'ip addr' and look for **eth0** then **inet** (e.g. ip = 172.29.38.21). Then from your windows browser (edge, chrome, etc) navigate to **ip:8080** (e.g. 172.29.38.21:8080) 76 | 77 | -------------------------------------------------------------------------------- /scripts/upload/prepare-config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import boto3 4 | import sys 5 | import os 6 | import time 7 | import json 8 | import io 9 | import yaml 10 | 11 | config = {} 12 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') 13 | config['JOB_TYPE'] = 'TRAINING' 14 | config['METRICS_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket') 15 | config['METRICS_S3_OBJECT_KEY'] = "{}/TrainingMetrics.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) 16 | config['MODEL_METADATA_FILE_S3_KEY'] = "{}/model/model_metadata.json".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) 17 | config['REWARD_FILE_S3_KEY'] = "{}/reward_function.py".format(os.environ.get('TARGET_S3_PREFIX', 'bucket')) 18 | config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('TARGET_S3_BUCKET', 'bucket') 19 | config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('TARGET_S3_PREFIX', 'rl-deepracer-sagemaker') 20 | 21 | # Car and training 22 | config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') 23 | if config['BODY_SHELL_TYPE'] == 'deepracer': 24 | config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') 25 | config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') 26 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') 27 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') 28 | config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') 29 | config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') 30 | 31 | config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false')) 32 | config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) 33 | config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') 34 | config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00') 35 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') 36 | config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5') 37 | 38 | # Object Avoidance 39 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': 40 | config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') 41 | config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') 42 | config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') 43 | config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') 44 | 45 | object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") 46 | if object_position_str != "": 47 | object_positions = [] 48 | for o in object_position_str.split(";"): 49 | object_positions.append(o) 50 | config['OBJECT_POSITIONS'] = object_positions 51 | config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) 52 | 53 | # Head to Bot 54 | if config['RACE_TYPE'] == 'HEAD_TO_BOT': 55 | config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') 56 | config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') 57 | config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') 58 | config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') 59 | config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') 60 | config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') 61 | config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') 62 | config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') 63 | 64 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('WORK_DIR'),'training_params.yaml')) 65 | print(local_yaml_path) 66 | with open(local_yaml_path, 'w') as yaml_file: 67 | yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) -------------------------------------------------------------------------------- /docs/multi_gpu.md: -------------------------------------------------------------------------------- 1 | # Training on a Computer with more than one GPU 2 | 3 | In some cases you might end up with having a computer with more than one GPU. This may be common on a workstation 4 | which may have one GPU for general graphics (e.g. GTX 10-series, RTX 20-series), as well as a data center GPU 5 | like a Tesla K40, K80 or M40. 6 | 7 | In this setting it can get a bit chaotic as DeepRacer will 'greedily' put any workload on any GPU - which will 8 | lead to Out-of-Memory somewhere down the road. 9 | 10 | ## Checking available GPUs 11 | 12 | You can use Tensorflow to give you an overview of available devices running `utils/cuda-check.sh`. 13 | 14 | It will say something like: 15 | ``` 16 | 2020-07-04 12:25:55.179580: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA 17 | 2020-07-04 12:25:55.547206: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 0 with properties: 18 | name: GeForce GTX 1650 major: 7 minor: 5 memoryClockRate(GHz): 1.68 19 | pciBusID: 0000:04:00.0 20 | totalMemory: 3.82GiB freeMemory: 3.30GiB 21 | 2020-07-04 12:25:55.732066: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1411] Found device 1 with properties: 22 | name: Tesla M40 24GB major: 5 minor: 2 memoryClockRate(GHz): 1.112 23 | pciBusID: 0000:81:00.0 24 | totalMemory: 22.41GiB freeMemory: 22.30GiB 25 | 2020-07-04 12:25:55.732141: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1 26 | 2020-07-04 12:25:56.745647: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix: 27 | 2020-07-04 12:25:56.745719: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] 0 1 28 | 2020-07-04 12:25:56.745732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0: N N 29 | 2020-07-04 12:25:56.745743: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1: N N 30 | 2020-07-04 12:25:56.745973: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5) 31 | 2020-07-04 12:25:56.750352: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2) 32 | 2020-07-04 12:25:56.774305: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1490] Adding visible gpu devices: 0, 1 33 | 2020-07-04 12:25:56.774408: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] Device interconnect StreamExecutor with strength 1 edge matrix: 34 | 2020-07-04 12:25:56.774425: I tensorflow/core/common_runtime/gpu/gpu_device.cc:977] 0 1 35 | 2020-07-04 12:25:56.774436: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 0: N N 36 | 2020-07-04 12:25:56.774446: I tensorflow/core/common_runtime/gpu/gpu_device.cc:990] 1: N N 37 | 2020-07-04 12:25:56.774551: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:0 with 195 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1650, pci bus id: 0000:04:00.0, compute capability: 7.5) 38 | 2020-07-04 12:25:56.774829: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1103] Created TensorFlow device (/device:GPU:1 with 1147 MB memory) -> physical GPU (device: 1, name: Tesla M40 24GB, pci bus id: 0000:81:00.0, compute capability: 5.2) 39 | ['/device:GPU:0', '/device:GPU:1'] 40 | ``` 41 | In this case the CUDA device #0 is the GTX 1650 and the CUDA device #1 is the Tesla M40. 42 | 43 | ### Selecting Device 44 | 45 | #### Robomaker 46 | To control the Robomaker then add the following to `system.env`: 47 | 48 | ``` 49 | CUDA_VISIBLE_DEVICES=1 50 | ``` 51 | The number is the CUDA number of the GPU you want the Robomakers to use. 52 | 53 | #### Sagemaker 54 | 55 | Sagemaker is more critical to place, but also more complicated, as you will have to build a new Docker image for it to work. 56 | 57 | A template is in `utils/Dockerfile.sagemaker-gpu`. Open it to alter the source image in `FROM`, and adapt `CUDA_VISIBLE_DEVICES`. 58 | 59 | Build the image with `docker build -t awsdeepracercommunity/deepracer-sagemaker:gpu-x -f utils/Dockerfile.sagemaker-gpu .` with x being anything you like. 60 | 61 | Update `system.env` to use the new image. 62 | -------------------------------------------------------------------------------- /scripts/viewer/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-t topic] [-w width] [-h height] [-q quality] -b [browser-command]" 5 | echo " -w Width of individual stream." 6 | echo " -h Heigth of individual stream." 7 | echo " -q Quality of the stream image." 8 | echo " -t Topic to follow - default /racecar/deepracer/kvs_stream" 9 | echo " -b Browser command (default: firefox --new-tab)" 10 | exit 1 11 | } 12 | 13 | trap ctrl_c INT 14 | 15 | function ctrl_c() { 16 | echo "Requested to stop." 17 | exit 1 18 | } 19 | 20 | # Stream definition 21 | TOPIC="/racecar/deepracer/kvs_stream" 22 | WIDTH=480 23 | HEIGHT=360 24 | QUALITY=75 25 | BROWSER="firefox --new-tab" 26 | 27 | while getopts ":w:h:q:t:b:" opt; do 28 | case $opt in 29 | w) WIDTH="$OPTARG" 30 | ;; 31 | h) HEIGHT="$OPTARG" 32 | ;; 33 | q) QUALITY="$OPTARG" 34 | ;; 35 | t) TOPIC="$OPTARG" 36 | ;; 37 | b) BROWSER="$OPTARG" 38 | ;; 39 | \?) echo "Invalid option -$OPTARG" >&2 40 | usage 41 | ;; 42 | esac 43 | done 44 | 45 | export DR_VIEWER_HTML=$DR_DIR/tmp/streams-$DR_RUN_ID.html 46 | export DR_NGINX_CONF=$DR_DIR/tmp/streams-$DR_RUN_ID.conf 47 | 48 | cat << EOF > $DR_NGINX_CONF 49 | server { 50 | listen 80; 51 | location / { 52 | root /usr/share/nginx/html; 53 | index index.html index.htm; 54 | } 55 | EOF 56 | echo "DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
DR-$DR_RUN_ID - $DR_LOCAL_S3_MODEL_PREFIX - $TOPIC
" > $DR_VIEWER_HTML 57 | 58 | if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; then 59 | ROBOMAKER_CONTAINERS=$(docker ps --format "{{.ID}} {{.Names}}" --filter name="deepracer-${DR_RUN_ID}" | grep robomaker | cut -f1 -d\ ) 60 | else 61 | ROBOMAKER_SERVICE_REPLICAS=$(docker service ps deepracer-${DR_RUN_ID}_robomaker | awk '/robomaker/ { print $1 }') 62 | for c in $ROBOMAKER_SERVICE_REPLICAS; do 63 | ROBOMAKER_CONTAINER_IP=$(docker inspect $c | jq -r '.[].NetworksAttachments[] | select (.Network.Spec.Name == "sagemaker-local") | .Addresses[0] ' | cut -f1 -d/) 64 | ROBOMAKER_CONTAINERS="${ROBOMAKER_CONTAINERS} ${ROBOMAKER_CONTAINER_IP}" 65 | done 66 | fi 67 | 68 | if [ -z "$ROBOMAKER_CONTAINERS" ]; then 69 | echo "No running robomakers. Exiting." 70 | exit 71 | fi 72 | 73 | 74 | for c in $ROBOMAKER_CONTAINERS; do 75 | C_URL="/$c/stream?topic=${TOPIC}&quality=${QUALITY}&width=${WIDTH}&height=${HEIGHT}" 76 | C_IMG="
" 77 | echo $C_IMG >> $DR_VIEWER_HTML 78 | echo " location /$c { proxy_pass http://$c:8080; rewrite /$c/(.*) /\$1 break; }" >> $DR_NGINX_CONF 79 | done 80 | 81 | echo "
" >> $DR_VIEWER_HTML 82 | echo "}" >> $DR_NGINX_CONF 83 | 84 | # Check if we will use Docker Swarm or Docker Compose 85 | STACK_NAME="deepracer-$DR_RUN_ID-viewer" 86 | COMPOSE_FILES=$DR_DIR/docker/docker-compose-webviewer.yml 87 | 88 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 89 | then 90 | COMPOSE_FILES="$COMPOSE_FILES -c $DR_DIR/docker/docker-compose-webviewer-swarm.yml" 91 | docker stack deploy -c $COMPOSE_FILES $STACK_NAME 92 | else 93 | docker-compose -f $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d 94 | fi 95 | 96 | # Starting browser if using local X and having display defined. 97 | if [[ -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then 98 | echo "Starting browser '$BROWSER'." 99 | if [ "${DR_DOCKER_STYLE,,}" == "swarm" ]; 100 | then 101 | sleep 5 102 | fi 103 | $BROWSER "http://127.0.01:8100" & 104 | fi 105 | 106 | -------------------------------------------------------------------------------- /bin/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap ctrl_c INT 4 | 5 | function ctrl_c() { 6 | echo "Requested to stop." 7 | exit 1 8 | } 9 | 10 | 11 | 12 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 13 | 14 | ## Patch system 15 | sudo apt-get update && sudo apt-mark hold grub-pc && sudo DEBIAN_FRONTEND=noninteractive apt-get -y -o \ 16 | DPkg::options::="--force-confdef" -o DPkg::options::="--force-confold" -qq --force-yes upgrade && \ 17 | sudo apt-get install --no-install-recommends -y jq 18 | source $DIR/detect.sh 19 | echo "Detected cloud type ${CLOUD_NAME}" 20 | 21 | ## Do I have a GPU 22 | GPUS=$(lspci | awk '/NVIDIA/ && ( /VGA/ || /3D controller/ ) ' | wc -l ) 23 | if [ $? -ne 0 ] || [ $GPUS -eq 0 ]; 24 | then 25 | ARCH="cpu" 26 | echo "No NVIDIA GPU detected. Will not install drivers." 27 | else 28 | ARCH="gpu" 29 | fi 30 | 31 | ## Do I have an additional disk for Docker images - looking for /dev/sdc (Azure) 32 | 33 | if [[ "${CLOUD_NAME}" == "azure" ]]; 34 | then 35 | ADDL_DISK=$(lsblk | awk '/^sdc/ {print $1}') 36 | ADDL_PART=$(lsblk -l | awk -v DISK="$ADDL_DISK" '($0 ~ DISK) && ($0 ~ /part/) {print $1}') 37 | 38 | if [ -n "$ADDL_DISK" ] && [ -z "$ADDL_PART" ]; 39 | then 40 | echo "Found $ADDL_DISK, preparing it for use" 41 | echo -e "g\nn\np\n1\n\n\nw\n" | sudo fdisk /dev/$ADDL_DISK 42 | sleep 1s 43 | ADDL_DEVICE=$(echo "/dev/"$ADDL_DISK"1") 44 | sudo mkfs.ext4 $ADDL_DEVICE 45 | sudo mkdir -p /var/lib/docker 46 | echo "$ADDL_DEVICE /var/lib/docker ext4 rw,user,auto 0 0" | sudo tee -a /etc/fstab 47 | mount /var/lib/docker 48 | if [ $? -ne 0 ] 49 | then 50 | echo "Error during preparing of additional disk. Exiting." 51 | exit 1 52 | fi 53 | elif [ -n "$ADDL_DISK" ] && [ -n "$ADDL_PART" ]; 54 | then 55 | echo "Found $ADDL_DISK - $ADDL_PART already mounted. Installing into present drive/directory structure." 56 | 57 | else 58 | echo "Did not find $ADDL_DISK. Installing into present drive/directory structure." 59 | fi 60 | fi 61 | 62 | ## Adding Nvidia Drivers 63 | if [[ "${ARCH}" == "gpu" ]]; 64 | then 65 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed 's/\.//') 66 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/3bf863cc.pub 67 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64/7fa2af80.pub 68 | echo "deb http://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda.list 69 | echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/$distribution/x86_64 /" | sudo tee /etc/apt/sources.list.d/cuda_learn.list 70 | sudo apt update && sudo apt install -y nvidia-driver-470-server cuda-minimal-build-11-4 --no-install-recommends -o Dpkg::Options::="--force-overwrite" 71 | fi 72 | 73 | ## Adding AWSCli 74 | sudo apt-get install -y --no-install-recommends awscli python3-boto3 75 | 76 | ## Installing Docker 77 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 78 | sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" 79 | sudo apt-get update && sudo apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io 80 | 81 | if [[ "${ARCH}" == "gpu" ]]; 82 | then 83 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 84 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 85 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 86 | 87 | sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-docker2 nvidia-container-toolkit nvidia-container-runtime 88 | cat /etc/docker/daemon.json | jq 'del(."default-runtime") + {"default-runtime": "nvidia"}' | sudo tee /etc/docker/daemon.json 89 | fi 90 | sudo systemctl enable docker 91 | sudo systemctl restart docker 92 | 93 | ## Ensure user can run docker 94 | sudo usermod -a -G docker $(id -un) 95 | 96 | ## Installing Docker Compose 97 | sudo curl -L https://github.com/docker/compose/releases/download/1.29.2/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose 98 | sudo chmod +x /usr/local/bin/docker-compose 99 | 100 | ## Reboot to load driver -- continue install if in cloud-init 101 | CLOUD_INIT=$(pstree -s $BASHPID | awk /cloud-init/ | wc -l) 102 | 103 | if [[ "$CLOUD_INIT" -ne 0 ]]; 104 | then 105 | echo "Rebooting in 5 seconds. Will continue with install." 106 | cd $DIR 107 | ./runonce.sh "./init.sh -c ${CLOUD_NAME} -a ${ARCH}" 108 | sleep 5s 109 | sudo reboot 110 | else 111 | echo "First stage done. Please reboot and run init.sh -c ${CLOUD_NAME} -a ${ARCH}" 112 | fi 113 | -------------------------------------------------------------------------------- /scripts/training/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source $DR_DIR/bin/scripts_wrapper.sh 4 | 5 | usage(){ 6 | echo "Usage: $0 [-w] [-q | -s | -r [n] | -a ] [-v]" 7 | echo " -w Wipes the target AWS DeepRacer model structure before upload." 8 | echo " -q Do not output / follow a log when starting." 9 | echo " -a Follow all Sagemaker and Robomaker logs." 10 | echo " -s Follow Sagemaker logs (default)." 11 | echo " -v Updates the viewer webpage." 12 | echo " -r [n] Follow Robomaker logs for worker n (default worker 0 / replica 1)." 13 | exit 1 14 | } 15 | 16 | trap ctrl_c INT 17 | 18 | function ctrl_c() { 19 | echo "Requested to stop." 20 | exit 1 21 | } 22 | 23 | OPT_DISPLAY="SAGEMAKER" 24 | 25 | while getopts ":whqsavr:" opt; do 26 | case $opt in 27 | w) OPT_WIPE="WIPE" 28 | ;; 29 | q) OPT_QUIET="QUIET" 30 | ;; 31 | s) OPT_DISPLAY="SAGEMAKER" 32 | ;; 33 | a) OPT_DISPLAY="ALL" 34 | ;; 35 | r) # Check if value is in numeric format. 36 | OPT_DISPLAY="ROBOMAKER" 37 | if [[ $OPTARG =~ ^[0-9]+$ ]]; then 38 | OPT_ROBOMAKER=$OPTARG 39 | else 40 | OPT_ROBOMAKER=0 41 | ((OPTIND--)) 42 | fi 43 | ;; 44 | v) OPT_VIEWER="VIEWER" 45 | ;; 46 | h) usage 47 | ;; 48 | \?) echo "Invalid option -$OPTARG" >&2 49 | usage 50 | ;; 51 | esac 52 | done 53 | 54 | # Ensure Sagemaker's folder is there 55 | if [ ! -d /tmp/sagemaker ]; then 56 | sudo mkdir -p /tmp/sagemaker 57 | sudo chmod -R g+w /tmp/sagemaker 58 | fi 59 | 60 | #Check if files are available 61 | S3_PATH="s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_MODEL_PREFIX" 62 | 63 | S3_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls ${S3_PATH} | wc -l) 64 | if [[ "$S3_FILES" -gt 0 ]]; 65 | then 66 | if [[ -z $OPT_WIPE ]]; 67 | then 68 | echo "Selected path $S3_PATH exists. Delete it, or use -w option. Exiting." 69 | exit 1 70 | else 71 | echo "Wiping path $S3_PATH." 72 | aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 rm --recursive ${S3_PATH} 73 | fi 74 | fi 75 | 76 | # Base compose file 77 | if [ ${DR_ROBOMAKER_MOUNT_LOGS,,} = "true" ]; 78 | then 79 | COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-mount.yml" 80 | export DR_MOUNT_DIR="$DR_DIR/data/logs/robomaker/$DR_LOCAL_S3_MODEL_PREFIX" 81 | mkdir -p $DR_MOUNT_DIR 82 | else 83 | COMPOSE_FILES="$DR_TRAIN_COMPOSE_FILE" 84 | fi 85 | 86 | # set evaluation specific environment variables 87 | STACK_NAME="deepracer-$DR_RUN_ID" 88 | 89 | export DR_CURRENT_PARAMS_FILE=${DR_LOCAL_S3_TRAINING_PARAMS_FILE} 90 | 91 | WORKER_CONFIG=$(python3 $DR_DIR/scripts/training/prepare-config.py) 92 | 93 | if [ "$DR_WORKERS" -gt 1 ]; then 94 | echo "Starting $DR_WORKERS workers" 95 | 96 | if [[ "${DR_DOCKER_STYLE,,}" != "swarm" ]]; 97 | then 98 | mkdir -p $DR_DIR/tmp/comms.$DR_RUN_ID 99 | rm -rf $DR_DIR/tmp/comms.$DR_RUN_ID/* 100 | COMPOSE_FILES="$COMPOSE_FILES $DR_DOCKER_FILE_SEP $DR_DIR/docker/docker-compose-robomaker-multi.yml" 101 | fi 102 | 103 | if [ "$DR_TRAIN_MULTI_CONFIG" == "True" ]; then 104 | export MULTI_CONFIG=$WORKER_CONFIG 105 | echo "Multi-config training, creating multiple Robomaker configurations in $S3_PATH" 106 | else 107 | echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" 108 | fi 109 | export ROBOMAKER_COMMAND="./run.sh multi distributed_training.launch" 110 | 111 | else 112 | export ROBOMAKER_COMMAND="./run.sh run distributed_training.launch" 113 | echo "Creating Robomaker configuration in $S3_PATH/$DR_LOCAL_S3_TRAINING_PARAMS_FILE" 114 | fi 115 | 116 | # Check if we are using Host X -- ensure variables are populated 117 | if [[ "${DR_HOST_X,,}" == "true" ]]; 118 | then 119 | if [[ -n "$DR_DISPLAY" ]]; then 120 | ROBO_DISPLAY=$DR_DISPLAY 121 | else 122 | ROBO_DISPLAY=$DISPLAY 123 | fi 124 | 125 | if ! DISPLAY=$ROBO_DISPLAY timeout 1s xset q &>/dev/null; then 126 | echo "No X Server running on display $ROBO_DISPLAY. Exiting" 127 | exit 0 128 | fi 129 | 130 | if [[ -z "$XAUTHORITY" ]]; then 131 | export XAUTHORITY=~/.Xauthority 132 | if [[ ! -f "$XAUTHORITY" ]]; then 133 | echo "No XAUTHORITY defined. .Xauthority does not exist. Stopping." 134 | exit 0 135 | fi 136 | fi 137 | 138 | fi 139 | 140 | # Check if we will use Docker Swarm or Docker Compose 141 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 142 | then 143 | ROBOMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Robomaker == "true") | .ID' | wc -l) 144 | if [[ "$ROBOMAKER_NODES" -eq 0 ]]; 145 | then 146 | echo "ERROR: No Swarm Nodes labelled for placement of Robomaker. Please add Robomaker node." 147 | echo " Example: docker node update --label-add Robomaker=true $(docker node inspect self | jq .[0].ID -r)" 148 | exit 0 149 | fi 150 | 151 | SAGEMAKER_NODES=$(docker node ls --format '{{.ID}}' | xargs docker inspect | jq '.[] | select (.Spec.Labels.Sagemaker == "true") | .ID' | wc -l) 152 | if [[ "$SAGEMAKER_NODES" -eq 0 ]]; 153 | then 154 | echo "ERROR: No Swarm Nodes labelled for placement of Sagemaker. Please add Sagemaker node." 155 | echo " Example: docker node update --label-add Sagemaker=true $(docker node inspect self | jq .[0].ID -r)" 156 | exit 0 157 | fi 158 | 159 | DISPLAY=$ROBO_DISPLAY docker stack deploy $COMPOSE_FILES $STACK_NAME 160 | 161 | else 162 | DISPLAY=$ROBO_DISPLAY docker-compose $COMPOSE_FILES -p $STACK_NAME --log-level ERROR up -d --scale robomaker=$DR_WORKERS 163 | fi 164 | 165 | # Viewer 166 | if [ -n "$OPT_VIEWER" ]; then 167 | (sleep 5; dr-update-viewer) 168 | fi 169 | 170 | # Request to be quiet. Quitting here. 171 | if [ -n "$OPT_QUIET" ]; then 172 | exit 0 173 | fi 174 | 175 | # Trigger requested log-file 176 | if [[ "${OPT_DISPLAY,,}" == "all" && -n "${DISPLAY}" && "${DR_HOST_X,,}" == "true" ]]; then 177 | dr-logs-sagemaker -w 15 178 | if [ "${DR_WORKERS}" -gt 1 ]; then 179 | for i in $(seq 1 ${DR_WORKERS}) 180 | do 181 | dr-logs-robomaker -w 15 -n $i 182 | done 183 | else 184 | dr-logs-robomaker -w 15 185 | fi 186 | elif [[ "${OPT_DISPLAY,,}" == "robomaker" ]]; then 187 | dr-logs-robomaker -w 15 -n $OPT_ROBOMAKER 188 | elif [[ "${OPT_DISPLAY,,}" == "sagemaker" ]]; then 189 | dr-logs-sagemaker -w 15 190 | fi 191 | 192 | -------------------------------------------------------------------------------- /utils/sample-createspot.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## This is sample code that will generally show you how to launch a spot instance on aws and leverage the 4 | ## automation built into deepracer-for-cloud to automatically start training 5 | ## Changes required to work: 6 | ## Input location where your training will take place -- S3_LOCATION 7 | ## Input security group, iam role, and key-name 8 | 9 | ## First you need to tell the script where in s3 your training will take place 10 | ## can be either a bucket at the root level, or a bucket/prefix. don't include the s3:// 11 | 12 | S3_LOCATION=<#########> 13 | 14 | ## extract bucket location 15 | BUCKET=${S3_LOCATION%%/*} 16 | 17 | ## extract prefix location 18 | if [[ "$S3_LOCATION" == *"/"* ]] 19 | then 20 | PREFIX=${S3_LOCATION#*/} 21 | else 22 | PREFIX="" 23 | fi 24 | 25 | ## Fill these out with your custom information if you want to upload and submit to leaderboard. not required to run 26 | DR_UPLOAD_S3_PREFIX=######## 27 | 28 | ## set the instance type you want to launch 29 | INSTANCE_TYPE=c5.2xlarge 30 | 31 | ## if you want to modify additional variables from the default, add them here, then add them to section further below called replace static paramamters. I've only done World name for now 32 | WORLD_NAME=FS_June2020 33 | 34 | ## modify this if you want additional robomaker workers 35 | DR_WORKERS=1 36 | 37 | ## select which images you want to use. these will be used later for a docker pull 38 | DR_SAGEMAKER_IMAGE=cpu-avx-mkl 39 | DR_ROBOMAKER_IMAGE=cpu-avx2 40 | 41 | ## check the s3 location for existing training folders 42 | ## automatically determine the latest training run (highest number), and set model parameters accordingly 43 | ## this script assumes the format rl-deepracer-1, rl-deepracer-2, etc. you will need to modify if your schema differs 44 | 45 | LAST_TRAINING=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}') 46 | ## drop trailing slash 47 | LAST_TRAINING=$(echo $LAST_TRAINING | sed 's:/*$::') 48 | 49 | CONFIG_FILE="./run.env" 50 | OLD_SYSTEMENV="./system.env" 51 | 52 | ## incorporate logic from increment.sh, slightly modified to use last training 53 | OPT_DELIM='-' 54 | ## Read in data 55 | CURRENT_RUN_MODEL=$(aws s3 ls $S3_LOCATION/rl-deepracer | sort -t - -k 3 -g | tail -n 1 | awk '{print $2}') 56 | ## drop trailing slash 57 | CURRENT_RUN_MODEL=$(echo $LAST_TRAINING | sed 's:/*$::') 58 | ## get number at the end 59 | CURRENT_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL}" | \ 60 | awk -v DELIM="${OPT_DELIM}" '{ n=split($0,a,DELIM); if (a[n] ~ /[0-9]*/) print a[n]; else print ""; }') 61 | 62 | if [ -z $LAST_TRAINING ] 63 | then 64 | echo No prior training found 65 | if [[ $PREFIX == "" ]] 66 | then 67 | NEW_RUN_MODEL=rl-deepracer-1 68 | else 69 | NEW_RUN_MODEL="$PREFIX/rl-deepracer-1" 70 | fi 71 | PRETRAINED=False 72 | CURRENT_RUN_MODEL=$NEW_RUN_MODEL 73 | else 74 | 75 | NEW_RUN_MODEL_NUM=$(echo "${CURRENT_RUN_MODEL_NUM} + 1" | bc ) 76 | PRETRAINED=True 77 | 78 | if [[ $PREFIX == "" ]] 79 | then 80 | NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") 81 | else 82 | NEW_RUN_MODEL=$(echo $CURRENT_RUN_MODEL | sed "s/${CURRENT_RUN_MODEL_NUM}\$/${NEW_RUN_MODEL_NUM}/") 83 | NEW_RUN_MODEL="$PREFIX/$NEW_RUN_MODEL" 84 | CURRENT_RUN_MODEL="$PREFIX/$CURRENT_RUN_MODEL" 85 | fi 86 | echo Last training was $CURRENT_RUN_MODEL so next training is $NEW_RUN_MODEL 87 | fi 88 | 89 | if [[ $PREFIX == "" ]] 90 | then 91 | CUSTOM_FILES_PREFIX="custom_files" 92 | else 93 | CUSTOM_FILES_PREFIX="$PREFIX/custom_files" 94 | fi 95 | 96 | ## Replace dynamic parameters in run.env (still local to your directory) 97 | sed -i.bak -re "s:(DR_LOCAL_S3_PRETRAINED_PREFIX=).*$:\1$CURRENT_RUN_MODEL:g; s:(DR_LOCAL_S3_PRETRAINED=).*$:\1$PRETRAINED:g; s:(DR_LOCAL_S3_MODEL_PREFIX=).*$:\1$NEW_RUN_MODEL:g; s:(DR_LOCAL_S3_CUSTOM_FILES_PREFIX=).*$:\1$CUSTOM_FILES_PREFIX:g" "$CONFIG_FILE" 98 | sed -i.bak -re "s/(DR_LOCAL_S3_BUCKET=).*$/\1$BUCKET/g" "$CONFIG_FILE" 99 | 100 | ## Replace static parameters in run.env (still local to your directory) 101 | sed -i.bak -re "s/(DR_UPLOAD_S3_PREFIX=).*$/\1$DR_UPLOAD_S3_PREFIX/g" "$CONFIG_FILE" 102 | sed -i.bak -re "s/(DR_WORLD_NAME=).*$/\1$WORLD_NAME/g" "$CONFIG_FILE" 103 | 104 | ## Replace static paramaters in system.env file, including sagemaker and robomaker images (still local to your directory) and the number of DR_workers 105 | sed -i.bak -re "s/(DR_UPLOAD_S3_BUCKET=).*$/\1$DR_UPLOAD_S3_BUCKET/g; s/(DR_SAGEMAKER_IMAGE=).*$/\1$DR_SAGEMAKER_IMAGE/g; s/(DR_ROBOMAKER_IMAGE=).*$/\1$DR_ROBOMAKER_IMAGE/g; s/(DR_WORKERS=).*$/\1$DR_WORKERS/g" "$OLD_SYSTEMENV" 106 | 107 | ## upload the new run.env and system.env files into your S3 bucket (same s3 location identified earlier) 108 | ## files are loaded into the node-config folder/prefix. You can also upload other files to node config, and they 109 | ## will sync to the EC2 instance as part of the autorun script later. If you add other files, make sure they are 110 | ## in node-config in the same directory structure as DRfc; example: s3location/node-config/scripts/training/.start.sh 111 | RUNENV_LOCATION=$S3_LOCATION/node-config/run.env 112 | SYSENV_LOCATION=$S3_LOCATION/node-config/system.env 113 | 114 | aws s3 cp ./run.env s3://$RUNENV_LOCATION 115 | aws s3 cp ./system.env s3://$SYSENV_LOCATION 116 | 117 | ## upload a custom autorun script to S3. there is a default autorun script in the repo that will be used unless a custom one is specified here instead 118 | #aws s3 cp ./autorun.sh s3://$S3_LOCATION/autorun.sh 119 | 120 | ## upload custom files -- if you dont want this, comment these lines out 121 | aws s3 cp ./model_metadata.json s3://$S3_LOCATION/custom_files/model_metadata.json 122 | aws s3 cp ./reward_function.py s3://$S3_LOCATION/custom_files/reward_function.py 123 | aws s3 cp ./hyperparameters.json s3://$S3_LOCATION/custom_files/hyperparameters.json 124 | 125 | ## launch an ec2 126 | ## update with your own settings, including key-name, security-group, and iam-instance-profile at a minimum 127 | ## user data includes a command to create a .txt file which simply contains the name of the s3 location 128 | ## this filename will be used as fundamental input to autorun.sh script run later on that instance 129 | ## you need to ensure you have proper IAM permissions to launch this instance 130 | 131 | aws ec2 run-instances \ 132 | --image-id ami-085925f297f89fce1 \ 133 | --count 1 \ 134 | --instance-type $INSTANCE_TYPE \ 135 | --key-name <####keyname####> \ 136 | --security-group-ids sg-<####sgid####> \ 137 | --block-device-mappings 'DeviceName=/dev/sda1,Ebs={DeleteOnTermination=true,VolumeSize=40}' \ 138 | --iam-instance-profile Arn=arn:aws:iam::<####acct_num####>:instance-profile/<####role_name####> \ 139 | --instance-market-options MarketType=spot \ 140 | --user-data "#!/bin/bash 141 | su -c 'git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git && echo "$S3_LOCATION/node-config" > /home/ubuntu/deepracer-for-cloud/autorun.s3url && /home/ubuntu/deepracer-for-cloud/bin/prepare.sh' - ubuntu" 142 | -------------------------------------------------------------------------------- /scripts/evaluation/prepare-config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import boto3 4 | import sys 5 | import os 6 | import time 7 | import json 8 | import io 9 | import yaml 10 | 11 | def str2bool(v): 12 | return v.lower() in ("yes", "true", "t", "1") 13 | 14 | config = {} 15 | config['CAR_COLOR'] = [] 16 | config['BODY_SHELL_TYPE'] = [] 17 | config['RACER_NAME'] = [] 18 | config['DISPLAY_NAME'] = [] 19 | config['MODEL_S3_PREFIX'] = [] 20 | config['MODEL_S3_BUCKET'] = [] 21 | config['SIMTRACE_S3_PREFIX'] = [] 22 | config['SIMTRACE_S3_BUCKET'] = [] 23 | config['KINESIS_VIDEO_STREAM_NAME'] = [] 24 | config['METRICS_S3_BUCKET'] = [] 25 | config['METRICS_S3_OBJECT_KEY'] = [] 26 | config['MP4_S3_BUCKET'] = [] 27 | config['MP4_S3_OBJECT_PREFIX'] = [] 28 | 29 | # Basic configuration; including all buckets etc. 30 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') 31 | config['JOB_TYPE'] = 'EVALUATION' 32 | config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') 33 | config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') 34 | 35 | config['MODEL_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) 36 | config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 37 | config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 38 | config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) 39 | 40 | # Metrics 41 | config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 42 | metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) 43 | if metrics_prefix is not None: 44 | config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time())))) 45 | else: 46 | config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time())))) 47 | 48 | # MP4 configuration / sav 49 | save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) 50 | if save_mp4: 51 | config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 52 | config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'bucket'),'mp4')) 53 | 54 | # Checkpoint 55 | config['EVAL_CHECKPOINT'] = os.environ.get('DR_EVAL_CHECKPOINT', 'last') 56 | 57 | # Car and training 58 | body_shell_type = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') 59 | config['BODY_SHELL_TYPE'].append(body_shell_type) 60 | if body_shell_type == 'deepracer': 61 | config['CAR_COLOR'].append(os.environ.get('DR_CAR_COLOR', 'Red')) 62 | config['DISPLAY_NAME'].append(os.environ.get('DR_DISPLAY_NAME', 'racer1')) 63 | config['RACER_NAME'].append(os.environ.get('DR_RACER_NAME', 'racer1')) 64 | 65 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') 66 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') 67 | config['NUMBER_OF_TRIALS'] = os.environ.get('DR_EVAL_NUMBER_OF_TRIALS', '5') 68 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') 69 | config['RESET_BEHIND_DIST'] = os.environ.get('DR_EVAL_RESET_BEHIND_DIST', '1.0') 70 | 71 | config['IS_CONTINUOUS'] = os.environ.get('DR_EVAL_IS_CONTINUOUS', 'True') 72 | config['NUMBER_OF_RESETS'] = os.environ.get('DR_EVAL_MAX_RESETS', '0') 73 | 74 | config['OFF_TRACK_PENALTY'] = os.environ.get('DR_EVAL_OFF_TRACK_PENALTY', '5.0') 75 | config['COLLISION_PENALTY'] = os.environ.get('DR_COLLISION_PENALTY', '5.0') 76 | 77 | # Object Avoidance 78 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': 79 | config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') 80 | config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') 81 | config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') 82 | config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') 83 | 84 | object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") 85 | if object_position_str != "": 86 | object_positions = [] 87 | for o in object_position_str.split(";"): 88 | object_positions.append(o) 89 | config['OBJECT_POSITIONS'] = object_positions 90 | config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) 91 | 92 | # Head to Bot 93 | if config['RACE_TYPE'] == 'HEAD_TO_BOT': 94 | config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') 95 | config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') 96 | config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') 97 | config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') 98 | config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') 99 | config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') 100 | config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') 101 | config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') 102 | config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0') 103 | 104 | # Head to Model 105 | if config['RACE_TYPE'] == 'HEAD_TO_MODEL': 106 | config['MODEL_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) 107 | config['MODEL_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 108 | config['SIMTRACE_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 109 | config['SIMTRACE_S3_PREFIX'].append(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker')) 110 | 111 | # Metrics 112 | config['METRICS_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 113 | metrics_prefix = os.environ.get('DR_EVAL_OPP_S3_METRICS_PREFIX', '{}/{}'.format(os.environ.get('DR_EVAL_OPP_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker'),'metrics')) 114 | if metrics_prefix is not None: 115 | config['METRICS_S3_OBJECT_KEY'].append('{}/EvaluationMetrics-{}.json'.format(metrics_prefix, str(round(time.time())))) 116 | else: 117 | config['METRICS_S3_OBJECT_KEY'].append('DeepRacer-Metrics/EvaluationMetrics-{}.json'.format(str(round(time.time())))) 118 | 119 | # MP4 configuration / sav 120 | save_mp4 = str2bool(os.environ.get("DR_EVAL_SAVE_MP4", "False")) 121 | if save_mp4: 122 | config['MP4_S3_BUCKET'].append(os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket')) 123 | config['MP4_S3_OBJECT_PREFIX'].append('{}/{}'.format(os.environ.get('DR_EVAL_OPP_MODEL_PREFIX', 'bucket'),'mp4')) 124 | 125 | # Car and training 126 | config['DISPLAY_NAME'].append(os.environ.get('DR_EVAL_OPP_DISPLAY_NAME', 'racer1')) 127 | config['RACER_NAME'].append(os.environ.get('DR_EVAL_OPP_RACER_NAME', 'racer1')) 128 | 129 | body_shell_type = os.environ.get('DR_EVAL_OPP_CAR_BODY_SHELL_TYPE', 'deepracer') 130 | config['BODY_SHELL_TYPE'].append(body_shell_type) 131 | config['VIDEO_JOB_TYPE'] = 'EVALUATION' 132 | config['CAR_COLOR'] = ['Purple', 'Orange'] 133 | config['MODEL_NAME'] = config['DISPLAY_NAME'] 134 | 135 | # S3 Setup / write and upload file 136 | s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) 137 | s3_region = config['AWS_REGION'] 138 | s3_bucket = config['MODEL_S3_BUCKET'][0] 139 | s3_prefix = config['MODEL_S3_PREFIX'][0] 140 | s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') 141 | if s3_mode == 'profile': 142 | s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') 143 | else: # mode is 'role' 144 | s3_profile = None 145 | s3_yaml_name = os.environ.get('DR_LOCAL_S3_EVAL_PARAMS_FILE', 'eval_params.yaml') 146 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) 147 | 148 | session = boto3.session.Session(profile_name=s3_profile) 149 | s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) 150 | 151 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) 152 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'eval-params-' + str(round(time.time())) + '.yaml')) 153 | 154 | with open(local_yaml_path, 'w') as yaml_file: 155 | yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) 156 | 157 | s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) 158 | -------------------------------------------------------------------------------- /bin/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | trap ctrl_c INT 4 | 5 | function ctrl_c() { 6 | echo "Requested to stop." 7 | exit 1 8 | } 9 | 10 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 11 | INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )" 12 | 13 | OPT_ARCH="gpu" 14 | OPT_CLOUD="" 15 | 16 | while getopts ":m:c:a:" opt; do 17 | case $opt in 18 | a) OPT_ARCH="$OPTARG" 19 | ;; 20 | m) OPT_MOUNT="$OPTARG" 21 | ;; 22 | c) OPT_CLOUD="$OPTARG" 23 | ;; 24 | \?) echo "Invalid option -$OPTARG" >&2 25 | exit 1 26 | ;; 27 | esac 28 | done 29 | 30 | if [[ -z "$OPT_CLOUD" ]]; then 31 | source $SCRIPT_DIR/detect.sh 32 | OPT_CLOUD=$CLOUD_NAME 33 | echo "Detected cloud type to be $CLOUD_NAME" 34 | fi 35 | 36 | # Find CPU Level 37 | CPU_LEVEL="cpu-avx" 38 | 39 | if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep avx2 | wc -l)" > 0 ]]; then 40 | CPU_LEVEL="cpu-avx2" 41 | elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n hw.optional.avx2_0)" == 1 ]]; then 42 | CPU_LEVEL="cpu-avx2" 43 | fi 44 | 45 | # Check if Intel (to ensure MKN) 46 | if [[ -f /proc/cpuinfo ]] && [[ "$(cat /proc/cpuinfo | grep GenuineIntel | wc -l)" > 0 ]]; then 47 | CPU_INTEL="true" 48 | elif [[ "$(type sysctl 2> /dev/null)" ]] && [[ "$(sysctl -n machdep.cpu.vendor)" == "GenuineIntel" ]]; then 49 | CPU_INTEL="true" 50 | fi 51 | 52 | # Check GPU 53 | if [[ "${OPT_ARCH}" == "gpu" ]] 54 | then 55 | docker build -t local/gputest - < $INSTALL_DIR/utils/Dockerfile.gpu-detect 56 | GPUS=$(docker run --rm --gpus all local/gputest 2> /dev/null | awk '/Device: ./' | wc -l ) 57 | if [ $? -ne 0 ] || [ $GPUS -eq 0 ] 58 | then 59 | echo "No GPU detected in docker. Using CPU". 60 | OPT_ARCH="cpu-avx" 61 | fi 62 | fi 63 | 64 | cd $INSTALL_DIR 65 | 66 | # create directory structure for docker volumes 67 | mkdir -p $INSTALL_DIR/data $INSTALL_DIR/data/minio $INSTALL_DIR/data/minio/bucket 68 | mkdir -p $INSTALL_DIR/data/logs $INSTALL_DIR/data/analysis $INSTALL_DIR/tmp 69 | sudo mkdir -p /tmp/sagemaker 70 | sudo chmod -R g+w /tmp/sagemaker 71 | 72 | # create symlink to current user's home .aws directory 73 | # NOTE: AWS cli must be installed for this to work 74 | # https://docs.aws.amazon.com/cli/latest/userguide/install-linux-al2017.html 75 | mkdir -p $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ 76 | ln -sf $(eval echo "~${USER}")/.aws $INSTALL_DIR/docker/volumes/ 77 | 78 | # copy rewardfunctions 79 | mkdir -p $INSTALL_DIR/custom_files 80 | cp $INSTALL_DIR/defaults/hyperparameters.json $INSTALL_DIR/custom_files/ 81 | cp $INSTALL_DIR/defaults/model_metadata.json $INSTALL_DIR/custom_files/ 82 | cp $INSTALL_DIR/defaults/reward_function.py $INSTALL_DIR/custom_files/ 83 | 84 | cp $INSTALL_DIR/defaults/template-system.env $INSTALL_DIR/system.env 85 | cp $INSTALL_DIR/defaults/template-run.env $INSTALL_DIR/run.env 86 | if [[ "${OPT_CLOUD}" == "aws" ]]; then 87 | AWS_EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` 88 | AWS_REGION="`echo \"$AWS_EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" 89 | sed -i "s//not-defined/g" $INSTALL_DIR/system.env 90 | sed -i "s//default/g" $INSTALL_DIR/system.env 91 | elif [[ "${OPT_CLOUD}" == "azure" ]]; then 92 | AWS_REGION="us-east-1" 93 | sed -i "s//azure/g" $INSTALL_DIR/system.env 94 | sed -i "s//not-defined/g" $INSTALL_DIR/system.env 95 | echo "Please run 'aws configure --profile azure' to set the credentials" 96 | elif [[ "${OPT_CLOUD}" == "remote" ]]; then 97 | AWS_REGION="us-east-1" 98 | sed -i "s//minio/g" $INSTALL_DIR/system.env 99 | sed -i "s//not-defined/g" $INSTALL_DIR/system.env 100 | echo "Please run 'aws configure --profile minio' to set the credentials" 101 | echo "Please define DR_REMOTE_MINIO_URL in system.env to point to remote minio instance." 102 | else 103 | AWS_REGION="us-east-1" 104 | sed -i "s//minio/g" $INSTALL_DIR/system.env 105 | sed -i "s//not-defined/g" $INSTALL_DIR/system.env 106 | echo "Please run 'aws configure --profile minio' to set the credentials" 107 | fi 108 | sed -i "s//to-be-defined/g" $INSTALL_DIR/system.env 109 | sed -i "s//$OPT_CLOUD/g" $INSTALL_DIR/system.env 110 | sed -i "s//$AWS_REGION/g" $INSTALL_DIR/system.env 111 | 112 | 113 | if [[ "${OPT_ARCH}" == "gpu" ]]; then 114 | SAGEMAKER_TAG="gpu" 115 | elif [[ -n "${CPU_INTEL}" ]]; then 116 | SAGEMAKER_TAG="cpu" 117 | else 118 | SAGEMAKER_TAG="cpu" 119 | fi 120 | 121 | #set proxys if required 122 | for arg in "$@"; 123 | do 124 | IFS='=' read -ra part <<< "$arg" 125 | if [ "${part[0]}" == "--http_proxy" ] || [ "${part[0]}" == "--https_proxy" ] || [ "${part[0]}" == "--no_proxy" ]; then 126 | var=${part[0]:2}=${part[1]} 127 | args="${args} --build-arg ${var}" 128 | fi 129 | done 130 | 131 | # Download docker images. Change to build statements if locally built images are desired. 132 | COACH_VERSION=$(jq -r '.containers.rl_coach | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) 133 | sed -i "s//$COACH_VERSION/g" $INSTALL_DIR/system.env 134 | 135 | ROBOMAKER_VERSION=$(jq -r '.containers.robomaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) 136 | if [ -n $ROBOMAKER_VERSION ]; then 137 | ROBOMAKER_VERSION=$ROBOMAKER_VERSION-$CPU_LEVEL 138 | else 139 | ROBOMAKER_VERSION=$CPU_LEVEL 140 | fi 141 | sed -i "s//$ROBOMAKER_VERSION/g" $INSTALL_DIR/system.env 142 | 143 | SAGEMAKER_VERSION=$(jq -r '.containers.sagemaker | select (.!=null)' $INSTALL_DIR/defaults/dependencies.json) 144 | if [ -n $SAGEMAKER_VERSION ]; then 145 | SAGEMAKER_VERSION=$SAGEMAKER_VERSION-$SAGEMAKER_TAG 146 | else 147 | SAGEMAKER_VERSION=$SAGEMAKER_TAG 148 | fi 149 | sed -i "s//$SAGEMAKER_VERSION/g" $INSTALL_DIR/system.env 150 | 151 | docker pull awsdeepracercommunity/deepracer-rlcoach:$COACH_VERSION 152 | docker pull awsdeepracercommunity/deepracer-robomaker:$ROBOMAKER_VERSION 153 | docker pull awsdeepracercommunity/deepracer-sagemaker:$SAGEMAKER_VERSION 154 | 155 | # create the network sagemaker-local if it doesn't exit 156 | SAGEMAKER_NW='sagemaker-local' 157 | docker swarm init 158 | SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) 159 | docker node update --label-add Sagemaker=true $SWARM_NODE 160 | docker node update --label-add Robomaker=true $SWARM_NODE 161 | docker network ls | grep -q $SAGEMAKER_NW 162 | if [ $? -ne 0 ] 163 | then 164 | docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm 165 | else 166 | docker network rm $SAGEMAKER_NW 167 | docker network create $SAGEMAKER_NW -d overlay --attachable --scope swarm 168 | fi 169 | 170 | # ensure our variables are set on startup - not for local setup. 171 | if [[ "${OPT_CLOUD}" != "local" ]]; then 172 | NUM_IN_PROFILE=$(cat $HOME/.profile | grep "$INSTALL_DIR/bin/activate.sh" | wc -l) 173 | if [ "$NUM_IN_PROFILE" -eq 0 ]; then 174 | echo "source $INSTALL_DIR/bin/activate.sh" >> $HOME/.profile 175 | fi 176 | fi 177 | 178 | # mark as done 179 | date | tee $INSTALL_DIR/DONE 180 | 181 | ## Optional auturun feature 182 | # if using automation scripts to auto configure and run 183 | # you must pass s3_training_location.txt to this instance in order for this to work 184 | if [[ -f "$INSTALL_DIR/autorun.s3url" ]] 185 | then 186 | ## read in first line. first line always assumed to be training location regardless what else is in file 187 | TRAINING_LOC=$(awk 'NR==1 {print; exit}' $INSTALL_DIR/autorun.s3url) 188 | 189 | #get bucket name 190 | TRAINING_BUCKET=${TRAINING_LOC%%/*} 191 | #get prefix. minor exception handling in case there is no prefix and a root bucket is passed 192 | if [[ "$TRAINING_LOC" == *"/"* ]] 193 | then 194 | TRAINING_PREFIX=${TRAINING_LOC#*/} 195 | else 196 | TRAINING_PREFIX="" 197 | fi 198 | 199 | ##check if custom autorun script exists in s3 training bucket. If not, use default in this repo 200 | aws s3api head-object --bucket $TRAINING_BUCKET --key $TRAINING_PREFIX/autorun.sh || not_exist=true 201 | if [ $not_exist ]; then 202 | echo "custom file does not exist, using local copy" 203 | else 204 | echo "custom script does exist, use it" 205 | aws s3 cp s3://$TRAINING_LOC/autorun.sh $INSTALL_DIR/bin/autorun.sh 206 | fi 207 | chmod +x $INSTALL_DIR/bin/autorun.sh 208 | bash -c "source $INSTALL_DIR/bin/autorun.sh" 209 | fi 210 | 211 | -------------------------------------------------------------------------------- /bin/activate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | verlte() { 4 | [ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] 5 | } 6 | 7 | function dr-update-env { 8 | 9 | if [[ -f "$DIR/system.env" ]] 10 | then 11 | LINES=$(grep -v '^#' $DIR/system.env) 12 | for l in $LINES; do 13 | env_var=$(echo $l | cut -f1 -d\=) 14 | env_val=$(echo $l | cut -f2 -d\=) 15 | eval "export $env_var=$env_val" 16 | done 17 | else 18 | echo "File system.env does not exist." 19 | return 1 20 | fi 21 | 22 | if [[ -f "$DR_CONFIG" ]] 23 | then 24 | LINES=$(grep -v '^#' $DR_CONFIG) 25 | for l in $LINES; do 26 | env_var=$(echo $l | cut -f1 -d\=) 27 | env_val=$(echo $l | cut -f2 -d\=) 28 | eval "export $env_var=$env_val" 29 | done 30 | else 31 | echo "File run.env does not exist." 32 | return 1 33 | fi 34 | 35 | if [[ -z "${DR_RUN_ID}" ]]; then 36 | export DR_RUN_ID=0 37 | fi 38 | 39 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 40 | then 41 | export DR_ROBOMAKER_TRAIN_PORT=$(expr 8080 + $DR_RUN_ID) 42 | export DR_ROBOMAKER_EVAL_PORT=$(expr 8180 + $DR_RUN_ID) 43 | export DR_ROBOMAKER_GUI_PORT=$(expr 5900 + $DR_RUN_ID) 44 | else 45 | export DR_ROBOMAKER_TRAIN_PORT="8080-8089" 46 | export DR_ROBOMAKER_EVAL_PORT="8080-8089" 47 | export DR_ROBOMAKER_GUI_PORT="5901-5920" 48 | fi 49 | 50 | } 51 | 52 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 53 | DIR="$( dirname $SCRIPT_DIR )" 54 | export DR_DIR=$DIR 55 | 56 | if [[ -f "$1" ]]; 57 | then 58 | export DR_CONFIG=$(readlink -f $1) 59 | dr-update-env 60 | elif [[ -f "$DIR/run.env" ]]; 61 | then 62 | export DR_CONFIG="$DIR/run.env" 63 | dr-update-env 64 | else 65 | echo "No configuration file." 66 | return 1 67 | fi 68 | 69 | # Check if Docker runs -- if not, then start it. 70 | if [[ "$(type service 2> /dev/null)" ]]; then 71 | service docker status > /dev/null || sudo service docker start 72 | fi 73 | 74 | # Check if we will use Docker Swarm or Docker Compose 75 | # If not defined then use Swarm 76 | if [[ -z "${DR_DOCKER_STYLE}" ]]; then 77 | export DR_DOCKER_STYLE="swarm" 78 | fi 79 | 80 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 81 | then 82 | export DR_DOCKER_FILE_SEP="-c" 83 | SWARM_NODE=$(docker node inspect self | jq .[0].ID -r) 84 | SWARM_NODE_UPDATE=$(docker node update --label-add Sagemaker=true $SWARM_NODE) 85 | else 86 | export DR_DOCKER_FILE_SEP="-f" 87 | fi 88 | 89 | # Prepare the docker compose files depending on parameters 90 | if [[ "${DR_CLOUD,,}" == "azure" ]]; 91 | then 92 | export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" 93 | export DR_MINIO_URL="http://minio:9000" 94 | DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" 95 | DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 96 | DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 97 | DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-azure.yml" 98 | elif [[ "${DR_CLOUD,,}" == "local" ]]; 99 | then 100 | export DR_LOCAL_S3_ENDPOINT_URL="http://localhost:9000" 101 | export DR_MINIO_URL="http://minio:9000" 102 | DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" 103 | DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 104 | DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 105 | DR_MINIO_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local.yml" 106 | elif [[ "${DR_CLOUD,,}" == "remote" ]]; 107 | then 108 | export DR_LOCAL_S3_ENDPOINT_URL="$DR_REMOTE_MINIO_URL" 109 | export DR_MINIO_URL="$DR_REMOTE_MINIO_URL" 110 | DR_LOCAL_PROFILE_ENDPOINT_URL="--profile $DR_LOCAL_S3_PROFILE --endpoint-url $DR_LOCAL_S3_ENDPOINT_URL" 111 | DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 112 | DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-endpoint.yml" 113 | DR_MINIO_COMPOSE_FILE="" 114 | else 115 | DR_LOCAL_PROFILE_ENDPOINT_URL="" 116 | DR_TRAIN_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training.yml" 117 | DR_EVAL_COMPOSE_FILE="$DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval.yml" 118 | fi 119 | 120 | # Prevent docker swarms to restart 121 | if [[ "${DR_HOST_X,,}" == "true" ]]; 122 | then 123 | DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml" 124 | DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-local-xorg.yml" 125 | fi 126 | 127 | # Prevent docker swarms to restart 128 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 129 | then 130 | DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-training-swarm.yml" 131 | DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-eval-swarm.yml" 132 | fi 133 | 134 | # Enable logs in CloudWatch 135 | if [[ "${DR_CLOUD_WATCH_ENABLE,,}" == "true" ]]; then 136 | DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml" 137 | DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-cwlog.yml" 138 | fi 139 | 140 | ## Check if we have an AWS IAM assumed role, or if we need to set specific credentials. 141 | if [ "${DR_CLOUD,,}" == "aws" ] && [ $(aws --output json sts get-caller-identity 2> /dev/null | jq '.Arn' | awk /assumed-role/ | wc -l ) -gt 0 ]; 142 | then 143 | export DR_LOCAL_S3_AUTH_MODE="role" 144 | else 145 | export DR_LOCAL_ACCESS_KEY_ID=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_access_key_id | xargs) 146 | export DR_LOCAL_SECRET_ACCESS_KEY=$(aws --profile $DR_LOCAL_S3_PROFILE configure get aws_secret_access_key | xargs) 147 | DR_TRAIN_COMPOSE_FILE="$DR_TRAIN_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" 148 | DR_EVAL_COMPOSE_FILE="$DR_EVAL_COMPOSE_FILE $DR_DOCKER_FILE_SEP $DIR/docker/docker-compose-keys.yml" 149 | export DR_UPLOAD_PROFILE="--profile $DR_UPLOAD_S3_PROFILE" 150 | export DR_LOCAL_S3_AUTH_MODE="profile" 151 | fi 152 | 153 | export DR_TRAIN_COMPOSE_FILE 154 | export DR_EVAL_COMPOSE_FILE 155 | export DR_LOCAL_PROFILE_ENDPOINT_URL 156 | 157 | if [[ -n "${DR_MINIO_COMPOSE_FILE}" ]]; then 158 | export MINIO_UID=$(id -u) 159 | export MINIO_USERNAME=$(id -u -n) 160 | export MINIO_GID=$(id -g) 161 | export MINIO_GROUPNAME=$(id -g -n) 162 | if [[ "${DR_DOCKER_STYLE,,}" == "swarm" ]]; 163 | then 164 | docker stack deploy $DR_MINIO_COMPOSE_FILE s3 165 | else 166 | docker-compose $DR_MINIO_COMPOSE_FILE -p s3 --log-level ERROR up -d 167 | fi 168 | 169 | fi 170 | 171 | ## Version check 172 | DEPENDENCY_VERSION=$(jq -r '.master_version | select (.!=null)' $DIR/defaults/dependencies.json) 173 | 174 | SAGEMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-sagemaker:$DR_SAGEMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) 175 | if [ -z "$SAGEMAKER_VER" ]; then SAGEMAKER_VER=$DR_SAGEMAKER_IMAGE; fi 176 | if ! verlte $DEPENDENCY_VERSION $SAGEMAKER_VER; then 177 | echo "WARNING: Incompatible version of Deepracer Sagemaker. Expected >$DEPENDENCY_VERSION. Got $SAGEMAKER_VER." 178 | fi 179 | 180 | ROBOMAKER_VER=$(docker inspect awsdeepracercommunity/deepracer-robomaker:$DR_ROBOMAKER_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version ) 181 | if [ -z "$ROBOMAKER_VER" ]; then ROBOMAKER_VER=$DR_ROBOMAKER_IMAGE; fi 182 | if ! verlte $DEPENDENCY_VERSION $ROBOMAKER_VER; then 183 | echo "WARNING: Incompatible version of Deepracer Robomaker. Expected >$DEPENDENCY_VERSION. Got $ROBOMAKER_VER." 184 | fi 185 | 186 | COACH_VER=$(docker inspect awsdeepracercommunity/deepracer-rlcoach:$DR_COACH_IMAGE 2> /dev/null | jq -r .[].Config.Labels.version) 187 | if [ -z "$COACH_VER" ]; then COACH_VER=$DR_COACH_IMAGE; fi 188 | if ! verlte $DEPENDENCY_VERSION $COACH_VER; then 189 | echo "WARNING: Incompatible version of Deepracer-for-Cloud Coach. Expected >$DEPENDENCY_VERSION. Got $COACH_VER." 190 | fi 191 | 192 | source $SCRIPT_DIR/scripts_wrapper.sh 193 | 194 | function dr-update { 195 | dr-update-env 196 | } 197 | 198 | function dr-reload { 199 | source $DIR/bin/activate.sh $DR_CONFIG 200 | } 201 | -------------------------------------------------------------------------------- /scripts/upload/upload-model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage(){ 4 | echo "Usage: $0 [-f] [-w] [-d] [-b] [-c ] [-p ]" 5 | echo " -f Force upload. No confirmation question." 6 | echo " -w Wipes the target AWS DeepRacer model structure before upload." 7 | echo " -d Dry-Run mode. Does not perform any write or delete operatios on target." 8 | echo " -b Uploads best checkpoint. Default is last checkpoint." 9 | echo " -p model Uploads model in specified S3 prefix." 10 | echo " -i Import model with the upload name" 11 | echo " -I name Import model with a specific name" 12 | echo " -1 Increment upload name with 1 (dr-increment-upload-model)" 13 | exit 1 14 | } 15 | 16 | trap ctrl_c INT 17 | 18 | function ctrl_c() { 19 | echo "Requested to stop." 20 | exit 1 21 | } 22 | 23 | while getopts ":fwdhbp:c:1iI:" opt; do 24 | case $opt in 25 | b) OPT_CHECKPOINT="Best" 26 | ;; 27 | c) OPT_CHECKPOINT_NUM="$OPTARG" 28 | ;; 29 | f) OPT_FORCE="-f" 30 | ;; 31 | d) OPT_DRYRUN="--dryrun" 32 | ;; 33 | p) OPT_PREFIX="$OPTARG" 34 | ;; 35 | w) OPT_WIPE="--delete" 36 | ;; 37 | i) OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" 38 | ;; 39 | I) OPT_IMPORT="$OPTARG" 40 | ;; 41 | 1) OPT_INCREMENT="Yes" 42 | ;; 43 | h) usage 44 | ;; 45 | \?) echo "Invalid option -$OPTARG" >&2 46 | usage 47 | ;; 48 | esac 49 | done 50 | 51 | if [[ -n "${OPT_DRYRUN}" ]]; 52 | then 53 | echo "*** DRYRUN MODE ***" 54 | fi 55 | 56 | if [[ -n "${OPT_INCREMENT}" ]]; 57 | then 58 | source $DR_DIR/scripts/upload/increment.sh ${OPT_FORCE} 59 | OPT_IMPORT="$DR_UPLOAD_S3_PREFIX" 60 | fi 61 | 62 | export TARGET_S3_BUCKET=${DR_UPLOAD_S3_BUCKET} 63 | export TARGET_S3_PREFIX=${DR_UPLOAD_S3_PREFIX} 64 | 65 | if [[ -z "${DR_UPLOAD_S3_BUCKET}" ]]; 66 | then 67 | echo "No upload bucket defined. Exiting." 68 | exit 1 69 | fi 70 | 71 | if [[ -z "${DR_UPLOAD_S3_PREFIX}" ]]; 72 | then 73 | echo "No upload prefix defined. Exiting." 74 | exit 1 75 | fi 76 | 77 | SOURCE_S3_BUCKET=${DR_LOCAL_S3_BUCKET} 78 | if [[ -n "${OPT_PREFIX}" ]]; 79 | then 80 | SOURCE_S3_MODEL_PREFIX=${OPT_PREFIX} 81 | else 82 | SOURCE_S3_MODEL_PREFIX=${DR_LOCAL_S3_MODEL_PREFIX} 83 | fi 84 | SOURCE_S3_CONFIG=${DR_LOCAL_S3_CUSTOM_FILES_PREFIX} 85 | SOURCE_S3_REWARD=${DR_LOCAL_S3_REWARD_KEY} 86 | SOURCE_S3_METRICS="${DR_LOCAL_S3_METRICS_PREFIX}/TrainingMetrics.json" 87 | 88 | export WORK_DIR=${DR_DIR}/tmp/upload/ 89 | mkdir -p ${WORK_DIR} && rm -rf ${WORK_DIR} && mkdir -p ${WORK_DIR}model ${WORK_DIR}ip 90 | 91 | # Upload information on model. 92 | TARGET_PARAMS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/training_params.yaml" 93 | TARGET_REWARD_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/reward_function.py" 94 | TARGET_HYPERPARAM_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/ip/hyperparameters.json" 95 | TARGET_METRICS_FILE_S3_KEY="s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/TrainingMetrics.json" 96 | 97 | # Check if metadata-files are available 98 | REWARD_IN_ROOT=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py 2> /dev/null | wc -l) 99 | if [ "$REWARD_IN_ROOT" -ne 0 ]; 100 | then 101 | REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/reward_function.py ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) 102 | else 103 | echo "Looking for Reward Function in s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD}" 104 | REWARD_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_REWARD} ${WORK_DIR} --no-progress | awk '/reward/ {print $4}'| xargs readlink -f 2> /dev/null) 105 | fi 106 | 107 | METADATA_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/model_metadata.json ${WORK_DIR} --no-progress | awk '/model_metadata.json$/ {print $4}'| xargs readlink -f 2> /dev/null) 108 | HYPERPARAM_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/ip/hyperparameters.json ${WORK_DIR} --no-progress | awk '/hyperparameters.json$/ {print $4}'| xargs readlink -f 2> /dev/null) 109 | METRICS_FILE=$(aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_METRICS} ${WORK_DIR} --no-progress | awk '/metric/ {print $4}'| xargs readlink -f 2> /dev/null) 110 | 111 | if [ -n "$METADATA_FILE" ] && [ -n "$REWARD_FILE" ] && [ -n "$HYPERPARAM_FILE" ] && [ -n "$METRICS_FILE" ]; 112 | then 113 | echo "All meta-data files found. Looking for checkpoint." 114 | else 115 | echo "Meta-data files are not found. Exiting." 116 | exit 1 117 | fi 118 | 119 | # Download checkpoint file 120 | echo "Looking for model to upload from s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/" 121 | CHECKPOINT_INDEX=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 cp s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/deepracer_checkpoints.json ${WORK_DIR}model/ --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) 122 | 123 | if [ -z "$CHECKPOINT_INDEX" ]; then 124 | echo "No checkpoint file available at s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model. Exiting." 125 | exit 1 126 | fi 127 | 128 | if [ -n "$OPT_CHECKPOINT_NUM" ]; then 129 | echo "Checking for checkpoint $OPT_CHECKPOINT_NUM" 130 | export OPT_CHECKPOINT_NUM 131 | CHECKPOINT_FILE=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 ls s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ | perl -ne'print "$1\n" if /.*\s($ENV{OPT_CHECKPOINT_NUM}_Step-[0-9]{1,7}\.ckpt)\.index/') 132 | CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` 133 | TIMESTAMP=`date +%s` 134 | CHECKPOINT_JSON_PART=$(jq -n '{ checkpoint: { name: $name, time_stamp: $timestamp | tonumber, avg_comp_pct: 50.0 } }' --arg name $CHECKPOINT_FILE --arg timestamp $TIMESTAMP) 135 | CHECKPOINT_JSON=$(echo $CHECKPOINT_JSON_PART | jq '. | {last_checkpoint: .checkpoint, best_checkpoint: .checkpoint}') 136 | elif [ -z "$OPT_CHECKPOINT" ]; then 137 | echo "Checking for latest tested checkpoint" 138 | CHECKPOINT_FILE=`jq -r .last_checkpoint.name < $CHECKPOINT_INDEX` 139 | CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` 140 | CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .last_checkpoint, best_checkpoint: .last_checkpoint}' < $CHECKPOINT_INDEX ) 141 | echo "Latest checkpoint = $CHECKPOINT" 142 | else 143 | echo "Checking for best checkpoint" 144 | CHECKPOINT_FILE=`jq -r .best_checkpoint.name < $CHECKPOINT_INDEX` 145 | CHECKPOINT=`echo $CHECKPOINT_FILE | cut -f1 -d_` 146 | CHECKPOINT_JSON=$(jq '. | {last_checkpoint: .best_checkpoint, best_checkpoint: .best_checkpoint}' < $CHECKPOINT_INDEX ) 147 | echo "Best checkpoint: $CHECKPOINT" 148 | fi 149 | 150 | # Find checkpoint & model files - download 151 | if [ -n "$CHECKPOINT" ]; then 152 | CHECKPOINT_MODEL_FILES=$(aws ${DR_LOCAL_PROFILE_ENDPOINT_URL} s3 sync s3://${SOURCE_S3_BUCKET}/${SOURCE_S3_MODEL_PREFIX}/model/ ${WORK_DIR}model/ --exclude "*" --include "${CHECKPOINT}*" --include "model_${CHECKPOINT}.pb" --include "deepracer_checkpoints.json" --no-progress | awk '{print $4}' | xargs readlink -f 2> /dev/null) 153 | CHECKPOINT_MODEL_FILE_COUNT=$(echo $CHECKPOINT_MODEL_FILES | wc -l) 154 | if [ "$CHECKPOINT_MODEL_FILE_COUNT" -eq 0 ]; then 155 | echo "No model files found. Files possibly deleted. Try again." 156 | exit 1 157 | fi 158 | cp ${METADATA_FILE} ${WORK_DIR}model/ 159 | # echo "model_checkpoint_path: \"${CHECKPOINT_FILE}\"" | tee ${WORK_DIR}model/checkpoint 160 | echo ${CHECKPOINT_FILE} | tee ${WORK_DIR}model/.coach_checkpoint > /dev/null 161 | else 162 | echo "Checkpoint not found. Exiting." 163 | exit 1 164 | fi 165 | 166 | # Create Training Params Yaml. 167 | PARAMS_FILE=$(python3 $DR_DIR/scripts/upload/prepare-config.py) 168 | 169 | # Upload files 170 | if [[ -z "${OPT_FORCE}" ]]; 171 | then 172 | echo "Ready to upload model ${SOURCE_S3_MODEL_PREFIX} to s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/" 173 | read -r -p "Are you sure? [y/N] " response 174 | if [[ ! "$response" =~ ^([yY][eE][sS]|[yY])$ ]] 175 | then 176 | echo "Aborting." 177 | exit 1 178 | fi 179 | fi 180 | 181 | # echo "" > ${WORK_DIR}model/.ready 182 | cd ${WORK_DIR} 183 | echo ${CHECKPOINT_JSON} > ${WORK_DIR}model/deepracer_checkpoints.json 184 | aws ${DR_UPLOAD_PROFILE} s3 sync ${WORK_DIR}model/ s3://${TARGET_S3_BUCKET}/${TARGET_S3_PREFIX}/model/ ${OPT_DRYRUN} ${OPT_WIPE} 185 | aws ${DR_UPLOAD_PROFILE} s3 cp ${REWARD_FILE} ${TARGET_REWARD_FILE_S3_KEY} ${OPT_DRYRUN} 186 | aws ${DR_UPLOAD_PROFILE} s3 cp ${METRICS_FILE} ${TARGET_METRICS_FILE_S3_KEY} ${OPT_DRYRUN} 187 | aws ${DR_UPLOAD_PROFILE} s3 cp ${PARAMS_FILE} ${TARGET_PARAMS_FILE_S3_KEY} ${OPT_DRYRUN} 188 | aws ${DR_UPLOAD_PROFILE} s3 cp ${HYPERPARAM_FILE} ${TARGET_HYPERPARAM_FILE_S3_KEY} ${OPT_DRYRUN} 189 | 190 | # After upload trigger the import 191 | if [[ -n "${OPT_IMPORT}" ]]; 192 | then 193 | $DR_DIR/scripts/upload/import-model.py "${DR_UPLOAD_S3_PROFILE}" "${DR_UPLOAD_S3_ROLE}" "${TARGET_S3_BUCKET}" "${TARGET_S3_PREFIX}" "${OPT_IMPORT}" 194 | fi -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installing Deepracer-for-Cloud 2 | 3 | ## Requirements 4 | 5 | Depending on your needs as well as specific needs of the cloud platform you can configure your VM to your liking. Both CPU-only as well as GPU systems are supported. 6 | 7 | **AWS**: 8 | 9 | * EC2 instance of type G3, G4, P2 or P3 - recommendation is g4dn.2xlarge - for GPU enabled training. C5 or M6 types - recommendation is c5.2xlarge - for CPU training. 10 | * Ubuntu 20.04 11 | * Minimum 30 GB, preferred 40 GB of OS disk. 12 | * Ephemeral Drive connected 13 | * Minimum of 8 GB GPU-RAM if running with GPU. 14 | * Recommended at least 6 VCPUs 15 | * S3 bucket. Preferrably in same region as EC2 instance. 16 | 17 | **Azure**: 18 | 19 | * N-Series VM that comes with NVIDIA Graphics Adapter - recommendation is NC6_Standard 20 | * Ubuntu 20.04 21 | * Standard 30 GB OS drive is sufficient to get started. 22 | * Recommended to add an additional 32 GB data disk if you want to use the Log Analysis container. 23 | * Minimum 8 GB GPU-RAM 24 | * Recommended at least 6 VCPUs 25 | * Storage Account with one Blob container configured for Access Key authentication. 26 | 27 | **Local**: 28 | 29 | * A modern, comparatively powerful, Intel based system. 30 | * Ubuntu 20.04, other Linux-dristros likely to work. 31 | * 4 core-CPU, equivalent to 8 vCPUs; the more the better. 32 | * NVIDIA Graphics adapter with minimum 8 GB RAM for Sagemaker to run GPU. Robomaker enabled GPU instances need ~1 GB each. 33 | * System RAM + GPU RAM should be at least 32 GB. 34 | * Running DRfC Ubuntu 20.04 on Windows using Windows Subsystem for Linux 2 is possible. See [Installing on Windows](windows.md) 35 | 36 | ## Installation 37 | 38 | The package comes with preparation and setup scripts that would allow a turn-key setup for a fresh virtual machine. 39 | 40 | ```shell 41 | git clone https://github.com/aws-deepracer-community/deepracer-for-cloud.git 42 | ``` 43 | 44 | **For cloud setup** execute: 45 | 46 | ```shell 47 | cd deepracer-for-cloud && ./bin/prepare.sh 48 | ``` 49 | 50 | This will prepare the VM by partitioning additional drives as well as installing all prerequisites. After a reboot it will continuee to run `./bin/init.sh` setting up the full repository and downloading the core Docker images. Depending on your environment this may take up to 30 minutes. The scripts will create a file `DONE` once completed. 51 | 52 | The installation script will adapt `.profile` to ensure that all settings are applied on login. Otherwise run the activation with `source bin/activate.sh`. 53 | 54 | **For local install** it is recommended *not* to run the `bin/prepare.sh` script; it might do more changes than what you want. Rather ensure that all prerequisites are set up and run `bin/init.sh` directly. 55 | 56 | The Init Script takes a few parameters: 57 | 58 | | Variable | Description | 59 | |----------|-------------| 60 | | `-c ` | Sets the cloud version to be configured, automatically updates the `DR_CLOUD` parameter in `system.env`. Options are `azure`, `aws` or `local`. Default is `local` | 61 | | `-a ` | Sets the architecture to be configured. Either `cpu` or `gpu`. Default is `gpu`. | 62 | 63 | *TODO: Document how to configure via cloud-init.* 64 | 65 | ## Environment Setup 66 | 67 | The initialization script will attempt to auto-detect your environment (`Azure`, `AWS` or `Local`), and store the outcome in the `DR_CLOUD` parameter in `system.env`. You can also pass in a `-c ` parameter to override it, e.g. if you want to run the minio-based `local` mode in the cloud. 68 | 69 | The main difference between the mode is based on authentication mechanisms and type of storage being configured. The next chapters will review each type of environment on its own. 70 | 71 | ### AWS 72 | 73 | In AWS it is possible to set up authentication to S3 in two ways: Integrated sign-on using [IAM Roles](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) or using access keys. 74 | 75 | #### IAM Role 76 | 77 | To use IAM Roles: 78 | 79 | * An empty S3 bucket in the same region as the EC2 instance. 80 | * An IAM Role that has permissions to: 81 | * Access both the *new* S3 bucket as well as the DeepRacer bucket. 82 | * AmazonVPCReadOnlyAccess 83 | * AmazonKinesisVideoStreamsFullAccess if you want to stream to Kinesis 84 | * CloudWatch 85 | * An EC2 instance with the defined IAM Role assigned. 86 | * Configure `system.env` as follows: 87 | * `DR_LOCAL_S3_PROFILE=default` 88 | * `DR_LOCAL_S3_BUCKET=` 89 | * `DR_UPLOAD_S3_PROFILE=default` 90 | * `DR_UPLOAD_S3_BUCKET=` 91 | * Run `dr-update` for configuration to take effect. 92 | 93 | #### Manual setup 94 | 95 | For access with IAM user: 96 | 97 | * An empty S3 bucket in the same region as the EC2 instance. 98 | * A real AWS IAM user set up with access keys: 99 | * User should have permissions to access the *new* bucket as well as the dedicated DeepRacer S3 bucket. 100 | * Use `aws configure` to configure this into the default profile. 101 | * Configure `system.env` as follows: 102 | * `DR_LOCAL_S3_PROFILE=default` 103 | * `DR_LOCAL_S3_BUCKET=` 104 | * `DR_UPLOAD_S3_PROFILE=default` 105 | * `DR_UPLOAD_S3_BUCKET=` 106 | * Run `dr-update` for configuration to take effect. 107 | 108 | ### Azure 109 | 110 | In Azure mode the script-set requires the following: 111 | 112 | * A storage account with a blob container set up with access keys: 113 | * Use `aws configure --profile ` to configure this into a specific profile. 114 | * `` can be defined by the user, but do not use `default`. 115 | * Access Key ID is the Storage Account name. 116 | * Secret Access Key is the Access Key for the Storage Account. 117 | * The blob container is equivalent to the S3 bucket. 118 | * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. 119 | * Configure `system.env` as follows: 120 | * `DR_LOCAL_S3_PROFILE=default` 121 | * `DR_LOCAL_S3_BUCKET=` 122 | * `DR_UPLOAD_S3_PROFILE=default` 123 | * `DR_UPLOAD_S3_BUCKET=` 124 | * Run `dr-update` for configuration to take effect. 125 | 126 | As Azure does not natively support S3 a [minio](https://min.io/product/overview) proxy is set up on port 9000 to allow the containers to communicate and store models. 127 | 128 | If you want to use awscli (`aws`) to manually move files then use `aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 ...`, as this will set both `--profile` and `--endpoint-url` parameters to match your configuration. 129 | 130 | ### Local 131 | 132 | Local mode runs a minio server that hosts the data in the `docker/volumes` directory. It is otherwise command-compatible with the Azure setup; as the data is accessible via Minio and not via native S3. 133 | 134 | In Local mode the script-set requires the following: 135 | 136 | * Configure the Minio credentials with `aws configure --profile minio`. The default configuration will use the `minio` profile to configure MINIO. You can choose any username or password, but username needs to be at least length 3, and password at least length 8. 137 | * A real AWS IAM user configured with `aws configure` to enable upload of models into AWS DeepRacer. 138 | * Configure `system.env` as follows: 139 | * `DR_LOCAL_S3_PROFILE=default` 140 | * `DR_LOCAL_S3_BUCKET=` 141 | * `DR_UPLOAD_S3_PROFILE=default` 142 | * `DR_UPLOAD_S3_BUCKET=` 143 | * Run `dr-update` for configuration to take effect. 144 | 145 | ## First Run 146 | 147 | For the first run the following final steps are needed. This creates a training run with all default values in 148 | 149 | * Define your custom files in `custom_files/` - samples can be found in `defaults` which you must copy over: 150 | * `hyperparameters.json` - definining the training hyperparameters 151 | * `model_metadata.json` - defining the action space and sensors 152 | * `reward_function.py` - defining the reward function 153 | * Upload the files into the bucket with `dr-upload-custom-files`. This will also start minio if required. 154 | * Start training with `dr-start-training` 155 | 156 | After a while you will see the sagemaker logs on the screen. 157 | 158 | ## Troubleshooting 159 | 160 | Here are some hints for troubleshooting specific issues you may encounter 161 | 162 | ### Local training troubleshooting 163 | 164 | | Issue | Troubleshooting hint | 165 | |------------- | ---------------------| 166 | Get messages like "Sagemaker is not running" | Run `docker -ps a` to see if the containers are running or if they stopped due to some errors 167 | Check docker errors for specific container | Run `docker logs -f ` 168 | Get message "Error response from daemon: could not choose an IP address to advertise since this system has multiple addresses on interface ..." when running `./bin/init.sh -c local -a cpu` | It means you have multiple IP addresses and you need to specify one within `./bin/init.sh`.
If you don't care which one to use, you can get the first one by running ```ifconfig \| grep $(route \| awk '/^default/ {print $8}') -a1 \| grep -o -P '(?<=inet ).*(?= netmask)```.
Edit `./bin/init.sh` and locate line `docker swarm init` and change it to `docker swarm init --advertise-addr `.
Rerun `./bin/init.sh -c local -a cpu` 169 | -------------------------------------------------------------------------------- /bin/scripts_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function dr-upload-custom-files { 4 | eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) 5 | echo "Uploading files to $CUSTOM_TARGET" 6 | aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $DR_DIR/custom_files/ $CUSTOM_TARGET 7 | } 8 | 9 | function dr-upload-model { 10 | dr-update-env && ${DR_DIR}/scripts/upload/upload-model.sh "$@" 11 | } 12 | 13 | function dr-download-model { 14 | dr-update-env && ${DR_DIR}/scripts/upload/download-model.sh "$@" 15 | } 16 | 17 | function dr-upload-car-zip { 18 | dr-update-env && ${DR_DIR}/scripts/upload/upload-car.sh "$@" 19 | } 20 | 21 | function dr-list-aws-models { 22 | echo "Due to changes in AWS DeepRacer Console this command is no longer available." 23 | } 24 | 25 | function dr-set-upload-model { 26 | echo "Due to changes in AWS DeepRacer Console this command is no longer available." 27 | } 28 | 29 | function dr-increment-upload-model { 30 | dr-update-env && ${DR_DIR}/scripts/upload/increment.sh "$@" && dr-update-env 31 | } 32 | 33 | function dr-download-custom-files { 34 | eval CUSTOM_TARGET=$(echo s3://$DR_LOCAL_S3_BUCKET/$DR_LOCAL_S3_CUSTOM_FILES_PREFIX/) 35 | echo "Downloading files from $CUSTOM_TARGET" 36 | aws $DR_LOCAL_PROFILE_ENDPOINT_URL s3 sync $CUSTOM_TARGET $DR_DIR/custom_files/ 37 | } 38 | 39 | function dr-start-training { 40 | dr-update-env 41 | $DR_DIR/scripts/training/start.sh "$@" 42 | } 43 | 44 | function dr-increment-training { 45 | dr-update-env && ${DR_DIR}/scripts/training/increment.sh "$@" && dr-update-env 46 | } 47 | 48 | function dr-stop-training { 49 | ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/training && ./stop.sh" 50 | } 51 | 52 | function dr-start-evaluation { 53 | dr-update-env 54 | $DR_DIR/scripts/evaluation/start.sh "$@" 55 | } 56 | 57 | function dr-stop-evaluation { 58 | ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/evaluation && ./stop.sh" 59 | } 60 | 61 | 62 | function dr-start-tournament { 63 | echo "Tournaments are no longer supported. Use Head-to-Model evaluation instead." 64 | } 65 | 66 | 67 | function dr-start-loganalysis { 68 | ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./start.sh" 69 | } 70 | 71 | 72 | function dr-stop-loganalysis { 73 | eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') 74 | if [ -n "$LOG_ANALYSIS_ID" ]; then 75 | ROBOMAKER_COMMAND="" bash -c "cd $DR_DIR/scripts/log-analysis && ./stop.sh" 76 | else 77 | echo "Log-analysis is not running." 78 | fi 79 | 80 | } 81 | 82 | function dr-logs-sagemaker { 83 | 84 | local OPTIND 85 | OPT_TIME="--since 5m" 86 | 87 | while getopts ":w:a" opt; do 88 | case $opt in 89 | w) OPT_WAIT=$OPTARG 90 | ;; 91 | a) OPT_TIME="" 92 | ;; 93 | \?) echo "Invalid option -$OPTARG" >&2 94 | ;; 95 | esac 96 | done 97 | 98 | SAGEMAKER_CONTAINER=$(dr-find-sagemaker) 99 | 100 | if [[ -z "$SAGEMAKER_CONTAINER" ]]; 101 | then 102 | if [[ -n "$OPT_WAIT" ]]; then 103 | WAIT_TIME=$OPT_WAIT 104 | echo "Waiting up to $WAIT_TIME seconds for Sagemaker to start up..." 105 | until [ -n "$SAGEMAKER_CONTAINER" ] 106 | do 107 | sleep 1 108 | ((WAIT_TIME--)) 109 | if [ "$WAIT_TIME" -lt 1 ]; then 110 | echo "Sagemaker is not running." 111 | return 1 112 | fi 113 | SAGEMAKER_CONTAINER=$(dr-find-sagemaker) 114 | done 115 | else 116 | echo "Sagemaker is not running." 117 | return 1 118 | fi 119 | fi 120 | 121 | if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; 122 | then 123 | if [ -x "$(command -v gnome-terminal)" ]; 124 | then 125 | gnome-terminal --tab --title "DR-${DR_RUN_ID}: Sagemaker - ${SAGEMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null 126 | echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate gnome-terminal. " 127 | elif [ -x "$(command -v x-terminal-emulator)" ]; 128 | then 129 | x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${SAGEMAKER_CONTAINER}" 2> /dev/null 130 | echo "Sagemaker container $SAGEMAKER_CONTAINER logs opened in separate terminal. " 131 | else 132 | echo 'Could not find a defined x-terminal-emulator. Displaying inline.' 133 | docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER 134 | fi 135 | else 136 | docker logs $OPT_TIME -f $SAGEMAKER_CONTAINER 137 | fi 138 | 139 | } 140 | 141 | function dr-find-sagemaker { 142 | 143 | STACK_NAME="deepracer-$DR_RUN_ID" 144 | RUN_NAME=${DR_LOCAL_S3_MODEL_PREFIX} 145 | 146 | SAGEMAKER_CONTAINERS=$(docker ps | awk ' /sagemaker/ { print $1 } '| xargs ) 147 | 148 | if [[ -n $SAGEMAKER_CONTAINERS ]]; 149 | then 150 | for CONTAINER in $SAGEMAKER_CONTAINERS; do 151 | CONTAINER_NAME=$(docker ps --format '{{.Names}}' --filter id=$CONTAINER) 152 | CONTAINER_PREFIX=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $1') 153 | COMPOSE_SERVICE_NAME=$(echo $CONTAINER_NAME | perl -n -e'/(.*)_(algo(.*))_./; print $2') 154 | COMPOSE_FILE=$(sudo find /tmp/sagemaker -name docker-compose.yaml -exec grep -l "$RUN_NAME" {} + | grep $CONTAINER_PREFIX) 155 | if [[ -n $COMPOSE_FILE ]]; then 156 | echo $CONTAINER 157 | return 158 | fi 159 | done 160 | fi 161 | 162 | } 163 | 164 | function dr-logs-robomaker { 165 | 166 | OPT_REPLICA=1 167 | OPT_EVAL="" 168 | local OPTIND 169 | OPT_TIME="--since 5m" 170 | 171 | while getopts ":w:n:ea" opt; do 172 | case $opt in 173 | w) OPT_WAIT=$OPTARG 174 | ;; 175 | n) OPT_REPLICA=$OPTARG 176 | ;; 177 | e) OPT_EVAL="-e" 178 | ;; 179 | a) OPT_TIME="" 180 | ;; 181 | \?) echo "Invalid option -$OPTARG" >&2 182 | ;; 183 | esac 184 | done 185 | 186 | ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL}) 187 | 188 | if [[ -z "$ROBOMAKER_CONTAINER" ]]; 189 | then 190 | if [[ -n "$OPT_WAIT" ]]; then 191 | WAIT_TIME=$OPT_WAIT 192 | echo "Waiting up to $WAIT_TIME seconds for Robomaker #${OPT_REPLICA} to start up..." 193 | until [ -n "$ROBOMAKER_CONTAINER" ] 194 | do 195 | sleep 1 196 | ((WAIT_TIME--)) 197 | if [ "$WAIT_TIME" -lt 1 ]; then 198 | echo "Robomaker #${OPT_REPLICA} is not running." 199 | return 1 200 | fi 201 | ROBOMAKER_CONTAINER=$(dr-find-robomaker -n ${OPT_REPLICA} ${OPT_EVAL}) 202 | done 203 | else 204 | echo "Robomaker #${OPT_REPLICA} is not running." 205 | return 1 206 | fi 207 | fi 208 | 209 | if [[ "${DR_HOST_X,,}" == "true" && -n "$DISPLAY" ]]; 210 | then 211 | if [ -x "$(command -v gnome-terminal)" ]; 212 | then 213 | gnome-terminal --tab --title "DR-${DR_RUN_ID}: Robomaker #${OPT_REPLICA} - ${ROBOMAKER_CONTAINER}" -- /usr/bin/bash -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null 214 | echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate gnome-terminal. " 215 | elif [ -x "$(command -v x-terminal-emulator)" ]; 216 | then 217 | x-terminal-emulator -e /bin/sh -c "docker logs $OPT_TIME -f ${ROBOMAKER_CONTAINER}" 2> /dev/null 218 | echo "Robomaker #${OPT_REPLICA} ($ROBOMAKER_CONTAINER) logs opened in separate terminal. " 219 | else 220 | echo 'Could not find a defined x-terminal-emulator. Displaying inline.' 221 | docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER 222 | fi 223 | else 224 | docker logs $OPT_TIME -f $ROBOMAKER_CONTAINER 225 | fi 226 | 227 | } 228 | 229 | function dr-find-robomaker { 230 | 231 | local OPTIND 232 | 233 | OPT_PREFIX="deepracer" 234 | 235 | while getopts ":n:e" opt; do 236 | case $opt in 237 | n) OPT_REPLICA=$OPTARG 238 | ;; 239 | e) OPT_PREFIX="-eval" 240 | ;; 241 | \?) echo "Invalid option -$OPTARG" >&2 242 | ;; 243 | esac 244 | done 245 | 246 | eval ROBOMAKER_ID=$(docker ps | grep "${OPT_PREFIX}-${DR_RUN_ID}_robomaker.${OPT_REPLICA}" | cut -f1 -d\ | head -1) 247 | if [ -n "$ROBOMAKER_ID" ]; then 248 | echo $ROBOMAKER_ID 249 | fi 250 | } 251 | 252 | function dr-get-robomaker-stats { 253 | 254 | local OPTIND 255 | OPT_REPLICA=1 256 | 257 | while getopts ":n:" opt; do 258 | case $opt in 259 | n) OPT_REPLICA=$OPTARG 260 | ;; 261 | \?) echo "Invalid option -$OPTARG" >&2 262 | ;; 263 | esac 264 | done 265 | 266 | eval ROBOMAKER_ID=$(dr-find-robomaker -n $OPT_REPLICA ) 267 | if [ -n "$ROBOMAKER_ID" ]; then 268 | echo "Showing statistics for Robomaker #$OPT_REPLICA - container $ROBOMAKER_ID" 269 | docker exec -ti $ROBOMAKER_ID bash -c "gz stats" 270 | else 271 | echo "Robomaker #$OPT_REPLICA is not running." 272 | fi 273 | } 274 | 275 | function dr-logs-loganalysis { 276 | eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') 277 | if [ -n "$LOG_ANALYSIS_ID" ]; then 278 | docker logs -f $LOG_ANALYSIS_ID 279 | else 280 | echo "Log-analysis is not running." 281 | fi 282 | 283 | } 284 | 285 | function dr-url-loganalysis { 286 | eval LOG_ANALYSIS_ID=$(docker ps | awk ' /loganalysis/ { print $1 }') 287 | if [ -n "$LOG_ANALYSIS_ID" ]; then 288 | docker exec "$LOG_ANALYSIS_ID" bash -c "jupyter server list" 289 | else 290 | echo "Log-analysis is not running." 291 | fi 292 | } 293 | 294 | function dr-view-stream { 295 | ${DR_DIR}/utils/start-local-browser.sh "$@" 296 | } 297 | 298 | function dr-start-viewer { 299 | $DR_DIR/scripts/viewer/start.sh "$@" 300 | } 301 | 302 | function dr-stop-viewer { 303 | $DR_DIR/scripts/viewer/stop.sh "$@" 304 | } 305 | 306 | function dr-update-viewer { 307 | $DR_DIR/scripts/viewer/stop.sh "$@" 308 | $DR_DIR/scripts/viewer/start.sh "$@" 309 | } 310 | -------------------------------------------------------------------------------- /docs/reference.md: -------------------------------------------------------------------------------- 1 | # Deepracer-for-Cloud Reference 2 | 3 | ## Environment Variables 4 | 5 | The scripts assume that two files `system.env` containing constant configuration values and `run.env` with run specific values is populated with the required values. Which values go into which file is not really important. 6 | 7 | | Variable | Description | 8 | |----------|-------------| 9 | | `DR_RUN_ID` | Used if you have multiple independent training jobs only a single DRfC instance. This is an advanced configuration and generally you should just leave this as the default `0`.| 10 | | `DR_WORLD_NAME` | Defines the track to be used.| 11 | | `DR_RACE_TYPE` | Valid options are `TIME_TRIAL`, `OBJECT_AVOIDANCE`, and `HEAD_TO_BOT`.| 12 | | `DR_CAR_COLOR` | Valid options are `Black`, `Grey`, `Blue`, `Red`, `Orange`, `White`, and `Purple`.| 13 | | `DR_CAR_NAME` | Display name of car; shows in Deepracer Console when uploading.| 14 | | `DR_ENABLE_DOMAIN_RANDOMIZATION` | If `True`, this cycles through different environment colors and lighting each episode. This is typically used to make your model more robust and generalized instead of tightly aligned with the simulator| 15 | | `DR_UPLOAD_S3_PREFIX` | Prefix of the target location. (Typically starts with `DeepRacer-SageMaker-RoboMaker-comm-`| 16 | | `DR_EVAL_NUMBER_OF_TRIALS` | How many laps to complete for evaluation simulations.| 17 | | `DR_EVAL_IS_CONTINUOUS` | If False, your evaluation trial will end if you car goes off track or is in a collision. If True, your car will take the penalty times as configured in those parameters, but continue evaluating the trial.| 18 | | `DR_EVAL_OFF_TRACK_PENALTY` | Number of seconds penalty time added for an off track during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| 19 | | `DR_EVAL_COLLISION_PENALTY` | Number of seconds penalty time added for a collision during evaluation. Only takes effect if `DR_EVAL_IS_CONTINUOUS` is set to True.| 20 | | `DR_EVAL_SAVE_MP4` | Set to `True` to save MP4 of an evaluation run. | 21 | | `DR_TRAIN_CHANGE_START_POSITION` | Determines if the racer shall round-robin the starting position during training sessions. (Recommended to be `True` for initial training.)| 22 | | `DR_TRAIN_ALTERNATE_DRIVING_DIRECTION` | `True` or `False`. If `True`, the car will alternate driving between clockwise and counter-clockwise each episode.| 23 | | `DR_TRAIN_START_POSITION_OFFSET` | Used to control where to start the training from on first episode.| 24 | | `DR_TRAIN_ROUND_ROBIN_ADVANCE_DISTANCE` | How far to progress each episode in round robin. 0.05 is 5% of the track. Generally best to try and keep this to even numbers that match with your total number of episodes to allow for even distribution around the track. For example, if 20 episodes per iternation, .05 or .10 or .20 would be good.| 25 | | `DR_TRAIN_MULTI_CONFIG` | `True` or `False`. This is used if you want to use different run.env configurations for each worker in a multi worker training run. See multi config documentation for more details on how to set this up.| 26 | | `DR_TRAIN_MIN_EVAL_TRIALS` | The minimum number of evaluation trials run between each training iteration. Evaluations will continue as long as policy training is occuring and may be more than this number. This establishes the minimum, and is generally useful if you want to speed up training especially when using gpu sagemaker containers.| 27 | | `DR_LOCAL_S3_PRETRAINED` | Determines if training or evaluation shall be based on the model created in a previous session, held in `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`, accessible by credentials held in profile `{DR_LOCAL_S3_PROFILE}`.| 28 | | `DR_LOCAL_S3_PRETRAINED_PREFIX` | Prefix of pretrained model within S3 bucket.| 29 | | `DR_LOCAL_S3_MODEL_PREFIX` | Prefix of model within S3 bucket.| 30 | | `DR_LOCAL_S3_BUCKET` | Name of S3 bucket which will be used during the session.| 31 | | `DR_LOCAL_S3_CUSTOM_FILES_PREFIX` | Prefix of configuration files within S3 bucket.| 32 | | `DR_LOCAL_S3_TRAINING_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during training. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| 33 | | `DR_LOCAL_S3_EVAL_PARAMS_FILE` | Name of YAML file that holds parameters sent to robomaker container for configuration during evaluations. Filename is relative to `s3://{DR_LOCAL_S3_BUCKET}/{LOCAL_S3_PRETRAINED_PREFIX}`.| 34 | | `DR_LOCAL_S3_MODEL_METADATA_KEY` | Location where the `model_metadata.json` file is stored.| 35 | | `DR_LOCAL_S3_HYPERPARAMETERS_KEY` | Location where the `hyperparameters.json` file is stored.| 36 | | `DR_LOCAL_S3_REWARD_KEY` | Location where the `reward_function.py` file is stored.| 37 | | `DR_LOCAL_S3_METRICS_PREFIX` | Location where the metrics will be stored.| 38 | | `DR_OA_NUMBER_OF_OBSTACLES` | For Object Avoidance, the number of obstacles on the track.| 39 | | `DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES` | Minimum distance in meters between obstacles.| 40 | | `DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS` | If True, obstacle locations will randomly change after each episode.| 41 | | `DR_OA_IS_OBSTACLE_BOT_CAR` | If True, obstacles will appear as a stationary car instead of a box.| 42 | | `DR_OA_OBJECT_POSITIONS` | Positions of boxes on the track. Tuples consisting of progress (fraction [0..1]) and inside or outside lane (-1 or 1). Example: `"0.23,-1;0.46,1"`| 43 | | `DR_H2B_IS_LANE_CHANGE` | If True, bot cars will change lanes based on configuration.| 44 | | `DR_H2B_LOWER_LANE_CHANGE_TIME` | Minimum time in seconds before car will change lanes.| 45 | | `DR_H2B_UPPER_LANE_CHANGE_TIME` | Maximum time in seconds before car will change langes.| 46 | | `DR_H2B_LANE_CHANGE_DISTANCE` | Distance in meters how long it will take the car to change lanes.| 47 | | `DR_H2B_NUMBER_OF_BOT_CARS` | Number of bot cars on the track.| 48 | | `DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS` | Minimum distance between bot cars.| 49 | | `DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS` | If True, bot car locations will randomly change after each episode.| 50 | | `DR_H2B_BOT_CAR_SPEED` | How fast the bot cars go in meters per second.| 51 | | `DR_CLOUD` | Can be `azure`, `aws`, `local` or `remote`; determines how the storage will be configured.| 52 | | `DR_AWS_APP_REGION` | (AWS only) Region for other AWS resources (e.g. Kinesis) | 53 | | `DR_UPLOAD_S3_PROFILE` | AWS Cli profile to be used that holds the 'real' S3 credentials needed to upload a model into AWS DeepRacer.| 54 | | `DR_UPLOAD_S3_BUCKET` | Name of the AWS DeepRacer bucket where models will be uploaded. (Typically starts with `aws-deepracer-`.)| 55 | | `DR_LOCAL_S3_PROFILE` | Name of AWS profile with credentials to be used. Stored in `~/.aws/credentials` unless AWS IAM Roles are used.| 56 | | `DR_GUI_ENABLE` | Enable or disable the Gazebo GUI in Robomaker | 57 | | `DR_KINESIS_STREAM_NAME` | Kinesis stream name. Used if you actually publish to the AWS KVS service. Leave blank if you do not want this. | 58 | | `DR_KINESIS_STREAM_ENABLE` | Enable or disable 'Kinesis Stream', True both publishes to a AWS KVS stream (if name not None), and to the topic `/racecar/deepracer/kvs_stream`. Leave True if you want to watch the car racing. | 59 | | `DR_SAGEMAKER_IMAGE` | Determines which sagemaker image will be used for training.| 60 | | `DR_ROBOMAKER_IMAGE` | Determines which robomaker image will be used for training or evaluation.| 61 | | `DR_COACH_IMAGE` | Determines which coach image will be used for training.| 62 | | `DR_WORKERS` | Number of Robomaker workers to be used for training. See additional documentation for more information about this feature.| 63 | | `DR_ROBOMAKER_MOUNT_LOGS` | TODO.| 64 | | `DR_CLOUD_WATCH_ENABLE` | Send log files to AWS CloudWatch.| 65 | | `DR_DOCKER_STYLE` | Valid Options are `Swarm` and `Compose`. Use Compose for openGL optimized containers.| 66 | | `DR_HOST_X` | Uses the host X-windows server, rather than starting one inside of Robomaker. Required for OpenGL images.| 67 | | `DR_WEBVIEWER_PORT` | Port for the web-viewer proxy which enables the streaming of all robomaker workers at once.| 68 | | `CUDA_VISIBLE_DEVICES` | Used in multi-GPU configurations. See additional documentation for more information about this feature.| 69 | 70 | ## Commands 71 | 72 | | Command | Description | 73 | |---------|-------------| 74 | | `dr-update` | Loads in all scripts and environment variables again.| 75 | | `dr-update-env` | Loads in all environment variables from `system.env` and `run.env`.| 76 | | `dr-upload-custom-files` | Uploads changed configuration files from `custom_files/` into `s3://{DR_LOCAL_S3_BUCKET}/custom_files`.| 77 | | `dr-download-custom-files` | Downloads changed configuration files from `s3://{DR_LOCAL_S3_BUCKET}/custom_files` into `custom_files/`.| 78 | | `dr-start-training` | Starts a training session in the local VM based on current configuration.| 79 | | `dr-increment-training` | Updates configuration, setting the current model prefix to pretrained, and incrementing a serial.| 80 | | `dr-stop-training` | Stops the current local training session. Uploads log files.| 81 | | `dr-start-evaluation` | Starts a evaluation session in the local VM based on current configuration.| 82 | | `dr-stop-evaluation` | Stops the current local evaluation session. Uploads log files.| 83 | | `dr-start-loganalysis` | Starts a Jupyter log-analysis container, available on port 8888.| 84 | | `dr-stop-loganalysis` | Stops the Jupyter log-analysis container.| 85 | | `dr-start-viewer` | Starts an NGINX proxy to stream all the robomaker streams; accessible remotly.| 86 | | `dr-stop-viewer` | Stops the NGINX proxy.| 87 | | `dr-logs-sagemaker` | Displays the logs from the running Sagemaker container.| 88 | | `dr-logs-robomaker` | Displays the logs from the running Robomaker container.| 89 | | `dr-list-aws-models` | Lists the models that are currently stored in your AWS DeepRacer S3 bucket. | 90 | | `dr-set-upload-model` | Updates the `run.env` with the prefix and name of your selected model. | 91 | | `dr-upload-model` | Uploads the model defined in `DR_LOCAL_S3_MODEL_PREFIX` to the AWS DeepRacer S3 prefix defined in `DR_UPLOAD_S3_PREFIX` | 92 | | `dr-download-model` | Downloads a file from a 'real' S3 location into a local prefix of choice. | 93 | -------------------------------------------------------------------------------- /utils/submit-monitor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import getopt 5 | import os 6 | import traceback 7 | import pickle 8 | import urllib.request 9 | 10 | import boto3 11 | from botocore.exceptions import ClientError 12 | 13 | try: 14 | import pandas as pd 15 | from deepracer import boto3_enhancer 16 | except ImportError: 17 | print("You need to install pandas and deepracer-utils to use this utility.") 18 | sys.exit(1) 19 | 20 | dr = None 21 | 22 | 23 | def main(): 24 | 25 | # Parse Arguments 26 | try: 27 | opts, _ = getopt.getopt( 28 | sys.argv[1:], 29 | "lvsghm:b:", 30 | ["logs", "verbose", "summary", "graphics", "help", "model=", "board="], 31 | ) 32 | except getopt.GetoptError as err: 33 | # print help information and exit: 34 | print(err) # will print something like "option -x not recognized" 35 | usage() 36 | sys.exit(2) 37 | 38 | logs_path = "{}/data/logs/leaderboards".format(os.environ.get("DR_DIR", None)) 39 | 40 | download_logs = False 41 | download_videos = False 42 | verbose = False 43 | create_summary = False 44 | model_name = None 45 | leaderboard_guid = None 46 | leaderboard_arn = None 47 | 48 | for opt, arg in opts: 49 | if opt in ("-l", "--logs"): 50 | download_logs = True 51 | elif opt in ("-g", "--graphics"): 52 | download_videos = True 53 | elif opt in ("-v", "--verbose"): 54 | verbose = True 55 | elif opt in ("-s", "--summary"): 56 | create_summary = True 57 | elif opt in ("-m", "--model"): 58 | model_name = arg.strip() 59 | elif opt in ("-b", "--board"): 60 | leaderboard_guid = arg.strip() 61 | elif opt in ("-h", "--help"): 62 | usage() 63 | sys.exit() 64 | 65 | # Prepare Boto3 66 | session = boto3.session.Session( 67 | region_name="us-east-1", 68 | profile_name=os.environ.get("DR_UPLOAD_S3_PROFILE", None), 69 | ) 70 | 71 | global dr 72 | dr = boto3_enhancer.deepracer_client(session=session) 73 | 74 | # Find the ARN for my model 75 | my_model = find_model(model_name) 76 | 77 | if my_model is not None: 78 | my_model_arn = my_model["ModelArn"].values[0] 79 | if verbose: 80 | print("Found ModelARN for model {}: {}".format(model_name, my_model_arn)) 81 | else: 82 | print("Did not find model with name {}".format(model_name)) 83 | sys.exit(1) 84 | 85 | if leaderboard_guid.startswith('arn'): 86 | leaderboard_arn = leaderboard_guid 87 | 88 | # Find the leaderboard 89 | if not leaderboard_arn: 90 | leaderboard_arn = find_leaderboard(leaderboard_guid) 91 | 92 | if leaderboard_arn is not None: 93 | if verbose: 94 | print("Found Leaderboard with ARN {}".format(leaderboard_arn)) 95 | else: 96 | print("Did not find Leaderboard with ARN {}".format(leaderboard_arn)) 97 | sys.exit(1) 98 | 99 | # Load summary from file if we are interested in it! 100 | if create_summary: 101 | 102 | pkl_f = "{}/{}/summary.pkl".format(logs_path, leaderboard_guid) 103 | if os.path.isfile(pkl_f): 104 | infile = open(pkl_f, "rb") 105 | my_submissions = pickle.load(infile) 106 | infile.close() 107 | else: 108 | my_submissions = {} 109 | my_submissions["LeaderboardSubmissions"] = [] 110 | 111 | dir_path = os.path.dirname(pkl_f) 112 | os.makedirs(dir_path, exist_ok=True) 113 | 114 | # Collect data about latest submission 115 | submission_response = dr.get_latest_user_submission(LeaderboardArn=leaderboard_arn) 116 | latest_submission = submission_response["LeaderboardSubmission"] 117 | if latest_submission: 118 | jobid = latest_submission["ActivityArn"].split("/", 1)[1] 119 | print( 120 | "Job {} has status {}".format( 121 | jobid, latest_submission["LeaderboardSubmissionStatusType"] 122 | ) 123 | ) 124 | 125 | if latest_submission["LeaderboardSubmissionStatusType"] == "SUCCESS": 126 | if download_logs: 127 | try: 128 | f_url = dr.get_asset_url( 129 | Arn=latest_submission["ActivityArn"], 130 | AssetType="LOGS", 131 | )["Url"] 132 | download_file( 133 | "{}/{}/robomaker-{}-{}.tar.gz".format( 134 | logs_path, 135 | leaderboard_guid, 136 | latest_submission["SubmissionTime"], 137 | jobid, 138 | ), 139 | f_url, 140 | ) 141 | except ClientError: 142 | print(("WARNING: Logfile for job {} not available.").format(jobid)) 143 | traceback.print_exc() 144 | 145 | if download_videos: 146 | download_file( 147 | "{}/{}/video-{}-{}.mp4".format( 148 | logs_path, 149 | leaderboard_guid, 150 | latest_submission["SubmissionTime"], 151 | jobid, 152 | ), 153 | latest_submission["SubmissionVideoS3path"], 154 | ) 155 | 156 | # Submit again 157 | _ = dr.create_leaderboard_submission( 158 | ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn 159 | ) 160 | print("Submitted {} to {}.".format(model_name, leaderboard_arn)) 161 | 162 | elif latest_submission["LeaderboardSubmissionStatusType"] == "ERROR" or latest_submission["LeaderboardSubmissionStatusType"] == "FAILED": 163 | print("Error in previous submission") 164 | if download_logs: 165 | try: 166 | f_url = dr.get_asset_url( 167 | Arn=latest_submission["ActivityArn"], 168 | AssetType="LOGS", 169 | )["Url"] 170 | download_file( 171 | "{}/{}/robomaker-{}-{}.tar.gz".format( 172 | logs_path, 173 | leaderboard_guid, 174 | latest_submission["SubmissionTime"], 175 | jobid, 176 | ), 177 | f_url, 178 | ) 179 | except ClientError: 180 | print(("WARNING: Logfile for job {} not available.").format(jobid)) 181 | traceback.print_exc() 182 | 183 | # Submit again 184 | _ = dr.create_leaderboard_submission( 185 | ModelArn=my_model_arn, LeaderboardArn=leaderboard_arn 186 | ) 187 | print("Submitted {} to {}.".format(model_name, leaderboard_arn)) 188 | 189 | # Maintain our summary 190 | if create_summary: 191 | for idx, i in enumerate(my_submissions["LeaderboardSubmissions"]): 192 | if "SubmissionTime" in i: 193 | if i["SubmissionTime"] == latest_submission["SubmissionTime"]: 194 | del my_submissions["LeaderboardSubmissions"][idx] 195 | else: 196 | del my_submissions["LeaderboardSubmissions"][idx] 197 | my_submissions["LeaderboardSubmissions"].append(latest_submission) 198 | 199 | # Save summary 200 | outfile = open(pkl_f, "wb") 201 | pickle.dump(my_submissions, outfile) 202 | outfile.close() 203 | 204 | # Display summary 205 | if verbose: 206 | display_submissions(my_submissions) 207 | 208 | 209 | def download_file(f_name, url): 210 | 211 | dir_path = os.path.dirname(f_name) 212 | os.makedirs(dir_path, exist_ok=True) 213 | if not os.path.isfile(f_name): 214 | print("Downloading {}".format(os.path.basename(f_name))) 215 | urllib.request.urlretrieve(url, f_name) 216 | 217 | 218 | def find_model(model_name): 219 | 220 | m_response = dr.list_models(ModelType="REINFORCEMENT_LEARNING", MaxResults=25) 221 | model_dict = m_response["Models"] 222 | models = pd.DataFrame.from_dict(model_dict) 223 | my_model = models[models["ModelName"] == model_name] 224 | 225 | if my_model.size > 0: 226 | return my_model 227 | 228 | while "NextToken" in m_response: 229 | m_response = dr.list_models( 230 | ModelType="REINFORCEMENT_LEARNING", 231 | MaxResults=50, 232 | NextToken=m_response["NextToken"], 233 | ) 234 | model_dict = m_response["Models"] 235 | 236 | models = pd.DataFrame.from_dict(model_dict) 237 | my_model = models[models["ModelName"] == model_name] 238 | if my_model.size > 0: 239 | return my_model 240 | 241 | return None 242 | 243 | 244 | def find_leaderboard(leaderboard_guid): 245 | leaderboard_arn = "arn:aws:deepracer:::leaderboard/{}".format(leaderboard_guid) 246 | 247 | l_response = dr.list_leaderboards(MaxResults=25) 248 | lboards_dict = l_response["Leaderboards"] 249 | leaderboards = pd.DataFrame.from_dict(l_response["Leaderboards"]) 250 | if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0: 251 | return leaderboard_arn 252 | 253 | while "NextToken" in l_response: 254 | l_response = dr.list_leaderboards( 255 | MaxResults=50, NextToken=l_response["NextToken"] 256 | ) 257 | lboards_dict = l_response["Leaderboards"] 258 | 259 | leaderboards = pd.DataFrame.from_dict(lboards_dict) 260 | if leaderboards[leaderboards["Arn"] == leaderboard_arn].size > 0: 261 | return leaderboard_arn 262 | 263 | return None 264 | 265 | 266 | def display_submissions(submissions_dict): 267 | # Display status 268 | my_columns = [ 269 | "SubmissionTime", 270 | "TotalLapTime", 271 | "BestLapTime", 272 | "ResetCount", 273 | "CollisionCount", 274 | "OffTrackCount", 275 | "Model", 276 | "JobId", 277 | "Status", 278 | ] 279 | my_submissions_df = pd.DataFrame.from_dict( 280 | submissions_dict["LeaderboardSubmissions"] 281 | ) 282 | my_submissions_df["SubmissionTime"] = ( 283 | my_submissions_df["SubmissionTime"] 284 | .values.astype(dtype="datetime64[ms]") 285 | .astype(dtype="datetime64[s]") 286 | ) 287 | my_submissions_df["TotalLapTime"] = my_submissions_df["TotalLapTime"].values.astype( 288 | dtype="datetime64[ms]" 289 | ) 290 | my_submissions_df["TotalLapTime"] = ( 291 | my_submissions_df["TotalLapTime"].dt.strftime("%M:%S.%f").str[:-4] 292 | ) 293 | my_submissions_df["BestLapTime"] = my_submissions_df["BestLapTime"].values.astype( 294 | dtype="datetime64[ms]" 295 | ) 296 | my_submissions_df["BestLapTime"] = ( 297 | my_submissions_df["BestLapTime"].dt.strftime("%M:%S.%f").str[:-4] 298 | ) 299 | my_submissions_df["JobId"] = my_submissions_df["ActivityArn"].str.split("/").str[1] 300 | my_submissions_df["Status"] = my_submissions_df["LeaderboardSubmissionStatusType"] 301 | my_submissions_df[[None, None, "Model"]] = my_submissions_df.ModelArn.str.split( 302 | "/", expand=True, 303 | ) 304 | 305 | # Display 306 | print("") 307 | print(my_submissions_df[my_columns]) 308 | 309 | 310 | def usage(): 311 | print( 312 | "Usage: submit-monitor.py [-v] [-s] [-l] [-g] -m -b " 313 | ) 314 | print(" -v Verbose output.") 315 | print(" -s Store a summary of all submissions.") 316 | print(" -l Download robomaker logfiles.") 317 | print(" -g Download video recordings.") 318 | print(" -m Display name of the model to submit.") 319 | print(" -b GUID or ARN of the leaderboard to submit to.") 320 | sys.exit(1) 321 | 322 | 323 | if __name__ == "__main__": 324 | main() 325 | -------------------------------------------------------------------------------- /scripts/training/prepare-config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import boto3 4 | import sys 5 | import os 6 | import time 7 | import json 8 | import io 9 | import yaml 10 | 11 | config = {} 12 | config['AWS_REGION'] = os.environ.get('DR_AWS_APP_REGION', 'us-east-1') 13 | config['JOB_TYPE'] = 'TRAINING' 14 | config['KINESIS_VIDEO_STREAM_NAME'] = os.environ.get('DR_KINESIS_STREAM_NAME', 'my-kinesis-stream') 15 | config['METRICS_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') 16 | 17 | metrics_prefix = os.environ.get('DR_LOCAL_S3_METRICS_PREFIX', None) 18 | if metrics_prefix is not None: 19 | config['METRICS_S3_OBJECT_KEY'] = '{}/TrainingMetrics.json'.format(metrics_prefix) 20 | else: 21 | config['METRICS_S3_OBJECT_KEY'] = 'DeepRacer-Metrics/TrainingMetrics-{}.json'.format(str(round(time.time()))) 22 | 23 | config['MODEL_METADATA_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_MODEL_METADATA_KEY', 'custom_files/model_metadata.json') 24 | config['REWARD_FILE_S3_KEY'] = os.environ.get('DR_LOCAL_S3_REWARD_KEY', 'custom_files/reward_function.py') 25 | config['ROBOMAKER_SIMULATION_JOB_ACCOUNT_ID'] = os.environ.get('', 'Dummy') 26 | config['NUM_WORKERS'] = os.environ.get('DR_WORKERS', 1) 27 | config['SAGEMAKER_SHARED_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') 28 | config['SAGEMAKER_SHARED_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') 29 | config['SIMTRACE_S3_BUCKET'] = os.environ.get('DR_LOCAL_S3_BUCKET', 'bucket') 30 | config['SIMTRACE_S3_PREFIX'] = os.environ.get('DR_LOCAL_S3_MODEL_PREFIX', 'rl-deepracer-sagemaker') 31 | config['TRAINING_JOB_ARN'] = 'arn:Dummy' 32 | 33 | # Car and training 34 | config['BODY_SHELL_TYPE'] = os.environ.get('DR_CAR_BODY_SHELL_TYPE', 'deepracer') 35 | if config['BODY_SHELL_TYPE'] == 'deepracer': 36 | config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') 37 | config['CAR_COLOR'] = os.environ.get('DR_CAR_COLOR', 'Red') 38 | config['CAR_NAME'] = os.environ.get('DR_CAR_NAME', 'MyCar') 39 | config['RACE_TYPE'] = os.environ.get('DR_RACE_TYPE', 'TIME_TRIAL') 40 | config['WORLD_NAME'] = os.environ.get('DR_WORLD_NAME', 'LGSWide') 41 | config['DISPLAY_NAME'] = os.environ.get('DR_DISPLAY_NAME', 'racer1') 42 | config['RACER_NAME'] = os.environ.get('DR_RACER_NAME', 'racer1') 43 | 44 | config['ALTERNATE_DRIVING_DIRECTION'] = os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION', os.environ.get('DR_ALTERNATE_DRIVING_DIRECTION', 'false')) 45 | config['CHANGE_START_POSITION'] = os.environ.get('DR_TRAIN_CHANGE_START_POSITION', os.environ.get('DR_CHANGE_START_POSITION', 'true')) 46 | config['ROUND_ROBIN_ADVANCE_DIST'] = os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST', '0.05') 47 | config['START_POSITION_OFFSET'] = os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00') 48 | config['ENABLE_DOMAIN_RANDOMIZATION'] = os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION', 'false') 49 | config['MIN_EVAL_TRIALS'] = os.environ.get('DR_TRAIN_MIN_EVAL_TRIALS', '5') 50 | 51 | # Object Avoidance 52 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': 53 | config['NUMBER_OF_OBSTACLES'] = os.environ.get('DR_OA_NUMBER_OF_OBSTACLES', '6') 54 | config['MIN_DISTANCE_BETWEEN_OBSTACLES'] = os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES', '2.0') 55 | config['RANDOMIZE_OBSTACLE_LOCATIONS'] = os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS', 'True') 56 | config['IS_OBSTACLE_BOT_CAR'] = os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR', 'false') 57 | 58 | object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") 59 | if object_position_str != "": 60 | object_positions = [] 61 | for o in object_position_str.split(";"): 62 | object_positions.append(o) 63 | config['OBJECT_POSITIONS'] = object_positions 64 | config['NUMBER_OF_OBSTACLES'] = str(len(object_positions)) 65 | 66 | # Head to Bot 67 | if config['RACE_TYPE'] == 'HEAD_TO_BOT': 68 | config['IS_LANE_CHANGE'] = os.environ.get('DR_H2B_IS_LANE_CHANGE', 'False') 69 | config['LOWER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME', '3.0') 70 | config['UPPER_LANE_CHANGE_TIME'] = os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME', '5.0') 71 | config['LANE_CHANGE_DISTANCE'] = os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE', '1.0') 72 | config['NUMBER_OF_BOT_CARS'] = os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS', '0') 73 | config['MIN_DISTANCE_BETWEEN_BOT_CARS'] = os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS', '2.0') 74 | config['RANDOMIZE_BOT_CAR_LOCATIONS'] = os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS', 'False') 75 | config['BOT_CAR_SPEED'] = os.environ.get('DR_H2B_BOT_CAR_SPEED', '0.2') 76 | config['PENALTY_SECONDS'] = os.environ.get('DR_H2B_BOT_CAR_PENALTY', '2.0') 77 | 78 | s3_endpoint_url = os.environ.get('DR_LOCAL_S3_ENDPOINT_URL', None) 79 | s3_region = config['AWS_REGION'] 80 | s3_bucket = config['SAGEMAKER_SHARED_S3_BUCKET'] 81 | s3_prefix = config['SAGEMAKER_SHARED_S3_PREFIX'] 82 | s3_mode = os.environ.get('DR_LOCAL_S3_AUTH_MODE','profile') 83 | if s3_mode == 'profile': 84 | s3_profile = os.environ.get('DR_LOCAL_S3_PROFILE', 'default') 85 | else: # mode is 'role' 86 | s3_profile = None 87 | s3_yaml_name = os.environ.get('DR_LOCAL_S3_TRAINING_PARAMS_FILE', 'training_params.yaml') 88 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) 89 | 90 | session = boto3.session.Session(profile_name=s3_profile) 91 | s3_client = session.client('s3', region_name=s3_region, endpoint_url=s3_endpoint_url) 92 | 93 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name)) 94 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml')) 95 | 96 | with open(local_yaml_path, 'w') as yaml_file: 97 | yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) 98 | 99 | # Copy the reward function to the s3 prefix bucket for compatability with DeepRacer console. 100 | reward_function_key = os.path.normpath(os.path.join(s3_prefix, "reward_function.py")) 101 | copy_source = { 102 | 'Bucket': s3_bucket, 103 | 'Key': config['REWARD_FILE_S3_KEY'] 104 | } 105 | s3_client.copy(copy_source, Bucket=s3_bucket, Key=reward_function_key) 106 | 107 | # Training with different configurations on each worker (aka Multi Config training) 108 | config['MULTI_CONFIG'] = os.environ.get('DR_TRAIN_MULTI_CONFIG', 'False') 109 | num_workers = int(config['NUM_WORKERS']) 110 | 111 | if config['MULTI_CONFIG'] == "True" and num_workers > 0: 112 | 113 | multi_config = {} 114 | multi_config['multi_config'] = [None] * num_workers 115 | 116 | for i in range(1,num_workers+1,1): 117 | if i == 1: 118 | # copy training_params to training_params_1 119 | s3_yaml_name_list = s3_yaml_name.split('.') 120 | s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i 121 | 122 | #upload additional training params files 123 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp)) 124 | s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) 125 | 126 | # Store in multi_config array 127 | multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp, 128 | 'world_name': config['WORLD_NAME']} 129 | 130 | else: # i >= 2 131 | #read in additional configuration file. format of file must be worker#-run.env 132 | location = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'worker-{}.env'.format(i))) 133 | with open(location, 'r') as fh: 134 | vars_dict = dict( 135 | tuple(line.split('=')) 136 | for line in fh.read().splitlines() if not line.startswith('#') 137 | ) 138 | 139 | # Reset parameters for the configuration of this worker number 140 | os.environ.update(vars_dict) 141 | 142 | # Update car and training parameters 143 | config.update({'WORLD_NAME': os.environ.get('DR_WORLD_NAME')}) 144 | config.update({'RACE_TYPE': os.environ.get('DR_RACE_TYPE')}) 145 | config.update({'CAR_COLOR': os.environ.get('DR_CAR_COLOR')}) 146 | config.update({'ALTERNATE_DRIVING_DIRECTION': os.environ.get('DR_TRAIN_ALTERNATE_DRIVING_DIRECTION')}) 147 | config.update({'CHANGE_START_POSITION': os.environ.get('DR_TRAIN_CHANGE_START_POSITION')}) 148 | config.update({'ROUND_ROBIN_ADVANCE_DIST': os.environ.get('DR_TRAIN_ROUND_ROBIN_ADVANCE_DIST')}) 149 | config.update({'ENABLE_DOMAIN_RANDOMIZATION': os.environ.get('DR_ENABLE_DOMAIN_RANDOMIZATION')}) 150 | config.update({'START_POSITION_OFFSET': os.environ.get('DR_TRAIN_START_POSITION_OFFSET', '0.00')}) 151 | 152 | # Update Object Avoidance parameters 153 | if config['RACE_TYPE'] == 'OBJECT_AVOIDANCE': 154 | config.update({'NUMBER_OF_OBSTACLES': os.environ.get('DR_OA_NUMBER_OF_OBSTACLES')}) 155 | config.update({'MIN_DISTANCE_BETWEEN_OBSTACLES': os.environ.get('DR_OA_MIN_DISTANCE_BETWEEN_OBSTACLES')}) 156 | config.update({'RANDOMIZE_OBSTACLE_LOCATIONS': os.environ.get('DR_OA_RANDOMIZE_OBSTACLE_LOCATIONS')}) 157 | config.update({'IS_OBSTACLE_BOT_CAR': os.environ.get('DR_OA_IS_OBSTACLE_BOT_CAR')}) 158 | object_position_str = os.environ.get('DR_OA_OBJECT_POSITIONS', "") 159 | if object_position_str != "": 160 | object_positions = [] 161 | for o in object_position_str.replace('"','').split(";"): 162 | object_positions.append(o) 163 | config.update({'OBJECT_POSITIONS': object_positions}) 164 | config.update({'NUMBER_OF_OBSTACLES': str(len(object_positions))}) 165 | else: 166 | config.pop('OBJECT_POSITIONS',[]) 167 | else: 168 | config.pop('NUMBER_OF_OBSTACLES', None) 169 | config.pop('MIN_DISTANCE_BETWEEN_OBSTACLES', None) 170 | config.pop('RANDOMIZE_OBSTACLE_LOCATIONS', None) 171 | config.pop('IS_OBSTACLE_BOT_CAR', None) 172 | config.pop('OBJECT_POSITIONS',[]) 173 | 174 | # Update Head to Bot parameters 175 | if config['RACE_TYPE'] == 'HEAD_TO_BOT': 176 | config.update({'IS_LANE_CHANGE': os.environ.get('DR_H2B_IS_LANE_CHANGE')}) 177 | config.update({'LOWER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_LOWER_LANE_CHANGE_TIME')}) 178 | config.update({'UPPER_LANE_CHANGE_TIME': os.environ.get('DR_H2B_UPPER_LANE_CHANGE_TIME')}) 179 | config.update({'LANE_CHANGE_DISTANCE': os.environ.get('DR_H2B_LANE_CHANGE_DISTANCE')}) 180 | config.update({'NUMBER_OF_BOT_CARS': os.environ.get('DR_H2B_NUMBER_OF_BOT_CARS')}) 181 | config.update({'MIN_DISTANCE_BETWEEN_BOT_CARS': os.environ.get('DR_H2B_MIN_DISTANCE_BETWEEN_BOT_CARS')}) 182 | config.update({'RANDOMIZE_BOT_CAR_LOCATIONS': os.environ.get('DR_H2B_RANDOMIZE_BOT_CAR_LOCATIONS')}) 183 | config.update({'BOT_CAR_SPEED': os.environ.get('DR_H2B_BOT_CAR_SPEED')}) 184 | config.update({'PENALTY_SECONDS': os.environ.get('DR_H2B_BOT_CAR_PENALTY')}) 185 | else: 186 | config.pop('IS_LANE_CHANGE', None) 187 | config.pop('LOWER_LANE_CHANGE_TIME', None) 188 | config.pop('UPPER_LANE_CHANGE_TIME', None) 189 | config.pop('LANE_CHANGE_DISTANCE', None) 190 | config.pop('NUMBER_OF_BOT_CARS', None) 191 | config.pop('MIN_DISTANCE_BETWEEN_BOT_CARS', None) 192 | config.pop('RANDOMIZE_BOT_CAR_LOCATIONS', None) 193 | config.pop('BOT_CAR_SPEED', None) 194 | 195 | #split string s3_yaml_name, insert the worker number, and add back on the .yaml extension 196 | s3_yaml_name_list = s3_yaml_name.split('.') 197 | s3_yaml_name_temp = s3_yaml_name_list[0] + "_%d.yaml" % i 198 | 199 | #upload additional training params files 200 | yaml_key = os.path.normpath(os.path.join(s3_prefix, s3_yaml_name_temp)) 201 | local_yaml_path = os.path.abspath(os.path.join(os.environ.get('DR_DIR'),'tmp', 'training-params-' + str(round(time.time())) + '.yaml')) 202 | with open(local_yaml_path, 'w') as yaml_file: 203 | yaml.dump(config, yaml_file, default_flow_style=False, default_style='\'', explicit_start=True) 204 | s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) 205 | 206 | # Store in multi_config array 207 | multi_config['multi_config'][i - 1] = {'config_file': s3_yaml_name_temp, 208 | 'world_name': config['WORLD_NAME']} 209 | 210 | print(json.dumps(multi_config)) 211 | 212 | else: 213 | s3_client.upload_file(Bucket=s3_bucket, Key=yaml_key, Filename=local_yaml_path) 214 | --------------------------------------------------------------------------------