├── tests
    ├── __init__.py
    ├── smoke
    │   ├── __init__.py
    │   └── test_basics.py
    ├── system
    │   ├── __init__.py
    │   └── test_examples.py
    └── conftest.py
├── requirements.txt
├── src
    ├── data
    │   └── .gitignore
    ├── simple_sagemaker
    │   ├── worker_toolkit
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── shell_launcher.py
    │   ├── s3_sync.py
    │   ├── iam_utils.py
    │   └── ecr_sync.py
    └── cli_launcher.py
├── examples
    ├── single_task
    │   ├── input_data
    │   │   ├── test2
    │   │   └── test
    │   ├── code
    │   │   ├── requirements.txt
    │   │   ├── internal_dependency
    │   │   │   └── lib2.py
    │   │   └── algo.py
    │   ├── expected_output
    │   │   ├── model
    │   │   │   ├── algo-1
    │   │   │   │   └── model_dir
    │   │   │   └── algo-2
    │   │   │   │   └── model_dir
    │   │   ├── state
    │   │   │   ├── algo-1
    │   │   │   │   └── state_algo-1
    │   │   │   └── algo-2
    │   │   │   │   └── state_algo-2
    │   │   └── output
    │   │   │   └── algo-1
    │   │   │       ├── output_data_dir
    │   │   │       └── input_dir_copy
    │   │   │           ├── data
    │   │   │               ├── data
    │   │   │               │   └── test
    │   │   │               ├── checkpoints-manifest
    │   │   │               └── data-manifest
    │   │   │           └── config
    │   │   │               ├── metric-definition-regex.json
    │   │   │               ├── upstreamoutputdataconfig.json
    │   │   │               ├── checkpointconfig.json
    │   │   │               ├── resourceconfig.json
    │   │   │               ├── inputdataconfig.json
    │   │   │               ├── tensorboardoutputconfig.json
    │   │   │               ├── hyperparameters.json
    │   │   │               ├── init-config.json
    │   │   │               └── trainingjobconfig.json
    │   ├── external_dependency
    │   │   └── lib1.py
    │   ├── docker
    │   │   └── Dockerfile
    │   └── example.py
    ├── imagenet
    │   ├── code
    │   │   ├── .gitignore
    │   │   ├── extract.sh
    │   │   ├── download.sh
    │   │   └── download-all.sh
    │   ├── run_local.sh
    │   └── run_remote.sh
    ├── processing_cli
    │   ├── data
    │   │   └── sample_data.txt
    │   ├── dep
    │   │   └── ex1_dep.py
    │   ├── expected_output
    │   │   ├── output1
    │   │   │   ├── output
    │   │   │   │   └── output
    │   │   │   ├── state
    │   │   │   │   └── state
    │   │   │   └── logs
    │   │   │   │   └── logs0
    │   │   ├── output2
    │   │   │   ├── output
    │   │   │   │   ├── output
    │   │   │   │   └── config
    │   │   │   │   │   ├── resourceconfig.json
    │   │   │   │   │   └── processingjobconfig.json
    │   │   │   ├── state
    │   │   │   │   └── state
    │   │   │   └── logs
    │   │   │   │   └── logs0
    │   │   └── output3
    │   │   │   ├── state
    │   │   │       └── state_sh
    │   │   │   ├── output
    │   │   │       ├── output_sh
    │   │   │       └── config
    │   │   │       │   ├── resourceconfig.json
    │   │   │       │   └── processingjobconfig.json
    │   │   │   └── logs
    │   │   │       └── logs0
    │   ├── ex3.sh
    │   ├── ex1.py
    │   └── run.sh
    ├── dogs_vs_cats
    │   ├── .gitignore
    │   └── run_remote.sh
    ├── cli_multi
    │   ├── expected_output
    │   │   └── output1
    │   │   │   ├── model
    │   │   │       ├── output_algo-1
    │   │   │       ├── sample_data1.txt_proc_by_algo-1
    │   │   │       └── sample_data2.txt_proc_by_algo-1
    │   │   │   ├── state
    │   │   │       └── algo-1
    │   │   │       │   └── state_algo-1
    │   │   │   └── logs
    │   │   │       └── logs0
    │   ├── run.sh
    │   └── worker.py
    ├── readme_examples
    │   ├── data
    │   │   └── sample_data.txt
    │   ├── expected_output_smoke
    │   │   └── example7
    │   │   │   └── output
    │   │   │       ├── success
    │   │   │       └── data
    │   │   │           └── ps__elf
    │   ├── example6
    │   │   ├── data
    │   │   │   ├── sample_data1.txt
    │   │   │   └── sample_data2.txt
    │   │   ├── code
    │   │   │   ├── requirements.txt
    │   │   │   ├── internal_dependency
    │   │   │   │   └── lib2.py
    │   │   │   └── worker6.py
    │   │   ├── external_dependency
    │   │   │   └── lib1.py
    │   │   └── Dockerfile
    │   ├── expected_output
    │   │   ├── example6_1
    │   │   │   ├── state
    │   │   │   │   ├── algo-1
    │   │   │   │   │   └── algo-1
    │   │   │   │   └── algo-2
    │   │   │   │   │   └── algo-2
    │   │   │   ├── output
    │   │   │   │   └── output_algo-1
    │   │   │   └── model
    │   │   │   │   ├── sample_data1.txt_proc_by_algo-1
    │   │   │   │   ├── sample_data1.txt_proc_by_algo-2
    │   │   │   │   ├── sample_data2.txt_proc_by_algo-1
    │   │   │   │   └── sample_data2.txt_proc_by_algo-2
    │   │   └── example1
    │   │   │   └── logs
    │   │   │       ├── logs0
    │   │   │       └── logs1
    │   ├── expected_output_rest
    │   │   ├── example3
    │   │   │   ├── model
    │   │   │   │   └── model_dir
    │   │   │   ├── state
    │   │   │   │   └── state_dir
    │   │   │   ├── output
    │   │   │   │   └── output_data_dir
    │   │   │   └── logs
    │   │   │   │   └── logs0
    │   │   ├── example3_2
    │   │   │   ├── model
    │   │   │   │   └── model_dir
    │   │   │   ├── state
    │   │   │   │   └── state_dir
    │   │   │   ├── output
    │   │   │   │   └── output_data_dir
    │   │   │   └── logs
    │   │   │   │   └── logs0
    │   │   ├── example3_2_stdout
    │   │   ├── example2
    │   │   │   └── logs
    │   │   │   │   └── logs0
    │   │   └── example5
    │   │   │   └── logs
    │   │   │       └── logs0
    │   ├── worker1.py
    │   ├── worker2.py
    │   ├── run_smoke.sh
    │   ├── run_smoke.bat
    │   ├── worker3.py
    │   ├── worker4.py
    │   ├── run_rest.sh
    │   └── run.sh
    ├── single_file
    │   ├── data
    │   │   ├── sample_data1.txt
    │   │   └── sample_data2.txt
    │   ├── expected_output
    │   │   └── output1
    │   │   │   ├── model
    │   │   │       ├── output_algo-1
    │   │   │       ├── output_algo-2
    │   │   │       ├── sample_data1.txt_proc_by_algo-1
    │   │   │       └── sample_data2.txt_proc_by_algo-2
    │   │   │   ├── state
    │   │   │       ├── algo-1
    │   │   │       │   └── state_algo-1
    │   │   │       └── algo-2
    │   │   │       │   └── state_algo-2
    │   │   │   └── logs
    │   │   │       ├── logs0
    │   │   │       └── logs1
    │   └── example.py
    ├── debugging
    │   ├── tensorboard
    │   │   ├── requirements.txt
    │   │   └── lightning.py
    │   ├── metrics_example.JPG
    │   ├── metrics.py
    │   └── run.sh
    ├── medium
    │   ├── intro
    │   │   ├── example3
    │   │   │   ├── data
    │   │   │   │   ├── sample_data1.txt
    │   │   │   │   └── sample_data2.txt
    │   │   │   ├── code
    │   │   │   │   ├── requirements.txt
    │   │   │   │   ├── internal_dependency
    │   │   │   │   │   └── lib2.py
    │   │   │   │   └── ssm_ex3_worker.py
    │   │   │   └── external_dependency
    │   │   │   │   └── lib1.py
    │   │   ├── ssm_ex2.py
    │   │   ├── README.md
    │   │   ├── run2.sh
    │   │   ├── run1.sh
    │   │   └── run3.sh
    │   └── distributed
    │   │   ├── cifar10
    │   │       ├── run_local.sh
    │   │       └── run_remote.sh
    │   │   └── README.md
    ├── multiple_tasks
    │   ├── expected_output
    │   │   ├── output1
    │   │   │   ├── model
    │   │   │   │   ├── algo-1
    │   │   │   │   │   └── model_dir
    │   │   │   │   └── algo-2
    │   │   │   │   │   └── model_dir
    │   │   │   ├── state
    │   │   │   │   ├── algo-1
    │   │   │   │   │   ├── state_algo-1_1
    │   │   │   │   │   ├── state_algo-1_10
    │   │   │   │   │   ├── state_algo-1_2
    │   │   │   │   │   ├── state_algo-1_3
    │   │   │   │   │   ├── state_algo-1_4
    │   │   │   │   │   ├── state_algo-1_5
    │   │   │   │   │   ├── state_algo-1_6
    │   │   │   │   │   ├── state_algo-1_7
    │   │   │   │   │   ├── state_algo-1_8
    │   │   │   │   │   └── state_algo-1_9
    │   │   │   │   └── algo-2
    │   │   │   │   │   ├── state_algo-2_1
    │   │   │   │   │   ├── state_algo-2_10
    │   │   │   │   │   ├── state_algo-2_2
    │   │   │   │   │   ├── state_algo-2_3
    │   │   │   │   │   ├── state_algo-2_4
    │   │   │   │   │   ├── state_algo-2_5
    │   │   │   │   │   ├── state_algo-2_6
    │   │   │   │   │   ├── state_algo-2_7
    │   │   │   │   │   ├── state_algo-2_8
    │   │   │   │   │   └── state_algo-2_9
    │   │   │   └── output
    │   │   │   │   └── algo-1
    │   │   │   │       ├── data_copy
    │   │   │   │           └── test
    │   │   │   │       └── state_copy
    │   │   │   │           └── algo-1
    │   │   │   │               ├── state_algo-1_1
    │   │   │   │               ├── state_algo-1_10
    │   │   │   │               ├── state_algo-1_2
    │   │   │   │               ├── state_algo-1_3
    │   │   │   │               ├── state_algo-1_4
    │   │   │   │               ├── state_algo-1_5
    │   │   │   │               ├── state_algo-1_6
    │   │   │   │               ├── state_algo-1_7
    │   │   │   │               ├── state_algo-1_8
    │   │   │   │               └── state_algo-1_9
    │   │   └── output2
    │   │   │   └── output
    │   │   │       └── algo-1
    │   │   │           ├── data_copy
    │   │   │               └── test
    │   │   │           ├── task1_state1_copy
    │   │   │               ├── algo-1
    │   │   │               │   ├── state_algo-1_1
    │   │   │               │   ├── state_algo-1_10
    │   │   │               │   ├── state_algo-1_2
    │   │   │               │   ├── state_algo-1_3
    │   │   │               │   ├── state_algo-1_4
    │   │   │               │   ├── state_algo-1_5
    │   │   │               │   ├── state_algo-1_6
    │   │   │               │   ├── state_algo-1_7
    │   │   │               │   ├── state_algo-1_8
    │   │   │               │   └── state_algo-1_9
    │   │   │               └── algo-2
    │   │   │               │   ├── state_algo-2_1
    │   │   │               │   ├── state_algo-2_10
    │   │   │               │   ├── state_algo-2_2
    │   │   │               │   ├── state_algo-2_3
    │   │   │               │   ├── state_algo-2_4
    │   │   │               │   ├── state_algo-2_5
    │   │   │               │   ├── state_algo-2_6
    │   │   │               │   ├── state_algo-2_7
    │   │   │               │   ├── state_algo-2_8
    │   │   │               │   └── state_algo-2_9
    │   │   │           ├── task1_state2_copy
    │   │   │               ├── algo-1
    │   │   │               │   ├── state_algo-1_1
    │   │   │               │   ├── state_algo-1_2
    │   │   │               │   ├── state_algo-1_4
    │   │   │               │   ├── state_algo-1_6
    │   │   │               │   └── state_algo-1_8
    │   │   │               └── algo-2
    │   │   │               │   ├── state_algo-2_1
    │   │   │               │   ├── state_algo-2_2
    │   │   │               │   ├── state_algo-2_4
    │   │   │               │   ├── state_algo-2_6
    │   │   │               │   └── state_algo-2_8
    │   │   │           ├── model_copy
    │   │   │               └── model.tar.gz
    │   │   │           └── task1_state3_copy
    │   │   │               └── output.tar.gz
    │   ├── code
    │   │   └── algo_multi.py
    │   └── example.py
    ├── update_expected
    │   └── update.sh
    └── retcode
    │   └── run.sh
├── docs
    ├── metric_example.jpg
    ├── source
    │   ├── modules.rst
    │   ├── index.rst
    │   ├── simple_sagemaker.task_toolkit.rst
    │   ├── simple_sagemaker.rst
    │   └── conf.py
    ├── Makefile
    ├── make.bat
    └── high_level_flow.txt
├── setup.py
├── MANIFEST.in
├── .gitignore
├── pyproject.toml
├── .vscode
    ├── settings.json
    └── launch.json
├── setup.cfg
├── tox.ini
└── .github
    └── workflows
        └── build.yaml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tox


--------------------------------------------------------------------------------
/src/data/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/smoke/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/system/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/single_task/input_data/test2:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/imagenet/code/.gitignore:
--------------------------------------------------------------------------------
1 | main.py


--------------------------------------------------------------------------------
/examples/processing_cli/data/sample_data.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/single_task/input_data/test:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/src/simple_sagemaker/worker_toolkit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/simple_sagemaker/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = "0.9.24"
2 | 


--------------------------------------------------------------------------------
/examples/dogs_vs_cats/.gitignore:
--------------------------------------------------------------------------------
1 | main.py
2 | data/*
3 | output/*


--------------------------------------------------------------------------------
/examples/cli_multi/expected_output/output1/model/output_algo-1:
--------------------------------------------------------------------------------
1 | output


--------------------------------------------------------------------------------
/examples/processing_cli/dep/ex1_dep.py:
--------------------------------------------------------------------------------
1 | print("Dependency!!!")
2 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output1/output/output:
--------------------------------------------------------------------------------
1 | output


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output1/state/state:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/readme_examples/data/sample_data.txt:
--------------------------------------------------------------------------------
1 | sample data content


--------------------------------------------------------------------------------
/examples/single_file/data/sample_data1.txt:
--------------------------------------------------------------------------------
1 | Single file sample data 1


--------------------------------------------------------------------------------
/examples/single_file/data/sample_data2.txt:
--------------------------------------------------------------------------------
1 | Single file sample data 2


--------------------------------------------------------------------------------
/examples/single_task/code/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.0.2
2 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/model/algo-1/model_dir:
--------------------------------------------------------------------------------
1 | model_dir


--------------------------------------------------------------------------------
/examples/single_task/expected_output/model/algo-2/model_dir:
--------------------------------------------------------------------------------
1 | model_dir


--------------------------------------------------------------------------------
/examples/cli_multi/expected_output/output1/state/algo-1/state_algo-1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/debugging/tensorboard/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch-lightning
2 | 


--------------------------------------------------------------------------------
/examples/medium/intro/example3/data/sample_data1.txt:
--------------------------------------------------------------------------------
1 | sample data content 1


--------------------------------------------------------------------------------
/examples/medium/intro/example3/data/sample_data2.txt:
--------------------------------------------------------------------------------
1 | sample data content 2


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output2/output/output:
--------------------------------------------------------------------------------
1 | output
2 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output2/state/state:
--------------------------------------------------------------------------------
1 | state
2 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output3/state/state_sh:
--------------------------------------------------------------------------------
1 | state
2 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_smoke/example7/output/success:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/model/output_algo-1:
--------------------------------------------------------------------------------
1 | output


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/model/output_algo-2:
--------------------------------------------------------------------------------
1 | output


--------------------------------------------------------------------------------
/examples/medium/intro/example3/code/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.0.2
2 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output3/output/output_sh:
--------------------------------------------------------------------------------
1 | output
2 | 


--------------------------------------------------------------------------------
/examples/readme_examples/example6/data/sample_data1.txt:
--------------------------------------------------------------------------------
1 | sample data content 1


--------------------------------------------------------------------------------
/examples/readme_examples/example6/data/sample_data2.txt:
--------------------------------------------------------------------------------
1 | sample data content 2


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/state/algo-1/state_algo-1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/state/algo-2/state_algo-2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/single_task/expected_output/state/algo-1/state_algo-1:
--------------------------------------------------------------------------------
1 | state_algo-1


--------------------------------------------------------------------------------
/examples/single_task/expected_output/state/algo-2/state_algo-2:
--------------------------------------------------------------------------------
1 | state_algo-2


--------------------------------------------------------------------------------
/src/cli_launcher.py:
--------------------------------------------------------------------------------
1 | from simple_sagemaker import cli
2 | 
3 | cli.main()
4 | 


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/model/algo-1/model_dir:
--------------------------------------------------------------------------------
1 | model_dir


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/model/algo-2/model_dir:
--------------------------------------------------------------------------------
1 | model_dir


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_10:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_3:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_5:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_7:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_9:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_10:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_3:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_5:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_7:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_9:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/readme_examples/example6/code/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==3.0.2
2 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/state/algo-1/algo-1:
--------------------------------------------------------------------------------
1 | state_algo-1


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/state/algo-2/algo-2:
--------------------------------------------------------------------------------
1 | state_algo-2


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/output_data_dir:
--------------------------------------------------------------------------------
1 | output_data_dir


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/data_copy/test:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/data_copy/test:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/output/output_algo-1:
--------------------------------------------------------------------------------
1 | output_algo-1


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3/model/model_dir:
--------------------------------------------------------------------------------
1 | model_dir file


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3/state/state_dir:
--------------------------------------------------------------------------------
1 | state_dir file


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3_2/model/model_dir:
--------------------------------------------------------------------------------
1 | model_dir file


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3_2/state/state_dir:
--------------------------------------------------------------------------------
1 | state_dir file


--------------------------------------------------------------------------------
/examples/single_task/external_dependency/lib1.py:
--------------------------------------------------------------------------------
1 | print("-- External Lib1 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/single_task/code/internal_dependency/lib2.py:
--------------------------------------------------------------------------------
1 | print("-- Internal Lib2 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/data/data/test:
--------------------------------------------------------------------------------
1 | hello world!


--------------------------------------------------------------------------------
/examples/medium/intro/example3/external_dependency/lib1.py:
--------------------------------------------------------------------------------
1 | print("-- External Lib1 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3/output/output_data_dir:
--------------------------------------------------------------------------------
1 | output_data_dir file


--------------------------------------------------------------------------------
/examples/medium/intro/example3/code/internal_dependency/lib2.py:
--------------------------------------------------------------------------------
1 | print("-- Internal Lib2 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_10:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_3:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_5:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_7:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_9:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/readme_examples/example6/external_dependency/lib1.py:
--------------------------------------------------------------------------------
1 | print("-- External Lib1 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3_2/output/output_data_dir:
--------------------------------------------------------------------------------
1 | output_data_dir file


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/metric-definition-regex.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/docs/metric_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/docs/metric_example.jpg


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_10:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_3:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_5:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_7:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_9:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_10:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_3:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_5:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_7:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_9:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_1:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_2:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_4:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_6:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_8:
--------------------------------------------------------------------------------
1 | state


--------------------------------------------------------------------------------
/examples/readme_examples/example6/code/internal_dependency/lib2.py:
--------------------------------------------------------------------------------
1 | print("-- Internal Lib2 imported!")
2 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/upstreamoutputdataconfig.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/checkpointconfig.json:
--------------------------------------------------------------------------------
1 | {"LocalPath":"/state"}


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import setuptools
4 | 
5 | if __name__ == "__main__":
6 |     setuptools.setup()


--------------------------------------------------------------------------------
/examples/cli_multi/expected_output/output1/model/sample_data1.txt_proc_by_algo-1:
--------------------------------------------------------------------------------
1 | Single file sample data 1 processed by algo-1


--------------------------------------------------------------------------------
/examples/cli_multi/expected_output/output1/model/sample_data2.txt_proc_by_algo-1:
--------------------------------------------------------------------------------
1 | Single file sample data 2 processed by algo-1


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output2/output/config/resourceconfig.json:
--------------------------------------------------------------------------------
1 | {"current_host":"algo-1","hosts":["algo-1"]}


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output3/output/config/resourceconfig.json:
--------------------------------------------------------------------------------
1 | {"current_host":"algo-1","hosts":["algo-1"]}


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/model/sample_data1.txt_proc_by_algo-1:
--------------------------------------------------------------------------------
1 | Single file sample data 1 processed by algo-1


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/model/sample_data2.txt_proc_by_algo-2:
--------------------------------------------------------------------------------
1 | Single file sample data 2 processed by algo-2


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/model/sample_data1.txt_proc_by_algo-1:
--------------------------------------------------------------------------------
1 | sample data content 1 processed by algo-1


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/model/sample_data1.txt_proc_by_algo-2:
--------------------------------------------------------------------------------
1 | sample data content 1 processed by algo-2


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/model/sample_data2.txt_proc_by_algo-1:
--------------------------------------------------------------------------------
1 | sample data content 2 processed by algo-1


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example6_1/model/sample_data2.txt_proc_by_algo-2:
--------------------------------------------------------------------------------
1 | sample data content 2 processed by algo-2


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | simple_sagemaker
2 | ================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    simple_sagemaker
8 | 


--------------------------------------------------------------------------------
/examples/debugging/metrics_example.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/debugging/metrics_example.JPG


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the README
2 | include *.rst
3 | 
4 | # Include the license file
5 | include LICENSE.txt
6 | 
7 | graft tests
8 | graft examples


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/data/checkpoints-manifest:
--------------------------------------------------------------------------------
1 | tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state
2 | 


--------------------------------------------------------------------------------
/examples/medium/intro/ssm_ex2.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | for i in range(torch.cuda.device_count()):
4 |     print(f"-***- Device {i}: {torch.cuda.get_device_properties(i)}")
5 | 


--------------------------------------------------------------------------------
/examples/readme_examples/worker1.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | for i in range(torch.cuda.device_count()):
4 |     print(f"-***- Device {i}: {torch.cuda.get_device_properties(i)}")
5 | 


--------------------------------------------------------------------------------
/examples/readme_examples/worker2.py:
--------------------------------------------------------------------------------
1 | from worker_toolkit import worker_lib
2 | 
3 | worker_config = worker_lib.WorkerConfig(False)
4 | print("-***-", worker_config.hps["msg"])
5 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/resourceconfig.json:
--------------------------------------------------------------------------------
1 | {"current_host": "algo-1", "hosts": ["algo-1", "algo-2"], "network_interface_name": "eth0"}


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/inputdataconfig.json:
--------------------------------------------------------------------------------
1 | {"data":{"TrainingInputMode":"File","S3DistributionType":"ShardedByS3Key","RecordWrapperType":"None"}}


--------------------------------------------------------------------------------
/examples/readme_examples/example6/Dockerfile:
--------------------------------------------------------------------------------
1 | # __BASE_IMAGE__ is automatically replaced with the correct base image
2 | FROM __BASE_IMAGE__ 
3 | RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3
4 | 


--------------------------------------------------------------------------------
/examples/medium/intro/README.md:
--------------------------------------------------------------------------------
1 | Examples from the Medium blog post [Cloud processing is now easier and cheaper!](https://medium.com/@shiftan/a-very-simple-and-cheap-way-to-run-your-processing-job-on-the-cloud-c76af579f9e9)


--------------------------------------------------------------------------------
/examples/medium/intro/run2.sh:
--------------------------------------------------------------------------------
1 | BASEDIR=$(dirname "$0")
2 | pushd .
3 | cd $BASEDIR
4 | 
5 | ssm run -p ssm-ex -t ex2 -e ssm_ex2.py -o ./out2 --it ml.p3.2xlarge --ic 2 --force_running
6 | 
7 | cat ./out2/logs/logs0
8 | 
9 | popd


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/model_copy/model.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/multiple_tasks/expected_output/output2/output/algo-1/model_copy/model.tar.gz


--------------------------------------------------------------------------------
/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state3_copy/output.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state3_copy/output.tar.gz


--------------------------------------------------------------------------------
/examples/single_task/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # __BASE_IMAGE__ is automatically replaced with the correct base image
2 | FROM __BASE_IMAGE__ 
3 | #FROM python:3.7-slim-buster
4 | 
5 | RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3
6 | #ENV PYTHONUNBUFFERED=TRUE
7 | 
8 | #ENTRYPOINT ["python3"]


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/tensorboardoutputconfig.json:
--------------------------------------------------------------------------------
1 | {"S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-05-09-16-33_py37/Task1/Task1-2020-10-05-09-17-22-LlJvq4UU/tensorboard-output","LocalPath":"/opt/ml/output/tensorboard/"}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # general things to ignore
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | *.egg
 6 | *.py[cod]
 7 | __pycache__/
 8 | *.so
 9 | *~
10 | 
11 | # due to using tox and pytest
12 | .tox
13 | .cache
14 | 
15 | examples/*/output
16 | examples/**/*.extracted*
17 | htmlcov/*
18 | .coverage*
19 | examples/**/cifar-10-*


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 40.6.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.isort]
 6 | multi_line_output = 3
 7 | include_trailing_comma = true
 8 | force_grid_wrap = 0
 9 | use_parentheses = true
10 | ensure_newline_before_comments = true
11 | line_length = 88


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--cmdopt", action="store", default="type1", help="my option: type1 or type2"
 7 |     )
 8 | 
 9 | 
10 | @pytest.fixture
11 | def cmdopt(request):
12 |     return request.config.getoption("--cmdopt")
13 | 


--------------------------------------------------------------------------------
/examples/debugging/metrics.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | def main():
 5 |     vals1 = [60, 6, 0.6]
 6 |     vals2 = [-10, 0, 10]
 7 |     for (val1, val2) in zip(vals1, vals2):
 8 |         print(f"Val1: {val1:.4e}")
 9 |         print(f"Val2: {val2}")
10 |         time.sleep(60)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     main()
15 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/data/data-manifest:
--------------------------------------------------------------------------------
1 | tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input
2 | s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input/test
3 | s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input/test2
4 | 


--------------------------------------------------------------------------------
/examples/processing_cli/ex3.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | echo "======= Starting Bash script ..."
 4 | echo "-***- Args:" $@
 5 | echo "-- Env:", `env`
 6 | echo "-***- Pwd:", `pwd`
 7 | echo "*** START listing files in /opt"
 8 | ls -laR /opt
 9 | echo "*** END file listing /opt"
10 | cp -r /opt/ml/config $SSM_OUTPUT/config 
11 | echo "output" > $SSM_OUTPUT/output_sh 
12 | echo "state" > $SSM_STATE/state_sh


--------------------------------------------------------------------------------
/examples/update_expected/update.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | BASEDIR=$(dirname "$0")
 4 | cd $BASEDIR
 5 | 
 6 | rm -rf output
 7 | unzip $1 -d ./output
 8 | cd output
 9 | mv popen*/*0/* .
10 | rm -r popen*
11 | find . | grep "\.extracted" | xargs rm
12 | 
13 | for file in *; do
14 |     echo updating $file ...
15 |     rm -rf ../../$file/expected_output/*
16 |     cp -r $file/output/* ../../$file/expected_output
17 | done


--------------------------------------------------------------------------------
/examples/cli_multi/run.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | # Params: [output] [prefix] [suffix] [additional ssm params...]
4 | BASEDIR=$(dirname "$0")
5 | ssm run --prefix ${2} -p simple-sagemaker-example-cli-multi -t task1${3} -e $BASEDIR/worker.py -o $1/output1 --task_type 1 -i $BASEDIR/../single_file/data ${@:4}
6 | ssm run --prefix ${2} -p simple-sagemaker-example-cli-multi -t task2${3} -e $BASEDIR/worker.py -o $1/output2 --task_type 2 --iit task2_data task1 model ${@:4}
7 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/hyperparameters.json:
--------------------------------------------------------------------------------
1 | {"arg2":"\"hello\"","sagemaker_container_log_level":"20","sagemaker_program":"\"algo.py\"","arg1":"5","sagemaker_region":"\"us-east-1\"","sagemaker_job_name":"\"Task1-2020-10-04-09-17-37-x6ux770b\"","sagemaker_submit_directory":"\"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/source/sourcedir.tar.gz\""}


--------------------------------------------------------------------------------
/examples/medium/intro/run1.sh:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(dirname "$0")
 2 | pushd .
 3 | cd $BASEDIR
 4 | 
 5 | # Clean the current state to make sure the code runs again
 6 | #   Note:   1. It is done just for demonstration, by appending "--force_running" to the "ssm shell" command below
 7 | ssm data -p ssm-ex -t ex1 --force_running
 8 | # Run the task
 9 | ssm shell -p ssm-ex -t ex1 -o ./out1 --it ml.p3.2xlarge --cmd_line "cat /proc/cpuinfo && nvidia-smi"
10 | 
11 | cat ./out1/logs/logs0
12 | 
13 | popd


--------------------------------------------------------------------------------
/examples/readme_examples/run_smoke.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -e # fail if any test fails
 3 | 
 4 | # Params: [output] [prefix] [suffix] [additional ssm params...]
 5 | BASEDIR=$(dirname "$0")
 6 | echo "Running with", $@
 7 | 
 8 | # Example 7 - local mode
 9 | #   --ks is used to avoid messing with state (not supported in local mode)
10 | ssm shell --prefix ${2} -p simple-sagemaker-example-cli -t shell-cli-local${3} \
11 |     --cmd_line "ps -elf >> \$SM_OUTPUT_DATA_DIR/ps__elf" \
12 |     -o $1/example7 --it 'local' --no_spot --download_output ${@:4} --ks


--------------------------------------------------------------------------------
/examples/readme_examples/run_smoke.bat:
--------------------------------------------------------------------------------
 1 | :: Params: [output] [prefix] [suffix] [additional ssm params...]
 2 | echo %0
 3 | echo off
 4 | for %%F in (%0) do set dirname=%%~dpF
 5 | echo "Running with"
 6 | echo %*
 7 | echo %dirname%
 8 | 
 9 | :: Example 7 - local mode
10 | ::   --ks is used to avoid messing with state (not supported in local mode)
11 | ssm shell -p %2simple-sagemaker-example-cli%3 -t shell-cli-local ^
12 |     --cmd_line "ps -elf >> \$SM_OUTPUT_DATA_DIR/ps__elf" ^
13 |     -o %1/example7 --it 'local' --no_spot --download_output %4 %5 %6 %7 %8 %9 --ks


--------------------------------------------------------------------------------
/examples/medium/distributed/cifar10/run_local.sh:
--------------------------------------------------------------------------------
 1 | set -e # stop and fail if anything stops
 2 | BASEDIR=$(dirname "$0")
 3 | pushd .
 4 | cd $BASEDIR
 5 | 
 6 | # Download the data
 7 | python cifar10.py --download_only --data_path ./data
 8 | # Train on a single node
 9 | python cifar10.py --data_path ./data \
10 |     --test_batch_size 100 --train_batch_size 256 --num_workers 2
11 | # Train distibuted
12 | python cifar10.py --data_path ./data \
13 |     --test_batch_size 100 --train_batch_size 256 --num_workers 2 \
14 |     --distributed --backend nccl 
15 | 
16 | popd


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. simple-sagemaker documentation master file, created by
 2 |    sphinx-quickstart on Mon Sep 21 00:15:01 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to simple-sagemaker's documentation!
 7 | ============================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 4
11 |    :caption: Contents:
12 | 
13 | 
14 | 
15 | Indices and tables
16 | ==================
17 | 
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/source/simple_sagemaker.task_toolkit.rst:
--------------------------------------------------------------------------------
 1 | simple\_sagemaker.task\_toolkit package
 2 | =======================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | simple\_sagemaker.task\_toolkit.algo\_lib module
 8 | ------------------------------------------------
 9 | 
10 | .. automodule:: simple_sagemaker.worker_toolkit.worker_lib
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Module contents
16 | ---------------
17 | 
18 | .. automodule:: simple_sagemaker.worker_toolkit
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 


--------------------------------------------------------------------------------
/examples/readme_examples/worker3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from worker_toolkit import worker_lib
 4 | 
 5 | worker_config = worker_lib.WorkerConfig(False)
 6 | 
 7 | open(os.path.join(worker_config.output_data_dir, "output_data_dir"), "wt").write(
 8 |     "output_data_dir file"
 9 | )
10 | open(os.path.join(worker_config.model_dir, "model_dir"), "wt").write("model_dir file")
11 | open(os.path.join(worker_config.state, "state_dir"), "wt").write("state_dir file")
12 | 
13 | # The task is marked as completed, to allow other tasks to use its output,
14 | # and to avoid re-running it (unless enforced)
15 | 


--------------------------------------------------------------------------------
/examples/debugging/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -ex # fail if any test fails
 3 | 
 4 | cd `dirname "$0"`
 5 | 
 6 | echo "Running $0 with ", -- $1 -- $2 -- $3 -- $4 -- $5
 7 | OUTPUT=${1:-.}
 8 | 
 9 | ssm run --prefix ${2} -p ssm-debugging -t metrics${3} -e ./metrics.py -o $OUTPUT/output1 ${@:4} \
10 |     --no_spot `#temporarily to accelerate iterations` &
11 | 
12 | ssm run --prefix ${2} -p ssm-debugging -t tensorboard${3} -s ./tensorboard -e lightning.py -o $OUTPUT/output2 ${@:4} \
13 |     --no_spot `#temporarily to accelerate iterations` --force_running &
14 | 
15 | 
16 | wait # wait for all processes
17 | 


--------------------------------------------------------------------------------
/src/simple_sagemaker/constants.py:
--------------------------------------------------------------------------------
 1 | LOCAL_STATE_PATH = "/state"
 2 | 
 3 | DEFAULT_INSTANCE_TYPE_TRAINING = "ml.m5.large"
 4 | DEFAULT_INSTANCE_TYPE_PROCESSING = "ml.t3.medium"
 5 | DEFAULT_INSTANCE_COUNT = 1
 6 | DEFAULT_VOLUME_SIZE = 30  # GB
 7 | DEFAULT_USE_SPOT = True
 8 | DEFAULT_MAX_RUN = 24 * 60
 9 | DEFAULT_MAX_WAIT = 0
10 | 
11 | DEFAULT_IAM_ROLE = "SageMakerIAMRole"
12 | DEFAULT_IAM_BUCKET_POLICY_SUFFIX = "Policy"
13 | 
14 | DEFAULT_REPO_TAG = "latest"
15 | 
16 | TEST_LOG_LINE_PREFIX = "-***-"
17 | TEST_LOG_LINE_BLOCK_PREFIX = "*** START "
18 | TEST_LOG_LINE_BLOCK_SUFFIX = "*** END "
19 | 
20 | TASK_TYPE_TRAINING = "Training"
21 | TASK_TYPE_PROCESSING = "Processing"
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/examples/readme_examples/worker4.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | import sys
 4 | 
 5 | from worker_toolkit import worker_lib
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def listDir(path):
11 |     logger.info(f"*** START listing files in {path}")
12 |     logger.info(
13 |         subprocess.run(
14 |             ["ls", "-la", "-R", path], stdout=subprocess.PIPE, universal_newlines=True
15 |         ).stdout
16 |     )
17 |     logger.info(f"*** END file listing {path}")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     logging.basicConfig(stream=sys.stdout)
22 |     worker_config = worker_lib.WorkerConfig(False)
23 |     listDir(worker_config.channel_data)
24 |     listDir(worker_config.channel_bucket)
25 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_smoke/example7/output/data/ps__elf:
--------------------------------------------------------------------------------
1 | F S UID          PID    PPID  C PRI  NI ADDR SZ WCHAN  STIME TTY          TIME CMD
2 | 4 S root           1       0  1  80   0 -  4939 wait   10:03 pts/0    00:00:00 bash -m start_with_right_hostname.sh train
3 | 4 S root          14       1 54  80   0 - 56730 pipe_w 10:03 pts/0    00:00:00 /opt/conda/bin/python /opt/conda/bin/train
4 | 4 S root          25      14  0  80   0 -  7627 wait   10:03 pts/0    00:00:00 /opt/conda/bin/python shell_launcher.py --SSM_SHELL_CMD_LINE ps -elf >> $SM_OUTPUT_DATA_DIR/ps__elf
5 | 4 S root          26      25  0  80   0 -  5456 wait   10:03 pts/0    00:00:00 /bin/bash -c ps -elf >> $SM_OUTPUT_DATA_DIR/ps__elf
6 | 4 R root          27      26  0  80   0 -  9040 -      10:03 pts/0    00:00:00 ps -elf
7 | 


--------------------------------------------------------------------------------
/examples/medium/intro/run3.sh:
--------------------------------------------------------------------------------
 1 | BASEDIR=$(dirname "$0")
 2 | pushd .
 3 | cd $BASEDIR/example3
 4 | 
 5 | ssm run -p ssm-ex -t ex3-1 -s ./code -e ssm_ex3_worker.py \
 6 |     -i ./data ShardedByS3Key \
 7 |     --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \
 8 |     --df "RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3" \
 9 |     --repo_name "ex3_repo" --aws_repo_name "ex3_repo" --no_spot \
10 |     --ic 2 --task_type 1 -o ./out3/ex3_1 --force_running
11 | 
12 | ssm run -p ssm-ex -t ex3-2 -s ./code -e ssm_ex3_worker.py \
13 |     -d ./external_dependency --iit ex3_1_model ex3-1 model \
14 |     --iit ex3_1_state ex3-1 state ShardedByS3Key \
15 |     -f tensorflow --md "Score" "Score=(.*?);" --tag "MyTag" "MyValue" \
16 |     --ic 2 --task_type 2 -o ./out3/ex3_2 --force_running
17 | 
18 | popd


--------------------------------------------------------------------------------
/docs/source/simple_sagemaker.rst:
--------------------------------------------------------------------------------
 1 | simple\_sagemaker package
 2 | =========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    simple_sagemaker.worker_toolkit
11 | 
12 | Submodules
13 | ----------
14 | 
15 | simple\_sagemaker.constants module
16 | ----------------------------------
17 | 
18 | .. automodule:: simple_sagemaker.constants
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | simple\_sagemaker.sm\_project module
24 | ------------------------------------
25 | 
26 | .. automodule:: simple_sagemaker.sm_project
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: simple_sagemaker
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/examples/imagenet/code/extract.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Expected to be launched with DATA_DIR as first argument
 3 | 
 4 | set -ex # stop and fail if anything stops
 5 | 
 6 | echo "Extracting all..."
 7 | cd $1
 8 | 
 9 | extract_and_delete() {
10 |     filename=$1
11 |     OUTDIR=${filename%.tar}
12 |     tar -xf $filename --xform="s|^|$OUTDIR/|S"
13 |     rm $filename    
14 | }
15 | 
16 | for filename in train/*.tar; do
17 |     extract_and_delete $filename &
18 | done
19 | 
20 | wait
21 | 
22 | cd val
23 | # https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md
24 | tar -xf ILSVRC2012_img_val.tar
25 | wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
26 | echo "Done!"
27 | cd ..
28 | echo "Extracted `find train | grep .JPEG | wc -l` train files and `find val | grep .JPEG | wc -l` validation files"


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.pythonPath": "/home/user/miniconda3/bin/python",
 3 |     "files.watcherExclude": {
 4 |         "**/.git/**": true,
 5 |         "**/.tox/**": true,
 6 |         "**/node_modules/*/**": true,
 7 |         "**/__pycache__/**": true,
 8 |         "**/runs/**": true,
 9 |         "**/wandbd/run*": true,
10 |     },
11 |     "cSpell.words": [
12 |         "CPUS",
13 |         "CUDA",
14 |         "Posix",
15 |         "Sharded",
16 |         "Xeon",
17 |         "algo",
18 |         "awsglue",
19 |         "conda",
20 |         "cpuinfo",
21 |         "cpython",
22 |         "drwxr",
23 |         "entrypoint",
24 |         "hyperparameters",
25 |         "pycache",
26 |         "pytorch",
27 |         "rglob",
28 |         "scikit",
29 |         "sourcedir",
30 |         "tensorflow",
31 |         "xlarge"
32 |     ],
33 |     "python.linting.enabled": true
34 | }


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/imagenet/run_local.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -e # stop and fail if anything stops
 4 | BASEDIR=$(dirname "$0")
 5 | DATA_DIR=${1:-~/proj/data/cv/imagenet}
 6 | cd $BASEDIR
 7 | 
 8 | EPOCHS=1
 9 | 
10 | # Download the code from PyTorch's examples repository
11 | [ -f code/main.py ] || wget -O code/main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py
12 | 
13 | # Download and extract the data
14 | ./code/download.sh $DATA_DIR
15 | ./code/extract.sh $DATA_DIR
16 | 
17 | # Train on a single GPU, $EPOCHS epochs
18 | echo ===== Training $EPOCHS epochs, a single GPU...
19 | python ./code/main.py --epochs $EPOCHS $DATA_DIR 
20 | 
21 | # "Distributed training" on 1 GPU, $EPOCHS epochs
22 | echo ===== Training $EPOCHS epochs, distributed, a single GPU...
23 | export MASTER_PORT=8888
24 | export MASTER_ADDR=localhost
25 | python ./code/main.py --multiprocessing-distributed  --dist-url env:// --world-size 1 --rank 0 --seed 123 --epochs $EPOCHS $DATA_DIR 
26 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/init-config.json:
--------------------------------------------------------------------------------
1 | {"inputMode":"FILE","channels":{"data":{"s3DataSource":{"s3DataType":"S3_PREFIX","s3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input","s3DataDistributionType":"SHARDED_BY_S3_KEY","attributeNames":null},"fileSystemDataSource":null,"compressionType":"NONE","recordWrapper":"NONE","shuffleConfig":null,"inputMode":"FILE","sharded":true}},"checkpointChannel":{"name":"checkpoints","channel":{"s3DataSource":{"s3DataType":"S3_PREFIX","s3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state","s3DataDistributionType":null,"attributeNames":null},"fileSystemDataSource":null,"compressionType":null,"recordWrapper":null,"shuffleConfig":null,"inputMode":"FILE","sharded":false},"outputPath":"/opt/ml/checkpoints","allowEmpty":true},"hostConfig":{"clusterSize":2,"hostNumber":1},"enableAdditionalPlatformLoggingForCustomer":false,"jobRunInfo":{"jobRunNumber":1}}


--------------------------------------------------------------------------------
/examples/imagenet/code/download.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Expected to be launched with DATA_DIR as first argument
 3 | 
 4 | set -ex # stop and fail if anything stops
 5 | 
 6 | mkdir -p $1
 7 | cd $1
 8 | 
 9 | [ -d ./train ] && rm -r ./train
10 | [ -d ./val ] && rm -r ./val
11 | 
12 | apt-get update
13 | apt-get -y --allow-unauthenticated install aria2
14 | download () {
15 |     aria2c --summary-interval=30 --conditional-get=true -x 16 -s 16 $1
16 | }
17 | 
18 | ### From https://cloud.google.com/tpu/docs/imagenet-setup, please make sure you have the permission to download the files from [Imagenet](http://image-net.org)
19 | echo Downloading to `pwd` 
20 | for FILENAME in ILSVRC2012_img_val.tar ILSVRC2012_img_train_t3.tar
21 | do
22 |     download http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/${FILENAME} 2>&1 && echo finished downloading $FILENAME  &
23 | done
24 | wait
25 | echo "Download finished!"
26 | 
27 | echo "Extracting first level..."
28 | tar -xf ILSVRC2012_img_train_t3.tar --xform="s|^|train/|S" &
29 | wait
30 | mv ILSVRC2012_img_val.tar val/
31 | echo "Done!"
32 | 


--------------------------------------------------------------------------------
/docs/high_level_flow.txt:
--------------------------------------------------------------------------------
 1 | #https://sequencediagram.org/ asdf asdf asdf
 2 | 
 3 | title High level flow
 4 | actor "Client (**runner**)" as c
 5 | 
 6 | database "ECS" as ecs #1da1f2
 7 | fontawesome f0a0 "S3" as s3 #1da1f2
 8 | control "SageMaker" as sm #1da1f2
 9 | fontawesome f233 "EC2 instance 1" as s1 #1da1f2
10 | fontawesome f233 "EC2 instance 2" as s2 #1da1f2
11 | c->ecs: docker image
12 | c->s3: code, data
13 | c->sm: job params
14 | sm<->ecs: download image
15 | sm<->s3: download code, data, state
16 | sm->*s1: start (params, code, data, state)
17 | sm->*s2: start (params, code, data, state)
18 | parallel
19 | s1->s1: run docker image
20 | s2->s2: run docker image
21 | parallel off
22 | parallel
23 | activate s1 #blue
24 | activate s2 #blue
25 | parallel off
26 | parallel
27 | note over s1, s2: The (**worker**) job is running.\nCode, data and state get\n mounted into it.
28 | deactivateafter s1
29 | deactivateafter s2
30 | parallel off
31 | s1->sm: output, model, state
32 | destroysilent s1
33 | s2->sm: output, model, state
34 | destroysilent s2
35 | sm->sm: merge output, model, state
36 | sm->s3: merged output, model, state
37 | c<->s3: download merged output, model, state
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/examples/imagenet/code/download-all.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Expected to be launched with DATA_DIR as first argument
 3 | 
 4 | set -ex # stop and fail if anything stops
 5 | 
 6 | mkdir -p $1
 7 | cd $1
 8 | 
 9 | [ -d ./train ] && rm -r ./train
10 | [ -d ./val ] && rm -r ./val
11 | 
12 | apt-get update
13 | apt-get -y --allow-unauthenticated install aria2
14 | download () {
15 |     aria2c --summary-interval=30 --conditional-get=true -x 16 -s 16 $1
16 | }
17 | 
18 | ### From https://cloud.google.com/tpu/docs/imagenet-setup, please make sure you have the permission to download the files from [Imagenet](http://image-net.org)
19 | echo Downloading to `pwd` 
20 | for FILENAME in ILSVRC2012_img_train.tar ILSVRC2012_img_val.tar ILSVRC2012_img_train_t3.tar
21 | do
22 |     download http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/${FILENAME} 2>&1 && echo finished downloading $FILENAME  &
23 | done
24 | wait
25 | echo "Download finished!"
26 | 
27 | echo "Extracting first level..."
28 | tar -xf ILSVRC2012_img_train.tar --xform="s|^|train/|S" &
29 | #tar -xf ILSVRC2012_img_train_t3.tar --xform="s|^|train/|S" &
30 | wait
31 | mv ILSVRC2012_img_val.tar val/
32 | echo "Done!"
33 | 


--------------------------------------------------------------------------------
/examples/medium/distributed/cifar10/run_remote.sh:
--------------------------------------------------------------------------------
 1 | set -e # stop and fail if anything stops
 2 | BASEDIR=$(dirname "$0")
 3 | pushd .
 4 | cd $BASEDIR
 5 | 
 6 | # Download the data
 7 | ssm run -p ex-cifar10 -t download -e cifar10.py --no_spot --\
 8 |     --download_only
 9 | 
10 | # Train on a single node
11 | ssm run -p ex-cifar10 -t train-single -e cifar10.py \
12 |      --md "Loss" "loss: ([0-9\\.]*)" --md "Accuracy" "Accuracy: ([0-9\\.]*)" \
13 |     --no_spot `#temporarily to accelerate iterations` \
14 |     --iit cifar_data download state --it ml.p3.2xlarge \
15 |     `# Beginning of training script params` -- \
16 |     --test_batch_size 100 --train_batch_size 256 --epochs 10 --num_workers 2 
17 | 
18 | # Train distibuted
19 | ssm run -p ex-cifar10 -t train-dist -e cifar10.py \
20 |     --md "Loss" "loss: ([0-9\\.]*)" --md "Accuracy" "Accuracy: ([0-9\\.]*)" \
21 |     --no_spot `#temporarily to accelerate iterations` \
22 |     --iit cifar_data download state --it ml.p3.2xlarge \
23 |     --ic 2 \
24 |     `# Beginning of training script params` -- \
25 |     --test_batch_size 100 --train_batch_size 256 --epochs 10 --num_workers 2 \
26 |     --distributed --backend nccl 
27 | 
28 | wait # wait for all processes
29 | 
30 | popd


--------------------------------------------------------------------------------
/examples/processing_cli/ex1.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | 
 6 | from dep import ex1_dep  # noqa: F401
 7 | from worker_toolkit import worker_lib
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def listDir(path, recursive=True):
13 |     logger.info(f"*** START listing files in {path}")
14 |     cmd_args = ["ls", "-la", path]
15 |     if recursive:
16 |         cmd_args.append("-R")
17 |     process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True)
18 |     logger.info(process.stdout)
19 |     logger.info(f"*** END file listing {path}")
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
24 |     logger.info("======= Starting python script ...")
25 | 
26 |     worker_config = worker_lib.WorkerConfig()
27 | 
28 |     print("Environ:", os.environ)
29 |     print("Args:", sys.argv)
30 | 
31 |     listDir("/opt/")
32 |     open(os.environ["SSM_STATE"] + "/state", "wt").write("state")
33 |     open(os.environ["SSM_OUTPUT"] + "/output", "wt").write("output")
34 | 
35 |     # just to show the final directory structue
36 |     logger.info("finished!")
37 |     # The task is marked as completed
38 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = simple_sagemaker
 3 | version = attr: simple_sagemaker.VERSION
 4 | description = A **simpler** and **cheaper** way to distribute work (python/shell/training) work on machines of your choice in the (AWS) cloud
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | url = https://github.com/shiftan/simple_sagemaker
 8 | author = Ariel Shiftan
 9 | maintainer = Ariel Shiftan
10 | license = Apache 2.0
11 | license_file = LICENSE
12 | platforms = any
13 | classifiers =
14 |     Programming Language :: Python :: 3.8
15 |     Programming Language :: Python :: 3.7
16 |     Programming Language :: Python :: 3.6
17 |     License :: OSI Approved :: Apache Software License
18 |     Topic :: Software Development :: Libraries
19 |     Topic :: Utilities
20 | keywords = sagemaker, aws
21 | maintainer-email = shiftan@gmail.com
22 | project_urls =
23 |     Source=https://github.com/shiftan/simple_sagemaker
24 |     Tracker=https://github.com/shiftan/simple_sagemaker
25 | 
26 | [options]
27 | package_dir=
28 |     =src
29 | packages=find:
30 | install_requires =
31 |     docker
32 |     boto3
33 |     sagemaker
34 | python_requires = >=3.6
35 | 
36 | [options.packages.find]
37 | where=src
38 | 
39 | [options.extras_require]
40 | docs =
41 |     sphinx>=2.0.0
42 | testing =
43 |     pytest>=4.0.0
44 | 
45 | [options.entry_points]
46 | console_scripts =
47 |     ssm = simple_sagemaker.cli:main


--------------------------------------------------------------------------------
/examples/readme_examples/run_rest.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #set -e # fail if any test fails
 3 | 
 4 | # Params: [output] [prefix] [suffix] [additional ssm params...]
 5 | BASEDIR=$(dirname "$0")
 6 | 
 7 | # Example 2 - passing hyperparams as command line arguments
 8 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task2${3} -e $BASEDIR/worker2.py --msg "Hello, world!" -o $1/example2 ${@:4} --max_run_mins 15 &
 9 | 
10 | # Example 3 - outputs
11 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task3${3} -e $BASEDIR/worker3.py -o $1/example3 ${@:4} --max_run_mins 15 &
12 | 
13 | wait # wait for all processes, to avoid AWS resource limits... :(
14 | 
15 | # Example 4 - Inputs, using a local data directory + s3 bucket
16 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task4${3} -e $BASEDIR/worker4.py \
17 |     -i $BASEDIR/data --iis bucket s3://awsglue-datasets/examples/us-legislators/all/persons.json \
18 |     --max_run_mins 15 -o $1/example4 ${@:4} &
19 | 
20 | # running task3 again
21 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task3${3} -e $BASEDIR/worker3.py -o $1/example3_2 ${@:4} --ks > $1/example3_2_stdout --max_run_mins 15 &
22 | 
23 | wait # wait for all processes
24 | 
25 | # Example 5 - chaining data, using task3's output
26 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task5${3} -e $BASEDIR/worker4.py --iit bucket task3 model -o $1/example5 ${@:4} --max_run_mins 15 &
27 | 
28 | wait # wait for all processes
29 | 


--------------------------------------------------------------------------------
/examples/dogs_vs_cats/run_remote.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -ex # stop and fail if anything stops
 4 | cd `dirname "$0"`
 5 | 
 6 | # Download the code from PyTorch's examples repository
 7 | [ -f code/main.py ] || wget -O main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py
 8 | 
 9 | # The dogs vs cats DB can be downloaded from
10 | ## Kaggle - https://www.kaggle.com/c/dogs-vs-cats
11 | ## Microsoft - https://www.microsoft.com/en-us/download/details.aspx?id=54765
12 | ## Floyhub - https://www.floydhub.com/fastai/datasets/cats-vs-dogs
13 | 
14 | # For simplicity, we currently just download a few sample images out of the full DB
15 | if [ ! -d ./data ]; then
16 |     mkdir -p data && cd data
17 |     wget -O sample_data.tar "https://www.floydhub.com/api/v1/download/artifacts/data/VbpRSQnFkQmYaBUtwt3aca?is_dir=true&path=sample"
18 |     tar xf sample_data.tar && mv valid val && cd ..
19 | fi
20 | 
21 | # Train on a single node
22 | #   We're as the data set is small (sample data) -i switch makes sense here, other approaches may be better for larger sets.
23 | ssm shell -p cat-vs-dogs -t 1-node -o ./output/output_1node --download_state \
24 |   -i ./data --it ml.p3.2xlarge -d main.py \
25 |   --cmd_line "CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && \
26 | python \$CODE_DIR/main.py --epochs 40 \$SM_CHANNEL_DATA --dist-url env:// --world-size \$SSM_NUM_NODES --rank \$SSM_HOST_RANK --seed 123" &
27 | 
28 | # Train on 3 nodes
29 | ssm shell -p cat-vs-dogs -t 3-nodes -o ./output/output_3nodes --download_state \
30 |   -i ./data --it ml.p3.2xlarge -d main.py --ic 3 \
31 |   --cmd_line "CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && \
32 | python \$CODE_DIR/main.py --epochs 40 \$SM_CHANNEL_DATA --dist-url env:// --world-size \$SSM_NUM_NODES --rank \$SSM_HOST_RANK --seed 123" &
33 | 
34 | wait
35 | 
36 | echo "FINISHED!"
37 | 
38 | 


--------------------------------------------------------------------------------
/examples/cli_multi/worker.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | from worker_toolkit import worker_lib
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def task1(worker_config):
11 |     # update the state per running instance
12 |     open(
13 |         f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt"
14 |     ).write("state")
15 |     # write to the model output directory
16 |     for file in Path(worker_config.channel_data).rglob("*"):
17 |         if file.is_file():
18 |             relp = file.relative_to(worker_config.channel_data)
19 |             path = Path(worker_config.model_dir) / (
20 |                 str(relp) + "_proc_by_" + worker_config.current_host
21 |             )
22 |             path.write_text(
23 |                 file.read_text() + " processed by " + worker_config.current_host
24 |             )
25 |     open(f"{worker_config.model_dir}/output_{worker_config.current_host}", "wt").write(
26 |         "output"
27 |     )
28 | 
29 | 
30 | def task2(worker_config):
31 |     logger.info(
32 |         f"Input task2_data: {list(Path(worker_config.channel_task2_data).rglob('*'))}"
33 |     )
34 | 
35 | 
36 | def main():
37 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
38 | 
39 |     logger.info("Starting worker...")
40 |     # parse the arguments
41 |     worker_config = worker_lib.WorkerConfig()
42 | 
43 |     logger.info(f"Hyperparams: {worker_config.hps}")
44 |     logger.info(
45 |         f"Input data files: {list(Path(worker_config.channel_data).rglob('*'))}"
46 |     )
47 |     logger.info(f"State files: { list(Path(worker_config.state).rglob('*'))}")
48 | 
49 |     if int(worker_config.hps["task_type"]) == 1:
50 |         task1(worker_config)
51 |     elif int(worker_config.hps["task_type"]) == 2:
52 |         task2(worker_config)
53 | 
54 |     logger.info("finished!")
55 |     # The task is marked as completed
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/examples/retcode/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -ex # fail if any test fails
 3 | 
 4 | cd `dirname "$0"`
 5 | 
 6 | # Args: expected actual msg
 7 | assert_eq() {
 8 |   local expected="$1"
 9 |   local actual="$2"
10 |   local msg
11 | 
12 |   if [ "$#" -ge 3 ]; then
13 |     msg="$3"
14 |   fi
15 | 
16 |   if [ "$expected" == "$actual" ]; then
17 |     return 0
18 |   else
19 |     [ "${#msg}" -gt 0 ] && echo "$expected == $actual :: $msg" || true
20 |     return 1
21 |   fi
22 | }
23 | 
24 | pids=()
25 | expected=()
26 | 
27 | # Args: expected command arg1 arg2 ...
28 | run_and_append() {
29 |     "${@:2}" &
30 |     pids+=($!)
31 |     expected+=($1)
32 | }
33 | 
34 | 
35 | run_and_append 0 ssm process -p exit-tests -t proc-cli-ret-0 --max_run_mins 15 \
36 |     --entrypoint "/bin/bash" -- -c "exit 0"
37 | 
38 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-1 --max_run_mins 15 \
39 |     --entrypoint "/bin/bash" -- -c "exit 1"
40 | 
41 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-0-msg --max_run_mins 15 \
42 |     --entrypoint "/bin/bash" -- -c "echo Message >> /opt/ml/output/message && exit 0" &
43 | 
44 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-1-msg --max_run_mins 15 \
45 |     --entrypoint "/bin/bash" -- -c "echo Message >> /opt/ml/output/message && exit 1"
46 | 
47 | 
48 | run_and_append 0 ssm shell -p exit-tests -t shel-cli-ret-0-0 \
49 |     --cmd_line "echo \$SSM_HOST_RANK && exit 0" --ic 2 --force_running
50 | 
51 | run_and_append 1 ssm shell -p exit-tests -t shel-cli-ret-0-1 \
52 |     --cmd_line "echo \$SSM_HOST_RANK && exit \$SSM_HOST_RANK" --ic 2 --force_running
53 | 
54 | run_and_append 1 ssm shell -p exit-tests -t shel-cli-ret-0-0-msg \
55 |     --cmd_line "echo \$SSM_HOST_RANK && echo Message >> /opt/ml/output/failure && exit 0" --ic 2 --force_running
56 | 
57 | echo "PIDs" ${pids[@]}
58 | 
59 | for i in ${!pids[@]} ;do
60 |     wait ${pids[$i]} && true
61 |     assert_eq $? ${expected[$i]} "Retcode should be ${expected[$i]} for $i"
62 | done
63 | 
64 | echo "PASSED"


--------------------------------------------------------------------------------
/src/simple_sagemaker/shell_launcher.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import sys
 6 | import time
 7 | from pathlib import Path
 8 | 
 9 | from worker_toolkit import worker_lib
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def worker():
15 |     logging.basicConfig(stream=sys.stdout)
16 | 
17 |     # Parse the arguments + initialize state
18 |     worker_config = worker_lib.WorkerConfig()
19 | 
20 |     # Delete the current file + toolkit as both got injected
21 |     os.remove(__file__)
22 |     shutil.rmtree("./worker_toolkit")
23 | 
24 |     # Run the shell / cmd line command
25 |     if "SSM_CMD_LINE" in worker_config.hps:
26 |         cmd_line = worker_config.hps["SSM_CMD_LINE"]
27 |         logger.info(f"Launching: {cmd_line}")
28 |         shell_cmd = subprocess.run(cmd_line)
29 |     elif "SSM_SHELL_CMD_LINE" in worker_config.hps:
30 |         cmd_line = worker_config.hps["SSM_SHELL_CMD_LINE"]
31 |         logger.info(f"Launching a shell: {cmd_line}")
32 |         shell_cmd = subprocess.run(cmd_line, shell=True, executable="/bin/bash")
33 | 
34 |     logger.info(f"finished with {shell_cmd.returncode} return code!")
35 | 
36 |     # wait_for_state_sync(worker_config)
37 |     return shell_cmd.returncode
38 | 
39 | 
40 | def wait_for_state_sync(worker_config):
41 |     max_secs = 60 * 5  # 5 mins max
42 |     wait_secs = 5
43 |     state_path = Path(worker_config.state)
44 |     max_change_time = max(map(os.path.getmtime, state_path.rglob("*")))
45 |     for i in range(max_secs // wait_secs):
46 |         time.sleep(wait_secs)
47 |         new_max = max(map(os.path.getmtime, state_path.rglob("*")))
48 |         if new_max == max_change_time:
49 |             return
50 |         max_change_time = new_max
51 |     logger.warning(
52 |         f"It seems like sage maker is still uploading after {max_secs} secs..."
53 |     )
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
58 |     retcode = worker()
59 |     sys.exit(retcode)
60 | 


--------------------------------------------------------------------------------
/examples/debugging/tensorboard/lightning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytorch_lightning as pl
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from pytorch_lightning.callbacks import ModelCheckpoint
 7 | from torch import nn
 8 | from torch.utils.data import DataLoader, random_split
 9 | from torchvision import transforms
10 | from torchvision.datasets import MNIST
11 | 
12 | 
13 | class LitAutoEncoder(pl.LightningModule):
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.encoder = nn.Sequential(
17 |             nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3)
18 |         )
19 |         self.decoder = nn.Sequential(
20 |             nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28)
21 |         )
22 | 
23 |     def forward(self, x):
24 |         # in lightning, forward defines the prediction/inference actions
25 |         embedding = self.encoder(x)
26 |         return embedding
27 | 
28 |     def training_step(self, batch, batch_idx):
29 |         # training_step defined the train loop. It is independent of forward
30 |         x, y = batch
31 |         x = x.view(x.size(0), -1)
32 |         z = self.encoder(x)
33 |         x_hat = self.decoder(z)
34 |         loss = F.mse_loss(x_hat, x)
35 |         self.log("train_loss", loss)
36 |         return loss
37 | 
38 |     def configure_optimizers(self):
39 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
40 |         return optimizer
41 | 
42 | 
43 | dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
44 | dataset = torch.utils.data.Subset(dataset, range(1000))
45 | train, val = random_split(dataset, [800, 200])
46 | 
47 | autoencoder = LitAutoEncoder()
48 | checkpoint_callback = ModelCheckpoint(
49 |     monitor="train_loss",
50 |     filepath="/state/checkpoints/sample-mnist-{epoch:02d}-{val_loss:.2f}",
51 |     save_top_k=3,
52 | )
53 | trainer = pl.Trainer(
54 |     default_root_dir="/opt/ml/output/tensorboard",
55 |     checkpoint_callback=checkpoint_callback,
56 | )
57 | trainer.fit(autoencoder, DataLoader(train), DataLoader(val))
58 | 


--------------------------------------------------------------------------------
/examples/processing_cli/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -ex # fail if any test fails
 3 | 
 4 | # Params: [output] [prefix] [suffix] [additional ssm params...]
 5 | cd `dirname "$0"`
 6 | echo "Running with", -- $1 -- $2 -- $3 -- $4 -- $5
 7 | 
 8 | # Example 1 - a processing script + dependencies
 9 | ssm process --prefix ${2} -p ssm-example-processing -t cli-code${3} -o $1/output1 \
10 |     --download_state --download_output --max_run_mins 15 \
11 |     --code ex1.py --dependencies ./dep ${@:4} \
12 |     -- arg1 -arg2 --arg3 "argument 4" &
13 | pid1=$!
14 | 
15 | # Example 2 - a raw entrypoint with arguments
16 | ssm process --prefix ${2} -p ssm-example-processing -t cli-shell${3} -o $1/output2 \
17 |     --download_state --download_output --max_run_mins 15 \
18 |     --entrypoint "/bin/bash" --dependencies ./dep --force_running \
19 |     -- -c "echo ==Bash && \
20 | echo \"-***- Args:\"\$@ &&echo \"-- Env:\"\`env\`&& \
21 | echo \"*** START listing files\"&&ls -laR /opt&&echo \"*** END \"&& \
22 | cp -r /opt/ml/config \$SSM_OUTPUT/config&& \
23 | echo output>\$SSM_OUTPUT/output&& \
24 | echo state>\$SSM_STATE/state" &
25 | 
26 | 
27 | # Example 3 - a bash script that gets the output and state of cli-code as input
28 | wait $pid1
29 | ssm process --prefix ${2} -p ssm-example-processing -t cli-bash${3} -o $1/output3 \
30 |     --download_state --command bash --download_output --max_run_mins 15 \
31 |     -i ./data --iit cli_code_output cli-code${3} output --iit cli_code_state cli-code${3} state \
32 |     --code ex3.sh --dependencies ./dep ${@:4} \
33 |     -- arg1 -arg2 --arg3 "argument 4" &
34 | 
35 | # Example 3 - a shell training ecript that gets the output and state of cli-code as input
36 | ssm shell --prefix ${2} -p ssm-example-processing -t shell-task${3} -o $1/output4 \
37 |     --iit cli_code_output cli-code${3} output --iit cli_code_state cli-code${3} state \
38 |     --cmd_line "echo '*** START listing files in /opt/ml' && ls -laR /opt/ml && echo '*** END file listing /opt/ml'" \
39 |     --max_run_mins 15 ${@:4} &
40 | 
41 | #  --it ml.t3.medium
42 | 
43 | wait # wait for all processes
44 | 
45 | # Run: 
46 | # tox -e bash -- ./run.sh ./output " " " " --cs


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output2/output/config/processingjobconfig.json:
--------------------------------------------------------------------------------
1 | {"ProcessingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:processing-job/cli-shell-2020-10-06-23-25-34-3bnesrat","ProcessingJobName":"cli-shell-2020-10-06-23-25-34-3bnESRAt","Environment":{"SSM_OUTPUT":"/opt/ml/processing/output","SSM_STATE":"/opt/ml/processing/state"},"AppSpecification":{"ImageUri":"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3","ContainerEntrypoint":["/bin/bash"],"ContainerArguments":["-c","echo '======= Bash script ...' \u0026\u0026         echo 'Args:' $@ \u0026\u0026 echo Env: `env` \u0026\u0026 pwd \u0026\u0026 ls -laR /opt \u0026\u0026         cp -r /opt/ml/config $SSM_OUTPUT/config \u0026\u0026         echo 'output' \u003e $SSM_OUTPUT/output \u0026\u0026         echo 'state' \u003e $SSM_STATE/state"]},"ProcessingInputs":[{"InputName":"DEP_dep","S3Input":{"LocalPath":"/opt/ml/processing/input/code/dep","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-shell-2020-10-06-23-25-34-3bnESRAt/input/DEP_dep","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_worker_toolkit","S3Input":{"LocalPath":"/opt/ml/processing/input/code/worker_toolkit","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-shell-2020-10-06-23-25-34-3bnESRAt/input/DEP_worker_toolkit","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}}],"ProcessingOutputConfig":{"Outputs":[{"OutputName":"state","S3Output":{"LocalPath":"/opt/ml/processing/state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-shell/state","S3UploadMode":"Continuous"}},{"OutputName":"output","S3Output":{"LocalPath":"/opt/ml/processing/output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-shell/cli-shell-2020-10-06-23-25-34-3bnESRAt/output","S3UploadMode":"EndOfJob"}}],"KmsKeyId":null},"ProcessingResources":{"ClusterConfig":{"InstanceCount":1,"InstanceType":"ml.m5.large","VolumeSizeInGB":30,"VolumeKmsKeyId":null}},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","StoppingCondition":{"MaxRuntimeInSeconds":900}}


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "env": {
14 |                 "PYTHONPATH": "${workspaceFolder}/src"
15 |             },
16 |             "cwd": "${fileDirname}"
17 |         },
18 |         {
19 |             "name": "Python: Cli",
20 |             "type": "python",
21 |             "request": "launch",
22 |             "program": "${workspaceFolder}/src/cli_launcher.py",
23 |             "console": "integratedTerminal",
24 |             "cwd": "${fileDirname}",
25 |             "justMyCode": false,
26 |             "env": {
27 |                 "PYTHONPATH": "${workspaceFolder}/src"
28 |             },
29 |             "args2": [
30 |                 "shell",
31 |                 "-p", "tests/aaa", 
32 |                 //"--instance_type", "ml.p3.2xlarge",
33 |                 "-t", "task2",
34 |                 "--cmd_line", "echo 222 333",
35 |                 "--dir_files",  "${workspaceFolder}/examples/readme_examples/data",
36 |                 //"-e",  "${workspaceFolder}/examples/readme_examples/worker3.py",
37 |                 "-o", "${workspaceFolder}/output",
38 |                 "--no_spot",
39 |                 "-f", "tensorflow",
40 |                 "-m",
41 |                 "--md", "Score", "Score=(.*?);",
42 |                 "--tag", "MyTag", "MyValue", 
43 |                 //"--iis", "bucket", "s3://awsglue-datasets/examples/us-legislators/all/persons.json"
44 |                 //"--iit", "bucket", "task3", "model"
45 |                 //"--cs", 
46 |                 "--ks",
47 |                 "--aa", "bb",
48 |                 "--cc", "dd",
49 |             ],
50 |             "args": ["shell", "-p", "shell-cli", "-t", "shell-cli-task22", "--cmd_line", "ls -la", "-o", "./output", "--local", "--it", "local", "--no_spot"],
51 |         }
52 |     ]
53 | }


--------------------------------------------------------------------------------
/examples/multiple_tasks/code/algo_multi.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import sys
 6 | 
 7 | from worker_toolkit import worker_lib
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def listDir(path, recursive=True):
13 |     logger.info(f"*** START listing files in {path}")
14 |     cmd_args = ["ls", "-la", path]
15 |     if recursive:
16 |         cmd_args.append("-R")
17 |     process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True)
18 |     logger.info(process.stdout)
19 |     logger.info(f"*** END file listing {path}")
20 | 
21 | 
22 | def logBefore(worker_config):
23 |     # show the given arguments and environment
24 |     logger.info(f"Argv: {sys.argv}")
25 |     logger.info(f"Env: {os.environ}")
26 |     # just to show the initial directory structue
27 |     listDir("/opt/ml")
28 |     listDir(worker_config.state)
29 | 
30 | 
31 | def logAfter(worker_config):
32 |     # just to show the final directory structue
33 |     listDir("/opt/ml")
34 |     listDir(worker_config.state)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     logging.basicConfig(stream=sys.stdout)
39 |     logger.info("Starting algo...")
40 | 
41 |     # parse the arguments
42 |     worker_config = worker_lib.WorkerConfig()
43 |     logBefore(worker_config)
44 | 
45 |     output_data_dir = os.path.join(
46 |         worker_config.output_data_dir, worker_config.current_host
47 |     )
48 | 
49 |     # create some data in the state dir
50 |     if worker_config.hps["stage"] == 1:
51 |         # put some files in the state directory
52 |         for i in range(10):
53 |             open(
54 |                 f"{worker_config.instance_state}/state_{worker_config.current_host}_{i+1}",
55 |                 "wt",
56 |             ).write("state")
57 | 
58 |         # put something in the model
59 |         modelDir = os.path.join(worker_config.model_dir, worker_config.current_host)
60 |         os.makedirs(modelDir, exist_ok=True)
61 |         open(f"{modelDir}/model_dir", "wt").write("model_dir")
62 | 
63 |     elif worker_config.hps["stage"] == 2:
64 |         logger.info("Doing nothing...")
65 | 
66 |     # copy all input channels to the output dir
67 |     for channel_name in worker_config.channels:
68 |         input_dir = worker_config.__getattr__(f"channel_{channel_name}")
69 |         shutil.copytree(input_dir, f"{output_data_dir}/{channel_name}_copy")
70 |     shutil.copytree(worker_config.state, f"{output_data_dir}/state_copy")
71 | 
72 |     logger.info("finished!")
73 |     logAfter(worker_config)
74 |     # The task is marked as completed
75 | 


--------------------------------------------------------------------------------
/examples/single_task/expected_output/output/algo-1/input_dir_copy/config/trainingjobconfig.json:
--------------------------------------------------------------------------------
1 | {"EnableInterContainerTrafficEncryption":false,"EnableNetworkIsolation":false,"EnableManagedSpotTraining":true,"HyperParameters":{"arg2":"\"hello\"","sagemaker_container_log_level":"20","sagemaker_program":"\"algo.py\"","arg1":"5","sagemaker_region":"\"us-east-1\"","sagemaker_job_name":"\"Task1-2020-10-04-09-17-37-x6ux770b\"","sagemaker_submit_directory":"\"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/source/sourcedir.tar.gz\""},"InputDataConfig":[{"ChannelName":"data","DataSource":{"S3DataSource":{"S3DataType":"S3_PREFIX","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input","S3DataDistributionType":"SHARDED_BY_S3_KEY"}},"CompressionType":"NONE","RecordWrapperType":"NONE"}],"OutputDataConfig":{"KmsKeyId":"","S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1"},"ResourceConfig":{"InstanceCount":2,"InstanceType":"ml.m5.large","VolumeSizeInGB":30},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","Tags":[{"Key":"SimpleSagemakerProject","Value":"tests/simple-sagemaker-example_2020-10-04-09-16-51_py37"},{"Key":"SimpleSagemakerCallingModule","Value":"/home/runner/work/simple_sagemaker/simple_sagemaker/examples/single_task/example.py"},{"Key":"SimpleSagemakerTask","Value":"Task1"},{"Key":"SimpleSagemakerVersion","Value":"0.9.19"}],"BaseTags":[{"Key":"SimpleSagemakerProject","Value":"tests/simple-sagemaker-example_2020-10-04-09-16-51_py37"},{"Key":"SimpleSagemakerCallingModule","Value":"/home/runner/work/simple_sagemaker/simple_sagemaker/examples/single_task/example.py"},{"Key":"SimpleSagemakerTask","Value":"Task1"},{"Key":"SimpleSagemakerVersion","Value":"0.9.19"}],"TrainingJobName":"Task1-2020-10-04-09-17-37-x6ux770b","StoppingCondition":{"MaxRuntimeInSeconds":900,"MaxWaitTimeInSeconds":900},"AlgorithmSpecification":{"MetricDefinitions":[],"TrainingImage":"XXXXXXXXXXXX.dkr.ecr.us-east-1.amazonaws.com/task_repo:latest","TrainingInputMode":"File"},"TrainingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:training-job/task1-2020-10-04-09-17-37-x6ux770b","DebugHookConfig":{"S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/debug-output","LocalPath":"/opt/ml/output/tensors"},"CheckpointConfig":{"S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state","LocalPath":"/state"}}


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | #filepath = os.path.abspath(os.path.dirname(__file__))
16 | #sys.path.insert(0, os.path.join(filepath, "..", "..", "src", "simple_sagemaker"))
17 | sys.path.insert(0, os.path.abspath('../../src/simple_sagemaker/'))
18 | 
19 | import sphinx_rtd_theme
20 | 
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = 'simple-sagemaker'
24 | copyright = '2020, Ariel Shiftan'
25 | author = 'Ariel Shiftan'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = '0.9.11'
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |     "sphinx_rtd_theme",
38 |     'sphinx.ext.autodoc',
39 |     "sphinx.ext.viewcode"
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = []
49 | 
50 | 
51 | # -- Options for HTML output -------------------------------------------------
52 | 
53 | # The theme to use for HTML and HTML Help pages.  See the documentation for
54 | # a list of builtin themes.
55 | #
56 | 
57 | html_theme = 'sphinx_rtd_theme'
58 | 
59 | # Add any paths that contain custom static files (such as style sheets) here,
60 | # relative to this directory. They are copied after the builtin static files,
61 | # so a file named "default.css" will overwrite the builtin "default.css".
62 | html_static_path = ['_static']
63 | 
64 | html_theme_options = {
65 |     'canonical_url': '',
66 |     'logo_only': False,
67 |     'display_version': True,
68 |     'prev_next_buttons_location': 'bottom',
69 |     'style_external_links': False,
70 |     'vcs_pageview_mode': '',
71 |     'style_nav_header_background': 'white',
72 |     # Toc options
73 |     'collapse_navigation': True,
74 |     'sticky_navigation': True,
75 |     'navigation_depth': 4,
76 |     'includehidden': True,
77 |     'titles_only': False
78 | }


--------------------------------------------------------------------------------
/examples/single_task/code/algo.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import sys
 6 | 
 7 | import transformers
 8 | from worker_toolkit import worker_lib
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def listDir(path, recursive=True):
14 |     logger.info(f"*** START listing files in {path}")
15 |     cmd_args = ["ls", "-la", path]
16 |     if recursive:
17 |         cmd_args.append("-R")
18 |     process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True)
19 |     logger.info(process.stdout)
20 |     logger.info(f"*** END file listing {path}")
21 | 
22 | 
23 | def logBefore(worker_config):
24 |     # show the given arguments and environment
25 |     logger.info(f"Argv: {sys.argv}")
26 |     logger.info(f"Env: {os.environ}")
27 |     # show a library that was installed due to requirements.txt
28 |     logger.info(f"transformers: {transformers}")
29 |     # just to show the initial directory structue
30 |     listDir("/opt/ml")
31 |     listDir(worker_config.state)
32 | 
33 | 
34 | def logAfter(worker_config):
35 |     # just to show the final directory structue
36 |     listDir("/opt/ml")
37 |     listDir(worker_config.state)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     logging.basicConfig(stream=sys.stdout)
42 |     logger.info("Starting algo...")
43 | 
44 |     # parse the arguments
45 |     worker_config = worker_lib.WorkerConfig()
46 | 
47 |     # importing internal and external dependencies
48 |     from external_dependency import lib1  # noqa: F401
49 |     from internal_dependency import lib2  # noqa: F401
50 | 
51 |     logBefore(worker_config)
52 | 
53 |     # copy the entire input dir to the output dir
54 |     output_data_dir = os.path.join(
55 |         worker_config.output_data_dir, worker_config.current_host
56 |     )
57 |     shutil.copytree(worker_config.input_dir, f"{output_data_dir}/input_dir_copy")
58 |     # copy state dir
59 |     shutil.copytree(worker_config.state, f"{output_data_dir}/state_copy")
60 |     # cteaye a file
61 |     open(f"{output_data_dir}/output_data_dir", "wt").write("output_data_dir")
62 | 
63 |     # create one file in the output dir
64 |     output_dir = os.path.join(worker_config.output_dir, worker_config.current_host)
65 |     os.makedirs(output_dir, exist_ok=True)
66 |     open(f"{output_dir}/output_dir", "wt").write("output_dir")
67 | 
68 |     # create one file in the output model dir
69 |     modelDir = os.path.join(worker_config.model_dir, worker_config.current_host)
70 |     os.makedirs(modelDir, exist_ok=True)
71 |     open(f"{modelDir}/model_dir", "wt").write("model_dir")
72 | 
73 |     open(
74 |         f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt"
75 |     ).write(f"state_{worker_config.current_host}")
76 | 
77 |     # just to show the final directory structue
78 |     logger.info("finished!")
79 |     logAfter(worker_config)
80 |     # The task is marked as completed
81 | 


--------------------------------------------------------------------------------
/tests/smoke/test_basics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import platform
 4 | import shutil
 5 | import subprocess
 6 | import sys
 7 | from time import time
 8 | 
 9 | import boto3
10 | 
11 | from ..system.compare_outputs import isAsExpected
12 | 
13 | file_path = os.path.split(__file__)[0]
14 | examples_path = os.path.abspath(os.path.join(file_path, "..", "..", "examples"))
15 | 
16 | 
17 | def test_project(caplog, tmp_path):
18 |     caplog.set_level(logging.INFO)
19 |     logging.info("test_project")
20 | 
21 |     from simple_sagemaker.sm_project import SageMakerProject
22 | 
23 |     sm_project = SageMakerProject(project_name="test")
24 |     sm_project = sm_project
25 | 
26 | 
27 | def test_task(caplog, tmp_path):
28 |     caplog.set_level(logging.INFO)
29 |     logging.info("test_task")
30 | 
31 |     from simple_sagemaker.sm_task import SageMakerTask
32 | 
33 |     boto3_session = boto3.Session()
34 |     image_uri = None
35 |     smTask = SageMakerTask(boto3_session, "taskName", image_uri, prefix="tests/smoke")
36 |     smTask = smTask
37 | 
38 | 
39 | def _testCliInternal(cmd):
40 |     shell_cmd = subprocess.run(cmd, shell=True)
41 |     print("**************", shell_cmd)
42 |     assert shell_cmd.returncode == 0
43 | 
44 | 
45 | def test_cli_help():
46 |     _testCliInternal("ssm -h")
47 | 
48 | 
49 | def test_cli_run_help():
50 |     _testCliInternal("ssm run -h")
51 | 
52 | 
53 | def test_cli_shell_help():
54 |     _testCliInternal("ssm shell -h")
55 | 
56 | 
57 | def test_cli_data_help():
58 |     _testCliInternal("ssm data -h")
59 | 
60 | 
61 | def _internalTestCli(test_path, caplog, tmp_path):
62 |     caplog.set_level(logging.INFO)
63 |     print("Temp path:", tmp_path)
64 |     print("Running cli:", test_path)
65 | 
66 |     output_path = os.path.join(tmp_path, test_path, "output_smoke")
67 |     # remove current local output
68 |     shutil.rmtree(output_path, ignore_errors=True)
69 |     # prefix/suffix for project name
70 |     py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}"
71 |     time_string = int(time())
72 |     postfix = f"-{os.name}-{time_string}-{py_version_string}"
73 |     prefix = "tests_smoke/"
74 | 
75 |     if platform.system() == "Linux":
76 |         run_shell = os.path.join(examples_path, test_path, "run_smoke.sh")
77 |     elif platform.system() == "Windows":
78 |         run_shell = os.path.join(examples_path, test_path, "run_smoke.bat")
79 |     subprocess.run(
80 |         [run_shell, output_path, prefix, postfix, "--cs --force_running"], check=True
81 |     )
82 | 
83 |     expected_path = os.path.join(examples_path, test_path, "expected_output_smoke")
84 |     assert isAsExpected(output_path, expected_path)
85 | 
86 | 
87 | def test_readme_examples(caplog, tmp_path):
88 |     # Windows can't currently work due to lack of support in running linux images
89 |     # Mac can't currently work as it doesn't have a docker engine
90 |     if platform.system() in ["Linux"]:
91 |         _internalTestCli("readme_examples", caplog, tmp_path)
92 | 


--------------------------------------------------------------------------------
/examples/imagenet/run_remote.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Arguments [PARTIAL_DATA flag]
 3 | 
 4 | set -e # stop and fail if anything stops
 5 | cd `dirname "$0"`
 6 | PARTIAL_DATA=$1
 7 | data_source=$( [ "$PARTIAL_DATA" == true ] &&  echo download || echo download-all )
 8 | echo "*** Using data source: $data_source"
 9 | 
10 | # Download the code from PyTorch's examples repository
11 | [ -f code/main.py ] || wget -O code/main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py
12 | 
13 | # Download the data
14 | ssm process -p ex-imagenet -t $data_source -v 400 \
15 |     --entrypoint "/bin/bash" --dependencies ./code \
16 |     -o ./output/$data_source \
17 |     -- -c "bash /opt/ml/processing/input/code/code/$data_source.sh \$SSM_OUTPUT/data"
18 | 
19 | run_training () { # args: task_name, instance_type, additional_command_params, [description] [epochs] [additional_args]
20 |     EPOCHS=${5:-10}  # 20 epochs by default
21 |     ADDITIONAL_ARGS=${6:-"--no_spot --force_running --cs"} # 
22 | 
23 |     echo ===== Training $EPOCHS epochs, $4...
24 |     ssm shell -p ex-imagenet -t $1 --dir_files ./code -o ./output/$1 -v 280 \
25 |         --iit train $data_source output FullyReplicated data/train \
26 |         --iit val $data_source output FullyReplicated data/val \
27 |         --md "loss" "Epoch:.*Loss\s+([e\-+0-9\\.]*) \(" --md "acc1" "Epoch:.*Acc@1\s+([e\-+0-9\\.]*) \(" --md "acc5" "Epoch:.*Acc@5\s+([e\-+0-9\\.]*) \(" \
28 |         --md "time" "Epoch:.*Time\s+([e\-+0-9\\.]*) \(" --md "data_time" "Epoch:.*Data\s+([e\-+0-9\\.]*) \(" \
29 |         --md "test_loss" "Test:.*Loss\s+([e\-+0-9\\.]*) \(" --md "test_acc1" "Test:.*Acc@1\s+([e\-+0-9\\.]*) \(" --md "test_acc5" "Test:.*Acc@5\s+([e\-+0-9\\.]*) \(" \
30 |         --download_model --download_output --download_state \
31 |         --it $2 $ADDITIONAL_ARGS \
32 |         --cmd_line  "./extract.sh \$SM_CHANNEL_TRAIN/.. && \ 
33 |                     CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && START=\$SECONDS && \
34 |                     python \$CODE_DIR/main.py --epochs $EPOCHS --resume checkpoint.pth.tar --workers 8 \$SM_CHANNEL_TRAIN/.. $3 2>&1 && \
35 |                     echo Total time: \$(( SECONDS - START )) seconds"
36 | 
37 |     exit $?
38 | }
39 | 
40 | DESC="a single GPU"
41 | run_training train-1gpu ml.p3.2xlarge "" "$DESC" &
42 | DESC="distributed training, a single GPU"
43 | run_training train-dist-1gpu ml.p3.2xlarge "--multiprocessing-distributed --dist-url env:// --world-size 1 --rank 0 --seed 123" "$DESC" &
44 | DESC="distributed training, 8 GPUs"
45 | run_training train-dist-8gpus ml.p2.8xlarge "--multiprocessing-distributed --dist-url env:// --world-size 1 --rank 0 --seed 123" "$DESC" &
46 | DESC="distributed training, 3 instances, total 3 GPUs"
47 | run_training train-dist-3nodes-3gpus ml.p3.2xlarge '--multiprocessing-distributed --dist-url env:// --world-size $SSM_NUM_NODES --rank $SSM_HOST_RANK --seed 123' "$DESC" \
48 |         "" "--no_spot --ic 3 --force_running --cs" &
49 | 
50 | wait
51 | echo "FINISHED!"
52 | exit
53 | 
54 | 


--------------------------------------------------------------------------------
/examples/readme_examples/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -e # fail if any test fails
 3 | 
 4 | # Params: [output] [prefix] [suffix] [additional ssm params...]
 5 | BASEDIR=$(dirname "$0")
 6 | echo "Running with", $@
 7 | 
 8 | # Example 1 - hello world
 9 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task1${3} -e $BASEDIR/worker1.py -o $1/example1 --it ml.p3.2xlarge --no_spot --ic 2 ${@:4} --max_run_mins 15 &
10 | 
11 | # Example 6_1 - a complete example part 1. 
12 | #   - Uses local data folder as input, that is distributed among instances (--i, ShardedByS3Key)
13 | #   - Uses a public s3 bucket as an additional input (--iis)
14 | #   - Builds a custom docker image (--df, --repo_name, --aws_repo_name)
15 | #   - Hyperparameter task_type
16 | #   - 2 instance (--ic)
17 | #   - Use an on-demand instance (--no_spot)
18 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-1${3} -s $BASEDIR/example6/code -e worker6.py \
19 |     -i $BASEDIR/example6/data ShardedByS3Key --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \
20 |     --df $BASEDIR/example6 --repo_name "task6_repo" --aws_repo_name "task6_repo" --no_spot \
21 |     --download_state --download_model --download_output --max_run_mins 15 \
22 |     --ic 2 --task_type 1 -o $1/example6_1 ${@:4} &
23 | 
24 | wait # wait for all processes
25 | 
26 | # Shell example
27 | ssm shell --prefix ${2} -p simple-sagemaker-example-cli -t shell-task${3} --cmd_line "cat /proc/cpuinfo && nvidia-smi" -o $1/example_cmd --it ml.p3.2xlarge ${@:4} --max_run_mins 15 &
28 | 
29 | # Example 6_2 - a complete example part 2.
30 | #   - Uses outputs from part 1 (--iit)
31 | #   - Uses additional local code dependencies (-d)
32 | #   - Uses the tensorflow framework as pre-built image (-f)
33 | #   - Tags the jobs (--tag)
34 | #   - Defines sagemaker metrics (-m, --md)
35 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-2${3} -s $BASEDIR/example6/code -e worker6.py \
36 |     -d $BASEDIR/example6/external_dependency --iit task_6_1_model cli-task6-1${3} model --iit task_6_1_state cli-task6-1${3} state ShardedByS3Key \
37 |     -f tensorflow --md "Score" "Score=(.*?);" --tag "MyTag" "MyValue" \
38 |     --download_state --download_model --download_output --max_run_mins 15 \
39 |     --ic 2 --task_type 2 -o $1/example6_2 ${@:4} &
40 | 
41 | wait # wait for all processes
42 | 
43 | # Run task6_1 again
44 | #   The rest of arguments ${@:4} (specifying --force_running) aren't passed here, to demonstrate that existing output is used, without running the task again
45 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-1${3} -s $BASEDIR/example6/code -e worker6.py \
46 |     -i $BASEDIR/example6/data ShardedByS3Key --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \
47 |     --df $BASEDIR/example6 --repo_name "task6_repo" --aws_repo_name "task6_repo" \
48 |     --download_state --download_model --download_output --max_run_mins 15 \
49 |     --ic 2 --task_type 1 -o $1/example6_1 > $1/example6_1_2_stdout &
50 | 
51 | wait # wait for all processes
52 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3_2_stdout:
--------------------------------------------------------------------------------
 1 | INFO:simple_sagemaker.cli:Running ssm cli, args:['/home/user/proj/simple_sagemaker/.tox/single_proc/bin/ssm', '-p', 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37', '-t', 'task3', '-e', '/home/user/proj/simple_sagemaker/examples/readme_examples/worker3.py', '-o', '/home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2', '--cs', '--ks']
 2 | INFO:simple_sagemaker.cli:Parsed arguments:Namespace(aws_repo_name=None, bucket_name=None, clean_state=False, config_file=None, dependencies=None, docker_file=None, entry_point='/home/user/proj/simple_sagemaker/examples/readme_examples/worker3.py', image_tag='latest', input_path=None, input_s3=None, input_task=None, instance_count=1, instance_type='ml.m5.large', max_run_mins=86400, max_wait_mins=86400, output_path='/home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2', project_name='tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37', repo_name=None, source_dir=None, task_name='task3', use_spot_instances=True, volume_size=30)
 3 | INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
 4 | INFO:simple_sagemaker.iam_utils:Creating SageMaker IAM Role: SageMakerIAMRole with an attached AmazonSageMakerFullAccess policy...
 5 | INFO:root:Using a pre-built image None...
 6 | INFO:simple_sagemaker.sm_project:result: {'ResponseMetadata': {'RequestId': '9617559CA710A258', 'HostId': 'Q/gQPYgU189C1j0/zcbDa1fzwI7Q3v9ftSyNYJGUWLEYMAOmFiGVIaI378G00ubguE0yiB7PAdI=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'Q/gQPYgU189C1j0/zcbDa1fzwI7Q3v9ftSyNYJGUWLEYMAOmFiGVIaI378G00ubguE0yiB7PAdI=', 'x-amz-request-id': '9617559CA710A258', 'date': 'Mon, 14 Sep 2020 14:45:45 GMT', 'x-amz-bucket-region': 'us-east-1', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'IsTruncated': False, 'Marker': '', 'Contents': [{'Key': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/state_dir', 'LastModified': datetime.datetime(2020, 9, 14, 14, 44, 39, tzinfo=tzlocal()), 'ETag': '"4a5376727cd09607cd5b6ea1805c7e48"', 'Size': 14, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'ariel.shiftan', 'ID': '899dac056bb9e5cef98c1aca7f7ba6b6674a8bae0a2841f34e51043d93f9aa4b'}}], 'Name': 'sagemaker-us-east-1-XXXXXXXXXXXX', 'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/', 'Delimiter': '/', 'MaxKeys': 1000, 'CommonPrefixes': [{'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/algo-1/'}], 'EncodingType': 'url'}
 7 | INFO:simple_sagemaker.sm_project:CommonPrefixes: [{'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/algo-1/'}]
 8 | INFO:simple_sagemaker.sm_project:subdirs: ['algo-1']
 9 | INFO:simple_sagemaker.sm_project:Task task3 is already completed by task3-2020-09-14-14-41-19-l7bXfIZg
10 | INFO:simple_sagemaker.sm_task:Downloading results to /home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2
11 | 


--------------------------------------------------------------------------------
/src/simple_sagemaker/s3_sync.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from bisect import bisect_left
 5 | from hashlib import md5
 6 | from pathlib import Path
 7 | 
 8 | import boto3
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class S3Sync:
14 |     def __init__(self, boto3_sessions):
15 |         self.s3_client = boto3_sessions.client("s3")
16 | 
17 |     def syncFolderToS3(self, source: str, dest: str, prefix: str) -> [str]:
18 |         paths = self.listFolderFiles(source)
19 |         objects = self.listS3Bucket(dest, prefix)
20 | 
21 |         # Getting the keys and ordering to perform binary search
22 |         # each time we want to check if any paths is already there.
23 |         object_keys = [obj["Key"][len(prefix) + 1 :] for obj in objects]
24 |         object_keys.sort()
25 |         object_keys_length = len(object_keys)
26 | 
27 |         for path in paths:
28 |             file_name = os.path.join(source, path)
29 |             should_upload = True
30 |             # Binary search.
31 |             index = bisect_left(object_keys, path)
32 |             # Check if the file already exists
33 |             if index != object_keys_length and object_keys[index] == path:
34 |                 # Check size
35 |                 file_stat = os.stat(file_name)
36 |                 if file_stat.st_size == objects[index]["Size"]:
37 |                     # Validate MD5
38 |                     md = md5(open(file_name, "rb").read()).hexdigest()
39 |                     if objects[index]["ETag"].strip('"') == md:
40 |                         should_upload = False
41 | 
42 |             if should_upload:
43 |                 logger.info(f"Uploading {file_name}")
44 |                 self.s3_client.upload_file(
45 |                     str(Path(source).joinpath(path)),
46 |                     Bucket=dest,
47 |                     Key=prefix + "/" + path,
48 |                 )
49 |             else:
50 |                 logger.info(f"Skipping {file_name}")
51 | 
52 |     def listS3Bucket(self, bucket, prefix):
53 |         res = []
54 |         try:
55 |             paginator = self.s3_client.get_paginator("list_objects_v2")
56 |             pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
57 | 
58 |             for page in pages:
59 |                 res.extend(page["Contents"])
60 |         except KeyError:
61 |             # No Contents Key, empty bucket.
62 |             return []
63 |         else:
64 |             return res
65 | 
66 |     @staticmethod
67 |     def listFolderFiles(folder_path):
68 |         """
69 |         Recursively list all files within the given folder
70 |         """
71 |         folder_path = folder_path.rstrip("/")
72 |         files = [
73 |             str(x.relative_to(folder_path))
74 |             for x in Path(folder_path).rglob("*")
75 |             if not x.is_dir()
76 |         ]
77 |         return files
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     # Test
82 |     boto3_session = boto3.Session()
83 |     s = S3Sync(boto3_session)
84 |     path = ".."
85 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
86 |     logger.info(f"listing {path}: {s.listFolderFiles(path)}")
87 | 


--------------------------------------------------------------------------------
/examples/readme_examples/example6/code/worker6.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | # a library that was installed due to requirements.txt
 7 | import transformers  # noqa: F401
 8 | 
 9 | # importing an internal dependency
10 | from internal_dependency import lib2  # noqa: F401
11 | from worker_toolkit import worker_lib
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def listDir(path, ignore_patterns=[]):
17 |     logger.info(f"*** START listing files in {path}")
18 |     for file in sorted(Path(path).rglob("*")):
19 |         if (not ignore_patterns) or all(
20 |             [pattern not in str(file) for pattern in ignore_patterns]
21 |         ):
22 |             logger.info(f"[{['Dir ', 'File'][file.is_file()]}] {file}")
23 |     logger.info(f"*** END file listing {path}")
24 | 
25 | 
26 | def worker1(worker_config):
27 |     # Libraries that were pre-installed in the docker image, as defined in the Dockerfile
28 |     import pandas  # noqa: F401
29 |     import sklearn  # noqa: F401
30 | 
31 |     logger.info("{pandas} is pre-installed in this image")
32 | 
33 |     # update the state
34 |     (Path(worker_config.instance_state) / worker_config.current_host).write_text(
35 |         f"state_{worker_config.current_host}"
36 |     )
37 |     # "process" input data into model output
38 |     for file in Path(worker_config.channel_data).rglob("*"):
39 |         relp = file.relative_to(worker_config.channel_data)
40 |         path = Path(worker_config.model_dir) / (
41 |             f"{relp}_proc_by_{worker_config.current_host}"
42 |         )
43 |         path.write_text(f"{file.read_text()} processed by {worker_config.current_host}")
44 |     # write to output dir
45 |     (
46 |         Path(worker_config.output_data_dir) / f"output_{worker_config.current_host}"
47 |     ).write_text(f"output_{worker_config.current_host}")
48 | 
49 | 
50 | def worker2(worker_config):
51 |     # importing an external dependency
52 |     from external_dependency import lib1  # noqa: F401
53 | 
54 |     logger.info("Score=10;")
55 |     time.sleep(60)  # sleep to be able to see the two scores
56 |     logger.info("Score=20;")
57 | 
58 | 
59 | def show_inputs(worker_config):
60 |     # just to show the initial directory structue
61 |     for channel_name in worker_config.channels:
62 |         input_path = worker_config.__getattr__(f"channel_{channel_name}")
63 |         logger.info(f"input channel {channel_name} is at {input_path}")
64 | 
65 |     listDir("/opt/ml", ["__pycache__"])
66 |     listDir(worker_config.state)
67 | 
68 | 
69 | def show_output(worker_config):
70 |     # show the final directory structue
71 |     listDir("/opt/ml", ["/opt/ml/input", "/opt/ml/code", "__pycache__"])
72 |     listDir(worker_config.state)
73 | 
74 | 
75 | def worker():
76 |     logging.basicConfig(stream=sys.stdout)
77 |     # parse the arguments
78 |     worker_config = worker_lib.WorkerConfig()
79 |     # get the instance specific state path
80 |     show_inputs(worker_config)
81 | 
82 |     if int(worker_config.hps["task_type"]) == 1:
83 |         worker1(worker_config)
84 |     elif int(worker_config.hps["task_type"]) == 2:
85 |         worker2(worker_config)
86 | 
87 |     show_output(worker_config)
88 | 
89 |     logger.info("finished!")
90 |     # The task is marked as completed
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     worker()
95 | 


--------------------------------------------------------------------------------
/tests/system/test_examples.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | import sys
 6 | from time import time
 7 | 
 8 | from .compare_outputs import isAsExpected
 9 | 
10 | file_path = os.path.split(__file__)[0]
11 | examples_path = os.path.abspath(os.path.join(file_path, "..", "..", "examples"))
12 | sys.path.append(examples_path)
13 | 
14 | 
15 | def _internalTestExample(caplog, tmp_path, runner):
16 |     caplog.set_level(logging.INFO)
17 |     # print(os.environ)
18 |     print("Temp path:", tmp_path)
19 |     print("Running", runner, runner.__name__, runner.__module__)
20 | 
21 |     example_path = os.path.dirname(runner.__code__.co_filename)
22 |     output_path = os.path.join(tmp_path, os.path.split(example_path)[-1], "output")
23 |     # remove current local output
24 |     shutil.rmtree(output_path, ignore_errors=True)
25 |     # prefix/suffix for project name
26 |     py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}"
27 |     time_string = int(time())
28 |     postfix = f"-{time_string}-{py_version_string}"
29 |     prefix = "tests/"
30 | 
31 |     sm_project = runner(postfix=postfix, prefix=prefix, output_path=output_path)
32 |     sm_project = sm_project
33 |     # sm_project.cleanFolder()
34 | 
35 |     expected_path = os.path.join(example_path, "expected_output")
36 |     # check for expected_output also one level up
37 |     if not os.path.isdir(expected_path):
38 |         expected_path = os.path.join(os.path.dirname(example_path), "expected_output")
39 | 
40 |     assert isAsExpected(output_path, expected_path)
41 | 
42 | 
43 | def _internalTestCli(test_path, caplog, tmp_path):
44 |     caplog.set_level(logging.INFO)
45 |     print("Temp path:", tmp_path)
46 |     print("Running cli:", test_path)
47 | 
48 |     output_path = os.path.join(tmp_path, test_path, "output")
49 |     # remove current local output
50 |     shutil.rmtree(output_path, ignore_errors=True)
51 |     # prefix/suffix for project name
52 |     py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}"
53 |     time_string = int(time())
54 |     postfix = f"-{time_string}-{py_version_string}"
55 |     prefix = "tests/"
56 | 
57 |     run_shell = os.path.join(examples_path, test_path, "run.sh")
58 |     subprocess.run(
59 |         [run_shell, output_path, prefix, postfix, "--cs --force_running"], check=True
60 |     )
61 | 
62 |     expected_path = os.path.join(examples_path, test_path, "expected_output")
63 |     assert isAsExpected(output_path, expected_path)
64 | 
65 | 
66 | def skip_test_cli_multi(caplog, tmp_path):
67 |     _internalTestCli("cli_multi", caplog, tmp_path)
68 | 
69 | 
70 | def test_readme_examples(caplog, tmp_path):
71 |     _internalTestCli("readme_examples", caplog, tmp_path)
72 | 
73 | 
74 | def test_processing_cli_examples(caplog, tmp_path):
75 |     _internalTestCli("processing_cli", caplog, tmp_path)
76 | 
77 | 
78 | def test_multiple_tasks(caplog, tmp_path):
79 |     from multiple_tasks.example import runner
80 | 
81 |     _internalTestExample(caplog, tmp_path, runner)
82 | 
83 | 
84 | def test_single_file_tasks(caplog, tmp_path):
85 |     from single_file.example import runner
86 | 
87 |     _internalTestExample(caplog, tmp_path, runner)
88 | 
89 | 
90 | def test_single_task(caplog, tmp_path):
91 |     from single_task.example import runner
92 | 
93 |     _internalTestExample(caplog, tmp_path, runner)
94 | 


--------------------------------------------------------------------------------
/examples/medium/intro/example3/code/ssm_ex3_worker.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | # a library that was installed due to requirements.txt
 7 | import transformers  # noqa: F401
 8 | 
 9 | # importing an internal dependency
10 | from internal_dependency import lib2  # noqa: F401
11 | from worker_toolkit import worker_lib
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def listDir(path, ignore_patterns=[]):
17 |     logger.info(f"*** START listing files in {path}")
18 |     for file in sorted(Path(path).rglob("*")):
19 |         if (not ignore_patterns) or all(
20 |             [pattern not in str(file) for pattern in ignore_patterns]
21 |         ):
22 |             logger.info(f"[{['Dir ', 'File'][file.is_file()]}] {file}")
23 |     logger.info(f"*** END file listing {path}")
24 | 
25 | 
26 | def worker1(worker_config):
27 |     # libraries that were pre-installed in the docker image, as defined in the Dockerfile
28 |     import pandas  # noqa: F401
29 |     import sklearn  # noqa: F401
30 | 
31 |     logger.info("{pandas} is pre-installed in this image")
32 | 
33 |     # update the state
34 |     (Path(worker_config.instance_state) / worker_config.current_host).write_text(
35 |         f"state_{worker_config.current_host}"
36 |     )
37 |     # "process" input data into model output
38 |     for file in Path(worker_config.channel_data).rglob("*"):
39 |         relp = file.relative_to(worker_config.channel_data)
40 |         path = Path(worker_config.model_dir) / (
41 |             f"{relp}_proc_by_{worker_config.current_host}"
42 |         )
43 |         path.write_text(f"{file.read_text()} processed by {worker_config.current_host}")
44 |     # write to output dir
45 |     (
46 |         Path(worker_config.output_data_dir) / f"output_{worker_config.current_host}"
47 |     ).write_text(f"output_{worker_config.current_host}")
48 | 
49 | 
50 | def worker2(worker_config):
51 |     # importing an external dependency
52 |     from external_dependency import lib1  # noqa: F401
53 | 
54 |     logger.info("Score=10;")
55 |     time.sleep(60)  # sleep to be able to see the two scores
56 |     logger.info("Score=20;")
57 | 
58 | 
59 | def show_inputs(worker_config):
60 |     # just to show the initial directory structue
61 |     for channel_name in worker_config.channels:
62 |         input_path = worker_config.__getattr__(f"channel_{channel_name}")
63 |         logger.info(f"input channel {channel_name} is at {input_path}")
64 | 
65 |     listDir("/opt/ml", ["__pycache__"])
66 |     listDir(worker_config.state)
67 | 
68 | 
69 | def show_output(worker_config):
70 |     # show the final directory structue
71 |     listDir("/opt/ml", ["/opt/ml/input", "/opt/ml/code", "__pycache__"])
72 |     listDir(worker_config.state)
73 | 
74 | 
75 | def worker():
76 |     logging.basicConfig(stream=sys.stdout)
77 |     # parse the arguments
78 |     worker_config = worker_lib.WorkerConfig()
79 |     # get the instance specific state path
80 |     show_inputs(worker_config)
81 | 
82 |     if int(worker_config.hps["task_type"]) == 1:
83 |         worker1(worker_config)
84 |     elif int(worker_config.hps["task_type"]) == 2:
85 |         worker2(worker_config)
86 | 
87 |     show_output(worker_config)
88 | 
89 |     logger.info("finished!")
90 |     # The task is marked as completed
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     worker()
95 | 


--------------------------------------------------------------------------------
/examples/medium/distributed/README.md:
--------------------------------------------------------------------------------
 1 | A very good [blog post](https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html) about distributed pytorch training.
 2 | 
 3 | # Examples
 4 | - [PyTorch's DDP example](https://github.com/pytorch/examples/tree/master/distributed/ddp)
 5 | - [PyTorch's imagenet example](https://github.com/pytorch/examples/tree/master/imagenet)
 6 | - [SageMaker's MNIST](https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/master/test/resources/mnist/mnist.py)
 7 | 
 8 | # Imagenet 
 9 | Using the [LSVRC2012 dataset](http://image-net.org/challenges/LSVRC/2012/ilsvrc2012.pdf)
10 | from https://cloud.google.com/tpu/docs/imagenet-setup :
11 | 1. Register to [http://image-net.org/](http://image-net.org/) and request access permission
12 | 2. nohup wget http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_train.tar
13 | 3. wget http://www.image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_train_t3.tar
14 | 4. wget http://www.image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_val.tar
15 | 
16 | 
17 | # Development flow
18 | 
19 | ## Part one - local development
20 | 1. Come up with the complete flow, and a single script to run it
21 | 2. Make sure parameters can be configured on the command line, specifically:
22 |     - Input / output / model paths
23 |     - The number of processes / workers for data loaders
24 |     - Distribution and number of used nodes (allow a single node as well)
25 |     - Hyperparameters - batch size, learning rate, number of epochs etc
26 | 3. Run locally
27 |     - With / without distribution (of size 1)
28 |     - Check CPU, RAM, GPU and GPU RAM usage
29 |         - Figure out a good balance between batch size and the learning rate until you reach a bottleneck
30 |     - Save the model to the "state" directory every few cycles (e.g. every min or two, assuming saving is quick)
31 |         - PyTorch: save it once on the beginning of the loop (to avoid many messages from the debugger `smdebug`)
32 | 4. Make sure running the entire flow again continue from where it stopped
33 | 
34 | ## Part two - moving remotely
35 | 5. Update the code to support simple-sagemaker
36 |     - The training script + running script
37 |     - TBD: Tutorial / post on tihs
38 | 6. Test locally using "local mode" "--it local" to make sure everything works
39 |     - Note: not everything is supported (TBD e.g.), but you may be able to find a few bugs quicker
40 | 7. Test remotely
41 |     - Start with "--no_spot" to accelerate iterations until you're ready
42 |     - Check CPU, RAM, GPU and GPU RAM usage
43 |         - Figure out a good balance between batch size and the learning rate until you reach a bottleneck
44 | 
45 | ## Part three - profit!
46 | 8. Run remotely
47 |     - Make sure to remove "--no_spot"
48 | 9. Hyperparameters tuning
49 | 10. Debugging
50 | 
51 | # Optimizations
52 | 1. Mixed precision - it is now [built in with PyTorch 1.6](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/)
53 | 
54 | Notes:
55 | 1. Make sure to save checkpoints to the state folder
56 | 2. TensorBoard is active, save logs to /opt/ml/output/tensorboard/, e.g. writer = SummaryWriter('/opt/ml/output/tensorboard/') and writer.add_scalar('Loss/test', np.random.random(), n_iter)
57 | 3. Syncing many files from S3 is slow, it's better to split the DB into e.g. 1000 tars.


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output3/output/config/processingjobconfig.json:
--------------------------------------------------------------------------------
1 | {"ProcessingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:processing-job/cli-bash-2020-10-06-23-30-46-ulq8rrv0","ProcessingJobName":"cli-bash-2020-10-06-23-30-46-ULQ8RRV0","Environment":{"SM_CHANNEL_CLI_CODE_OUTPUT":"/opt/ml/processing/input/data/cli_code_output","SM_CHANNEL_CLI_CODE_STATE":"/opt/ml/processing/input/data/cli_code_state","SM_CHANNEL_DATA":"/opt/ml/processing/data","SSM_OUTPUT":"/opt/ml/processing/output","SSM_STATE":"/opt/ml/processing/state"},"AppSpecification":{"ImageUri":"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3","ContainerEntrypoint":["bash","/opt/ml/processing/input/code/ex3.sh"],"ContainerArguments":["arg1","-arg2","--arg3","argument 4"]},"ProcessingInputs":[{"InputName":"cli_code_output","S3Input":{"LocalPath":"/opt/ml/processing/input/data/cli_code_output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-code/cli-code-2020-10-06-23-25-36-bE5AcbXg/output","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"cli_code_state","S3Input":{"LocalPath":"/opt/ml/processing/input/data/cli_code_state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-code/state","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_dep","S3Input":{"LocalPath":"/opt/ml/processing/input/code/dep","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/DEP_dep","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_worker_toolkit","S3Input":{"LocalPath":"/opt/ml/processing/input/code/worker_toolkit","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/DEP_worker_toolkit","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"data","S3Input":{"LocalPath":"/opt/ml/processing/data","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/input","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"code","S3Input":{"LocalPath":"/opt/ml/processing/input/code","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/code/ex3.sh","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}}],"ProcessingOutputConfig":{"Outputs":[{"OutputName":"state","S3Output":{"LocalPath":"/opt/ml/processing/state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/state","S3UploadMode":"Continuous"}},{"OutputName":"output","S3Output":{"LocalPath":"/opt/ml/processing/output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/output","S3UploadMode":"EndOfJob"}}],"KmsKeyId":null},"ProcessingResources":{"ClusterConfig":{"InstanceCount":1,"InstanceType":"ml.m5.large","VolumeSizeInGB":30,"VolumeKmsKeyId":null}},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","StoppingCondition":{"MaxRuntimeInSeconds":900}}


--------------------------------------------------------------------------------
/examples/single_task/example.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | import sys
  5 | from time import gmtime, strftime
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | file_path = os.path.split(__file__)[0]
 10 | if "TOX_ENV_NAME" not in os.environ:
 11 |     srcPath = os.path.abspath(os.path.join(file_path, "..", "..", "src"))
 12 |     sys.path.append(srcPath)
 13 | from simple_sagemaker.sm_project import SageMakerProject  # noqa: E402
 14 | 
 15 | 
 16 | def setDefaultParams(sm_project):
 17 |     # docker image params
 18 |     aws_repo_name = "task_repo"  # remote (ECR) rpository name
 19 |     repo_name = "task_repo"  # local repository name
 20 |     image_tag = "latest"  # tag for local & remote images
 21 |     docker_file_path = os.path.join(file_path, "docker")  # path of the local Dockerfile
 22 |     sm_project.setDefaultImageParams(
 23 |         aws_repo_name, repo_name, image_tag, docker_file_path
 24 |     )
 25 | 
 26 |     # job code path, entrypoint and params
 27 |     source_dir = os.path.join(file_path, "code")
 28 |     entry_point = "algo.py"
 29 |     dependencies = [os.path.join(file_path, "external_dependency")]
 30 |     sm_project.setDefaultCodeParams(source_dir, entry_point, dependencies)
 31 | 
 32 |     # instances type an count
 33 |     instance_type = "ml.m5.large"
 34 |     training_instance_count = 2
 35 |     volume_size = (
 36 |         30  # Size in GB of the EBS volume to use for storing input data during training
 37 |     )
 38 |     use_spot_instances = True  # False
 39 |     max_run_mins = 15
 40 |     sm_project.setDefaultInstanceParams(
 41 |         instance_type,
 42 |         training_instance_count,
 43 |         volume_size,
 44 |         use_spot_instances,
 45 |         max_run_mins,
 46 |     )
 47 | 
 48 | 
 49 | def runner(
 50 |     project_name="simple-sagemaker-example", prefix="", postfix="", output_path=None
 51 | ):
 52 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 53 | 
 54 |     sm_project = SageMakerProject(project_name, prefix=prefix)
 55 | 
 56 |     setDefaultParams(sm_project)
 57 | 
 58 |     image_uri = sm_project.buildOrGetImage(
 59 |         instance_type=sm_project.defaultInstanceParams.instance_type
 60 |     )
 61 | 
 62 |     # task name
 63 |     task_name = (
 64 |         "task1"
 65 |         + postfix  # must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])*
 66 |     )
 67 |     # input data params
 68 |     input_data_path = os.path.join(
 69 |         file_path, "input_data"
 70 |     )  # Can also provide a URI to an S3 bucket, e.g. next commented line
 71 |     # input_data_path = sagemaker.s3.s3_path_join("s3://", "sagemaker-us-east-1-XXXXXXXXXXXX", "task3", "input")
 72 |     distribution = "ShardedByS3Key"  # or "FullyReplicated" which is the default
 73 |     model_uri = (
 74 |         None  # Can be used to supply model data as an additional input, local/s3
 75 |     )
 76 |     hyperparameters = {"arg1": 5, "arg2": "hello"}
 77 | 
 78 |     sm_project.runTask(
 79 |         task_name,
 80 |         image_uri,
 81 |         hyperparameters,
 82 |         input_data_path,
 83 |         model_uri=model_uri,
 84 |         input_distribution=distribution,
 85 |         clean_state=True,
 86 |     )
 87 | 
 88 |     # delete the output directory
 89 |     if not output_path:
 90 |         output_path = os.path.join(file_path, "output")
 91 |     shutil.rmtree(output_path, ignore_errors=True)
 92 |     sm_project.downloadResults(task_name, output_path)
 93 | 
 94 |     return sm_project
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}"
 99 |     time_string = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
100 |     sm_project = runner(postfix=f"_{time_string}_{py_version_string}", prefix="tests/")
101 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | envlist = py37, py36, py38
  3 | 
  4 | [testenv] # the default env - coverage + parallalism
  5 | whitelist_externals = 
  6 |     /bin/echo
  7 |     /bin/ls
  8 | deps = 
  9 |     pytest-xdist
 10 |     coverage
 11 |     pytest-cov
 12 |     sagemaker[local]
 13 | passenv = 
 14 |     AWS_DEFAULT_REGION
 15 |     AWS_REGION
 16 |     AWS_ACCESS_KEY_ID
 17 |     AWS_SECRET_ACCESS_KEY
 18 |     AWS_SESSION_TOKEN
 19 | 
 20 | commands =
 21 |     pytest --cov=simple_sagemaker --cov-append --cov-report=term-missing -n 4 --basetemp="{envtmpdir}" {posargs} 
 22 | 
 23 | [testenv:single_proc] # no coverage nor parallelism
 24 | deps = 
 25 |     sagemaker[local]
 26 |     pytest
 27 | commands =
 28 |     pytest --basetemp="{envtmpdir}" {posargs}
 29 | 
 30 | [testenv:ssm] # run ssm cli
 31 | deps = 
 32 |     sagemaker[local]
 33 | 
 34 | commands =
 35 |     ssm {posargs}
 36 | 
 37 | [testenv:bash] # run ssm cli
 38 | whitelist_externals = 
 39 |     /usr/bin/bash
 40 |     /bin/bash
 41 | commands =
 42 |     bash {posargs}
 43 | 
 44 | 
 45 | [testenv:no-coverage] # no coverage
 46 | commands = 
 47 |     pytest -n 4 --basetemp="{envtmpdir}" {posargs}
 48 | deps =
 49 |     pytest-xdist
 50 | 
 51 | [testenv:report] # generate a coverage report
 52 | skip_install = true
 53 | deps = coverage
 54 | commands =
 55 |     coverage html -i --include="*simple_sagemaker*" --omit="*worker_toolkit*","*shell_launcher.py"
 56 |     coverage report --include="*simple_sagemaker*" --omit="*worker_toolkit*","*shell_launcher.py" --fail-under=85
 57 |     coverage report --help
 58 | 
 59 | [testenv:clean] # clean up coverage data
 60 | skip_install = true
 61 | deps = coverage
 62 | commands = coverage erase
 63 | 
 64 | [tool:pytest]
 65 | testpaths = tests
 66 |     
 67 | ### Formatting & linting 
 68 | [flake8]
 69 | max-line-length = 127
 70 | extend-ignore = E203
 71 | 
 72 | [testenv:lint]
 73 | skip_install = true
 74 | setenv =
 75 | deps =
 76 |     flake8
 77 |     black
 78 |     isort
 79 | commands =
 80 |     flake8 ./src ./tests ./examples --count --statistics
 81 |     isort --check-only ./src ./tests ./examples
 82 |     black --check ./src ./tests ./examples
 83 | 
 84 | [testenv:cf] # Code Format
 85 | skip_install = true
 86 | deps = 
 87 |     black
 88 |     isort
 89 | commands = 
 90 |     isort ./src ./tests ./examples
 91 |     black ./src ./tests ./examples
 92 | 
 93 | [testenv:publish] # build & publish the code
 94 | skip_install = true
 95 | basepython = python3.7
 96 | setenv =
 97 | deps =
 98 |     setuptools 
 99 |     wheel 
100 |     twine
101 | commands =
102 |     python setup.py sdist bdist_wheel
103 |     twine upload dist/*
104 | 
105 | [testenv:docs]
106 | description = invoke sphinx-build to build the HTML docs
107 | basepython = python3.7
108 | changedir = docs
109 | deps = 
110 |     sphinx
111 |     sphinx-rtd-theme
112 | whitelist_externals = 
113 |     /usr/bin/make
114 | commands = 
115 |     sphinx-apidoc -f -o ./source ../src/simple_sagemaker ../src/simple_sagemaker/sm_task.py ../src/simple_sagemaker/s3_sync.py ../src/simple_sagemaker/iam_utils.py ../src/simple_sagemaker/ecr_sync.py ../src/simple_sagemaker/cli.py
116 |     make html
117 | 
118 | 
119 | # Run a single test, e.g. for debugging
120 | # tox -e single_proc -- --capture=no --log-cli-level=INFO -k single_file
121 | 
122 | # Run an ssm command line
123 | # tox -e ssm -- run -p simple-sagemaker-example-cli -t task1 -e ./examples/cli_simple/worker.py --cs -o ./output --use_spot_instances 0
124 | 
125 | # Ececute a shell command
126 | # tox -e bash --  ./examples/cli_simple/run.sh ./output
127 | 
128 | # Ececute a shell job
129 | # tox -e ssm -- shell -p shell-cli -t shell-cli-task --cmd_line "ls -la" -o ./output --dir_files=./src/simple_sagemaker
130 | # tox -e ssm -- shell -p shell-cli -t shell-cli-task --cmd_line "ls -la" -o ./output --df "RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3" --repo_name "tt_repo" --aws_repo_name "tt_repo"


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
  1 | name: Build, test and deploy
  2 | 
  3 | on: 
  4 |   push:
  5 |   pull_request:
  6 |     branches:
  7 |       - master
  8 | 
  9 | jobs:
 10 |   build:
 11 | 
 12 |     runs-on: ${{ matrix.os }}
 13 |     #if: github.event_name != 'push' || (!startsWith(github.ref, 'refs/heads/master'))
 14 |     
 15 |     strategy:
 16 |       max-parallel: 10
 17 |       matrix:
 18 |         os: [ubuntu-latest, windows-latest, macos-latest]
 19 |         python-version: [3.6, 3.7, 3.8]
 20 | 
 21 |     steps:
 22 |     - name: Set up Python ${{ matrix.python-version }}
 23 |       uses: actions/setup-python@v2
 24 |       with:
 25 |         python-version: ${{ matrix.python-version }}
 26 |       # Cache pip packages
 27 |     - name: Get pip cache dir
 28 |       id: pip-cache
 29 |       run: |
 30 |         echo "::set-output name=dir::$(pip cache dir)"
 31 |     - name: pip cache
 32 |       uses: actions/cache@v2
 33 |       with:
 34 |         path: ${{ steps.pip-cache.outputs.dir }}
 35 |         key: ${{ matrix.os }}-pip-${{ hashFiles('**/setup.cfg') }}
 36 |         restore-keys: |
 37 |           ${{ matrix.os }}-pip-
 38 | 
 39 |     - name: Checkout
 40 |       uses: actions/checkout@v2
 41 |     - name: Install dependencies
 42 |       run: |
 43 |         python -m pip install --upgrade pip
 44 |         pip install -r requirements.txt
 45 |     - name: Lint
 46 |       run: |
 47 |         tox -e lint
 48 |     - name: Configure AWS credentials
 49 |       uses: aws-actions/configure-aws-credentials@v1
 50 |       with:
 51 |         aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 52 |         aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 53 |         aws-region: us-east-1
 54 |     - name: Smoke tests
 55 |       run: |
 56 |         tox -e py -- --capture=no --log-cli-level=INFO tests/smoke/
 57 |     - name: System tests & check coverage
 58 |       if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') ||  github.base_ref  == 'master')
 59 |       run: |
 60 |         tox -e py -- --capture=no --log-cli-level=INFO tests/system/
 61 |         tox -e report
 62 |     - name: Upload coverage report
 63 |       if: (!cancelled()) && matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') ||  github.base_ref  == 'master')
 64 |       uses: actions/upload-artifact@v2
 65 |       with:
 66 |         name: coverage
 67 |         path: htmlcov
 68 |     - name: Upload examples output
 69 |       if: (!cancelled()) && matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') ||  github.base_ref  == 'master')
 70 |       uses: actions/upload-artifact@v2
 71 |       with:
 72 |         name: examples_output
 73 |         path: |
 74 |           .tox/py/tmp/*/*/*/output/
 75 |           !.tox/py/tmp/*/*current/*/output/
 76 |     - name: Build the package
 77 |       if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') ||  github.base_ref  == 'master')
 78 |       run: |
 79 |         pip install setuptools wheel twine
 80 |         python setup.py sdist bdist_wheel
 81 |     - name: Upload dist files
 82 |       if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') ||  github.base_ref  == 'master')
 83 |       uses: actions/upload-artifact@v2
 84 |       with:
 85 |         name: dist
 86 |         path: dist
 87 | 
 88 |   publish:
 89 | 
 90 |     runs-on: ubuntu-latest
 91 |     needs: build
 92 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/heads/master')
 93 | 
 94 |     steps:
 95 |     - name: Download dist package
 96 |       uses: actions/download-artifact@v2
 97 |       with:
 98 |         name: dist
 99 |         path: dist
100 |     - name: Publish
101 |       uses: pypa/gh-action-pypi-publish@master
102 |       with:
103 |         user: __token__
104 |         password: ${{ secrets.pypi_password }}
105 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example1/logs/logs0:
--------------------------------------------------------------------------------
 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 2 | bash: no job control in this shell
 3 | 2020-10-04 09:21:07,561 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
 4 | 2020-10-04 09:21:07,584 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
 5 | 2020-10-04 09:21:10,639 sagemaker_pytorch_container.training INFO     Invoking user training script.
 6 | 2020-10-04 09:21:11,064 sagemaker-training-toolkit INFO     Invoking user script
 7 | 
 8 | Training Env:
 9 | 
10 | {
11 |     "additional_framework_parameters": {},
12 |     "channel_input_dirs": {},
13 |     "current_host": "algo-1",
14 |     "framework_module": "sagemaker_pytorch_container.training:main",
15 |     "hosts": [
16 |         "algo-1",
17 |         "algo-2"
18 |     ],
19 |     "hyperparameters": {},
20 |     "input_config_dir": "/opt/ml/input/config",
21 |     "input_data_config": {},
22 |     "input_dir": "/opt/ml/input",
23 |     "is_master": true,
24 |     "job_name": "cli-task1-2020-10-04-09-16-52-zNRvwXqG",
25 |     "log_level": 20,
26 |     "master_hostname": "algo-1",
27 |     "model_dir": "/opt/ml/model",
28 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz",
29 |     "module_name": "worker1",
30 |     "network_interface_name": "eth0",
31 |     "num_cpus": 8,
32 |     "num_gpus": 1,
33 |     "output_data_dir": "/opt/ml/output/data",
34 |     "output_dir": "/opt/ml/output",
35 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
36 |     "resource_config": {
37 |         "current_host": "algo-1",
38 |         "hosts": [
39 |             "algo-1",
40 |             "algo-2"
41 |         ],
42 |         "network_interface_name": "eth0"
43 |     },
44 |     "user_entry_point": "worker1.py"
45 | }
46 | 
47 | Environment variables:
48 | 
49 | SM_HOSTS=["algo-1","algo-2"]
50 | SM_NETWORK_INTERFACE_NAME=eth0
51 | SM_HPS={}
52 | SM_USER_ENTRY_POINT=worker1.py
53 | SM_FRAMEWORK_PARAMS={}
54 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}
55 | SM_INPUT_DATA_CONFIG={}
56 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
57 | SM_CHANNELS=[]
58 | SM_CURRENT_HOST=algo-1
59 | SM_MODULE_NAME=worker1
60 | SM_LOG_LEVEL=20
61 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
62 | SM_INPUT_DIR=/opt/ml/input
63 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
64 | SM_OUTPUT_DIR=/opt/ml/output
65 | SM_NUM_CPUS=8
66 | SM_NUM_GPUS=1
67 | SM_MODEL_DIR=/opt/ml/model
68 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz
69 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"cli-task1-2020-10-04-09-16-52-zNRvwXqG","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz","module_name":"worker1","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"worker1.py"}
70 | SM_USER_ARGS=[]
71 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
72 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
73 | 
74 | Invoking script with the following command:
75 | 
76 | /opt/conda/bin/python worker1.py
77 | 
78 | 
79 | -***- Device 0: _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)
80 | 2020-10-04 09:21:14,490 sagemaker-training-toolkit INFO     Reporting training SUCCESS
81 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output/example1/logs/logs1:
--------------------------------------------------------------------------------
 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 2 | bash: no job control in this shell
 3 | 2020-10-04 09:21:02,371 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
 4 | 2020-10-04 09:21:02,394 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
 5 | 2020-10-04 09:21:15,062 sagemaker_pytorch_container.training INFO     Invoking user training script.
 6 | 2020-10-04 09:21:15,415 sagemaker-training-toolkit INFO     Invoking user script
 7 | 
 8 | Training Env:
 9 | 
10 | {
11 |     "additional_framework_parameters": {},
12 |     "channel_input_dirs": {},
13 |     "current_host": "algo-2",
14 |     "framework_module": "sagemaker_pytorch_container.training:main",
15 |     "hosts": [
16 |         "algo-1",
17 |         "algo-2"
18 |     ],
19 |     "hyperparameters": {},
20 |     "input_config_dir": "/opt/ml/input/config",
21 |     "input_data_config": {},
22 |     "input_dir": "/opt/ml/input",
23 |     "is_master": false,
24 |     "job_name": "cli-task1-2020-10-04-09-16-52-zNRvwXqG",
25 |     "log_level": 20,
26 |     "master_hostname": "algo-1",
27 |     "model_dir": "/opt/ml/model",
28 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz",
29 |     "module_name": "worker1",
30 |     "network_interface_name": "eth0",
31 |     "num_cpus": 8,
32 |     "num_gpus": 1,
33 |     "output_data_dir": "/opt/ml/output/data",
34 |     "output_dir": "/opt/ml/output",
35 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
36 |     "resource_config": {
37 |         "current_host": "algo-2",
38 |         "hosts": [
39 |             "algo-1",
40 |             "algo-2"
41 |         ],
42 |         "network_interface_name": "eth0"
43 |     },
44 |     "user_entry_point": "worker1.py"
45 | }
46 | 
47 | Environment variables:
48 | 
49 | SM_HOSTS=["algo-1","algo-2"]
50 | SM_NETWORK_INTERFACE_NAME=eth0
51 | SM_HPS={}
52 | SM_USER_ENTRY_POINT=worker1.py
53 | SM_FRAMEWORK_PARAMS={}
54 | SM_RESOURCE_CONFIG={"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}
55 | SM_INPUT_DATA_CONFIG={}
56 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
57 | SM_CHANNELS=[]
58 | SM_CURRENT_HOST=algo-2
59 | SM_MODULE_NAME=worker1
60 | SM_LOG_LEVEL=20
61 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
62 | SM_INPUT_DIR=/opt/ml/input
63 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
64 | SM_OUTPUT_DIR=/opt/ml/output
65 | SM_NUM_CPUS=8
66 | SM_NUM_GPUS=1
67 | SM_MODEL_DIR=/opt/ml/model
68 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz
69 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-2","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":false,"job_name":"cli-task1-2020-10-04-09-16-52-zNRvwXqG","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz","module_name":"worker1","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"worker1.py"}
70 | SM_USER_ARGS=[]
71 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
72 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
73 | 
74 | Invoking script with the following command:
75 | 
76 | /opt/conda/bin/python worker1.py
77 | 
78 | 
79 | -***- Device 0: _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)
80 | 2020-10-04 09:21:17,972 sagemaker-training-toolkit INFO     Reporting training SUCCESS
81 | 


--------------------------------------------------------------------------------
/src/simple_sagemaker/iam_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | 
  4 | logger = logging.getLogger(__name__)
  5 | 
  6 | 
  7 | def createSageMakerIAMRole(boto3_session, role_name):
  8 |     logger.debug(
  9 |         f"Creating SageMaker IAM Role: {role_name} with an attached AmazonSageMakerFullAccess policy..."
 10 |     )
 11 | 
 12 |     trustRelationship = {
 13 |         "Version": "2012-10-17",
 14 |         "Statement": [
 15 |             {
 16 |                 "Sid": "",
 17 |                 "Effect": "Allow",
 18 |                 "Principal": {"Service": "sagemaker.amazonaws.com"},
 19 |                 "Action": "sts:AssumeRole",
 20 |             }
 21 |         ],
 22 |     }
 23 |     client = boto3_session.client("iam")
 24 |     try:
 25 |         client.get_role(RoleName=role_name)
 26 |     except:  # noqa: E722
 27 |         client.create_role(
 28 |             RoleName=role_name,
 29 |             AssumeRolePolicyDocument=json.dumps(trustRelationship),
 30 |         )
 31 |     response = client.attach_role_policy(
 32 |         RoleName=role_name,
 33 |         PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess",
 34 |     )
 35 |     assert (
 36 |         response["ResponseMetadata"]["HTTPStatusCode"] == 200
 37 |     ), f"Couldn't attach AmazonSageMakerFullAccess policy to role {role_name}"
 38 | 
 39 | 
 40 | def getOrCreatePolicy(iam_client, boto3_session, policy_name, policyString):
 41 |     listed_policies = iam_client.list_policies(Scope="Local")
 42 |     assert listed_policies["IsTruncated"] is False
 43 |     filtered_policy = [
 44 |         policy
 45 |         for policy in listed_policies["Policies"]
 46 |         if policy["PolicyName"] == policy_name
 47 |     ]
 48 |     if not filtered_policy:
 49 |         response = iam_client.create_policy(
 50 |             PolicyName=policy_name, PolicyDocument=json.dumps(policyString)
 51 |         )
 52 |         assert (
 53 |             response["ResponseMetadata"]["HTTPStatusCode"] == 200
 54 |         ), f"Couldn't create polict {policy_name}"
 55 |         policy = response["Policy"]
 56 |         policy_arn = policy["Arn"]
 57 |     else:
 58 |         policy = filtered_policy[0]
 59 |         policy_arn = policy["Arn"]
 60 |         iam = boto3_session.resource("iam")
 61 |         policy_obj = iam.Policy(policy_arn)
 62 |         if json.dumps(policyString["Statement"][0]) in json.dumps(
 63 |             policy_obj.default_version.document["Statement"]
 64 |         ):
 65 |             logger.debug(f"Statement already exist im {policy_name}")
 66 |         else:
 67 |             logger.debug(f"Adding the statement to policy {policy_name}")
 68 |             policy_json = policy_obj.default_version.document
 69 |             policy_json["Statement"].append(policyString["Statement"][0])
 70 |             response = iam_client.create_policy_version(
 71 |                 PolicyArn=policy_arn,
 72 |                 PolicyDocument=json.dumps(policy_json),
 73 |                 SetAsDefault=True,
 74 |             )
 75 |             assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
 76 |             response = iam_client.delete_policy_version(
 77 |                 PolicyArn=policy_arn, VersionId=policy_obj.default_version.version_id
 78 |             )
 79 |             assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
 80 |     return policy_arn
 81 | 
 82 | 
 83 | def allowAccessToS3Bucket(boto3_session, role_name, policy_name, bucket_name):
 84 |     logger.debug(
 85 |         f"Allowing access for {role_name} to {bucket_name} using the {policy_name} policy..."
 86 |     )
 87 | 
 88 |     client = boto3_session.client("iam")
 89 |     policyString = {
 90 |         "Version": "2012-10-17",
 91 |         "Statement": [
 92 |             {
 93 |                 "Sid": "",
 94 |                 "Effect": "Allow",
 95 |                 "Action": ["s3:*"],
 96 |                 "Resource": [
 97 |                     f"arn:aws:s3:::{bucket_name}",
 98 |                     f"arn:aws:s3:::{bucket_name}/*",
 99 |                 ],
100 |             }
101 |         ],
102 |     }
103 |     policy_arn = getOrCreatePolicy(client, boto3_session, policy_name, policyString)
104 | 
105 |     response = client.attach_role_policy(
106 |         RoleName=role_name,
107 |         PolicyArn=policy_arn,
108 |     )
109 |     assert (
110 |         response["ResponseMetadata"]["HTTPStatusCode"] == 200
111 |     ), f"Couldn't attach {policy_name} policy to role {role_name}"
112 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output2/logs/logs0:
--------------------------------------------------------------------------------
  1 | ==Bash
  2 | -***- Args:
  3 | -- Env:SSM_OUTPUT=/opt/ml/processing/output HOSTNAME=ip-10-0-235-103.ec2.internal SAGEMAKER_SERVING_MODULE=sagemaker_sklearn_container.serving:main SAGEMAKER_TRAINING_MODULE=sagemaker_sklearn_container.training:main AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=/v2/credentials/AeW1bfcO2AJo1dQj5NBqhzLbpvn21RX1yr6lvJnO0lU PYTHONUNBUFFERED=1 SSM_STATE=/opt/ml/processing/state LC_ALL=C.UTF-8 PYTHONIOENCODING=UTF-8 PATH=/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin PWD=/ LANG=C.UTF-8 AWS_REGION=us-east-1 PYTHONDONTWRITEBYTECODE=1 SHLVL=1 HOME=/root _=/usr/bin/env
  4 | *** START listing files
  5 | /opt:
  6 | total 12
  7 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 .
  8 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 ..
  9 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ml
 10 | 
 11 | /opt/ml:
 12 | total 20
 13 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .
 14 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 ..
 15 | drw-r--r-- 2 root root 4096 Oct 11 12:41 config
 16 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 output
 17 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 processing
 18 | 
 19 | /opt/ml/config:
 20 | total 16
 21 | drw-r--r-- 2 root root 4096 Oct 11 12:41 .
 22 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
 23 | -rw-r--r-- 1 root root 2358 Oct 11 12:41 processingjobconfig.json
 24 | -rw-r--r-- 1 root root   44 Oct 11 12:41 resourceconfig.json
 25 | 
 26 | /opt/ml/output:
 27 | total 16
 28 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .
 29 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
 30 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 data
 31 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 metrics
 32 | 
 33 | /opt/ml/output/data:
 34 | total 16
 35 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .
 36 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 37 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 output
 38 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 state
 39 | 
 40 | /opt/ml/output/data/output:
 41 | total 8
 42 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 .
 43 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 44 | 
 45 | /opt/ml/output/data/state:
 46 | total 8
 47 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 .
 48 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 49 | 
 50 | /opt/ml/output/metrics:
 51 | total 12
 52 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .
 53 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 54 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 cloudwatch
 55 | 
 56 | /opt/ml/output/metrics/cloudwatch:
 57 | total 8
 58 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 .
 59 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 ..
 60 | 
 61 | /opt/ml/processing:
 62 | total 20
 63 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .
 64 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
 65 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 input
 66 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 output
 67 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 state
 68 | 
 69 | /opt/ml/processing/input:
 70 | total 12
 71 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .
 72 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
 73 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 code
 74 | 
 75 | /opt/ml/processing/input/code:
 76 | total 16
 77 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .
 78 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 ..
 79 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 dep
 80 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 worker_toolkit
 81 | 
 82 | /opt/ml/processing/input/code/dep:
 83 | total 12
 84 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 .
 85 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 86 | -rw-r--r-- 1 root root   23 Oct 11 12:46 ex1_dep.py
 87 | 
 88 | /opt/ml/processing/input/code/worker_toolkit:
 89 | total 24
 90 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .
 91 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 ..
 92 | -rw-r--r-- 1 root root    0 Oct 11 12:46 __init__.py
 93 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 __pycache__
 94 | -rw-r--r-- 1 root root 9763 Oct 11 12:46 worker_lib.py
 95 | 
 96 | /opt/ml/processing/input/code/worker_toolkit/__pycache__:
 97 | total 20
 98 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 .
 99 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 ..
100 | -rw-r--r-- 1 root root  218 Oct 11 12:46 __init__.cpython-37.pyc
101 | -rw-r--r-- 1 root root 7548 Oct 11 12:46 worker_lib.cpython-37.pyc
102 | 
103 | /opt/ml/processing/output:
104 | total 8
105 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 .
106 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
107 | 
108 | /opt/ml/processing/state:
109 | total 8
110 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 .
111 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ..
112 | *** END 
113 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3/logs/logs0:
--------------------------------------------------------------------------------
 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 2 | bash: no job control in this shell
 3 | 2020-09-13 13:19:33,550 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
 4 | 2020-09-13 13:19:33,552 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 5 | 2020-09-13 13:19:33,561 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
 6 | 2020-09-13 13:19:36,593 sagemaker_pytorch_container.training INFO     Invoking user training script.
 7 | 2020-09-13 13:19:51,068 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 8 | 2020-09-13 13:19:51,080 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 9 | 2020-09-13 13:19:51,092 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
10 | 2020-09-13 13:19:51,102 sagemaker-training-toolkit INFO     Invoking user script
11 | 
12 | Training Env:
13 | 
14 | {
15 |     "additional_framework_parameters": {},
16 |     "channel_input_dirs": {},
17 |     "current_host": "algo-1",
18 |     "framework_module": "sagemaker_pytorch_container.training:main",
19 |     "hosts": [
20 |         "algo-1"
21 |     ],
22 |     "hyperparameters": {},
23 |     "input_config_dir": "/opt/ml/input/config",
24 |     "channel_data_config": {},
25 |     "input_dir": "/opt/ml/input",
26 |     "is_master": true,
27 |     "job_name": "task3-2020-09-13-13-16-15-tj2lK7rZ",
28 |     "log_level": 20,
29 |     "master_hostname": "algo-1",
30 |     "model_dir": "/opt/ml/model",
31 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz",
32 |     "module_name": "worker3",
33 |     "network_interface_name": "eth0",
34 |     "num_cpus": 2,
35 |     "num_gpus": 0,
36 |     "output_data_dir": "/opt/ml/output/data",
37 |     "output_dir": "/opt/ml/output",
38 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
39 |     "resource_config": {
40 |         "current_host": "algo-1",
41 |         "hosts": [
42 |             "algo-1"
43 |         ],
44 |         "network_interface_name": "eth0"
45 |     },
46 |     "user_entry_point": "worker3.py"
47 | }
48 | 
49 | Environment variables:
50 | 
51 | SM_HOSTS=["algo-1"]
52 | SM_NETWORK_INTERFACE_NAME=eth0
53 | SM_HPS={}
54 | SM_USER_ENTRY_POINT=worker3.py
55 | SM_FRAMEWORK_PARAMS={}
56 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
57 | SM_channel_data_CONFIG={}
58 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
59 | SM_CHANNELS=[]
60 | SM_CURRENT_HOST=algo-1
61 | SM_MODULE_NAME=worker3
62 | SM_LOG_LEVEL=20
63 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
64 | SM_INPUT_DIR=/opt/ml/input
65 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
66 | SM_OUTPUT_DIR=/opt/ml/output
67 | SM_NUM_CPUS=2
68 | SM_NUM_GPUS=0
69 | SM_MODEL_DIR=/opt/ml/model
70 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz
71 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task3-2020-09-13-13-16-15-tj2lK7rZ","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz","module_name":"worker3","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker3.py"}
72 | SM_USER_ARGS=[]
73 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
74 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
75 | 
76 | Invoking script with the following command:
77 | 
78 | /opt/conda/bin/python worker3.py
79 | 
80 | 
81 | 2020-09-13 13:19:51,169 sagemaker-training-toolkit INFO     Reporting training SUCCESS
82 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example3_2/logs/logs0:
--------------------------------------------------------------------------------
 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 2 | bash: no job control in this shell
 3 | 2020-09-13 13:23:18,591 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
 4 | 2020-09-13 13:23:18,595 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 5 | 2020-09-13 13:23:18,612 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
 6 | 2020-09-13 13:23:20,041 sagemaker_pytorch_container.training INFO     Invoking user training script.
 7 | 2020-09-13 13:23:51,904 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 8 | 2020-09-13 13:23:51,916 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 9 | 2020-09-13 13:23:51,928 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
10 | 2020-09-13 13:23:51,938 sagemaker-training-toolkit INFO     Invoking user script
11 | 
12 | Training Env:
13 | 
14 | {
15 |     "additional_framework_parameters": {},
16 |     "channel_input_dirs": {},
17 |     "current_host": "algo-1",
18 |     "framework_module": "sagemaker_pytorch_container.training:main",
19 |     "hosts": [
20 |         "algo-1"
21 |     ],
22 |     "hyperparameters": {},
23 |     "input_config_dir": "/opt/ml/input/config",
24 |     "channel_data_config": {},
25 |     "input_dir": "/opt/ml/input",
26 |     "is_master": true,
27 |     "job_name": "task3-2020-09-13-13-20-31-f6osgaSU",
28 |     "log_level": 20,
29 |     "master_hostname": "algo-1",
30 |     "model_dir": "/opt/ml/model",
31 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz",
32 |     "module_name": "worker3",
33 |     "network_interface_name": "eth0",
34 |     "num_cpus": 2,
35 |     "num_gpus": 0,
36 |     "output_data_dir": "/opt/ml/output/data",
37 |     "output_dir": "/opt/ml/output",
38 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
39 |     "resource_config": {
40 |         "current_host": "algo-1",
41 |         "hosts": [
42 |             "algo-1"
43 |         ],
44 |         "network_interface_name": "eth0"
45 |     },
46 |     "user_entry_point": "worker3.py"
47 | }
48 | 
49 | Environment variables:
50 | 
51 | SM_HOSTS=["algo-1"]
52 | SM_NETWORK_INTERFACE_NAME=eth0
53 | SM_HPS={}
54 | SM_USER_ENTRY_POINT=worker3.py
55 | SM_FRAMEWORK_PARAMS={}
56 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
57 | SM_channel_data_CONFIG={}
58 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
59 | SM_CHANNELS=[]
60 | SM_CURRENT_HOST=algo-1
61 | SM_MODULE_NAME=worker3
62 | SM_LOG_LEVEL=20
63 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
64 | SM_INPUT_DIR=/opt/ml/input
65 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
66 | SM_OUTPUT_DIR=/opt/ml/output
67 | SM_NUM_CPUS=2
68 | SM_NUM_GPUS=0
69 | SM_MODEL_DIR=/opt/ml/model
70 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz
71 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task3-2020-09-13-13-20-31-f6osgaSU","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz","module_name":"worker3","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker3.py"}
72 | SM_USER_ARGS=[]
73 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
74 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
75 | 
76 | Invoking script with the following command:
77 | 
78 | /opt/conda/bin/python worker3.py
79 | 
80 | 
81 | 2020-09-13 13:23:52,003 sagemaker-training-toolkit INFO     Reporting training SUCCESS
82 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example2/logs/logs0:
--------------------------------------------------------------------------------
 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 2 | bash: no job control in this shell
 3 | 2020-09-14 21:46:30,898 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
 4 | 2020-09-14 21:46:30,901 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 5 | 2020-09-14 21:46:30,910 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
 6 | 2020-09-14 21:46:32,341 sagemaker_pytorch_container.training INFO     Invoking user training script.
 7 | 2020-09-14 21:46:32,613 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 8 | 2020-09-14 21:46:32,625 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 9 | 2020-09-14 21:46:32,638 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
10 | 2020-09-14 21:46:32,648 sagemaker-training-toolkit INFO     Invoking user script
11 | 
12 | Training Env:
13 | 
14 | {
15 |     "additional_framework_parameters": {},
16 |     "channel_input_dirs": {},
17 |     "current_host": "algo-1",
18 |     "framework_module": "sagemaker_pytorch_container.training:main",
19 |     "hosts": [
20 |         "algo-1"
21 |     ],
22 |     "hyperparameters": {
23 |         "msg": "Hello, world!"
24 |     },
25 |     "input_config_dir": "/opt/ml/input/config",
26 |     "channel_data_config": {},
27 |     "input_dir": "/opt/ml/input",
28 |     "is_master": true,
29 |     "job_name": "task2-2020-09-14-21-43-32-oKDGLvk6",
30 |     "log_level": 20,
31 |     "master_hostname": "algo-1",
32 |     "model_dir": "/opt/ml/model",
33 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz",
34 |     "module_name": "worker2",
35 |     "network_interface_name": "eth0",
36 |     "num_cpus": 2,
37 |     "num_gpus": 0,
38 |     "output_data_dir": "/opt/ml/output/data",
39 |     "output_dir": "/opt/ml/output",
40 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
41 |     "resource_config": {
42 |         "current_host": "algo-1",
43 |         "hosts": [
44 |             "algo-1"
45 |         ],
46 |         "network_interface_name": "eth0"
47 |     },
48 |     "user_entry_point": "worker2.py"
49 | }
50 | 
51 | Environment variables:
52 | 
53 | SM_HOSTS=["algo-1"]
54 | SM_NETWORK_INTERFACE_NAME=eth0
55 | SM_HPS={"msg":"Hello, world!"}
56 | SM_USER_ENTRY_POINT=worker2.py
57 | SM_FRAMEWORK_PARAMS={}
58 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
59 | SM_channel_data_CONFIG={}
60 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
61 | SM_CHANNELS=[]
62 | SM_CURRENT_HOST=algo-1
63 | SM_MODULE_NAME=worker2
64 | SM_LOG_LEVEL=20
65 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
66 | SM_INPUT_DIR=/opt/ml/input
67 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
68 | SM_OUTPUT_DIR=/opt/ml/output
69 | SM_NUM_CPUS=2
70 | SM_NUM_GPUS=0
71 | SM_MODEL_DIR=/opt/ml/model
72 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz
73 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"msg":"Hello, world!"},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task2-2020-09-14-21-43-32-oKDGLvk6","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz","module_name":"worker2","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker2.py"}
74 | SM_USER_ARGS=["--msg","Hello, world!"]
75 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
76 | SM_HP_MSG=Hello, world!
77 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
78 | 
79 | Invoking script with the following command:
80 | 
81 | /opt/conda/bin/python worker2.py --msg Hello, world!
82 | 
83 | 
84 | -***- Hello, world!
85 | 2020-09-14 21:46:32,715 sagemaker-training-toolkit INFO     Reporting training SUCCESS
86 | 


--------------------------------------------------------------------------------
/examples/multiple_tasks/example.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | import sys
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | file_path = os.path.split(__file__)[0]
  9 | if "TOX_ENV_NAME" not in os.environ:
 10 |     srcPath = os.path.abspath(os.path.join(file_path, "..", "..", "src"))
 11 |     sys.path.append(srcPath)
 12 | from simple_sagemaker.sm_project import SageMakerProject  # noqa: E402
 13 | 
 14 | 
 15 | def setDefaultParams(sm_project):
 16 |     # docker image params
 17 |     aws_repo_name = "task_repo"  # remote (ECR) rpository name
 18 |     repo_name = "task_repo"  # local repository name
 19 |     image_tag = "latest"  # tag for local & remote images
 20 |     docker_file_path = os.path.join(
 21 |         file_path, "..", "single_task", "docker"
 22 |     )  # path of the local Dockerfile
 23 |     sm_project.setDefaultImageParams(
 24 |         aws_repo_name, repo_name, image_tag, docker_file_path
 25 |     )
 26 | 
 27 |     # job code path, entrypoint and params
 28 |     source_dir = os.path.join(file_path, "code")
 29 |     entry_point = "algo_multi.py"
 30 |     dependencies = []
 31 |     sm_project.setDefaultCodeParams(source_dir, entry_point, dependencies)
 32 | 
 33 |     # instances type an count
 34 |     instance_type = "ml.m5.large"
 35 |     training_instance_count = 2
 36 |     volume_size = (
 37 |         30  # Size in GB of the EBS volume to use for storing input data during training
 38 |     )
 39 |     use_spot_instances = True  # False
 40 |     max_run_mins = 15
 41 |     sm_project.setDefaultInstanceParams(
 42 |         instance_type,
 43 |         training_instance_count,
 44 |         volume_size,
 45 |         use_spot_instances,
 46 |         max_run_mins,
 47 |     )
 48 | 
 49 | 
 50 | def runner(
 51 |     project_name="simple-sagemaker-example-multi",
 52 |     prefix="",
 53 |     postfix="",
 54 |     output_path=None,
 55 | ):
 56 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 57 | 
 58 |     sm_project = SageMakerProject(project_name, prefix=prefix)
 59 |     setDefaultParams(sm_project)
 60 |     image_uri = sm_project.buildOrGetImage(
 61 |         instance_type=sm_project.defaultInstanceParams.instance_type
 62 |     )
 63 | 
 64 |     # task name
 65 |     task_name = (
 66 |         "multi-task1" + postfix
 67 |     )  # must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])*
 68 |     # input data params
 69 |     input_data_path = os.path.join(
 70 |         file_path, "..", "single_task", "input_data"
 71 |     )  # Can also provide a URI to an S3 bucket, e.g. next commented line
 72 |     # input_data_path = sagemaker.s3.s3_path_join("s3://", "sagemaker-us-east-1-XXXXXXXXXXXX", "task3", "input")
 73 |     distribution = "ShardedByS3Key"  # or "FullyReplicated" which is the default
 74 |     model_uri = (
 75 |         None  # Can be used to supply model data as an additional input, local/s3
 76 |     )
 77 |     hyperparameters = {"stage": 1}
 78 |     sm_project.runTask(
 79 |         task_name,
 80 |         image_uri,
 81 |         hyperparameters,
 82 |         input_data_path,
 83 |         model_uri=model_uri,
 84 |         input_distribution=distribution,
 85 |         clean_state=True,
 86 |     )
 87 | 
 88 |     if not output_path:
 89 |         output_path = os.path.join(file_path, "output")
 90 |     # delete the output directory
 91 |     outputDir1 = os.path.join(output_path, "output1")
 92 |     shutil.rmtree(outputDir1, ignore_errors=True)
 93 |     sm_project.downloadResults(task_name, outputDir1)
 94 | 
 95 |     task_name2 = "multi-task2"
 96 |     hyperparameters = {"stage": 2}
 97 |     additional_inputs = dict()
 98 |     additional_inputs["task1_state1"] = sm_project.getInputConfig(task_name, "state")
 99 |     additional_inputs["task1_state2"] = sm_project.getInputConfig(
100 |         task_name, "state", distribution="ShardedByS3Key"
101 |     )
102 |     additional_inputs["task1_state3"] = sm_project.getInputConfig(
103 |         task_name, "output", distribution="ShardedByS3Key"
104 |     )
105 |     model_uri = sm_project.tasks[task_name].getOutputTargetUri(model=True)
106 |     sm_project.runTask(
107 |         task_name2,
108 |         image_uri,
109 |         hyperparameters,
110 |         input_data_path,
111 |         model_uri=model_uri,
112 |         input_distribution=distribution,
113 |         additional_inputs=additional_inputs,
114 |         clean_state=True,
115 |     )
116 | 
117 |     # delete the output directory
118 |     output_dir2 = os.path.join(output_path, "output2")
119 |     shutil.rmtree(output_dir2, ignore_errors=True)
120 |     sm_project.downloadResults(task_name2, output_dir2)
121 | 
122 |     return sm_project
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     runner()
127 | 


--------------------------------------------------------------------------------
/examples/single_file/example.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import shutil
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | dockerFileContent = """
  9 | # __BASE_IMAGE__ is automatically replaced with the correct base image
 10 | FROM __BASE_IMAGE__
 11 | RUN pip3 install pandas==1.1 scikit-learn==0.21.3
 12 | """
 13 | file_path = Path(__file__).parent
 14 | 
 15 | 
 16 | def runner(project_name="simple-sagemaker-sf", prefix="", postfix="", output_path=None):
 17 |     from simple_sagemaker.sm_project import SageMakerProject
 18 | 
 19 |     sm_project = SageMakerProject(project_name, prefix=prefix)
 20 |     # define the code parameters
 21 |     sm_project.setDefaultCodeParams(
 22 |         source_dir=None, entry_point=__file__, dependencies=[]
 23 |     )
 24 |     # define the instance parameters
 25 |     sm_project.setDefaultInstanceParams(instance_count=2, max_run_mins=15)
 26 |     # docker image
 27 |     sm_project.setDefaultImageParams(
 28 |         aws_repo_name="task_repo",
 29 |         repo_name="task_repo",
 30 |         image_tag="latest",
 31 |         docker_file_path_or_content=dockerFileContent,
 32 |     )
 33 |     image_uri = sm_project.buildOrGetImage(
 34 |         instance_type=sm_project.defaultInstanceParams.instance_type
 35 |     )
 36 | 
 37 |     # *** Task 1 - process input data
 38 |     task1_name = "single-file-task1" + postfix
 39 |     # set the input data
 40 |     input_data_path = file_path / "data"
 41 |     # run the task
 42 |     sm_project.runTask(
 43 |         task1_name,
 44 |         image_uri,
 45 |         input_distribution="ShardedByS3Key",  # distribute the input files among the workers
 46 |         hyperparameters={"worker": 1, "arg": "hello world!", "task": 1},
 47 |         input_data_path=str(input_data_path) if input_data_path.is_dir() else None,
 48 |         clean_state=True,  # clean the current state, also forces re-running
 49 |     )
 50 |     # download the results
 51 |     if not output_path:
 52 |         output_path = file_path / "output"
 53 |     shutil.rmtree(output_path, ignore_errors=True)
 54 |     sm_project.downloadResults(task1_name, Path(output_path) / "output1")
 55 | 
 56 |     # *** Task 2 - process the results of Task 1
 57 |     task2_name = "single-file-task2" + postfix
 58 |     # set the input
 59 |     additional_inputs = {
 60 |         "task2_data": sm_project.getInputConfig(task1_name, "model"),
 61 |         "task2_data_dist": sm_project.getInputConfig(
 62 |             task1_name, "model", distribution="ShardedByS3Key"
 63 |         ),
 64 |     }
 65 |     # run the task
 66 |     sm_project.runTask(
 67 |         task2_name,
 68 |         image_uri,
 69 |         hyperparameters={"worker": 1, "arg": "hello world!", "task": 2},
 70 |         clean_state=True,  # clean the current state, also forces re-running
 71 |         additional_inputs=additional_inputs,
 72 |     )
 73 |     # download the results
 74 |     sm_project.downloadResults(task2_name, Path(output_path) / "output2")
 75 | 
 76 |     return sm_project
 77 | 
 78 | 
 79 | def worker():
 80 |     from worker_toolkit import worker_lib
 81 | 
 82 |     logger.info("Starting worker...")
 83 |     # parse the arguments
 84 |     worker_config = worker_lib.WorkerConfig()
 85 | 
 86 |     logger.info(f"Hyperparams: {worker_config.hps}")
 87 |     logger.info(
 88 |         f"Input data files: {list(Path(worker_config.channel_data).rglob('*'))}"
 89 |     )
 90 |     logger.info(f"State files: { list(Path(worker_config.state).rglob('*'))}")
 91 | 
 92 |     if int(worker_config.hps["task"]) == 1:
 93 |         # update the state per running instance
 94 |         open(
 95 |             f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt"
 96 |         ).write("state")
 97 |         # write to the model output directory
 98 |         for file in Path(worker_config.channel_data).rglob("*"):
 99 |             if file.is_file():
100 |                 relp = file.relative_to(worker_config.channel_data)
101 |                 path = Path(worker_config.model_dir) / (
102 |                     str(relp) + "_proc_by_" + worker_config.current_host
103 |                 )
104 |                 path.write_text(
105 |                     file.read_text() + " processed by " + worker_config.current_host
106 |                 )
107 |         open(
108 |             f"{worker_config.model_dir}/output_{worker_config.current_host}", "wt"
109 |         ).write("output")
110 |     elif int(worker_config.hps["task"]) == 2:
111 |         logger.info(
112 |             f"Input task2_data: {list(Path(worker_config.channel_task2_data).rglob('*'))}"
113 |         )
114 |         logger.info(
115 |             f"Input task2_data_dist: {list(Path(worker_config.channel_task2_data_dist).rglob('*'))}"
116 |         )
117 | 
118 |     logger.info("finished!")
119 |     # The task is marked as completed
120 | 
121 | 
122 | def main():
123 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
124 |     if "--worker" in sys.argv:
125 |         worker()
126 |     else:
127 |         runner()
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output3/logs/logs0:
--------------------------------------------------------------------------------
  1 | ======= Starting Bash script ...
  2 | -***- Args: arg1 -arg2 --arg3 argument 4
  3 | -- Env:, SSM_OUTPUT=/opt/ml/processing/output HOSTNAME=ip-10-0-189-78.ec2.internal SAGEMAKER_SERVING_MODULE=sagemaker_sklearn_container.serving:main SAGEMAKER_TRAINING_MODULE=sagemaker_sklearn_container.training:main AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=/v2/credentials/f_w_W_i_iOPc3jxTvlBDwhlypkVF7JT0tnKUxHe_kCY PYTHONUNBUFFERED=1 SSM_STATE=/opt/ml/processing/state LC_ALL=C.UTF-8 PYTHONIOENCODING=UTF-8 PATH=/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin PWD=/ LANG=C.UTF-8 AWS_REGION=us-east-1 PYTHONDONTWRITEBYTECODE=1 SHLVL=1 HOME=/root SM_CHANNEL_CLI_CODE_STATE=/opt/ml/processing/input/data/cli_code_state SM_CHANNEL_CLI_CODE_OUTPUT=/opt/ml/processing/input/data/cli_code_output SM_CHANNEL_DATA=/opt/ml/processing/data _=/usr/bin/env
  4 | -***- Pwd:, /
  5 | *** START listing files in /opt
  6 | /opt:
  7 | total 12
  8 | drwxr-xr-x 1 root root 4096 Oct  9 05:55 .
  9 | drwxr-xr-x 1 root root 4096 Oct  9 05:55 ..
 10 | drwxr-xr-x 5 root root 4096 Oct  9 05:55 ml
 11 | 
 12 | /opt/ml:
 13 | total 20
 14 | drwxr-xr-x 5 root root 4096 Oct  9 05:55 .
 15 | drwxr-xr-x 1 root root 4096 Oct  9 05:55 ..
 16 | drw-r--r-- 2 root root 4096 Oct  9 05:53 config
 17 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 output
 18 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 processing
 19 | 
 20 | /opt/ml/config:
 21 | total 16
 22 | drw-r--r-- 2 root root 4096 Oct  9 05:53 .
 23 | drwxr-xr-x 5 root root 4096 Oct  9 05:55 ..
 24 | -rw-r--r-- 1 root root 3628 Oct  9 05:53 processingjobconfig.json
 25 | -rw-r--r-- 1 root root   44 Oct  9 05:53 resourceconfig.json
 26 | 
 27 | /opt/ml/output:
 28 | total 16
 29 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 .
 30 | drwxr-xr-x 5 root root 4096 Oct  9 05:55 ..
 31 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 data
 32 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 metrics
 33 | 
 34 | /opt/ml/output/data:
 35 | total 16
 36 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 .
 37 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 38 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 output
 39 | drwxr-xr-x 2 root root 4096 Oct  9 05:53 state
 40 | 
 41 | /opt/ml/output/data/output:
 42 | total 8
 43 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
 44 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 45 | 
 46 | /opt/ml/output/data/state:
 47 | total 8
 48 | drwxr-xr-x 2 root root 4096 Oct  9 05:53 .
 49 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 50 | 
 51 | /opt/ml/output/metrics:
 52 | total 12
 53 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 .
 54 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 55 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 cloudwatch
 56 | 
 57 | /opt/ml/output/metrics/cloudwatch:
 58 | total 8
 59 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
 60 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 ..
 61 | 
 62 | /opt/ml/processing:
 63 | total 24
 64 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 .
 65 | drwxr-xr-x 5 root root 4096 Oct  9 05:55 ..
 66 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 data
 67 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 input
 68 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 output
 69 | drwxr-xr-x 2 root root 4096 Oct  9 05:53 state
 70 | 
 71 | /opt/ml/processing/data:
 72 | total 8
 73 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
 74 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 ..
 75 | -rw-r--r-- 1 root root    0 Oct  9 05:55 sample_data.txt
 76 | 
 77 | /opt/ml/processing/input:
 78 | total 16
 79 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 .
 80 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 ..
 81 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 code
 82 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 data
 83 | 
 84 | /opt/ml/processing/input/code:
 85 | total 20
 86 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 .
 87 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 88 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 dep
 89 | -rw-r--r-- 1 root root  325 Oct  9 05:55 ex3.sh
 90 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 worker_toolkit
 91 | 
 92 | /opt/ml/processing/input/code/dep:
 93 | total 12
 94 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
 95 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
 96 | -rw-r--r-- 1 root root   23 Oct  9 05:55 ex1_dep.py
 97 | 
 98 | /opt/ml/processing/input/code/worker_toolkit:
 99 | total 24
100 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 .
101 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
102 | -rw-r--r-- 1 root root    0 Oct  9 05:55 __init__.py
103 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 __pycache__
104 | -rw-r--r-- 1 root root 9637 Oct  9 05:55 worker_lib.py
105 | 
106 | /opt/ml/processing/input/code/worker_toolkit/__pycache__:
107 | total 20
108 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
109 | drwxr-xr-x 3 root root 4096 Oct  9 05:55 ..
110 | -rw-r--r-- 1 root root  218 Oct  9 05:55 __init__.cpython-37.pyc
111 | -rw-r--r-- 1 root root 7460 Oct  9 05:55 worker_lib.cpython-37.pyc
112 | 
113 | /opt/ml/processing/input/data:
114 | total 16
115 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 .
116 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
117 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 cli_code_output
118 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 cli_code_state
119 | 
120 | /opt/ml/processing/input/data/cli_code_output:
121 | total 12
122 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
123 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
124 | -rw-r--r-- 1 root root    6 Oct  9 05:55 output
125 | 
126 | /opt/ml/processing/input/data/cli_code_state:
127 | total 12
128 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
129 | drwxr-xr-x 4 root root 4096 Oct  9 05:55 ..
130 | -rw-r--r-- 1 root root    5 Oct  9 05:55 state
131 | 
132 | /opt/ml/processing/output:
133 | total 8
134 | drwxr-xr-x 2 root root 4096 Oct  9 05:55 .
135 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 ..
136 | 
137 | /opt/ml/processing/state:
138 | total 8
139 | drwxr-xr-x 2 root root 4096 Oct  9 05:53 .
140 | drwxr-xr-x 6 root root 4096 Oct  9 05:55 ..
141 | *** END file listing /opt 


--------------------------------------------------------------------------------
/examples/processing_cli/expected_output/output1/logs/logs0:
--------------------------------------------------------------------------------
  1 | Dependency!!!
  2 | INFO:__main__:======= Starting python script ...
  3 | INFO:worker_toolkit.worker_lib:Deleting other instances' state
  4 | INFO:worker_toolkit.worker_lib:Creating state dir
  5 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='', channel_model='', channels=[], current_host='algo-1', host_rank=0, hosts=['algo-1'], hps=[], input_config_dir='', input_data_config='', input_dir='', instance_state='/opt/ml/processing/state/algo-1', job_name='cli-code-2020-10-06-23-25-36-bE5AcbXg', model_dir='', network_interface_name='', num_cpus=2, num_gpus=-1, num_nodes=1, output_data_dir='', output_dir='', resource_config='', state='/opt/ml/processing/state')
  6 | Environ: environ({'PATH': '/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'HOSTNAME': 'ip-10-0-228-147.ec2.internal', 'SSM_STATE': '/opt/ml/processing/state', 'SSM_OUTPUT': '/opt/ml/processing/output', 'AWS_REGION': 'us-east-1', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/2ucF5alLTY-9jFs9Hp_ZIv8OpNqr93rvz-omNZVcG6E', 'PYTHONDONTWRITEBYTECODE': '1', 'PYTHONUNBUFFERED': '1', 'PYTHONIOENCODING': 'UTF-8', 'LANG': 'C.UTF-8', 'LC_ALL': 'C.UTF-8', 'SAGEMAKER_TRAINING_MODULE': 'sagemaker_sklearn_container.training:main', 'SAGEMAKER_SERVING_MODULE': 'sagemaker_sklearn_container.serving:main', 'HOME': '/root', 'SAGEMAKER_JOB_NAME': 'cli-code-2020-10-06-23-25-36-bE5AcbXg', 'SM_HOSTS': '["algo-1"]', 'SM_CURRENT_HOST': 'algo-1', 'SSM_NUM_NODES': '1', 'SSM_HOST_RANK': '0', 'SSM_INSTANCE_STATE': '/opt/ml/processing/state/algo-1', 'SMDEBUG_LOG_LEVEL': 'warning'})
  7 | Args: ['/opt/ml/processing/input/code/ex1.py', 'arg1', '-arg2', '--arg3', 'argument 4']
  8 | INFO:__main__:*** START listing files in /opt/
  9 | INFO:__main__:/opt/:
 10 | total 12
 11 | drwxr-xr-x 1 root root 4096 Oct  6 23:29 .
 12 | drwxr-xr-x 1 root root 4096 Oct  6 23:29 ..
 13 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ml
 14 | 
 15 | /opt/ml:
 16 | total 20
 17 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 .
 18 | drwxr-xr-x 1 root root 4096 Oct  6 23:29 ..
 19 | drw-r--r-- 2 root root 4096 Oct  6 23:27 config
 20 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 output
 21 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 processing
 22 | 
 23 | /opt/ml/config:
 24 | total 16
 25 | drw-r--r-- 2 root root 4096 Oct  6 23:27 .
 26 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
 27 | -rw-r--r-- 1 root root 2271 Oct  6 23:27 processingjobconfig.json
 28 | -rw-r--r-- 1 root root   44 Oct  6 23:27 resourceconfig.json
 29 | 
 30 | /opt/ml/output:
 31 | total 16
 32 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 .
 33 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
 34 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 data
 35 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 metrics
 36 | 
 37 | /opt/ml/output/data:
 38 | total 16
 39 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 .
 40 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 ..
 41 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 output
 42 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 state
 43 | 
 44 | /opt/ml/output/data/output:
 45 | total 8
 46 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
 47 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 ..
 48 | 
 49 | /opt/ml/output/data/state:
 50 | total 12
 51 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 .
 52 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 ..
 53 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 algo-1
 54 | 
 55 | /opt/ml/output/data/state/algo-1:
 56 | total 8
 57 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
 58 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 ..
 59 | 
 60 | /opt/ml/output/metrics:
 61 | total 12
 62 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 .
 63 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 ..
 64 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 cloudwatch
 65 | 
 66 | /opt/ml/output/metrics/cloudwatch:
 67 | total 8
 68 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
 69 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 ..
 70 | 
 71 | /opt/ml/processing:
 72 | total 20
 73 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 .
 74 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
 75 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 input
 76 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 output
 77 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 state
 78 | 
 79 | /opt/ml/processing/input:
 80 | total 12
 81 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 .
 82 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
 83 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 code
 84 | 
 85 | /opt/ml/processing/input/code:
 86 | total 20
 87 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 .
 88 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 ..
 89 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 dep
 90 | -rw-r--r-- 1 root root 1037 Oct  6 23:29 ex1.py
 91 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 worker_toolkit
 92 | 
 93 | /opt/ml/processing/input/code/dep:
 94 | total 12
 95 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
 96 | drwxr-xr-x 4 root root 4096 Oct  6 23:29 ..
 97 | -rw-r--r-- 1 root root   23 Oct  6 23:29 ex1_dep.py
 98 | 
 99 | /opt/ml/processing/input/code/worker_toolkit:
100 | total 24
101 | drwxr-xr-x 3 root root  4096 Oct  6 23:29 .
102 | drwxr-xr-x 4 root root  4096 Oct  6 23:29 ..
103 | -rw-r--r-- 1 root root     0 Oct  6 23:29 __init__.py
104 | drwxr-xr-x 2 root root  4096 Oct  6 23:29 __pycache__
105 | -rw-r--r-- 1 root root 10325 Oct  6 23:29 worker_lib.py
106 | 
107 | /opt/ml/processing/input/code/worker_toolkit/__pycache__:
108 | total 20
109 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
110 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 ..
111 | -rw-r--r-- 1 root root  201 Oct  6 23:29 __init__.cpython-37.pyc
112 | -rw-r--r-- 1 root root 8111 Oct  6 23:29 worker_lib.cpython-37.pyc
113 | 
114 | /opt/ml/processing/output:
115 | total 8
116 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
117 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
118 | 
119 | /opt/ml/processing/state:
120 | total 12
121 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 .
122 | drwxr-xr-x 5 root root 4096 Oct  6 23:29 ..
123 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 algo-1
124 | 
125 | /opt/ml/processing/state/algo-1:
126 | total 8
127 | drwxr-xr-x 2 root root 4096 Oct  6 23:29 .
128 | drwxr-xr-x 3 root root 4096 Oct  6 23:29 ..
129 | 
130 | INFO:__main__:*** END file listing /opt/
131 | INFO:__main__:finished!
132 | 


--------------------------------------------------------------------------------
/src/simple_sagemaker/ecr_sync.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | import os
  4 | from io import BytesIO
  5 | 
  6 | import docker
  7 | from sagemaker import image_uris
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class ECRSync:
 13 |     def __init__(self, boto3_session):
 14 |         self.boto3_session = boto3_session
 15 |         self.ecrClient = self.boto3_session.client("ecr")
 16 | 
 17 |     def getRpoUri(self, aws_repo_name):
 18 |         repo_uri = None
 19 |         for repo in self.ecrClient.describe_repositories()["repositories"]:
 20 |             if repo["repositoryName"] == aws_repo_name:
 21 |                 repo_uri = repo["repositoryUri"]
 22 |         return repo_uri
 23 | 
 24 |     def getOrCreateRepo(self, aws_repo_name):
 25 |         repo_uri = self.getRpoUri(aws_repo_name)
 26 |         if repo_uri is None:
 27 |             logging.info(f"Creating ECR repository: {aws_repo_name}")
 28 |             repo = self.ecrClient.create_repository(repositoryName=aws_repo_name)
 29 |             repo_uri = repo["repository"]["repositoryUri"]
 30 |         return repo_uri
 31 | 
 32 |     def getPrebuiltImage(
 33 |         self,
 34 |         instance_type,
 35 |         framework,
 36 |         framework_version,
 37 |         py_version,
 38 |         image_scope="training",
 39 |     ):
 40 |         assert framework, "Framework has to be specified"
 41 |         defaults = {
 42 |             "pytorch": ("1.6.0", "py3"),
 43 |             "tensorflow": ("2.3.0", "py37"),
 44 |             "sklearn": ("0.20.0", None),
 45 |         }
 46 | 
 47 |         if framework in defaults:
 48 |             if framework_version is None or py_version is None:
 49 |                 framework_version, py_version = defaults[framework]
 50 | 
 51 |         logger.debug(
 52 |             f"Getting the image for {framework}, framework_version {framework_version}, python version {py_version}"
 53 |         )
 54 | 
 55 |         region_name = self.boto3_session.region_name
 56 | 
 57 |         # Get the base image name, validate Dockerfile is based on it (TODO: replace in file)
 58 |         baseimage_uri = image_uris.retrieve(
 59 |             framework,
 60 |             region=region_name,
 61 |             version=framework_version,
 62 |             py_version=py_version,
 63 |             image_scope=image_scope,
 64 |             instance_type=instance_type,
 65 |         )
 66 |         return baseimage_uri
 67 | 
 68 |     def buildAndPushDockerImage(
 69 |         self,
 70 |         docker_file_path_or_content,
 71 |         aws_repo_name,
 72 |         repo_name,
 73 |         image_tag,
 74 |         instance_type,
 75 |         framework,
 76 |         framework_version,
 77 |         py_version,
 78 |     ):
 79 |         baseimage_uri = self.getPrebuiltImage(
 80 |             instance_type, framework, framework_version, py_version
 81 |         )
 82 | 
 83 |         if not docker_file_path_or_content:
 84 |             logger.debug(f"Using a pre-built image {baseimage_uri}...")
 85 |             return baseimage_uri
 86 | 
 87 |         repo_uri = self.getOrCreateRepo(aws_repo_name)
 88 | 
 89 |         build_args = dict()
 90 |         build_args["tag"] = repo_name + ":" + image_tag
 91 | 
 92 |         if os.path.isdir(docker_file_path_or_content):
 93 |             docker_file_path_or_content = open(
 94 |                 os.path.join(docker_file_path_or_content, "Dockerfile"), "rt"
 95 |             ).read()
 96 |         elif os.path.isfile(docker_file_path_or_content):
 97 |             docker_file_path_or_content = open(docker_file_path_or_content, "rt").read()
 98 | 
 99 |         # If it's not there -> add it :)
100 |         if "__BASE_IMAGE__" not in docker_file_path_or_content:
101 |             logger.warning(
102 |                 "__BASE_IMAGE__ couln't be found in docker_file_path_or_content, it was added on the beginning!"
103 |             )
104 |             docker_file_path_or_content = (
105 |                 f"FROM {baseimage_uri}\n" + docker_file_path_or_content
106 |             )
107 |         else:
108 |             docker_file_path_or_content = docker_file_path_or_content.replace(
109 |                 "__BASE_IMAGE__", baseimage_uri
110 |             )
111 | 
112 |         logging.info(
113 |             f"Building {docker_file_path_or_content} to {repo_name}:{image_tag} and pushing to {aws_repo_name}..."
114 |         )
115 | 
116 |         fileObj = BytesIO(docker_file_path_or_content.encode("utf-8"))
117 |         build_args["fileobj"] = fileObj
118 | 
119 |         # Create auth config
120 |         resp = self.ecrClient.get_authorization_token()
121 |         token = resp["authorizationData"][0]["authorizationToken"]
122 |         token = base64.b64decode(token).decode()
123 |         username, password = token.split(":")
124 |         auth_config = {"username": username, "password": password}
125 | 
126 |         client = docker.from_env()
127 |         # pull the base image
128 |         client.images.pull(baseimage_uri, auth_config=auth_config)
129 |         # build and tag the image
130 |         image = client.images.build(**build_args)
131 | 
132 |         images = self.ecrClient.describe_images(repositoryName=aws_repo_name)
133 |         images_digests = [x["imageDigest"] for x in images["imageDetails"]]
134 |         build_repo_digests = image[0].attrs["RepoDigests"]
135 |         if build_repo_digests:
136 |             builtImageDigest = build_repo_digests[0].split("@")[1]
137 |         if not build_repo_digests or (builtImageDigest not in images_digests):
138 |             logging.info("Tagging and pushing the image...")
139 |             res = image[0].tag(repo_uri, image_tag)
140 |             assert res
141 | 
142 |             # push the image to ECR
143 |             for line in client.images.push(
144 |                 repo_uri, image_tag, auth_config=auth_config, stream=True, decode=True
145 |             ):
146 |                 logging.info(line)
147 |             image_uri = f"{repo_uri}:{image_tag}"
148 |         else:
149 |             logging.info("Image already exists!")
150 |             image_idx = images_digests.index(builtImageDigest)
151 |             image_details = images["imageDetails"][image_idx]
152 |             # see https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-pull-ecr-image.html
153 |             image_uri = f'{repo_uri}@{image_details["imageDigest"]}'
154 |         logging.info(f"Image uri: {image_uri}")
155 |         return image_uri
156 | 


--------------------------------------------------------------------------------
/examples/readme_examples/expected_output_rest/example5/logs/logs0:
--------------------------------------------------------------------------------
  1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
  2 | bash: no job control in this shell
  3 | 2020-09-13 13:23:30,697 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
  4 | 2020-09-13 13:23:30,702 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  5 | 2020-09-13 13:23:30,711 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
  6 | 2020-09-13 13:23:33,730 sagemaker_pytorch_container.training INFO     Invoking user training script.
  7 | 2020-09-13 13:23:33,996 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  8 | 2020-09-13 13:23:34,008 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  9 | 2020-09-13 13:23:34,020 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 10 | 2020-09-13 13:23:34,030 sagemaker-training-toolkit INFO     Invoking user script
 11 | 
 12 | Training Env:
 13 | 
 14 | {
 15 |     "additional_framework_parameters": {},
 16 |     "channel_input_dirs": {
 17 |         "bucket": "/opt/ml/input/data/bucket"
 18 |     },
 19 |     "current_host": "algo-1",
 20 |     "framework_module": "sagemaker_pytorch_container.training:main",
 21 |     "hosts": [
 22 |         "algo-1"
 23 |     ],
 24 |     "hyperparameters": {},
 25 |     "input_config_dir": "/opt/ml/input/config",
 26 |     "channel_data_config": {
 27 |         "bucket": {
 28 |             "TrainingInputMode": "File",
 29 |             "S3DistributionType": "FullyReplicated",
 30 |             "RecordWrapperType": "None"
 31 |         }
 32 |     },
 33 |     "input_dir": "/opt/ml/input",
 34 |     "is_master": true,
 35 |     "job_name": "task5-2020-09-13-13-20-31-Cz53I5to",
 36 |     "log_level": 20,
 37 |     "master_hostname": "algo-1",
 38 |     "model_dir": "/opt/ml/model",
 39 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz",
 40 |     "module_name": "worker4",
 41 |     "network_interface_name": "eth0",
 42 |     "num_cpus": 2,
 43 |     "num_gpus": 0,
 44 |     "output_data_dir": "/opt/ml/output/data",
 45 |     "output_dir": "/opt/ml/output",
 46 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
 47 |     "resource_config": {
 48 |         "current_host": "algo-1",
 49 |         "hosts": [
 50 |             "algo-1"
 51 |         ],
 52 |         "network_interface_name": "eth0"
 53 |     },
 54 |     "user_entry_point": "worker4.py"
 55 | }
 56 | 
 57 | Environment variables:
 58 | 
 59 | SM_HOSTS=["algo-1"]
 60 | SM_NETWORK_INTERFACE_NAME=eth0
 61 | SM_HPS={}
 62 | SM_USER_ENTRY_POINT=worker4.py
 63 | SM_FRAMEWORK_PARAMS={}
 64 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
 65 | SM_channel_data_CONFIG={"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
 66 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
 67 | SM_CHANNELS=["bucket"]
 68 | SM_CURRENT_HOST=algo-1
 69 | SM_MODULE_NAME=worker4
 70 | SM_LOG_LEVEL=20
 71 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
 72 | SM_INPUT_DIR=/opt/ml/input
 73 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
 74 | SM_OUTPUT_DIR=/opt/ml/output
 75 | SM_NUM_CPUS=2
 76 | SM_NUM_GPUS=0
 77 | SM_MODEL_DIR=/opt/ml/model
 78 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz
 79 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"bucket":"/opt/ml/input/data/bucket"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task5-2020-09-13-13-20-31-Cz53I5to","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz","module_name":"worker4","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker4.py"}
 80 | SM_USER_ARGS=[]
 81 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
 82 | SM_CHANNEL_BUCKET=/opt/ml/input/data/bucket
 83 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
 84 | 
 85 | Invoking script with the following command:
 86 | 
 87 | /opt/conda/bin/python worker4.py
 88 | 
 89 | 
 90 | INFO:worker_toolkit.worker_lib:Args: Namespace(batch_size=64, channel_names=['bucket'], current_host='algo-1', epochs=50, hosts=['algo-1'], hps={}, channel_bucket='/opt/ml/input/data/bucket', input_config_dir='/opt/ml/input/config', channel_data='', channel_data_config='{"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', channel_model='', job_name='task5-2020-09-13-13-20-31-Cz53I5to', learning_rate=0.05, model_dir='/opt/ml/model', network_interface='eth0', num_cpus=2, num_gpus=0, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}', state='/state', use_cuda=False)
 91 | INFO:worker_toolkit.worker_lib:Unmatched: []
 92 | INFO:__main__:*** START listing files in 
 93 | INFO:__main__:
 94 | INFO:__main__:*** END file listing 
 95 | INFO:__main__:*** START listing files in /opt/ml/input/data/bucket
 96 | INFO:__main__:/opt/ml/input/data/bucket:
 97 | total 12
 98 | drwxr-xr-x 2 root root 4096 Sep 13 13:23 .
 99 | drwxr-xr-x 3 root root 4096 Sep 13 13:23 ..
100 | -rw-r--r-- 1 root root  127 Sep 13 13:23 model.tar.gz
101 | 
102 | INFO:__main__:*** END file listing /opt/ml/input/data/bucket
103 | 2020-09-13 13:23:34,107 sagemaker-training-toolkit INFO     Reporting training SUCCESS
104 | 


--------------------------------------------------------------------------------
/examples/cli_multi/expected_output/output1/logs/logs0:
--------------------------------------------------------------------------------
  1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
  2 | bash: no job control in this shell
  3 | 2020-09-13 13:20:23,180 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
  4 | 2020-09-13 13:20:23,182 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  5 | 2020-09-13 13:20:23,192 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
  6 | 2020-09-13 13:20:26,236 sagemaker_pytorch_container.training INFO     Invoking user training script.
  7 | 2020-09-13 13:20:26,500 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  8 | 2020-09-13 13:20:26,512 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  9 | 2020-09-13 13:20:26,524 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 10 | 2020-09-13 13:20:26,533 sagemaker-training-toolkit INFO     Invoking user script
 11 | 
 12 | Training Env:
 13 | 
 14 | {
 15 |     "additional_framework_parameters": {},
 16 |     "channel_input_dirs": {
 17 |         "data": "/opt/ml/input/data/data"
 18 |     },
 19 |     "current_host": "algo-1",
 20 |     "framework_module": "sagemaker_pytorch_container.training:main",
 21 |     "hosts": [
 22 |         "algo-1"
 23 |     ],
 24 |     "hyperparameters": {
 25 |         "task_type": "1"
 26 |     },
 27 |     "input_config_dir": "/opt/ml/input/config",
 28 |     "channel_data_config": {
 29 |         "data": {
 30 |             "TrainingInputMode": "File",
 31 |             "S3DistributionType": "ShardedByS3Key",
 32 |             "RecordWrapperType": "None"
 33 |         }
 34 |     },
 35 |     "input_dir": "/opt/ml/input",
 36 |     "is_master": true,
 37 |     "job_name": "task1-2020-09-13-13-16-15-3uM65148",
 38 |     "log_level": 20,
 39 |     "master_hostname": "algo-1",
 40 |     "model_dir": "/opt/ml/model",
 41 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz",
 42 |     "module_name": "worker",
 43 |     "network_interface_name": "eth0",
 44 |     "num_cpus": 2,
 45 |     "num_gpus": 0,
 46 |     "output_data_dir": "/opt/ml/output/data",
 47 |     "output_dir": "/opt/ml/output",
 48 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
 49 |     "resource_config": {
 50 |         "current_host": "algo-1",
 51 |         "hosts": [
 52 |             "algo-1"
 53 |         ],
 54 |         "network_interface_name": "eth0"
 55 |     },
 56 |     "user_entry_point": "worker.py"
 57 | }
 58 | 
 59 | Environment variables:
 60 | 
 61 | SM_HOSTS=["algo-1"]
 62 | SM_NETWORK_INTERFACE_NAME=eth0
 63 | SM_HPS={"task_type":"1"}
 64 | SM_USER_ENTRY_POINT=worker.py
 65 | SM_FRAMEWORK_PARAMS={}
 66 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}
 67 | SM_channel_data_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}
 68 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
 69 | SM_CHANNELS=["data"]
 70 | SM_CURRENT_HOST=algo-1
 71 | SM_MODULE_NAME=worker
 72 | SM_LOG_LEVEL=20
 73 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
 74 | SM_INPUT_DIR=/opt/ml/input
 75 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
 76 | SM_OUTPUT_DIR=/opt/ml/output
 77 | SM_NUM_CPUS=2
 78 | SM_NUM_GPUS=0
 79 | SM_MODEL_DIR=/opt/ml/model
 80 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz
 81 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"task_type":"1"},"input_config_dir":"/opt/ml/input/config","channel_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task1-2020-09-13-13-16-15-3uM65148","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz","module_name":"worker","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker.py"}
 82 | SM_USER_ARGS=["--task_type","1"]
 83 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
 84 | SM_CHANNEL_DATA=/opt/ml/input/data/data
 85 | SM_HP_TASK_TYPE=1
 86 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
 87 | 
 88 | Invoking script with the following command:
 89 | 
 90 | /opt/conda/bin/python worker.py --task_type 1
 91 | 
 92 | 
 93 | INFO:__main__:Starting worker...
 94 | INFO:worker_toolkit.worker_lib:Args: Namespace(batch_size=64, channel_names=['data'], current_host='algo-1', epochs=50, hosts=['algo-1'], hps={'task_type': '1'}, input_config_dir='/opt/ml/input/config', channel_data='/opt/ml/input/data/data', channel_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', channel_model='', job_name='task1-2020-09-13-13-16-15-3uM65148', learning_rate=0.05, model_dir='/opt/ml/model', network_interface='eth0', num_cpus=2, num_gpus=0, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}', state='/state', use_cuda=False)
 95 | INFO:worker_toolkit.worker_lib:Unmatched: ['--task_type', '1']
 96 | INFO:worker_toolkit.worker_lib:Deleting other instances' state
 97 | INFO:worker_toolkit.worker_lib:Creating instance specific state dir
 98 | INFO:__main__:Hyperparams: {'task_type': '1'}
 99 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data1.txt'), PosixPath('/opt/ml/input/data/data/sample_data2.txt')]
100 | INFO:__main__:State files: [PosixPath('/state/algo-1')]
101 | INFO:__main__:finished!
102 | 2020-09-13 13:20:26,601 sagemaker-training-toolkit INFO     Reporting training SUCCESS
103 | 


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/logs/logs0:
--------------------------------------------------------------------------------
  1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
  2 | bash: no job control in this shell
  3 | 2020-10-04 09:21:03,620 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
  4 | 2020-10-04 09:21:03,623 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  5 | 2020-10-04 09:21:03,632 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
  6 | 2020-10-04 09:21:16,308 sagemaker_pytorch_container.training INFO     Invoking user training script.
  7 | 2020-10-04 09:21:16,553 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  8 | 2020-10-04 09:21:16,564 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  9 | 2020-10-04 09:21:16,575 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 10 | 2020-10-04 09:21:16,585 sagemaker-training-toolkit INFO     Invoking user script
 11 | 
 12 | Training Env:
 13 | 
 14 | {
 15 |     "additional_framework_parameters": {},
 16 |     "channel_input_dirs": {
 17 |         "data": "/opt/ml/input/data/data"
 18 |     },
 19 |     "current_host": "algo-1",
 20 |     "framework_module": "sagemaker_pytorch_container.training:main",
 21 |     "hosts": [
 22 |         "algo-1",
 23 |         "algo-2"
 24 |     ],
 25 |     "hyperparameters": {
 26 |         "task": 1,
 27 |         "arg": "hello world!",
 28 |         "worker": 1
 29 |     },
 30 |     "input_config_dir": "/opt/ml/input/config",
 31 |     "input_data_config": {
 32 |         "data": {
 33 |             "TrainingInputMode": "File",
 34 |             "S3DistributionType": "ShardedByS3Key",
 35 |             "RecordWrapperType": "None"
 36 |         }
 37 |     },
 38 |     "input_dir": "/opt/ml/input",
 39 |     "is_master": true,
 40 |     "job_name": "single-file-task1-2020-10-04-09-17-17-PMGHWPsv",
 41 |     "log_level": 20,
 42 |     "master_hostname": "algo-1",
 43 |     "model_dir": "/opt/ml/model",
 44 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz",
 45 |     "module_name": "example",
 46 |     "network_interface_name": "eth0",
 47 |     "num_cpus": 2,
 48 |     "num_gpus": 0,
 49 |     "output_data_dir": "/opt/ml/output/data",
 50 |     "output_dir": "/opt/ml/output",
 51 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
 52 |     "resource_config": {
 53 |         "current_host": "algo-1",
 54 |         "hosts": [
 55 |             "algo-1",
 56 |             "algo-2"
 57 |         ],
 58 |         "network_interface_name": "eth0"
 59 |     },
 60 |     "user_entry_point": "example.py"
 61 | }
 62 | 
 63 | Environment variables:
 64 | 
 65 | SM_HOSTS=["algo-1","algo-2"]
 66 | SM_NETWORK_INTERFACE_NAME=eth0
 67 | SM_HPS={"arg":"hello world!","task":1,"worker":1}
 68 | SM_USER_ENTRY_POINT=example.py
 69 | SM_FRAMEWORK_PARAMS={}
 70 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}
 71 | SM_INPUT_DATA_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}
 72 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
 73 | SM_CHANNELS=["data"]
 74 | SM_CURRENT_HOST=algo-1
 75 | SM_MODULE_NAME=example
 76 | SM_LOG_LEVEL=20
 77 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
 78 | SM_INPUT_DIR=/opt/ml/input
 79 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
 80 | SM_OUTPUT_DIR=/opt/ml/output
 81 | SM_NUM_CPUS=2
 82 | SM_NUM_GPUS=0
 83 | SM_MODEL_DIR=/opt/ml/model
 84 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz
 85 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{"arg":"hello world!","task":1,"worker":1},"input_config_dir":"/opt/ml/input/config","input_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"single-file-task1-2020-10-04-09-17-17-PMGHWPsv","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz","module_name":"example","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"example.py"}
 86 | SM_USER_ARGS=["--arg","hello world!","--task","1","--worker","1"]
 87 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
 88 | SM_CHANNEL_DATA=/opt/ml/input/data/data
 89 | SM_HP_TASK=1
 90 | SM_HP_ARG=hello world!
 91 | SM_HP_WORKER=1
 92 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
 93 | 
 94 | Invoking script with the following command:
 95 | 
 96 | /opt/conda/bin/python example.py --arg hello world! --task 1 --worker 1
 97 | 
 98 | 
 99 | INFO:__main__:Starting worker...
100 | INFO:worker_toolkit.worker_lib:Deleting other instances' state
101 | INFO:worker_toolkit.worker_lib:Creating state dir
102 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='/opt/ml/input/data/data', channel_model='', channels=['data'], current_host='algo-1', host_rank=0, hosts=['algo-1', 'algo-2'], hps={'arg': 'hello world!', 'task': 1, 'worker': 1}, input_config_dir='/opt/ml/input/config', input_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', instance_state='/state/algo-1', job_name='single-file-task1-2020-10-04-09-17-17-PMGHWPsv', model_dir='/opt/ml/model', network_interface_name='eth0', num_cpus=2, num_gpus=0, num_nodes=2, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}', state='/state')
103 | INFO:__main__:Hyperparams: {'arg': 'hello world!', 'task': 1, 'worker': 1}
104 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data1.txt')]
105 | INFO:__main__:State files: [PosixPath('/state/algo-1')]
106 | INFO:__main__:finished!
107 | 2020-10-04 09:21:16,653 sagemaker-training-toolkit INFO     Reporting training SUCCESS
108 | 


--------------------------------------------------------------------------------
/examples/single_file/expected_output/output1/logs/logs1:
--------------------------------------------------------------------------------
  1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device
  2 | bash: no job control in this shell
  3 | 2020-10-04 09:21:07,312 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
  4 | 2020-10-04 09:21:07,314 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  5 | 2020-10-04 09:21:07,324 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
  6 | 2020-10-04 09:21:10,395 sagemaker_pytorch_container.training INFO     Invoking user training script.
  7 | 2020-10-04 09:21:10,647 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  8 | 2020-10-04 09:21:10,659 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
  9 | 2020-10-04 09:21:10,671 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
 10 | 2020-10-04 09:21:10,680 sagemaker-training-toolkit INFO     Invoking user script
 11 | 
 12 | Training Env:
 13 | 
 14 | {
 15 |     "additional_framework_parameters": {},
 16 |     "channel_input_dirs": {
 17 |         "data": "/opt/ml/input/data/data"
 18 |     },
 19 |     "current_host": "algo-2",
 20 |     "framework_module": "sagemaker_pytorch_container.training:main",
 21 |     "hosts": [
 22 |         "algo-1",
 23 |         "algo-2"
 24 |     ],
 25 |     "hyperparameters": {
 26 |         "task": 1,
 27 |         "arg": "hello world!",
 28 |         "worker": 1
 29 |     },
 30 |     "input_config_dir": "/opt/ml/input/config",
 31 |     "input_data_config": {
 32 |         "data": {
 33 |             "TrainingInputMode": "File",
 34 |             "S3DistributionType": "ShardedByS3Key",
 35 |             "RecordWrapperType": "None"
 36 |         }
 37 |     },
 38 |     "input_dir": "/opt/ml/input",
 39 |     "is_master": false,
 40 |     "job_name": "single-file-task1-2020-10-04-09-17-17-PMGHWPsv",
 41 |     "log_level": 20,
 42 |     "master_hostname": "algo-1",
 43 |     "model_dir": "/opt/ml/model",
 44 |     "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz",
 45 |     "module_name": "example",
 46 |     "network_interface_name": "eth0",
 47 |     "num_cpus": 2,
 48 |     "num_gpus": 0,
 49 |     "output_data_dir": "/opt/ml/output/data",
 50 |     "output_dir": "/opt/ml/output",
 51 |     "output_intermediate_dir": "/opt/ml/output/intermediate",
 52 |     "resource_config": {
 53 |         "current_host": "algo-2",
 54 |         "hosts": [
 55 |             "algo-1",
 56 |             "algo-2"
 57 |         ],
 58 |         "network_interface_name": "eth0"
 59 |     },
 60 |     "user_entry_point": "example.py"
 61 | }
 62 | 
 63 | Environment variables:
 64 | 
 65 | SM_HOSTS=["algo-1","algo-2"]
 66 | SM_NETWORK_INTERFACE_NAME=eth0
 67 | SM_HPS={"arg":"hello world!","task":1,"worker":1}
 68 | SM_USER_ENTRY_POINT=example.py
 69 | SM_FRAMEWORK_PARAMS={}
 70 | SM_RESOURCE_CONFIG={"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}
 71 | SM_INPUT_DATA_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}
 72 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data
 73 | SM_CHANNELS=["data"]
 74 | SM_CURRENT_HOST=algo-2
 75 | SM_MODULE_NAME=example
 76 | SM_LOG_LEVEL=20
 77 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main
 78 | SM_INPUT_DIR=/opt/ml/input
 79 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config
 80 | SM_OUTPUT_DIR=/opt/ml/output
 81 | SM_NUM_CPUS=2
 82 | SM_NUM_GPUS=0
 83 | SM_MODEL_DIR=/opt/ml/model
 84 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz
 85 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-2","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{"arg":"hello world!","task":1,"worker":1},"input_config_dir":"/opt/ml/input/config","input_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":false,"job_name":"single-file-task1-2020-10-04-09-17-17-PMGHWPsv","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz","module_name":"example","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"example.py"}
 86 | SM_USER_ARGS=["--arg","hello world!","--task","1","--worker","1"]
 87 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
 88 | SM_CHANNEL_DATA=/opt/ml/input/data/data
 89 | SM_HP_TASK=1
 90 | SM_HP_ARG=hello world!
 91 | SM_HP_WORKER=1
 92 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages
 93 | 
 94 | Invoking script with the following command:
 95 | 
 96 | /opt/conda/bin/python example.py --arg hello world! --task 1 --worker 1
 97 | 
 98 | 
 99 | INFO:__main__:Starting worker...
100 | INFO:worker_toolkit.worker_lib:Deleting other instances' state
101 | INFO:worker_toolkit.worker_lib:Creating state dir
102 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='/opt/ml/input/data/data', channel_model='', channels=['data'], current_host='algo-2', host_rank=1, hosts=['algo-1', 'algo-2'], hps={'arg': 'hello world!', 'task': 1, 'worker': 1}, input_config_dir='/opt/ml/input/config', input_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', instance_state='/state/algo-2', job_name='single-file-task1-2020-10-04-09-17-17-PMGHWPsv', model_dir='/opt/ml/model', network_interface_name='eth0', num_cpus=2, num_gpus=0, num_nodes=2, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}', state='/state')
103 | INFO:__main__:Hyperparams: {'arg': 'hello world!', 'task': 1, 'worker': 1}
104 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data2.txt')]
105 | INFO:__main__:State files: [PosixPath('/state/algo-2')]
106 | INFO:__main__:finished!
107 | 2020-10-04 09:21:10,751 sagemaker-training-toolkit INFO     Reporting training SUCCESS
108 | 


--------------------------------------------------------------------------------