├── tests ├── __init__.py ├── smoke │ ├── __init__.py │ └── test_basics.py ├── system │ ├── __init__.py │ └── test_examples.py └── conftest.py ├── requirements.txt ├── src ├── data │ └── .gitignore ├── simple_sagemaker │ ├── worker_toolkit │ │ └── __init__.py │ ├── __init__.py │ ├── constants.py │ ├── shell_launcher.py │ ├── s3_sync.py │ ├── iam_utils.py │ └── ecr_sync.py └── cli_launcher.py ├── examples ├── single_task │ ├── input_data │ │ ├── test2 │ │ └── test │ ├── code │ │ ├── requirements.txt │ │ ├── internal_dependency │ │ │ └── lib2.py │ │ └── algo.py │ ├── expected_output │ │ ├── model │ │ │ ├── algo-1 │ │ │ │ └── model_dir │ │ │ └── algo-2 │ │ │ │ └── model_dir │ │ ├── state │ │ │ ├── algo-1 │ │ │ │ └── state_algo-1 │ │ │ └── algo-2 │ │ │ │ └── state_algo-2 │ │ └── output │ │ │ └── algo-1 │ │ │ ├── output_data_dir │ │ │ └── input_dir_copy │ │ │ ├── data │ │ │ ├── data │ │ │ │ └── test │ │ │ ├── checkpoints-manifest │ │ │ └── data-manifest │ │ │ └── config │ │ │ ├── metric-definition-regex.json │ │ │ ├── upstreamoutputdataconfig.json │ │ │ ├── checkpointconfig.json │ │ │ ├── resourceconfig.json │ │ │ ├── inputdataconfig.json │ │ │ ├── tensorboardoutputconfig.json │ │ │ ├── hyperparameters.json │ │ │ ├── init-config.json │ │ │ └── trainingjobconfig.json │ ├── external_dependency │ │ └── lib1.py │ ├── docker │ │ └── Dockerfile │ └── example.py ├── imagenet │ ├── code │ │ ├── .gitignore │ │ ├── extract.sh │ │ ├── download.sh │ │ └── download-all.sh │ ├── run_local.sh │ └── run_remote.sh ├── processing_cli │ ├── data │ │ └── sample_data.txt │ ├── dep │ │ └── ex1_dep.py │ ├── expected_output │ │ ├── output1 │ │ │ ├── output │ │ │ │ └── output │ │ │ ├── state │ │ │ │ └── state │ │ │ └── logs │ │ │ │ └── logs0 │ │ ├── output2 │ │ │ ├── output │ │ │ │ ├── output │ │ │ │ └── config │ │ │ │ │ ├── resourceconfig.json │ │ │ │ │ └── processingjobconfig.json │ │ │ ├── state │ │ │ │ └── state │ │ │ └── logs │ │ │ │ └── logs0 │ │ └── output3 │ │ │ ├── state │ │ │ └── state_sh │ │ │ ├── output │ │ │ ├── output_sh │ │ │ └── config │ │ │ │ ├── resourceconfig.json │ │ │ │ └── processingjobconfig.json │ │ │ └── logs │ │ │ └── logs0 │ ├── ex3.sh │ ├── ex1.py │ └── run.sh ├── dogs_vs_cats │ ├── .gitignore │ └── run_remote.sh ├── cli_multi │ ├── expected_output │ │ └── output1 │ │ │ ├── model │ │ │ ├── output_algo-1 │ │ │ ├── sample_data1.txt_proc_by_algo-1 │ │ │ └── sample_data2.txt_proc_by_algo-1 │ │ │ ├── state │ │ │ └── algo-1 │ │ │ │ └── state_algo-1 │ │ │ └── logs │ │ │ └── logs0 │ ├── run.sh │ └── worker.py ├── readme_examples │ ├── data │ │ └── sample_data.txt │ ├── expected_output_smoke │ │ └── example7 │ │ │ └── output │ │ │ ├── success │ │ │ └── data │ │ │ └── ps__elf │ ├── example6 │ │ ├── data │ │ │ ├── sample_data1.txt │ │ │ └── sample_data2.txt │ │ ├── code │ │ │ ├── requirements.txt │ │ │ ├── internal_dependency │ │ │ │ └── lib2.py │ │ │ └── worker6.py │ │ ├── external_dependency │ │ │ └── lib1.py │ │ └── Dockerfile │ ├── expected_output │ │ ├── example6_1 │ │ │ ├── state │ │ │ │ ├── algo-1 │ │ │ │ │ └── algo-1 │ │ │ │ └── algo-2 │ │ │ │ │ └── algo-2 │ │ │ ├── output │ │ │ │ └── output_algo-1 │ │ │ └── model │ │ │ │ ├── sample_data1.txt_proc_by_algo-1 │ │ │ │ ├── sample_data1.txt_proc_by_algo-2 │ │ │ │ ├── sample_data2.txt_proc_by_algo-1 │ │ │ │ └── sample_data2.txt_proc_by_algo-2 │ │ └── example1 │ │ │ └── logs │ │ │ ├── logs0 │ │ │ └── logs1 │ ├── expected_output_rest │ │ ├── example3 │ │ │ ├── model │ │ │ │ └── model_dir │ │ │ ├── state │ │ │ │ └── state_dir │ │ │ ├── output │ │ │ │ └── output_data_dir │ │ │ └── logs │ │ │ │ └── logs0 │ │ ├── example3_2 │ │ │ ├── model │ │ │ │ └── model_dir │ │ │ ├── state │ │ │ │ └── state_dir │ │ │ ├── output │ │ │ │ └── output_data_dir │ │ │ └── logs │ │ │ │ └── logs0 │ │ ├── example3_2_stdout │ │ ├── example2 │ │ │ └── logs │ │ │ │ └── logs0 │ │ └── example5 │ │ │ └── logs │ │ │ └── logs0 │ ├── worker1.py │ ├── worker2.py │ ├── run_smoke.sh │ ├── run_smoke.bat │ ├── worker3.py │ ├── worker4.py │ ├── run_rest.sh │ └── run.sh ├── single_file │ ├── data │ │ ├── sample_data1.txt │ │ └── sample_data2.txt │ ├── expected_output │ │ └── output1 │ │ │ ├── model │ │ │ ├── output_algo-1 │ │ │ ├── output_algo-2 │ │ │ ├── sample_data1.txt_proc_by_algo-1 │ │ │ └── sample_data2.txt_proc_by_algo-2 │ │ │ ├── state │ │ │ ├── algo-1 │ │ │ │ └── state_algo-1 │ │ │ └── algo-2 │ │ │ │ └── state_algo-2 │ │ │ └── logs │ │ │ ├── logs0 │ │ │ └── logs1 │ └── example.py ├── debugging │ ├── tensorboard │ │ ├── requirements.txt │ │ └── lightning.py │ ├── metrics_example.JPG │ ├── metrics.py │ └── run.sh ├── medium │ ├── intro │ │ ├── example3 │ │ │ ├── data │ │ │ │ ├── sample_data1.txt │ │ │ │ └── sample_data2.txt │ │ │ ├── code │ │ │ │ ├── requirements.txt │ │ │ │ ├── internal_dependency │ │ │ │ │ └── lib2.py │ │ │ │ └── ssm_ex3_worker.py │ │ │ └── external_dependency │ │ │ │ └── lib1.py │ │ ├── ssm_ex2.py │ │ ├── README.md │ │ ├── run2.sh │ │ ├── run1.sh │ │ └── run3.sh │ └── distributed │ │ ├── cifar10 │ │ ├── run_local.sh │ │ └── run_remote.sh │ │ └── README.md ├── multiple_tasks │ ├── expected_output │ │ ├── output1 │ │ │ ├── model │ │ │ │ ├── algo-1 │ │ │ │ │ └── model_dir │ │ │ │ └── algo-2 │ │ │ │ │ └── model_dir │ │ │ ├── state │ │ │ │ ├── algo-1 │ │ │ │ │ ├── state_algo-1_1 │ │ │ │ │ ├── state_algo-1_10 │ │ │ │ │ ├── state_algo-1_2 │ │ │ │ │ ├── state_algo-1_3 │ │ │ │ │ ├── state_algo-1_4 │ │ │ │ │ ├── state_algo-1_5 │ │ │ │ │ ├── state_algo-1_6 │ │ │ │ │ ├── state_algo-1_7 │ │ │ │ │ ├── state_algo-1_8 │ │ │ │ │ └── state_algo-1_9 │ │ │ │ └── algo-2 │ │ │ │ │ ├── state_algo-2_1 │ │ │ │ │ ├── state_algo-2_10 │ │ │ │ │ ├── state_algo-2_2 │ │ │ │ │ ├── state_algo-2_3 │ │ │ │ │ ├── state_algo-2_4 │ │ │ │ │ ├── state_algo-2_5 │ │ │ │ │ ├── state_algo-2_6 │ │ │ │ │ ├── state_algo-2_7 │ │ │ │ │ ├── state_algo-2_8 │ │ │ │ │ └── state_algo-2_9 │ │ │ └── output │ │ │ │ └── algo-1 │ │ │ │ ├── data_copy │ │ │ │ └── test │ │ │ │ └── state_copy │ │ │ │ └── algo-1 │ │ │ │ ├── state_algo-1_1 │ │ │ │ ├── state_algo-1_10 │ │ │ │ ├── state_algo-1_2 │ │ │ │ ├── state_algo-1_3 │ │ │ │ ├── state_algo-1_4 │ │ │ │ ├── state_algo-1_5 │ │ │ │ ├── state_algo-1_6 │ │ │ │ ├── state_algo-1_7 │ │ │ │ ├── state_algo-1_8 │ │ │ │ └── state_algo-1_9 │ │ └── output2 │ │ │ └── output │ │ │ └── algo-1 │ │ │ ├── data_copy │ │ │ └── test │ │ │ ├── task1_state1_copy │ │ │ ├── algo-1 │ │ │ │ ├── state_algo-1_1 │ │ │ │ ├── state_algo-1_10 │ │ │ │ ├── state_algo-1_2 │ │ │ │ ├── state_algo-1_3 │ │ │ │ ├── state_algo-1_4 │ │ │ │ ├── state_algo-1_5 │ │ │ │ ├── state_algo-1_6 │ │ │ │ ├── state_algo-1_7 │ │ │ │ ├── state_algo-1_8 │ │ │ │ └── state_algo-1_9 │ │ │ └── algo-2 │ │ │ │ ├── state_algo-2_1 │ │ │ │ ├── state_algo-2_10 │ │ │ │ ├── state_algo-2_2 │ │ │ │ ├── state_algo-2_3 │ │ │ │ ├── state_algo-2_4 │ │ │ │ ├── state_algo-2_5 │ │ │ │ ├── state_algo-2_6 │ │ │ │ ├── state_algo-2_7 │ │ │ │ ├── state_algo-2_8 │ │ │ │ └── state_algo-2_9 │ │ │ ├── task1_state2_copy │ │ │ ├── algo-1 │ │ │ │ ├── state_algo-1_1 │ │ │ │ ├── state_algo-1_2 │ │ │ │ ├── state_algo-1_4 │ │ │ │ ├── state_algo-1_6 │ │ │ │ └── state_algo-1_8 │ │ │ └── algo-2 │ │ │ │ ├── state_algo-2_1 │ │ │ │ ├── state_algo-2_2 │ │ │ │ ├── state_algo-2_4 │ │ │ │ ├── state_algo-2_6 │ │ │ │ └── state_algo-2_8 │ │ │ ├── model_copy │ │ │ └── model.tar.gz │ │ │ └── task1_state3_copy │ │ │ └── output.tar.gz │ ├── code │ │ └── algo_multi.py │ └── example.py ├── update_expected │ └── update.sh └── retcode │ └── run.sh ├── docs ├── metric_example.jpg ├── source │ ├── modules.rst │ ├── index.rst │ ├── simple_sagemaker.task_toolkit.rst │ ├── simple_sagemaker.rst │ └── conf.py ├── Makefile ├── make.bat └── high_level_flow.txt ├── setup.py ├── MANIFEST.in ├── .gitignore ├── pyproject.toml ├── .vscode ├── settings.json └── launch.json ├── setup.cfg ├── tox.ini └── .github └── workflows └── build.yaml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tox -------------------------------------------------------------------------------- /src/data/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/smoke/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/system/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/single_task/input_data/test2: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/imagenet/code/.gitignore: -------------------------------------------------------------------------------- 1 | main.py -------------------------------------------------------------------------------- /examples/processing_cli/data/sample_data.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/single_task/input_data/test: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /src/simple_sagemaker/worker_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/simple_sagemaker/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = "0.9.24" 2 | -------------------------------------------------------------------------------- /examples/dogs_vs_cats/.gitignore: -------------------------------------------------------------------------------- 1 | main.py 2 | data/* 3 | output/* -------------------------------------------------------------------------------- /examples/cli_multi/expected_output/output1/model/output_algo-1: -------------------------------------------------------------------------------- 1 | output -------------------------------------------------------------------------------- /examples/processing_cli/dep/ex1_dep.py: -------------------------------------------------------------------------------- 1 | print("Dependency!!!") 2 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output1/output/output: -------------------------------------------------------------------------------- 1 | output -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output1/state/state: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/readme_examples/data/sample_data.txt: -------------------------------------------------------------------------------- 1 | sample data content -------------------------------------------------------------------------------- /examples/single_file/data/sample_data1.txt: -------------------------------------------------------------------------------- 1 | Single file sample data 1 -------------------------------------------------------------------------------- /examples/single_file/data/sample_data2.txt: -------------------------------------------------------------------------------- 1 | Single file sample data 2 -------------------------------------------------------------------------------- /examples/single_task/code/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==3.0.2 2 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/model/algo-1/model_dir: -------------------------------------------------------------------------------- 1 | model_dir -------------------------------------------------------------------------------- /examples/single_task/expected_output/model/algo-2/model_dir: -------------------------------------------------------------------------------- 1 | model_dir -------------------------------------------------------------------------------- /examples/cli_multi/expected_output/output1/state/algo-1/state_algo-1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/debugging/tensorboard/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-lightning 2 | -------------------------------------------------------------------------------- /examples/medium/intro/example3/data/sample_data1.txt: -------------------------------------------------------------------------------- 1 | sample data content 1 -------------------------------------------------------------------------------- /examples/medium/intro/example3/data/sample_data2.txt: -------------------------------------------------------------------------------- 1 | sample data content 2 -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output2/output/output: -------------------------------------------------------------------------------- 1 | output 2 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output2/state/state: -------------------------------------------------------------------------------- 1 | state 2 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output3/state/state_sh: -------------------------------------------------------------------------------- 1 | state 2 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_smoke/example7/output/success: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/model/output_algo-1: -------------------------------------------------------------------------------- 1 | output -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/model/output_algo-2: -------------------------------------------------------------------------------- 1 | output -------------------------------------------------------------------------------- /examples/medium/intro/example3/code/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==3.0.2 2 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output3/output/output_sh: -------------------------------------------------------------------------------- 1 | output 2 | -------------------------------------------------------------------------------- /examples/readme_examples/example6/data/sample_data1.txt: -------------------------------------------------------------------------------- 1 | sample data content 1 -------------------------------------------------------------------------------- /examples/readme_examples/example6/data/sample_data2.txt: -------------------------------------------------------------------------------- 1 | sample data content 2 -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/state/algo-1/state_algo-1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/state/algo-2/state_algo-2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/single_task/expected_output/state/algo-1/state_algo-1: -------------------------------------------------------------------------------- 1 | state_algo-1 -------------------------------------------------------------------------------- /examples/single_task/expected_output/state/algo-2/state_algo-2: -------------------------------------------------------------------------------- 1 | state_algo-2 -------------------------------------------------------------------------------- /src/cli_launcher.py: -------------------------------------------------------------------------------- 1 | from simple_sagemaker import cli 2 | 3 | cli.main() 4 | -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/model/algo-1/model_dir: -------------------------------------------------------------------------------- 1 | model_dir -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/model/algo-2/model_dir: -------------------------------------------------------------------------------- 1 | model_dir -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_10: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_3: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_5: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_7: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-1/state_algo-1_9: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_10: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_3: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_5: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_7: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/state/algo-2/state_algo-2_9: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/readme_examples/example6/code/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==3.0.2 2 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/state/algo-1/algo-1: -------------------------------------------------------------------------------- 1 | state_algo-1 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/state/algo-2/algo-2: -------------------------------------------------------------------------------- 1 | state_algo-2 -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/output_data_dir: -------------------------------------------------------------------------------- 1 | output_data_dir -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/data_copy/test: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/data_copy/test: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/output/output_algo-1: -------------------------------------------------------------------------------- 1 | output_algo-1 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3/model/model_dir: -------------------------------------------------------------------------------- 1 | model_dir file -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3/state/state_dir: -------------------------------------------------------------------------------- 1 | state_dir file -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3_2/model/model_dir: -------------------------------------------------------------------------------- 1 | model_dir file -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3_2/state/state_dir: -------------------------------------------------------------------------------- 1 | state_dir file -------------------------------------------------------------------------------- /examples/single_task/external_dependency/lib1.py: -------------------------------------------------------------------------------- 1 | print("-- External Lib1 imported!") 2 | -------------------------------------------------------------------------------- /examples/single_task/code/internal_dependency/lib2.py: -------------------------------------------------------------------------------- 1 | print("-- Internal Lib2 imported!") 2 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/data/data/test: -------------------------------------------------------------------------------- 1 | hello world! -------------------------------------------------------------------------------- /examples/medium/intro/example3/external_dependency/lib1.py: -------------------------------------------------------------------------------- 1 | print("-- External Lib1 imported!") 2 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3/output/output_data_dir: -------------------------------------------------------------------------------- 1 | output_data_dir file -------------------------------------------------------------------------------- /examples/medium/intro/example3/code/internal_dependency/lib2.py: -------------------------------------------------------------------------------- 1 | print("-- Internal Lib2 imported!") 2 | -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_10: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_3: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_5: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_7: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output1/output/algo-1/state_copy/algo-1/state_algo-1_9: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/readme_examples/example6/external_dependency/lib1.py: -------------------------------------------------------------------------------- 1 | print("-- External Lib1 imported!") 2 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3_2/output/output_data_dir: -------------------------------------------------------------------------------- 1 | output_data_dir file -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/metric-definition-regex.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /docs/metric_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/docs/metric_example.jpg -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_10: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_3: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_5: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_7: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-1/state_algo-1_9: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_10: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_3: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_5: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_7: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state1_copy/algo-2/state_algo-2_9: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-1/state_algo-1_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_1: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_2: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_4: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_6: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state2_copy/algo-2/state_algo-2_8: -------------------------------------------------------------------------------- 1 | state -------------------------------------------------------------------------------- /examples/readme_examples/example6/code/internal_dependency/lib2.py: -------------------------------------------------------------------------------- 1 | print("-- Internal Lib2 imported!") 2 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/upstreamoutputdataconfig.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/checkpointconfig.json: -------------------------------------------------------------------------------- 1 | {"LocalPath":"/state"} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import setuptools 4 | 5 | if __name__ == "__main__": 6 | setuptools.setup() -------------------------------------------------------------------------------- /examples/cli_multi/expected_output/output1/model/sample_data1.txt_proc_by_algo-1: -------------------------------------------------------------------------------- 1 | Single file sample data 1 processed by algo-1 -------------------------------------------------------------------------------- /examples/cli_multi/expected_output/output1/model/sample_data2.txt_proc_by_algo-1: -------------------------------------------------------------------------------- 1 | Single file sample data 2 processed by algo-1 -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output2/output/config/resourceconfig.json: -------------------------------------------------------------------------------- 1 | {"current_host":"algo-1","hosts":["algo-1"]} -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output3/output/config/resourceconfig.json: -------------------------------------------------------------------------------- 1 | {"current_host":"algo-1","hosts":["algo-1"]} -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/model/sample_data1.txt_proc_by_algo-1: -------------------------------------------------------------------------------- 1 | Single file sample data 1 processed by algo-1 -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/model/sample_data2.txt_proc_by_algo-2: -------------------------------------------------------------------------------- 1 | Single file sample data 2 processed by algo-2 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/model/sample_data1.txt_proc_by_algo-1: -------------------------------------------------------------------------------- 1 | sample data content 1 processed by algo-1 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/model/sample_data1.txt_proc_by_algo-2: -------------------------------------------------------------------------------- 1 | sample data content 1 processed by algo-2 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/model/sample_data2.txt_proc_by_algo-1: -------------------------------------------------------------------------------- 1 | sample data content 2 processed by algo-1 -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example6_1/model/sample_data2.txt_proc_by_algo-2: -------------------------------------------------------------------------------- 1 | sample data content 2 processed by algo-2 -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | simple_sagemaker 2 | ================ 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | simple_sagemaker 8 | -------------------------------------------------------------------------------- /examples/debugging/metrics_example.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/debugging/metrics_example.JPG -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.rst 3 | 4 | # Include the license file 5 | include LICENSE.txt 6 | 7 | graft tests 8 | graft examples -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/data/checkpoints-manifest: -------------------------------------------------------------------------------- 1 | tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state 2 | -------------------------------------------------------------------------------- /examples/medium/intro/ssm_ex2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | for i in range(torch.cuda.device_count()): 4 | print(f"-***- Device {i}: {torch.cuda.get_device_properties(i)}") 5 | -------------------------------------------------------------------------------- /examples/readme_examples/worker1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | for i in range(torch.cuda.device_count()): 4 | print(f"-***- Device {i}: {torch.cuda.get_device_properties(i)}") 5 | -------------------------------------------------------------------------------- /examples/readme_examples/worker2.py: -------------------------------------------------------------------------------- 1 | from worker_toolkit import worker_lib 2 | 3 | worker_config = worker_lib.WorkerConfig(False) 4 | print("-***-", worker_config.hps["msg"]) 5 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/resourceconfig.json: -------------------------------------------------------------------------------- 1 | {"current_host": "algo-1", "hosts": ["algo-1", "algo-2"], "network_interface_name": "eth0"} -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/inputdataconfig.json: -------------------------------------------------------------------------------- 1 | {"data":{"TrainingInputMode":"File","S3DistributionType":"ShardedByS3Key","RecordWrapperType":"None"}} -------------------------------------------------------------------------------- /examples/readme_examples/example6/Dockerfile: -------------------------------------------------------------------------------- 1 | # __BASE_IMAGE__ is automatically replaced with the correct base image 2 | FROM __BASE_IMAGE__ 3 | RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3 4 | -------------------------------------------------------------------------------- /examples/medium/intro/README.md: -------------------------------------------------------------------------------- 1 | Examples from the Medium blog post [Cloud processing is now easier and cheaper!](https://medium.com/@shiftan/a-very-simple-and-cheap-way-to-run-your-processing-job-on-the-cloud-c76af579f9e9) -------------------------------------------------------------------------------- /examples/medium/intro/run2.sh: -------------------------------------------------------------------------------- 1 | BASEDIR=$(dirname "$0") 2 | pushd . 3 | cd $BASEDIR 4 | 5 | ssm run -p ssm-ex -t ex2 -e ssm_ex2.py -o ./out2 --it ml.p3.2xlarge --ic 2 --force_running 6 | 7 | cat ./out2/logs/logs0 8 | 9 | popd -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/model_copy/model.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/multiple_tasks/expected_output/output2/output/algo-1/model_copy/model.tar.gz -------------------------------------------------------------------------------- /examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state3_copy/output.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiftan/simple_sagemaker/HEAD/examples/multiple_tasks/expected_output/output2/output/algo-1/task1_state3_copy/output.tar.gz -------------------------------------------------------------------------------- /examples/single_task/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # __BASE_IMAGE__ is automatically replaced with the correct base image 2 | FROM __BASE_IMAGE__ 3 | #FROM python:3.7-slim-buster 4 | 5 | RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3 6 | #ENV PYTHONUNBUFFERED=TRUE 7 | 8 | #ENTRYPOINT ["python3"] -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/tensorboardoutputconfig.json: -------------------------------------------------------------------------------- 1 | {"S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-05-09-16-33_py37/Task1/Task1-2020-10-05-09-17-22-LlJvq4UU/tensorboard-output","LocalPath":"/opt/ml/output/tensorboard/"} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # general things to ignore 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | *.egg 6 | *.py[cod] 7 | __pycache__/ 8 | *.so 9 | *~ 10 | 11 | # due to using tox and pytest 12 | .tox 13 | .cache 14 | 15 | examples/*/output 16 | examples/**/*.extracted* 17 | htmlcov/* 18 | .coverage* 19 | examples/**/cifar-10-* -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 40.6.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.isort] 6 | multi_line_output = 3 7 | include_trailing_comma = true 8 | force_grid_wrap = 0 9 | use_parentheses = true 10 | ensure_newline_before_comments = true 11 | line_length = 88 -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--cmdopt", action="store", default="type1", help="my option: type1 or type2" 7 | ) 8 | 9 | 10 | @pytest.fixture 11 | def cmdopt(request): 12 | return request.config.getoption("--cmdopt") 13 | -------------------------------------------------------------------------------- /examples/debugging/metrics.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def main(): 5 | vals1 = [60, 6, 0.6] 6 | vals2 = [-10, 0, 10] 7 | for (val1, val2) in zip(vals1, vals2): 8 | print(f"Val1: {val1:.4e}") 9 | print(f"Val2: {val2}") 10 | time.sleep(60) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/data/data-manifest: -------------------------------------------------------------------------------- 1 | tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input 2 | s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input/test 3 | s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input/test2 4 | -------------------------------------------------------------------------------- /examples/processing_cli/ex3.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | echo "======= Starting Bash script ..." 4 | echo "-***- Args:" $@ 5 | echo "-- Env:", `env` 6 | echo "-***- Pwd:", `pwd` 7 | echo "*** START listing files in /opt" 8 | ls -laR /opt 9 | echo "*** END file listing /opt" 10 | cp -r /opt/ml/config $SSM_OUTPUT/config 11 | echo "output" > $SSM_OUTPUT/output_sh 12 | echo "state" > $SSM_STATE/state_sh -------------------------------------------------------------------------------- /examples/update_expected/update.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | BASEDIR=$(dirname "$0") 4 | cd $BASEDIR 5 | 6 | rm -rf output 7 | unzip $1 -d ./output 8 | cd output 9 | mv popen*/*0/* . 10 | rm -r popen* 11 | find . | grep "\.extracted" | xargs rm 12 | 13 | for file in *; do 14 | echo updating $file ... 15 | rm -rf ../../$file/expected_output/* 16 | cp -r $file/output/* ../../$file/expected_output 17 | done -------------------------------------------------------------------------------- /examples/cli_multi/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Params: [output] [prefix] [suffix] [additional ssm params...] 4 | BASEDIR=$(dirname "$0") 5 | ssm run --prefix ${2} -p simple-sagemaker-example-cli-multi -t task1${3} -e $BASEDIR/worker.py -o $1/output1 --task_type 1 -i $BASEDIR/../single_file/data ${@:4} 6 | ssm run --prefix ${2} -p simple-sagemaker-example-cli-multi -t task2${3} -e $BASEDIR/worker.py -o $1/output2 --task_type 2 --iit task2_data task1 model ${@:4} 7 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/hyperparameters.json: -------------------------------------------------------------------------------- 1 | {"arg2":"\"hello\"","sagemaker_container_log_level":"20","sagemaker_program":"\"algo.py\"","arg1":"5","sagemaker_region":"\"us-east-1\"","sagemaker_job_name":"\"Task1-2020-10-04-09-17-37-x6ux770b\"","sagemaker_submit_directory":"\"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/source/sourcedir.tar.gz\""} -------------------------------------------------------------------------------- /examples/medium/intro/run1.sh: -------------------------------------------------------------------------------- 1 | BASEDIR=$(dirname "$0") 2 | pushd . 3 | cd $BASEDIR 4 | 5 | # Clean the current state to make sure the code runs again 6 | # Note: 1. It is done just for demonstration, by appending "--force_running" to the "ssm shell" command below 7 | ssm data -p ssm-ex -t ex1 --force_running 8 | # Run the task 9 | ssm shell -p ssm-ex -t ex1 -o ./out1 --it ml.p3.2xlarge --cmd_line "cat /proc/cpuinfo && nvidia-smi" 10 | 11 | cat ./out1/logs/logs0 12 | 13 | popd -------------------------------------------------------------------------------- /examples/readme_examples/run_smoke.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -e # fail if any test fails 3 | 4 | # Params: [output] [prefix] [suffix] [additional ssm params...] 5 | BASEDIR=$(dirname "$0") 6 | echo "Running with", $@ 7 | 8 | # Example 7 - local mode 9 | # --ks is used to avoid messing with state (not supported in local mode) 10 | ssm shell --prefix ${2} -p simple-sagemaker-example-cli -t shell-cli-local${3} \ 11 | --cmd_line "ps -elf >> \$SM_OUTPUT_DATA_DIR/ps__elf" \ 12 | -o $1/example7 --it 'local' --no_spot --download_output ${@:4} --ks -------------------------------------------------------------------------------- /examples/readme_examples/run_smoke.bat: -------------------------------------------------------------------------------- 1 | :: Params: [output] [prefix] [suffix] [additional ssm params...] 2 | echo %0 3 | echo off 4 | for %%F in (%0) do set dirname=%%~dpF 5 | echo "Running with" 6 | echo %* 7 | echo %dirname% 8 | 9 | :: Example 7 - local mode 10 | :: --ks is used to avoid messing with state (not supported in local mode) 11 | ssm shell -p %2simple-sagemaker-example-cli%3 -t shell-cli-local ^ 12 | --cmd_line "ps -elf >> \$SM_OUTPUT_DATA_DIR/ps__elf" ^ 13 | -o %1/example7 --it 'local' --no_spot --download_output %4 %5 %6 %7 %8 %9 --ks -------------------------------------------------------------------------------- /examples/medium/distributed/cifar10/run_local.sh: -------------------------------------------------------------------------------- 1 | set -e # stop and fail if anything stops 2 | BASEDIR=$(dirname "$0") 3 | pushd . 4 | cd $BASEDIR 5 | 6 | # Download the data 7 | python cifar10.py --download_only --data_path ./data 8 | # Train on a single node 9 | python cifar10.py --data_path ./data \ 10 | --test_batch_size 100 --train_batch_size 256 --num_workers 2 11 | # Train distibuted 12 | python cifar10.py --data_path ./data \ 13 | --test_batch_size 100 --train_batch_size 256 --num_workers 2 \ 14 | --distributed --backend nccl 15 | 16 | popd -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. simple-sagemaker documentation master file, created by 2 | sphinx-quickstart on Mon Sep 21 00:15:01 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to simple-sagemaker's documentation! 7 | ============================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/source/simple_sagemaker.task_toolkit.rst: -------------------------------------------------------------------------------- 1 | simple\_sagemaker.task\_toolkit package 2 | ======================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | simple\_sagemaker.task\_toolkit.algo\_lib module 8 | ------------------------------------------------ 9 | 10 | .. automodule:: simple_sagemaker.worker_toolkit.worker_lib 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: simple_sagemaker.worker_toolkit 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /examples/readme_examples/worker3.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from worker_toolkit import worker_lib 4 | 5 | worker_config = worker_lib.WorkerConfig(False) 6 | 7 | open(os.path.join(worker_config.output_data_dir, "output_data_dir"), "wt").write( 8 | "output_data_dir file" 9 | ) 10 | open(os.path.join(worker_config.model_dir, "model_dir"), "wt").write("model_dir file") 11 | open(os.path.join(worker_config.state, "state_dir"), "wt").write("state_dir file") 12 | 13 | # The task is marked as completed, to allow other tasks to use its output, 14 | # and to avoid re-running it (unless enforced) 15 | -------------------------------------------------------------------------------- /examples/debugging/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -ex # fail if any test fails 3 | 4 | cd `dirname "$0"` 5 | 6 | echo "Running $0 with ", -- $1 -- $2 -- $3 -- $4 -- $5 7 | OUTPUT=${1:-.} 8 | 9 | ssm run --prefix ${2} -p ssm-debugging -t metrics${3} -e ./metrics.py -o $OUTPUT/output1 ${@:4} \ 10 | --no_spot `#temporarily to accelerate iterations` & 11 | 12 | ssm run --prefix ${2} -p ssm-debugging -t tensorboard${3} -s ./tensorboard -e lightning.py -o $OUTPUT/output2 ${@:4} \ 13 | --no_spot `#temporarily to accelerate iterations` --force_running & 14 | 15 | 16 | wait # wait for all processes 17 | -------------------------------------------------------------------------------- /src/simple_sagemaker/constants.py: -------------------------------------------------------------------------------- 1 | LOCAL_STATE_PATH = "/state" 2 | 3 | DEFAULT_INSTANCE_TYPE_TRAINING = "ml.m5.large" 4 | DEFAULT_INSTANCE_TYPE_PROCESSING = "ml.t3.medium" 5 | DEFAULT_INSTANCE_COUNT = 1 6 | DEFAULT_VOLUME_SIZE = 30 # GB 7 | DEFAULT_USE_SPOT = True 8 | DEFAULT_MAX_RUN = 24 * 60 9 | DEFAULT_MAX_WAIT = 0 10 | 11 | DEFAULT_IAM_ROLE = "SageMakerIAMRole" 12 | DEFAULT_IAM_BUCKET_POLICY_SUFFIX = "Policy" 13 | 14 | DEFAULT_REPO_TAG = "latest" 15 | 16 | TEST_LOG_LINE_PREFIX = "-***-" 17 | TEST_LOG_LINE_BLOCK_PREFIX = "*** START " 18 | TEST_LOG_LINE_BLOCK_SUFFIX = "*** END " 19 | 20 | TASK_TYPE_TRAINING = "Training" 21 | TASK_TYPE_PROCESSING = "Processing" 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /examples/readme_examples/worker4.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import sys 4 | 5 | from worker_toolkit import worker_lib 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def listDir(path): 11 | logger.info(f"*** START listing files in {path}") 12 | logger.info( 13 | subprocess.run( 14 | ["ls", "-la", "-R", path], stdout=subprocess.PIPE, universal_newlines=True 15 | ).stdout 16 | ) 17 | logger.info(f"*** END file listing {path}") 18 | 19 | 20 | if __name__ == "__main__": 21 | logging.basicConfig(stream=sys.stdout) 22 | worker_config = worker_lib.WorkerConfig(False) 23 | listDir(worker_config.channel_data) 24 | listDir(worker_config.channel_bucket) 25 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_smoke/example7/output/data/ps__elf: -------------------------------------------------------------------------------- 1 | F S UID PID PPID C PRI NI ADDR SZ WCHAN STIME TTY TIME CMD 2 | 4 S root 1 0 1 80 0 - 4939 wait 10:03 pts/0 00:00:00 bash -m start_with_right_hostname.sh train 3 | 4 S root 14 1 54 80 0 - 56730 pipe_w 10:03 pts/0 00:00:00 /opt/conda/bin/python /opt/conda/bin/train 4 | 4 S root 25 14 0 80 0 - 7627 wait 10:03 pts/0 00:00:00 /opt/conda/bin/python shell_launcher.py --SSM_SHELL_CMD_LINE ps -elf >> $SM_OUTPUT_DATA_DIR/ps__elf 5 | 4 S root 26 25 0 80 0 - 5456 wait 10:03 pts/0 00:00:00 /bin/bash -c ps -elf >> $SM_OUTPUT_DATA_DIR/ps__elf 6 | 4 R root 27 26 0 80 0 - 9040 - 10:03 pts/0 00:00:00 ps -elf 7 | -------------------------------------------------------------------------------- /examples/medium/intro/run3.sh: -------------------------------------------------------------------------------- 1 | BASEDIR=$(dirname "$0") 2 | pushd . 3 | cd $BASEDIR/example3 4 | 5 | ssm run -p ssm-ex -t ex3-1 -s ./code -e ssm_ex3_worker.py \ 6 | -i ./data ShardedByS3Key \ 7 | --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \ 8 | --df "RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3" \ 9 | --repo_name "ex3_repo" --aws_repo_name "ex3_repo" --no_spot \ 10 | --ic 2 --task_type 1 -o ./out3/ex3_1 --force_running 11 | 12 | ssm run -p ssm-ex -t ex3-2 -s ./code -e ssm_ex3_worker.py \ 13 | -d ./external_dependency --iit ex3_1_model ex3-1 model \ 14 | --iit ex3_1_state ex3-1 state ShardedByS3Key \ 15 | -f tensorflow --md "Score" "Score=(.*?);" --tag "MyTag" "MyValue" \ 16 | --ic 2 --task_type 2 -o ./out3/ex3_2 --force_running 17 | 18 | popd -------------------------------------------------------------------------------- /docs/source/simple_sagemaker.rst: -------------------------------------------------------------------------------- 1 | simple\_sagemaker package 2 | ========================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | simple_sagemaker.worker_toolkit 11 | 12 | Submodules 13 | ---------- 14 | 15 | simple\_sagemaker.constants module 16 | ---------------------------------- 17 | 18 | .. automodule:: simple_sagemaker.constants 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | simple\_sagemaker.sm\_project module 24 | ------------------------------------ 25 | 26 | .. automodule:: simple_sagemaker.sm_project 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: simple_sagemaker 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /examples/imagenet/code/extract.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Expected to be launched with DATA_DIR as first argument 3 | 4 | set -ex # stop and fail if anything stops 5 | 6 | echo "Extracting all..." 7 | cd $1 8 | 9 | extract_and_delete() { 10 | filename=$1 11 | OUTDIR=${filename%.tar} 12 | tar -xf $filename --xform="s|^|$OUTDIR/|S" 13 | rm $filename 14 | } 15 | 16 | for filename in train/*.tar; do 17 | extract_and_delete $filename & 18 | done 19 | 20 | wait 21 | 22 | cd val 23 | # https://github.com/facebookarchive/fb.resnet.torch/blob/master/INSTALL.md 24 | tar -xf ILSVRC2012_img_val.tar 25 | wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash 26 | echo "Done!" 27 | cd .. 28 | echo "Extracted `find train | grep .JPEG | wc -l` train files and `find val | grep .JPEG | wc -l` validation files" -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/user/miniconda3/bin/python", 3 | "files.watcherExclude": { 4 | "**/.git/**": true, 5 | "**/.tox/**": true, 6 | "**/node_modules/*/**": true, 7 | "**/__pycache__/**": true, 8 | "**/runs/**": true, 9 | "**/wandbd/run*": true, 10 | }, 11 | "cSpell.words": [ 12 | "CPUS", 13 | "CUDA", 14 | "Posix", 15 | "Sharded", 16 | "Xeon", 17 | "algo", 18 | "awsglue", 19 | "conda", 20 | "cpuinfo", 21 | "cpython", 22 | "drwxr", 23 | "entrypoint", 24 | "hyperparameters", 25 | "pycache", 26 | "pytorch", 27 | "rglob", 28 | "scikit", 29 | "sourcedir", 30 | "tensorflow", 31 | "xlarge" 32 | ], 33 | "python.linting.enabled": true 34 | } -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/imagenet/run_local.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e # stop and fail if anything stops 4 | BASEDIR=$(dirname "$0") 5 | DATA_DIR=${1:-~/proj/data/cv/imagenet} 6 | cd $BASEDIR 7 | 8 | EPOCHS=1 9 | 10 | # Download the code from PyTorch's examples repository 11 | [ -f code/main.py ] || wget -O code/main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py 12 | 13 | # Download and extract the data 14 | ./code/download.sh $DATA_DIR 15 | ./code/extract.sh $DATA_DIR 16 | 17 | # Train on a single GPU, $EPOCHS epochs 18 | echo ===== Training $EPOCHS epochs, a single GPU... 19 | python ./code/main.py --epochs $EPOCHS $DATA_DIR 20 | 21 | # "Distributed training" on 1 GPU, $EPOCHS epochs 22 | echo ===== Training $EPOCHS epochs, distributed, a single GPU... 23 | export MASTER_PORT=8888 24 | export MASTER_ADDR=localhost 25 | python ./code/main.py --multiprocessing-distributed --dist-url env:// --world-size 1 --rank 0 --seed 123 --epochs $EPOCHS $DATA_DIR 26 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/init-config.json: -------------------------------------------------------------------------------- 1 | {"inputMode":"FILE","channels":{"data":{"s3DataSource":{"s3DataType":"S3_PREFIX","s3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input","s3DataDistributionType":"SHARDED_BY_S3_KEY","attributeNames":null},"fileSystemDataSource":null,"compressionType":"NONE","recordWrapper":"NONE","shuffleConfig":null,"inputMode":"FILE","sharded":true}},"checkpointChannel":{"name":"checkpoints","channel":{"s3DataSource":{"s3DataType":"S3_PREFIX","s3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state","s3DataDistributionType":null,"attributeNames":null},"fileSystemDataSource":null,"compressionType":null,"recordWrapper":null,"shuffleConfig":null,"inputMode":"FILE","sharded":false},"outputPath":"/opt/ml/checkpoints","allowEmpty":true},"hostConfig":{"clusterSize":2,"hostNumber":1},"enableAdditionalPlatformLoggingForCustomer":false,"jobRunInfo":{"jobRunNumber":1}} -------------------------------------------------------------------------------- /examples/imagenet/code/download.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Expected to be launched with DATA_DIR as first argument 3 | 4 | set -ex # stop and fail if anything stops 5 | 6 | mkdir -p $1 7 | cd $1 8 | 9 | [ -d ./train ] && rm -r ./train 10 | [ -d ./val ] && rm -r ./val 11 | 12 | apt-get update 13 | apt-get -y --allow-unauthenticated install aria2 14 | download () { 15 | aria2c --summary-interval=30 --conditional-get=true -x 16 -s 16 $1 16 | } 17 | 18 | ### From https://cloud.google.com/tpu/docs/imagenet-setup, please make sure you have the permission to download the files from [Imagenet](http://image-net.org) 19 | echo Downloading to `pwd` 20 | for FILENAME in ILSVRC2012_img_val.tar ILSVRC2012_img_train_t3.tar 21 | do 22 | download http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/${FILENAME} 2>&1 && echo finished downloading $FILENAME & 23 | done 24 | wait 25 | echo "Download finished!" 26 | 27 | echo "Extracting first level..." 28 | tar -xf ILSVRC2012_img_train_t3.tar --xform="s|^|train/|S" & 29 | wait 30 | mv ILSVRC2012_img_val.tar val/ 31 | echo "Done!" 32 | -------------------------------------------------------------------------------- /docs/high_level_flow.txt: -------------------------------------------------------------------------------- 1 | #https://sequencediagram.org/ asdf asdf asdf 2 | 3 | title High level flow 4 | actor "Client (**runner**)" as c 5 | 6 | database "ECS" as ecs #1da1f2 7 | fontawesome f0a0 "S3" as s3 #1da1f2 8 | control "SageMaker" as sm #1da1f2 9 | fontawesome f233 "EC2 instance 1" as s1 #1da1f2 10 | fontawesome f233 "EC2 instance 2" as s2 #1da1f2 11 | c->ecs: docker image 12 | c->s3: code, data 13 | c->sm: job params 14 | sm<->ecs: download image 15 | sm<->s3: download code, data, state 16 | sm->*s1: start (params, code, data, state) 17 | sm->*s2: start (params, code, data, state) 18 | parallel 19 | s1->s1: run docker image 20 | s2->s2: run docker image 21 | parallel off 22 | parallel 23 | activate s1 #blue 24 | activate s2 #blue 25 | parallel off 26 | parallel 27 | note over s1, s2: The (**worker**) job is running.\nCode, data and state get\n mounted into it. 28 | deactivateafter s1 29 | deactivateafter s2 30 | parallel off 31 | s1->sm: output, model, state 32 | destroysilent s1 33 | s2->sm: output, model, state 34 | destroysilent s2 35 | sm->sm: merge output, model, state 36 | sm->s3: merged output, model, state 37 | c<->s3: download merged output, model, state 38 | 39 | 40 | -------------------------------------------------------------------------------- /examples/imagenet/code/download-all.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Expected to be launched with DATA_DIR as first argument 3 | 4 | set -ex # stop and fail if anything stops 5 | 6 | mkdir -p $1 7 | cd $1 8 | 9 | [ -d ./train ] && rm -r ./train 10 | [ -d ./val ] && rm -r ./val 11 | 12 | apt-get update 13 | apt-get -y --allow-unauthenticated install aria2 14 | download () { 15 | aria2c --summary-interval=30 --conditional-get=true -x 16 -s 16 $1 16 | } 17 | 18 | ### From https://cloud.google.com/tpu/docs/imagenet-setup, please make sure you have the permission to download the files from [Imagenet](http://image-net.org) 19 | echo Downloading to `pwd` 20 | for FILENAME in ILSVRC2012_img_train.tar ILSVRC2012_img_val.tar ILSVRC2012_img_train_t3.tar 21 | do 22 | download http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/${FILENAME} 2>&1 && echo finished downloading $FILENAME & 23 | done 24 | wait 25 | echo "Download finished!" 26 | 27 | echo "Extracting first level..." 28 | tar -xf ILSVRC2012_img_train.tar --xform="s|^|train/|S" & 29 | #tar -xf ILSVRC2012_img_train_t3.tar --xform="s|^|train/|S" & 30 | wait 31 | mv ILSVRC2012_img_val.tar val/ 32 | echo "Done!" 33 | -------------------------------------------------------------------------------- /examples/medium/distributed/cifar10/run_remote.sh: -------------------------------------------------------------------------------- 1 | set -e # stop and fail if anything stops 2 | BASEDIR=$(dirname "$0") 3 | pushd . 4 | cd $BASEDIR 5 | 6 | # Download the data 7 | ssm run -p ex-cifar10 -t download -e cifar10.py --no_spot --\ 8 | --download_only 9 | 10 | # Train on a single node 11 | ssm run -p ex-cifar10 -t train-single -e cifar10.py \ 12 | --md "Loss" "loss: ([0-9\\.]*)" --md "Accuracy" "Accuracy: ([0-9\\.]*)" \ 13 | --no_spot `#temporarily to accelerate iterations` \ 14 | --iit cifar_data download state --it ml.p3.2xlarge \ 15 | `# Beginning of training script params` -- \ 16 | --test_batch_size 100 --train_batch_size 256 --epochs 10 --num_workers 2 17 | 18 | # Train distibuted 19 | ssm run -p ex-cifar10 -t train-dist -e cifar10.py \ 20 | --md "Loss" "loss: ([0-9\\.]*)" --md "Accuracy" "Accuracy: ([0-9\\.]*)" \ 21 | --no_spot `#temporarily to accelerate iterations` \ 22 | --iit cifar_data download state --it ml.p3.2xlarge \ 23 | --ic 2 \ 24 | `# Beginning of training script params` -- \ 25 | --test_batch_size 100 --train_batch_size 256 --epochs 10 --num_workers 2 \ 26 | --distributed --backend nccl 27 | 28 | wait # wait for all processes 29 | 30 | popd -------------------------------------------------------------------------------- /examples/processing_cli/ex1.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | import sys 5 | 6 | from dep import ex1_dep # noqa: F401 7 | from worker_toolkit import worker_lib 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def listDir(path, recursive=True): 13 | logger.info(f"*** START listing files in {path}") 14 | cmd_args = ["ls", "-la", path] 15 | if recursive: 16 | cmd_args.append("-R") 17 | process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True) 18 | logger.info(process.stdout) 19 | logger.info(f"*** END file listing {path}") 20 | 21 | 22 | if __name__ == "__main__": 23 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 24 | logger.info("======= Starting python script ...") 25 | 26 | worker_config = worker_lib.WorkerConfig() 27 | 28 | print("Environ:", os.environ) 29 | print("Args:", sys.argv) 30 | 31 | listDir("/opt/") 32 | open(os.environ["SSM_STATE"] + "/state", "wt").write("state") 33 | open(os.environ["SSM_OUTPUT"] + "/output", "wt").write("output") 34 | 35 | # just to show the final directory structue 36 | logger.info("finished!") 37 | # The task is marked as completed 38 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = simple_sagemaker 3 | version = attr: simple_sagemaker.VERSION 4 | description = A **simpler** and **cheaper** way to distribute work (python/shell/training) work on machines of your choice in the (AWS) cloud 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/shiftan/simple_sagemaker 8 | author = Ariel Shiftan 9 | maintainer = Ariel Shiftan 10 | license = Apache 2.0 11 | license_file = LICENSE 12 | platforms = any 13 | classifiers = 14 | Programming Language :: Python :: 3.8 15 | Programming Language :: Python :: 3.7 16 | Programming Language :: Python :: 3.6 17 | License :: OSI Approved :: Apache Software License 18 | Topic :: Software Development :: Libraries 19 | Topic :: Utilities 20 | keywords = sagemaker, aws 21 | maintainer-email = shiftan@gmail.com 22 | project_urls = 23 | Source=https://github.com/shiftan/simple_sagemaker 24 | Tracker=https://github.com/shiftan/simple_sagemaker 25 | 26 | [options] 27 | package_dir= 28 | =src 29 | packages=find: 30 | install_requires = 31 | docker 32 | boto3 33 | sagemaker 34 | python_requires = >=3.6 35 | 36 | [options.packages.find] 37 | where=src 38 | 39 | [options.extras_require] 40 | docs = 41 | sphinx>=2.0.0 42 | testing = 43 | pytest>=4.0.0 44 | 45 | [options.entry_points] 46 | console_scripts = 47 | ssm = simple_sagemaker.cli:main -------------------------------------------------------------------------------- /examples/readme_examples/run_rest.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #set -e # fail if any test fails 3 | 4 | # Params: [output] [prefix] [suffix] [additional ssm params...] 5 | BASEDIR=$(dirname "$0") 6 | 7 | # Example 2 - passing hyperparams as command line arguments 8 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task2${3} -e $BASEDIR/worker2.py --msg "Hello, world!" -o $1/example2 ${@:4} --max_run_mins 15 & 9 | 10 | # Example 3 - outputs 11 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task3${3} -e $BASEDIR/worker3.py -o $1/example3 ${@:4} --max_run_mins 15 & 12 | 13 | wait # wait for all processes, to avoid AWS resource limits... :( 14 | 15 | # Example 4 - Inputs, using a local data directory + s3 bucket 16 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task4${3} -e $BASEDIR/worker4.py \ 17 | -i $BASEDIR/data --iis bucket s3://awsglue-datasets/examples/us-legislators/all/persons.json \ 18 | --max_run_mins 15 -o $1/example4 ${@:4} & 19 | 20 | # running task3 again 21 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task3${3} -e $BASEDIR/worker3.py -o $1/example3_2 ${@:4} --ks > $1/example3_2_stdout --max_run_mins 15 & 22 | 23 | wait # wait for all processes 24 | 25 | # Example 5 - chaining data, using task3's output 26 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t task5${3} -e $BASEDIR/worker4.py --iit bucket task3 model -o $1/example5 ${@:4} --max_run_mins 15 & 27 | 28 | wait # wait for all processes 29 | -------------------------------------------------------------------------------- /examples/dogs_vs_cats/run_remote.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -ex # stop and fail if anything stops 4 | cd `dirname "$0"` 5 | 6 | # Download the code from PyTorch's examples repository 7 | [ -f code/main.py ] || wget -O main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py 8 | 9 | # The dogs vs cats DB can be downloaded from 10 | ## Kaggle - https://www.kaggle.com/c/dogs-vs-cats 11 | ## Microsoft - https://www.microsoft.com/en-us/download/details.aspx?id=54765 12 | ## Floyhub - https://www.floydhub.com/fastai/datasets/cats-vs-dogs 13 | 14 | # For simplicity, we currently just download a few sample images out of the full DB 15 | if [ ! -d ./data ]; then 16 | mkdir -p data && cd data 17 | wget -O sample_data.tar "https://www.floydhub.com/api/v1/download/artifacts/data/VbpRSQnFkQmYaBUtwt3aca?is_dir=true&path=sample" 18 | tar xf sample_data.tar && mv valid val && cd .. 19 | fi 20 | 21 | # Train on a single node 22 | # We're as the data set is small (sample data) -i switch makes sense here, other approaches may be better for larger sets. 23 | ssm shell -p cat-vs-dogs -t 1-node -o ./output/output_1node --download_state \ 24 | -i ./data --it ml.p3.2xlarge -d main.py \ 25 | --cmd_line "CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && \ 26 | python \$CODE_DIR/main.py --epochs 40 \$SM_CHANNEL_DATA --dist-url env:// --world-size \$SSM_NUM_NODES --rank \$SSM_HOST_RANK --seed 123" & 27 | 28 | # Train on 3 nodes 29 | ssm shell -p cat-vs-dogs -t 3-nodes -o ./output/output_3nodes --download_state \ 30 | -i ./data --it ml.p3.2xlarge -d main.py --ic 3 \ 31 | --cmd_line "CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && \ 32 | python \$CODE_DIR/main.py --epochs 40 \$SM_CHANNEL_DATA --dist-url env:// --world-size \$SSM_NUM_NODES --rank \$SSM_HOST_RANK --seed 123" & 33 | 34 | wait 35 | 36 | echo "FINISHED!" 37 | 38 | -------------------------------------------------------------------------------- /examples/cli_multi/worker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from pathlib import Path 4 | 5 | from worker_toolkit import worker_lib 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def task1(worker_config): 11 | # update the state per running instance 12 | open( 13 | f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt" 14 | ).write("state") 15 | # write to the model output directory 16 | for file in Path(worker_config.channel_data).rglob("*"): 17 | if file.is_file(): 18 | relp = file.relative_to(worker_config.channel_data) 19 | path = Path(worker_config.model_dir) / ( 20 | str(relp) + "_proc_by_" + worker_config.current_host 21 | ) 22 | path.write_text( 23 | file.read_text() + " processed by " + worker_config.current_host 24 | ) 25 | open(f"{worker_config.model_dir}/output_{worker_config.current_host}", "wt").write( 26 | "output" 27 | ) 28 | 29 | 30 | def task2(worker_config): 31 | logger.info( 32 | f"Input task2_data: {list(Path(worker_config.channel_task2_data).rglob('*'))}" 33 | ) 34 | 35 | 36 | def main(): 37 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 38 | 39 | logger.info("Starting worker...") 40 | # parse the arguments 41 | worker_config = worker_lib.WorkerConfig() 42 | 43 | logger.info(f"Hyperparams: {worker_config.hps}") 44 | logger.info( 45 | f"Input data files: {list(Path(worker_config.channel_data).rglob('*'))}" 46 | ) 47 | logger.info(f"State files: { list(Path(worker_config.state).rglob('*'))}") 48 | 49 | if int(worker_config.hps["task_type"]) == 1: 50 | task1(worker_config) 51 | elif int(worker_config.hps["task_type"]) == 2: 52 | task2(worker_config) 53 | 54 | logger.info("finished!") 55 | # The task is marked as completed 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /examples/retcode/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -ex # fail if any test fails 3 | 4 | cd `dirname "$0"` 5 | 6 | # Args: expected actual msg 7 | assert_eq() { 8 | local expected="$1" 9 | local actual="$2" 10 | local msg 11 | 12 | if [ "$#" -ge 3 ]; then 13 | msg="$3" 14 | fi 15 | 16 | if [ "$expected" == "$actual" ]; then 17 | return 0 18 | else 19 | [ "${#msg}" -gt 0 ] && echo "$expected == $actual :: $msg" || true 20 | return 1 21 | fi 22 | } 23 | 24 | pids=() 25 | expected=() 26 | 27 | # Args: expected command arg1 arg2 ... 28 | run_and_append() { 29 | "${@:2}" & 30 | pids+=($!) 31 | expected+=($1) 32 | } 33 | 34 | 35 | run_and_append 0 ssm process -p exit-tests -t proc-cli-ret-0 --max_run_mins 15 \ 36 | --entrypoint "/bin/bash" -- -c "exit 0" 37 | 38 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-1 --max_run_mins 15 \ 39 | --entrypoint "/bin/bash" -- -c "exit 1" 40 | 41 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-0-msg --max_run_mins 15 \ 42 | --entrypoint "/bin/bash" -- -c "echo Message >> /opt/ml/output/message && exit 0" & 43 | 44 | run_and_append 1 ssm process -p exit-tests -t proc-cli-ret-1-msg --max_run_mins 15 \ 45 | --entrypoint "/bin/bash" -- -c "echo Message >> /opt/ml/output/message && exit 1" 46 | 47 | 48 | run_and_append 0 ssm shell -p exit-tests -t shel-cli-ret-0-0 \ 49 | --cmd_line "echo \$SSM_HOST_RANK && exit 0" --ic 2 --force_running 50 | 51 | run_and_append 1 ssm shell -p exit-tests -t shel-cli-ret-0-1 \ 52 | --cmd_line "echo \$SSM_HOST_RANK && exit \$SSM_HOST_RANK" --ic 2 --force_running 53 | 54 | run_and_append 1 ssm shell -p exit-tests -t shel-cli-ret-0-0-msg \ 55 | --cmd_line "echo \$SSM_HOST_RANK && echo Message >> /opt/ml/output/failure && exit 0" --ic 2 --force_running 56 | 57 | echo "PIDs" ${pids[@]} 58 | 59 | for i in ${!pids[@]} ;do 60 | wait ${pids[$i]} && true 61 | assert_eq $? ${expected[$i]} "Retcode should be ${expected[$i]} for $i" 62 | done 63 | 64 | echo "PASSED" -------------------------------------------------------------------------------- /src/simple_sagemaker/shell_launcher.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | import time 7 | from pathlib import Path 8 | 9 | from worker_toolkit import worker_lib 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def worker(): 15 | logging.basicConfig(stream=sys.stdout) 16 | 17 | # Parse the arguments + initialize state 18 | worker_config = worker_lib.WorkerConfig() 19 | 20 | # Delete the current file + toolkit as both got injected 21 | os.remove(__file__) 22 | shutil.rmtree("./worker_toolkit") 23 | 24 | # Run the shell / cmd line command 25 | if "SSM_CMD_LINE" in worker_config.hps: 26 | cmd_line = worker_config.hps["SSM_CMD_LINE"] 27 | logger.info(f"Launching: {cmd_line}") 28 | shell_cmd = subprocess.run(cmd_line) 29 | elif "SSM_SHELL_CMD_LINE" in worker_config.hps: 30 | cmd_line = worker_config.hps["SSM_SHELL_CMD_LINE"] 31 | logger.info(f"Launching a shell: {cmd_line}") 32 | shell_cmd = subprocess.run(cmd_line, shell=True, executable="/bin/bash") 33 | 34 | logger.info(f"finished with {shell_cmd.returncode} return code!") 35 | 36 | # wait_for_state_sync(worker_config) 37 | return shell_cmd.returncode 38 | 39 | 40 | def wait_for_state_sync(worker_config): 41 | max_secs = 60 * 5 # 5 mins max 42 | wait_secs = 5 43 | state_path = Path(worker_config.state) 44 | max_change_time = max(map(os.path.getmtime, state_path.rglob("*"))) 45 | for i in range(max_secs // wait_secs): 46 | time.sleep(wait_secs) 47 | new_max = max(map(os.path.getmtime, state_path.rglob("*"))) 48 | if new_max == max_change_time: 49 | return 50 | max_change_time = new_max 51 | logger.warning( 52 | f"It seems like sage maker is still uploading after {max_secs} secs..." 53 | ) 54 | 55 | 56 | if __name__ == "__main__": 57 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 58 | retcode = worker() 59 | sys.exit(retcode) 60 | -------------------------------------------------------------------------------- /examples/debugging/tensorboard/lightning.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytorch_lightning as pl 4 | import torch 5 | import torch.nn.functional as F 6 | from pytorch_lightning.callbacks import ModelCheckpoint 7 | from torch import nn 8 | from torch.utils.data import DataLoader, random_split 9 | from torchvision import transforms 10 | from torchvision.datasets import MNIST 11 | 12 | 13 | class LitAutoEncoder(pl.LightningModule): 14 | def __init__(self): 15 | super().__init__() 16 | self.encoder = nn.Sequential( 17 | nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3) 18 | ) 19 | self.decoder = nn.Sequential( 20 | nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28) 21 | ) 22 | 23 | def forward(self, x): 24 | # in lightning, forward defines the prediction/inference actions 25 | embedding = self.encoder(x) 26 | return embedding 27 | 28 | def training_step(self, batch, batch_idx): 29 | # training_step defined the train loop. It is independent of forward 30 | x, y = batch 31 | x = x.view(x.size(0), -1) 32 | z = self.encoder(x) 33 | x_hat = self.decoder(z) 34 | loss = F.mse_loss(x_hat, x) 35 | self.log("train_loss", loss) 36 | return loss 37 | 38 | def configure_optimizers(self): 39 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) 40 | return optimizer 41 | 42 | 43 | dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()) 44 | dataset = torch.utils.data.Subset(dataset, range(1000)) 45 | train, val = random_split(dataset, [800, 200]) 46 | 47 | autoencoder = LitAutoEncoder() 48 | checkpoint_callback = ModelCheckpoint( 49 | monitor="train_loss", 50 | filepath="/state/checkpoints/sample-mnist-{epoch:02d}-{val_loss:.2f}", 51 | save_top_k=3, 52 | ) 53 | trainer = pl.Trainer( 54 | default_root_dir="/opt/ml/output/tensorboard", 55 | checkpoint_callback=checkpoint_callback, 56 | ) 57 | trainer.fit(autoencoder, DataLoader(train), DataLoader(val)) 58 | -------------------------------------------------------------------------------- /examples/processing_cli/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -ex # fail if any test fails 3 | 4 | # Params: [output] [prefix] [suffix] [additional ssm params...] 5 | cd `dirname "$0"` 6 | echo "Running with", -- $1 -- $2 -- $3 -- $4 -- $5 7 | 8 | # Example 1 - a processing script + dependencies 9 | ssm process --prefix ${2} -p ssm-example-processing -t cli-code${3} -o $1/output1 \ 10 | --download_state --download_output --max_run_mins 15 \ 11 | --code ex1.py --dependencies ./dep ${@:4} \ 12 | -- arg1 -arg2 --arg3 "argument 4" & 13 | pid1=$! 14 | 15 | # Example 2 - a raw entrypoint with arguments 16 | ssm process --prefix ${2} -p ssm-example-processing -t cli-shell${3} -o $1/output2 \ 17 | --download_state --download_output --max_run_mins 15 \ 18 | --entrypoint "/bin/bash" --dependencies ./dep --force_running \ 19 | -- -c "echo ==Bash && \ 20 | echo \"-***- Args:\"\$@ &&echo \"-- Env:\"\`env\`&& \ 21 | echo \"*** START listing files\"&&ls -laR /opt&&echo \"*** END \"&& \ 22 | cp -r /opt/ml/config \$SSM_OUTPUT/config&& \ 23 | echo output>\$SSM_OUTPUT/output&& \ 24 | echo state>\$SSM_STATE/state" & 25 | 26 | 27 | # Example 3 - a bash script that gets the output and state of cli-code as input 28 | wait $pid1 29 | ssm process --prefix ${2} -p ssm-example-processing -t cli-bash${3} -o $1/output3 \ 30 | --download_state --command bash --download_output --max_run_mins 15 \ 31 | -i ./data --iit cli_code_output cli-code${3} output --iit cli_code_state cli-code${3} state \ 32 | --code ex3.sh --dependencies ./dep ${@:4} \ 33 | -- arg1 -arg2 --arg3 "argument 4" & 34 | 35 | # Example 3 - a shell training ecript that gets the output and state of cli-code as input 36 | ssm shell --prefix ${2} -p ssm-example-processing -t shell-task${3} -o $1/output4 \ 37 | --iit cli_code_output cli-code${3} output --iit cli_code_state cli-code${3} state \ 38 | --cmd_line "echo '*** START listing files in /opt/ml' && ls -laR /opt/ml && echo '*** END file listing /opt/ml'" \ 39 | --max_run_mins 15 ${@:4} & 40 | 41 | # --it ml.t3.medium 42 | 43 | wait # wait for all processes 44 | 45 | # Run: 46 | # tox -e bash -- ./run.sh ./output " " " " --cs -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output2/output/config/processingjobconfig.json: -------------------------------------------------------------------------------- 1 | {"ProcessingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:processing-job/cli-shell-2020-10-06-23-25-34-3bnesrat","ProcessingJobName":"cli-shell-2020-10-06-23-25-34-3bnESRAt","Environment":{"SSM_OUTPUT":"/opt/ml/processing/output","SSM_STATE":"/opt/ml/processing/state"},"AppSpecification":{"ImageUri":"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3","ContainerEntrypoint":["/bin/bash"],"ContainerArguments":["-c","echo '======= Bash script ...' \u0026\u0026 echo 'Args:' $@ \u0026\u0026 echo Env: `env` \u0026\u0026 pwd \u0026\u0026 ls -laR /opt \u0026\u0026 cp -r /opt/ml/config $SSM_OUTPUT/config \u0026\u0026 echo 'output' \u003e $SSM_OUTPUT/output \u0026\u0026 echo 'state' \u003e $SSM_STATE/state"]},"ProcessingInputs":[{"InputName":"DEP_dep","S3Input":{"LocalPath":"/opt/ml/processing/input/code/dep","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-shell-2020-10-06-23-25-34-3bnESRAt/input/DEP_dep","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_worker_toolkit","S3Input":{"LocalPath":"/opt/ml/processing/input/code/worker_toolkit","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-shell-2020-10-06-23-25-34-3bnESRAt/input/DEP_worker_toolkit","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}}],"ProcessingOutputConfig":{"Outputs":[{"OutputName":"state","S3Output":{"LocalPath":"/opt/ml/processing/state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-shell/state","S3UploadMode":"Continuous"}},{"OutputName":"output","S3Output":{"LocalPath":"/opt/ml/processing/output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-shell/cli-shell-2020-10-06-23-25-34-3bnESRAt/output","S3UploadMode":"EndOfJob"}}],"KmsKeyId":null},"ProcessingResources":{"ClusterConfig":{"InstanceCount":1,"InstanceType":"ml.m5.large","VolumeSizeInGB":30,"VolumeKmsKeyId":null}},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","StoppingCondition":{"MaxRuntimeInSeconds":900}} -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "env": { 14 | "PYTHONPATH": "${workspaceFolder}/src" 15 | }, 16 | "cwd": "${fileDirname}" 17 | }, 18 | { 19 | "name": "Python: Cli", 20 | "type": "python", 21 | "request": "launch", 22 | "program": "${workspaceFolder}/src/cli_launcher.py", 23 | "console": "integratedTerminal", 24 | "cwd": "${fileDirname}", 25 | "justMyCode": false, 26 | "env": { 27 | "PYTHONPATH": "${workspaceFolder}/src" 28 | }, 29 | "args2": [ 30 | "shell", 31 | "-p", "tests/aaa", 32 | //"--instance_type", "ml.p3.2xlarge", 33 | "-t", "task2", 34 | "--cmd_line", "echo 222 333", 35 | "--dir_files", "${workspaceFolder}/examples/readme_examples/data", 36 | //"-e", "${workspaceFolder}/examples/readme_examples/worker3.py", 37 | "-o", "${workspaceFolder}/output", 38 | "--no_spot", 39 | "-f", "tensorflow", 40 | "-m", 41 | "--md", "Score", "Score=(.*?);", 42 | "--tag", "MyTag", "MyValue", 43 | //"--iis", "bucket", "s3://awsglue-datasets/examples/us-legislators/all/persons.json" 44 | //"--iit", "bucket", "task3", "model" 45 | //"--cs", 46 | "--ks", 47 | "--aa", "bb", 48 | "--cc", "dd", 49 | ], 50 | "args": ["shell", "-p", "shell-cli", "-t", "shell-cli-task22", "--cmd_line", "ls -la", "-o", "./output", "--local", "--it", "local", "--no_spot"], 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /examples/multiple_tasks/code/algo_multi.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | 7 | from worker_toolkit import worker_lib 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def listDir(path, recursive=True): 13 | logger.info(f"*** START listing files in {path}") 14 | cmd_args = ["ls", "-la", path] 15 | if recursive: 16 | cmd_args.append("-R") 17 | process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True) 18 | logger.info(process.stdout) 19 | logger.info(f"*** END file listing {path}") 20 | 21 | 22 | def logBefore(worker_config): 23 | # show the given arguments and environment 24 | logger.info(f"Argv: {sys.argv}") 25 | logger.info(f"Env: {os.environ}") 26 | # just to show the initial directory structue 27 | listDir("/opt/ml") 28 | listDir(worker_config.state) 29 | 30 | 31 | def logAfter(worker_config): 32 | # just to show the final directory structue 33 | listDir("/opt/ml") 34 | listDir(worker_config.state) 35 | 36 | 37 | if __name__ == "__main__": 38 | logging.basicConfig(stream=sys.stdout) 39 | logger.info("Starting algo...") 40 | 41 | # parse the arguments 42 | worker_config = worker_lib.WorkerConfig() 43 | logBefore(worker_config) 44 | 45 | output_data_dir = os.path.join( 46 | worker_config.output_data_dir, worker_config.current_host 47 | ) 48 | 49 | # create some data in the state dir 50 | if worker_config.hps["stage"] == 1: 51 | # put some files in the state directory 52 | for i in range(10): 53 | open( 54 | f"{worker_config.instance_state}/state_{worker_config.current_host}_{i+1}", 55 | "wt", 56 | ).write("state") 57 | 58 | # put something in the model 59 | modelDir = os.path.join(worker_config.model_dir, worker_config.current_host) 60 | os.makedirs(modelDir, exist_ok=True) 61 | open(f"{modelDir}/model_dir", "wt").write("model_dir") 62 | 63 | elif worker_config.hps["stage"] == 2: 64 | logger.info("Doing nothing...") 65 | 66 | # copy all input channels to the output dir 67 | for channel_name in worker_config.channels: 68 | input_dir = worker_config.__getattr__(f"channel_{channel_name}") 69 | shutil.copytree(input_dir, f"{output_data_dir}/{channel_name}_copy") 70 | shutil.copytree(worker_config.state, f"{output_data_dir}/state_copy") 71 | 72 | logger.info("finished!") 73 | logAfter(worker_config) 74 | # The task is marked as completed 75 | -------------------------------------------------------------------------------- /examples/single_task/expected_output/output/algo-1/input_dir_copy/config/trainingjobconfig.json: -------------------------------------------------------------------------------- 1 | {"EnableInterContainerTrafficEncryption":false,"EnableNetworkIsolation":false,"EnableManagedSpotTraining":true,"HyperParameters":{"arg2":"\"hello\"","sagemaker_container_log_level":"20","sagemaker_program":"\"algo.py\"","arg1":"5","sagemaker_region":"\"us-east-1\"","sagemaker_job_name":"\"Task1-2020-10-04-09-17-37-x6ux770b\"","sagemaker_submit_directory":"\"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/source/sourcedir.tar.gz\""},"InputDataConfig":[{"ChannelName":"data","DataSource":{"S3DataSource":{"S3DataType":"S3_PREFIX","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/input","S3DataDistributionType":"SHARDED_BY_S3_KEY"}},"CompressionType":"NONE","RecordWrapperType":"NONE"}],"OutputDataConfig":{"KmsKeyId":"","S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1"},"ResourceConfig":{"InstanceCount":2,"InstanceType":"ml.m5.large","VolumeSizeInGB":30},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","Tags":[{"Key":"SimpleSagemakerProject","Value":"tests/simple-sagemaker-example_2020-10-04-09-16-51_py37"},{"Key":"SimpleSagemakerCallingModule","Value":"/home/runner/work/simple_sagemaker/simple_sagemaker/examples/single_task/example.py"},{"Key":"SimpleSagemakerTask","Value":"Task1"},{"Key":"SimpleSagemakerVersion","Value":"0.9.19"}],"BaseTags":[{"Key":"SimpleSagemakerProject","Value":"tests/simple-sagemaker-example_2020-10-04-09-16-51_py37"},{"Key":"SimpleSagemakerCallingModule","Value":"/home/runner/work/simple_sagemaker/simple_sagemaker/examples/single_task/example.py"},{"Key":"SimpleSagemakerTask","Value":"Task1"},{"Key":"SimpleSagemakerVersion","Value":"0.9.19"}],"TrainingJobName":"Task1-2020-10-04-09-17-37-x6ux770b","StoppingCondition":{"MaxRuntimeInSeconds":900,"MaxWaitTimeInSeconds":900},"AlgorithmSpecification":{"MetricDefinitions":[],"TrainingImage":"XXXXXXXXXXXX.dkr.ecr.us-east-1.amazonaws.com/task_repo:latest","TrainingInputMode":"File"},"TrainingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:training-job/task1-2020-10-04-09-17-37-x6ux770b","DebugHookConfig":{"S3OutputPath":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/Task1-2020-10-04-09-17-37-x6ux770b/debug-output","LocalPath":"/opt/ml/output/tensors"},"CheckpointConfig":{"S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example_2020-10-04-09-16-51_py37/Task1/state","LocalPath":"/state"}} -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | #filepath = os.path.abspath(os.path.dirname(__file__)) 16 | #sys.path.insert(0, os.path.join(filepath, "..", "..", "src", "simple_sagemaker")) 17 | sys.path.insert(0, os.path.abspath('../../src/simple_sagemaker/')) 18 | 19 | import sphinx_rtd_theme 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'simple-sagemaker' 24 | copyright = '2020, Ariel Shiftan' 25 | author = 'Ariel Shiftan' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = '0.9.11' 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | "sphinx_rtd_theme", 38 | 'sphinx.ext.autodoc', 39 | "sphinx.ext.viewcode" 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = [] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | 57 | html_theme = 'sphinx_rtd_theme' 58 | 59 | # Add any paths that contain custom static files (such as style sheets) here, 60 | # relative to this directory. They are copied after the builtin static files, 61 | # so a file named "default.css" will overwrite the builtin "default.css". 62 | html_static_path = ['_static'] 63 | 64 | html_theme_options = { 65 | 'canonical_url': '', 66 | 'logo_only': False, 67 | 'display_version': True, 68 | 'prev_next_buttons_location': 'bottom', 69 | 'style_external_links': False, 70 | 'vcs_pageview_mode': '', 71 | 'style_nav_header_background': 'white', 72 | # Toc options 73 | 'collapse_navigation': True, 74 | 'sticky_navigation': True, 75 | 'navigation_depth': 4, 76 | 'includehidden': True, 77 | 'titles_only': False 78 | } -------------------------------------------------------------------------------- /examples/single_task/code/algo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | 7 | import transformers 8 | from worker_toolkit import worker_lib 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def listDir(path, recursive=True): 14 | logger.info(f"*** START listing files in {path}") 15 | cmd_args = ["ls", "-la", path] 16 | if recursive: 17 | cmd_args.append("-R") 18 | process = subprocess.run(cmd_args, stdout=subprocess.PIPE, universal_newlines=True) 19 | logger.info(process.stdout) 20 | logger.info(f"*** END file listing {path}") 21 | 22 | 23 | def logBefore(worker_config): 24 | # show the given arguments and environment 25 | logger.info(f"Argv: {sys.argv}") 26 | logger.info(f"Env: {os.environ}") 27 | # show a library that was installed due to requirements.txt 28 | logger.info(f"transformers: {transformers}") 29 | # just to show the initial directory structue 30 | listDir("/opt/ml") 31 | listDir(worker_config.state) 32 | 33 | 34 | def logAfter(worker_config): 35 | # just to show the final directory structue 36 | listDir("/opt/ml") 37 | listDir(worker_config.state) 38 | 39 | 40 | if __name__ == "__main__": 41 | logging.basicConfig(stream=sys.stdout) 42 | logger.info("Starting algo...") 43 | 44 | # parse the arguments 45 | worker_config = worker_lib.WorkerConfig() 46 | 47 | # importing internal and external dependencies 48 | from external_dependency import lib1 # noqa: F401 49 | from internal_dependency import lib2 # noqa: F401 50 | 51 | logBefore(worker_config) 52 | 53 | # copy the entire input dir to the output dir 54 | output_data_dir = os.path.join( 55 | worker_config.output_data_dir, worker_config.current_host 56 | ) 57 | shutil.copytree(worker_config.input_dir, f"{output_data_dir}/input_dir_copy") 58 | # copy state dir 59 | shutil.copytree(worker_config.state, f"{output_data_dir}/state_copy") 60 | # cteaye a file 61 | open(f"{output_data_dir}/output_data_dir", "wt").write("output_data_dir") 62 | 63 | # create one file in the output dir 64 | output_dir = os.path.join(worker_config.output_dir, worker_config.current_host) 65 | os.makedirs(output_dir, exist_ok=True) 66 | open(f"{output_dir}/output_dir", "wt").write("output_dir") 67 | 68 | # create one file in the output model dir 69 | modelDir = os.path.join(worker_config.model_dir, worker_config.current_host) 70 | os.makedirs(modelDir, exist_ok=True) 71 | open(f"{modelDir}/model_dir", "wt").write("model_dir") 72 | 73 | open( 74 | f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt" 75 | ).write(f"state_{worker_config.current_host}") 76 | 77 | # just to show the final directory structue 78 | logger.info("finished!") 79 | logAfter(worker_config) 80 | # The task is marked as completed 81 | -------------------------------------------------------------------------------- /tests/smoke/test_basics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import platform 4 | import shutil 5 | import subprocess 6 | import sys 7 | from time import time 8 | 9 | import boto3 10 | 11 | from ..system.compare_outputs import isAsExpected 12 | 13 | file_path = os.path.split(__file__)[0] 14 | examples_path = os.path.abspath(os.path.join(file_path, "..", "..", "examples")) 15 | 16 | 17 | def test_project(caplog, tmp_path): 18 | caplog.set_level(logging.INFO) 19 | logging.info("test_project") 20 | 21 | from simple_sagemaker.sm_project import SageMakerProject 22 | 23 | sm_project = SageMakerProject(project_name="test") 24 | sm_project = sm_project 25 | 26 | 27 | def test_task(caplog, tmp_path): 28 | caplog.set_level(logging.INFO) 29 | logging.info("test_task") 30 | 31 | from simple_sagemaker.sm_task import SageMakerTask 32 | 33 | boto3_session = boto3.Session() 34 | image_uri = None 35 | smTask = SageMakerTask(boto3_session, "taskName", image_uri, prefix="tests/smoke") 36 | smTask = smTask 37 | 38 | 39 | def _testCliInternal(cmd): 40 | shell_cmd = subprocess.run(cmd, shell=True) 41 | print("**************", shell_cmd) 42 | assert shell_cmd.returncode == 0 43 | 44 | 45 | def test_cli_help(): 46 | _testCliInternal("ssm -h") 47 | 48 | 49 | def test_cli_run_help(): 50 | _testCliInternal("ssm run -h") 51 | 52 | 53 | def test_cli_shell_help(): 54 | _testCliInternal("ssm shell -h") 55 | 56 | 57 | def test_cli_data_help(): 58 | _testCliInternal("ssm data -h") 59 | 60 | 61 | def _internalTestCli(test_path, caplog, tmp_path): 62 | caplog.set_level(logging.INFO) 63 | print("Temp path:", tmp_path) 64 | print("Running cli:", test_path) 65 | 66 | output_path = os.path.join(tmp_path, test_path, "output_smoke") 67 | # remove current local output 68 | shutil.rmtree(output_path, ignore_errors=True) 69 | # prefix/suffix for project name 70 | py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}" 71 | time_string = int(time()) 72 | postfix = f"-{os.name}-{time_string}-{py_version_string}" 73 | prefix = "tests_smoke/" 74 | 75 | if platform.system() == "Linux": 76 | run_shell = os.path.join(examples_path, test_path, "run_smoke.sh") 77 | elif platform.system() == "Windows": 78 | run_shell = os.path.join(examples_path, test_path, "run_smoke.bat") 79 | subprocess.run( 80 | [run_shell, output_path, prefix, postfix, "--cs --force_running"], check=True 81 | ) 82 | 83 | expected_path = os.path.join(examples_path, test_path, "expected_output_smoke") 84 | assert isAsExpected(output_path, expected_path) 85 | 86 | 87 | def test_readme_examples(caplog, tmp_path): 88 | # Windows can't currently work due to lack of support in running linux images 89 | # Mac can't currently work as it doesn't have a docker engine 90 | if platform.system() in ["Linux"]: 91 | _internalTestCli("readme_examples", caplog, tmp_path) 92 | -------------------------------------------------------------------------------- /examples/imagenet/run_remote.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Arguments [PARTIAL_DATA flag] 3 | 4 | set -e # stop and fail if anything stops 5 | cd `dirname "$0"` 6 | PARTIAL_DATA=$1 7 | data_source=$( [ "$PARTIAL_DATA" == true ] && echo download || echo download-all ) 8 | echo "*** Using data source: $data_source" 9 | 10 | # Download the code from PyTorch's examples repository 11 | [ -f code/main.py ] || wget -O code/main.py https://raw.githubusercontent.com/pytorch/examples/master/imagenet/main.py 12 | 13 | # Download the data 14 | ssm process -p ex-imagenet -t $data_source -v 400 \ 15 | --entrypoint "/bin/bash" --dependencies ./code \ 16 | -o ./output/$data_source \ 17 | -- -c "bash /opt/ml/processing/input/code/code/$data_source.sh \$SSM_OUTPUT/data" 18 | 19 | run_training () { # args: task_name, instance_type, additional_command_params, [description] [epochs] [additional_args] 20 | EPOCHS=${5:-10} # 20 epochs by default 21 | ADDITIONAL_ARGS=${6:-"--no_spot --force_running --cs"} # 22 | 23 | echo ===== Training $EPOCHS epochs, $4... 24 | ssm shell -p ex-imagenet -t $1 --dir_files ./code -o ./output/$1 -v 280 \ 25 | --iit train $data_source output FullyReplicated data/train \ 26 | --iit val $data_source output FullyReplicated data/val \ 27 | --md "loss" "Epoch:.*Loss\s+([e\-+0-9\\.]*) \(" --md "acc1" "Epoch:.*Acc@1\s+([e\-+0-9\\.]*) \(" --md "acc5" "Epoch:.*Acc@5\s+([e\-+0-9\\.]*) \(" \ 28 | --md "time" "Epoch:.*Time\s+([e\-+0-9\\.]*) \(" --md "data_time" "Epoch:.*Data\s+([e\-+0-9\\.]*) \(" \ 29 | --md "test_loss" "Test:.*Loss\s+([e\-+0-9\\.]*) \(" --md "test_acc1" "Test:.*Acc@1\s+([e\-+0-9\\.]*) \(" --md "test_acc5" "Test:.*Acc@5\s+([e\-+0-9\\.]*) \(" \ 30 | --download_model --download_output --download_state \ 31 | --it $2 $ADDITIONAL_ARGS \ 32 | --cmd_line "./extract.sh \$SM_CHANNEL_TRAIN/.. && \ 33 | CODE_DIR=\`pwd\` && cd \$SSM_INSTANCE_STATE && START=\$SECONDS && \ 34 | python \$CODE_DIR/main.py --epochs $EPOCHS --resume checkpoint.pth.tar --workers 8 \$SM_CHANNEL_TRAIN/.. $3 2>&1 && \ 35 | echo Total time: \$(( SECONDS - START )) seconds" 36 | 37 | exit $? 38 | } 39 | 40 | DESC="a single GPU" 41 | run_training train-1gpu ml.p3.2xlarge "" "$DESC" & 42 | DESC="distributed training, a single GPU" 43 | run_training train-dist-1gpu ml.p3.2xlarge "--multiprocessing-distributed --dist-url env:// --world-size 1 --rank 0 --seed 123" "$DESC" & 44 | DESC="distributed training, 8 GPUs" 45 | run_training train-dist-8gpus ml.p2.8xlarge "--multiprocessing-distributed --dist-url env:// --world-size 1 --rank 0 --seed 123" "$DESC" & 46 | DESC="distributed training, 3 instances, total 3 GPUs" 47 | run_training train-dist-3nodes-3gpus ml.p3.2xlarge '--multiprocessing-distributed --dist-url env:// --world-size $SSM_NUM_NODES --rank $SSM_HOST_RANK --seed 123' "$DESC" \ 48 | "" "--no_spot --ic 3 --force_running --cs" & 49 | 50 | wait 51 | echo "FINISHED!" 52 | exit 53 | 54 | -------------------------------------------------------------------------------- /examples/readme_examples/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -e # fail if any test fails 3 | 4 | # Params: [output] [prefix] [suffix] [additional ssm params...] 5 | BASEDIR=$(dirname "$0") 6 | echo "Running with", $@ 7 | 8 | # Example 1 - hello world 9 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task1${3} -e $BASEDIR/worker1.py -o $1/example1 --it ml.p3.2xlarge --no_spot --ic 2 ${@:4} --max_run_mins 15 & 10 | 11 | # Example 6_1 - a complete example part 1. 12 | # - Uses local data folder as input, that is distributed among instances (--i, ShardedByS3Key) 13 | # - Uses a public s3 bucket as an additional input (--iis) 14 | # - Builds a custom docker image (--df, --repo_name, --aws_repo_name) 15 | # - Hyperparameter task_type 16 | # - 2 instance (--ic) 17 | # - Use an on-demand instance (--no_spot) 18 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-1${3} -s $BASEDIR/example6/code -e worker6.py \ 19 | -i $BASEDIR/example6/data ShardedByS3Key --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \ 20 | --df $BASEDIR/example6 --repo_name "task6_repo" --aws_repo_name "task6_repo" --no_spot \ 21 | --download_state --download_model --download_output --max_run_mins 15 \ 22 | --ic 2 --task_type 1 -o $1/example6_1 ${@:4} & 23 | 24 | wait # wait for all processes 25 | 26 | # Shell example 27 | ssm shell --prefix ${2} -p simple-sagemaker-example-cli -t shell-task${3} --cmd_line "cat /proc/cpuinfo && nvidia-smi" -o $1/example_cmd --it ml.p3.2xlarge ${@:4} --max_run_mins 15 & 28 | 29 | # Example 6_2 - a complete example part 2. 30 | # - Uses outputs from part 1 (--iit) 31 | # - Uses additional local code dependencies (-d) 32 | # - Uses the tensorflow framework as pre-built image (-f) 33 | # - Tags the jobs (--tag) 34 | # - Defines sagemaker metrics (-m, --md) 35 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-2${3} -s $BASEDIR/example6/code -e worker6.py \ 36 | -d $BASEDIR/example6/external_dependency --iit task_6_1_model cli-task6-1${3} model --iit task_6_1_state cli-task6-1${3} state ShardedByS3Key \ 37 | -f tensorflow --md "Score" "Score=(.*?);" --tag "MyTag" "MyValue" \ 38 | --download_state --download_model --download_output --max_run_mins 15 \ 39 | --ic 2 --task_type 2 -o $1/example6_2 ${@:4} & 40 | 41 | wait # wait for all processes 42 | 43 | # Run task6_1 again 44 | # The rest of arguments ${@:4} (specifying --force_running) aren't passed here, to demonstrate that existing output is used, without running the task again 45 | ssm run --prefix ${2} -p simple-sagemaker-example-cli -t cli-task6-1${3} -s $BASEDIR/example6/code -e worker6.py \ 46 | -i $BASEDIR/example6/data ShardedByS3Key --iis persons s3://awsglue-datasets/examples/us-legislators/all/persons.json \ 47 | --df $BASEDIR/example6 --repo_name "task6_repo" --aws_repo_name "task6_repo" \ 48 | --download_state --download_model --download_output --max_run_mins 15 \ 49 | --ic 2 --task_type 1 -o $1/example6_1 > $1/example6_1_2_stdout & 50 | 51 | wait # wait for all processes 52 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3_2_stdout: -------------------------------------------------------------------------------- 1 | INFO:simple_sagemaker.cli:Running ssm cli, args:['/home/user/proj/simple_sagemaker/.tox/single_proc/bin/ssm', '-p', 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37', '-t', 'task3', '-e', '/home/user/proj/simple_sagemaker/examples/readme_examples/worker3.py', '-o', '/home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2', '--cs', '--ks'] 2 | INFO:simple_sagemaker.cli:Parsed arguments:Namespace(aws_repo_name=None, bucket_name=None, clean_state=False, config_file=None, dependencies=None, docker_file=None, entry_point='/home/user/proj/simple_sagemaker/examples/readme_examples/worker3.py', image_tag='latest', input_path=None, input_s3=None, input_task=None, instance_count=1, instance_type='ml.m5.large', max_run_mins=86400, max_wait_mins=86400, output_path='/home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2', project_name='tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37', repo_name=None, source_dir=None, task_name='task3', use_spot_instances=True, volume_size=30) 3 | INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials 4 | INFO:simple_sagemaker.iam_utils:Creating SageMaker IAM Role: SageMakerIAMRole with an attached AmazonSageMakerFullAccess policy... 5 | INFO:root:Using a pre-built image None... 6 | INFO:simple_sagemaker.sm_project:result: {'ResponseMetadata': {'RequestId': '9617559CA710A258', 'HostId': 'Q/gQPYgU189C1j0/zcbDa1fzwI7Q3v9ftSyNYJGUWLEYMAOmFiGVIaI378G00ubguE0yiB7PAdI=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'Q/gQPYgU189C1j0/zcbDa1fzwI7Q3v9ftSyNYJGUWLEYMAOmFiGVIaI378G00ubguE0yiB7PAdI=', 'x-amz-request-id': '9617559CA710A258', 'date': 'Mon, 14 Sep 2020 14:45:45 GMT', 'x-amz-bucket-region': 'us-east-1', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'IsTruncated': False, 'Marker': '', 'Contents': [{'Key': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/state_dir', 'LastModified': datetime.datetime(2020, 9, 14, 14, 44, 39, tzinfo=tzlocal()), 'ETag': '"4a5376727cd09607cd5b6ea1805c7e48"', 'Size': 14, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'ariel.shiftan', 'ID': '899dac056bb9e5cef98c1aca7f7ba6b6674a8bae0a2841f34e51043d93f9aa4b'}}], 'Name': 'sagemaker-us-east-1-XXXXXXXXXXXX', 'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/', 'Delimiter': '/', 'MaxKeys': 1000, 'CommonPrefixes': [{'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/algo-1/'}], 'EncodingType': 'url'} 7 | INFO:simple_sagemaker.sm_project:CommonPrefixes: [{'Prefix': 'tests/simple-sagemaker-example-cli_2020-09-14-14-41-14_py37/task3/state/algo-1/'}] 8 | INFO:simple_sagemaker.sm_project:subdirs: ['algo-1'] 9 | INFO:simple_sagemaker.sm_project:Task task3 is already completed by task3-2020-09-14-14-41-19-l7bXfIZg 10 | INFO:simple_sagemaker.sm_task:Downloading results to /home/user/proj/simple_sagemaker/.tox/single_proc/tmp/test_readme_examples0/output/example3_2 11 | -------------------------------------------------------------------------------- /src/simple_sagemaker/s3_sync.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from bisect import bisect_left 5 | from hashlib import md5 6 | from pathlib import Path 7 | 8 | import boto3 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class S3Sync: 14 | def __init__(self, boto3_sessions): 15 | self.s3_client = boto3_sessions.client("s3") 16 | 17 | def syncFolderToS3(self, source: str, dest: str, prefix: str) -> [str]: 18 | paths = self.listFolderFiles(source) 19 | objects = self.listS3Bucket(dest, prefix) 20 | 21 | # Getting the keys and ordering to perform binary search 22 | # each time we want to check if any paths is already there. 23 | object_keys = [obj["Key"][len(prefix) + 1 :] for obj in objects] 24 | object_keys.sort() 25 | object_keys_length = len(object_keys) 26 | 27 | for path in paths: 28 | file_name = os.path.join(source, path) 29 | should_upload = True 30 | # Binary search. 31 | index = bisect_left(object_keys, path) 32 | # Check if the file already exists 33 | if index != object_keys_length and object_keys[index] == path: 34 | # Check size 35 | file_stat = os.stat(file_name) 36 | if file_stat.st_size == objects[index]["Size"]: 37 | # Validate MD5 38 | md = md5(open(file_name, "rb").read()).hexdigest() 39 | if objects[index]["ETag"].strip('"') == md: 40 | should_upload = False 41 | 42 | if should_upload: 43 | logger.info(f"Uploading {file_name}") 44 | self.s3_client.upload_file( 45 | str(Path(source).joinpath(path)), 46 | Bucket=dest, 47 | Key=prefix + "/" + path, 48 | ) 49 | else: 50 | logger.info(f"Skipping {file_name}") 51 | 52 | def listS3Bucket(self, bucket, prefix): 53 | res = [] 54 | try: 55 | paginator = self.s3_client.get_paginator("list_objects_v2") 56 | pages = paginator.paginate(Bucket=bucket, Prefix=prefix) 57 | 58 | for page in pages: 59 | res.extend(page["Contents"]) 60 | except KeyError: 61 | # No Contents Key, empty bucket. 62 | return [] 63 | else: 64 | return res 65 | 66 | @staticmethod 67 | def listFolderFiles(folder_path): 68 | """ 69 | Recursively list all files within the given folder 70 | """ 71 | folder_path = folder_path.rstrip("/") 72 | files = [ 73 | str(x.relative_to(folder_path)) 74 | for x in Path(folder_path).rglob("*") 75 | if not x.is_dir() 76 | ] 77 | return files 78 | 79 | 80 | if __name__ == "__main__": 81 | # Test 82 | boto3_session = boto3.Session() 83 | s = S3Sync(boto3_session) 84 | path = ".." 85 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 86 | logger.info(f"listing {path}: {s.listFolderFiles(path)}") 87 | -------------------------------------------------------------------------------- /examples/readme_examples/example6/code/worker6.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import time 4 | from pathlib import Path 5 | 6 | # a library that was installed due to requirements.txt 7 | import transformers # noqa: F401 8 | 9 | # importing an internal dependency 10 | from internal_dependency import lib2 # noqa: F401 11 | from worker_toolkit import worker_lib 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def listDir(path, ignore_patterns=[]): 17 | logger.info(f"*** START listing files in {path}") 18 | for file in sorted(Path(path).rglob("*")): 19 | if (not ignore_patterns) or all( 20 | [pattern not in str(file) for pattern in ignore_patterns] 21 | ): 22 | logger.info(f"[{['Dir ', 'File'][file.is_file()]}] {file}") 23 | logger.info(f"*** END file listing {path}") 24 | 25 | 26 | def worker1(worker_config): 27 | # Libraries that were pre-installed in the docker image, as defined in the Dockerfile 28 | import pandas # noqa: F401 29 | import sklearn # noqa: F401 30 | 31 | logger.info("{pandas} is pre-installed in this image") 32 | 33 | # update the state 34 | (Path(worker_config.instance_state) / worker_config.current_host).write_text( 35 | f"state_{worker_config.current_host}" 36 | ) 37 | # "process" input data into model output 38 | for file in Path(worker_config.channel_data).rglob("*"): 39 | relp = file.relative_to(worker_config.channel_data) 40 | path = Path(worker_config.model_dir) / ( 41 | f"{relp}_proc_by_{worker_config.current_host}" 42 | ) 43 | path.write_text(f"{file.read_text()} processed by {worker_config.current_host}") 44 | # write to output dir 45 | ( 46 | Path(worker_config.output_data_dir) / f"output_{worker_config.current_host}" 47 | ).write_text(f"output_{worker_config.current_host}") 48 | 49 | 50 | def worker2(worker_config): 51 | # importing an external dependency 52 | from external_dependency import lib1 # noqa: F401 53 | 54 | logger.info("Score=10;") 55 | time.sleep(60) # sleep to be able to see the two scores 56 | logger.info("Score=20;") 57 | 58 | 59 | def show_inputs(worker_config): 60 | # just to show the initial directory structue 61 | for channel_name in worker_config.channels: 62 | input_path = worker_config.__getattr__(f"channel_{channel_name}") 63 | logger.info(f"input channel {channel_name} is at {input_path}") 64 | 65 | listDir("/opt/ml", ["__pycache__"]) 66 | listDir(worker_config.state) 67 | 68 | 69 | def show_output(worker_config): 70 | # show the final directory structue 71 | listDir("/opt/ml", ["/opt/ml/input", "/opt/ml/code", "__pycache__"]) 72 | listDir(worker_config.state) 73 | 74 | 75 | def worker(): 76 | logging.basicConfig(stream=sys.stdout) 77 | # parse the arguments 78 | worker_config = worker_lib.WorkerConfig() 79 | # get the instance specific state path 80 | show_inputs(worker_config) 81 | 82 | if int(worker_config.hps["task_type"]) == 1: 83 | worker1(worker_config) 84 | elif int(worker_config.hps["task_type"]) == 2: 85 | worker2(worker_config) 86 | 87 | show_output(worker_config) 88 | 89 | logger.info("finished!") 90 | # The task is marked as completed 91 | 92 | 93 | if __name__ == "__main__": 94 | worker() 95 | -------------------------------------------------------------------------------- /tests/system/test_examples.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | from time import time 7 | 8 | from .compare_outputs import isAsExpected 9 | 10 | file_path = os.path.split(__file__)[0] 11 | examples_path = os.path.abspath(os.path.join(file_path, "..", "..", "examples")) 12 | sys.path.append(examples_path) 13 | 14 | 15 | def _internalTestExample(caplog, tmp_path, runner): 16 | caplog.set_level(logging.INFO) 17 | # print(os.environ) 18 | print("Temp path:", tmp_path) 19 | print("Running", runner, runner.__name__, runner.__module__) 20 | 21 | example_path = os.path.dirname(runner.__code__.co_filename) 22 | output_path = os.path.join(tmp_path, os.path.split(example_path)[-1], "output") 23 | # remove current local output 24 | shutil.rmtree(output_path, ignore_errors=True) 25 | # prefix/suffix for project name 26 | py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}" 27 | time_string = int(time()) 28 | postfix = f"-{time_string}-{py_version_string}" 29 | prefix = "tests/" 30 | 31 | sm_project = runner(postfix=postfix, prefix=prefix, output_path=output_path) 32 | sm_project = sm_project 33 | # sm_project.cleanFolder() 34 | 35 | expected_path = os.path.join(example_path, "expected_output") 36 | # check for expected_output also one level up 37 | if not os.path.isdir(expected_path): 38 | expected_path = os.path.join(os.path.dirname(example_path), "expected_output") 39 | 40 | assert isAsExpected(output_path, expected_path) 41 | 42 | 43 | def _internalTestCli(test_path, caplog, tmp_path): 44 | caplog.set_level(logging.INFO) 45 | print("Temp path:", tmp_path) 46 | print("Running cli:", test_path) 47 | 48 | output_path = os.path.join(tmp_path, test_path, "output") 49 | # remove current local output 50 | shutil.rmtree(output_path, ignore_errors=True) 51 | # prefix/suffix for project name 52 | py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}" 53 | time_string = int(time()) 54 | postfix = f"-{time_string}-{py_version_string}" 55 | prefix = "tests/" 56 | 57 | run_shell = os.path.join(examples_path, test_path, "run.sh") 58 | subprocess.run( 59 | [run_shell, output_path, prefix, postfix, "--cs --force_running"], check=True 60 | ) 61 | 62 | expected_path = os.path.join(examples_path, test_path, "expected_output") 63 | assert isAsExpected(output_path, expected_path) 64 | 65 | 66 | def skip_test_cli_multi(caplog, tmp_path): 67 | _internalTestCli("cli_multi", caplog, tmp_path) 68 | 69 | 70 | def test_readme_examples(caplog, tmp_path): 71 | _internalTestCli("readme_examples", caplog, tmp_path) 72 | 73 | 74 | def test_processing_cli_examples(caplog, tmp_path): 75 | _internalTestCli("processing_cli", caplog, tmp_path) 76 | 77 | 78 | def test_multiple_tasks(caplog, tmp_path): 79 | from multiple_tasks.example import runner 80 | 81 | _internalTestExample(caplog, tmp_path, runner) 82 | 83 | 84 | def test_single_file_tasks(caplog, tmp_path): 85 | from single_file.example import runner 86 | 87 | _internalTestExample(caplog, tmp_path, runner) 88 | 89 | 90 | def test_single_task(caplog, tmp_path): 91 | from single_task.example import runner 92 | 93 | _internalTestExample(caplog, tmp_path, runner) 94 | -------------------------------------------------------------------------------- /examples/medium/intro/example3/code/ssm_ex3_worker.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import time 4 | from pathlib import Path 5 | 6 | # a library that was installed due to requirements.txt 7 | import transformers # noqa: F401 8 | 9 | # importing an internal dependency 10 | from internal_dependency import lib2 # noqa: F401 11 | from worker_toolkit import worker_lib 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def listDir(path, ignore_patterns=[]): 17 | logger.info(f"*** START listing files in {path}") 18 | for file in sorted(Path(path).rglob("*")): 19 | if (not ignore_patterns) or all( 20 | [pattern not in str(file) for pattern in ignore_patterns] 21 | ): 22 | logger.info(f"[{['Dir ', 'File'][file.is_file()]}] {file}") 23 | logger.info(f"*** END file listing {path}") 24 | 25 | 26 | def worker1(worker_config): 27 | # libraries that were pre-installed in the docker image, as defined in the Dockerfile 28 | import pandas # noqa: F401 29 | import sklearn # noqa: F401 30 | 31 | logger.info("{pandas} is pre-installed in this image") 32 | 33 | # update the state 34 | (Path(worker_config.instance_state) / worker_config.current_host).write_text( 35 | f"state_{worker_config.current_host}" 36 | ) 37 | # "process" input data into model output 38 | for file in Path(worker_config.channel_data).rglob("*"): 39 | relp = file.relative_to(worker_config.channel_data) 40 | path = Path(worker_config.model_dir) / ( 41 | f"{relp}_proc_by_{worker_config.current_host}" 42 | ) 43 | path.write_text(f"{file.read_text()} processed by {worker_config.current_host}") 44 | # write to output dir 45 | ( 46 | Path(worker_config.output_data_dir) / f"output_{worker_config.current_host}" 47 | ).write_text(f"output_{worker_config.current_host}") 48 | 49 | 50 | def worker2(worker_config): 51 | # importing an external dependency 52 | from external_dependency import lib1 # noqa: F401 53 | 54 | logger.info("Score=10;") 55 | time.sleep(60) # sleep to be able to see the two scores 56 | logger.info("Score=20;") 57 | 58 | 59 | def show_inputs(worker_config): 60 | # just to show the initial directory structue 61 | for channel_name in worker_config.channels: 62 | input_path = worker_config.__getattr__(f"channel_{channel_name}") 63 | logger.info(f"input channel {channel_name} is at {input_path}") 64 | 65 | listDir("/opt/ml", ["__pycache__"]) 66 | listDir(worker_config.state) 67 | 68 | 69 | def show_output(worker_config): 70 | # show the final directory structue 71 | listDir("/opt/ml", ["/opt/ml/input", "/opt/ml/code", "__pycache__"]) 72 | listDir(worker_config.state) 73 | 74 | 75 | def worker(): 76 | logging.basicConfig(stream=sys.stdout) 77 | # parse the arguments 78 | worker_config = worker_lib.WorkerConfig() 79 | # get the instance specific state path 80 | show_inputs(worker_config) 81 | 82 | if int(worker_config.hps["task_type"]) == 1: 83 | worker1(worker_config) 84 | elif int(worker_config.hps["task_type"]) == 2: 85 | worker2(worker_config) 86 | 87 | show_output(worker_config) 88 | 89 | logger.info("finished!") 90 | # The task is marked as completed 91 | 92 | 93 | if __name__ == "__main__": 94 | worker() 95 | -------------------------------------------------------------------------------- /examples/medium/distributed/README.md: -------------------------------------------------------------------------------- 1 | A very good [blog post](https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html) about distributed pytorch training. 2 | 3 | # Examples 4 | - [PyTorch's DDP example](https://github.com/pytorch/examples/tree/master/distributed/ddp) 5 | - [PyTorch's imagenet example](https://github.com/pytorch/examples/tree/master/imagenet) 6 | - [SageMaker's MNIST](https://github.com/aws/sagemaker-pytorch-training-toolkit/blob/master/test/resources/mnist/mnist.py) 7 | 8 | # Imagenet 9 | Using the [LSVRC2012 dataset](http://image-net.org/challenges/LSVRC/2012/ilsvrc2012.pdf) 10 | from https://cloud.google.com/tpu/docs/imagenet-setup : 11 | 1. Register to [http://image-net.org/](http://image-net.org/) and request access permission 12 | 2. nohup wget http://image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_train.tar 13 | 3. wget http://www.image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_train_t3.tar 14 | 4. wget http://www.image-net.org/challenges/LSVRC/2012/dd31405981ef5f776aa17412e1f0c112/ILSVRC2012_img_val.tar 15 | 16 | 17 | # Development flow 18 | 19 | ## Part one - local development 20 | 1. Come up with the complete flow, and a single script to run it 21 | 2. Make sure parameters can be configured on the command line, specifically: 22 | - Input / output / model paths 23 | - The number of processes / workers for data loaders 24 | - Distribution and number of used nodes (allow a single node as well) 25 | - Hyperparameters - batch size, learning rate, number of epochs etc 26 | 3. Run locally 27 | - With / without distribution (of size 1) 28 | - Check CPU, RAM, GPU and GPU RAM usage 29 | - Figure out a good balance between batch size and the learning rate until you reach a bottleneck 30 | - Save the model to the "state" directory every few cycles (e.g. every min or two, assuming saving is quick) 31 | - PyTorch: save it once on the beginning of the loop (to avoid many messages from the debugger `smdebug`) 32 | 4. Make sure running the entire flow again continue from where it stopped 33 | 34 | ## Part two - moving remotely 35 | 5. Update the code to support simple-sagemaker 36 | - The training script + running script 37 | - TBD: Tutorial / post on tihs 38 | 6. Test locally using "local mode" "--it local" to make sure everything works 39 | - Note: not everything is supported (TBD e.g.), but you may be able to find a few bugs quicker 40 | 7. Test remotely 41 | - Start with "--no_spot" to accelerate iterations until you're ready 42 | - Check CPU, RAM, GPU and GPU RAM usage 43 | - Figure out a good balance between batch size and the learning rate until you reach a bottleneck 44 | 45 | ## Part three - profit! 46 | 8. Run remotely 47 | - Make sure to remove "--no_spot" 48 | 9. Hyperparameters tuning 49 | 10. Debugging 50 | 51 | # Optimizations 52 | 1. Mixed precision - it is now [built in with PyTorch 1.6](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) 53 | 54 | Notes: 55 | 1. Make sure to save checkpoints to the state folder 56 | 2. TensorBoard is active, save logs to /opt/ml/output/tensorboard/, e.g. writer = SummaryWriter('/opt/ml/output/tensorboard/') and writer.add_scalar('Loss/test', np.random.random(), n_iter) 57 | 3. Syncing many files from S3 is slow, it's better to split the DB into e.g. 1000 tars. -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output3/output/config/processingjobconfig.json: -------------------------------------------------------------------------------- 1 | {"ProcessingJobArn":"arn:aws:sagemaker:us-east-1:XXXXXXXXXXXX:processing-job/cli-bash-2020-10-06-23-30-46-ulq8rrv0","ProcessingJobName":"cli-bash-2020-10-06-23-30-46-ULQ8RRV0","Environment":{"SM_CHANNEL_CLI_CODE_OUTPUT":"/opt/ml/processing/input/data/cli_code_output","SM_CHANNEL_CLI_CODE_STATE":"/opt/ml/processing/input/data/cli_code_state","SM_CHANNEL_DATA":"/opt/ml/processing/data","SSM_OUTPUT":"/opt/ml/processing/output","SSM_STATE":"/opt/ml/processing/state"},"AppSpecification":{"ImageUri":"683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3","ContainerEntrypoint":["bash","/opt/ml/processing/input/code/ex3.sh"],"ContainerArguments":["arg1","-arg2","--arg3","argument 4"]},"ProcessingInputs":[{"InputName":"cli_code_output","S3Input":{"LocalPath":"/opt/ml/processing/input/data/cli_code_output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-code/cli-code-2020-10-06-23-25-36-bE5AcbXg/output","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"cli_code_state","S3Input":{"LocalPath":"/opt/ml/processing/input/data/cli_code_state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-code/state","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_dep","S3Input":{"LocalPath":"/opt/ml/processing/input/code/dep","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/DEP_dep","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"DEP_worker_toolkit","S3Input":{"LocalPath":"/opt/ml/processing/input/code/worker_toolkit","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/DEP_worker_toolkit","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"data","S3Input":{"LocalPath":"/opt/ml/processing/data","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/input","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}},{"InputName":"code","S3Input":{"LocalPath":"/opt/ml/processing/input/code","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/input/code/ex3.sh","S3DataDistributionType":"FullyReplicated","S3DataType":"S3Prefix","S3InputMode":"File","S3CompressionType":"None","S3DownloadMode":"StartOfJob"}}],"ProcessingOutputConfig":{"Outputs":[{"OutputName":"state","S3Output":{"LocalPath":"/opt/ml/processing/state","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/state","S3UploadMode":"Continuous"}},{"OutputName":"output","S3Output":{"LocalPath":"/opt/ml/processing/output","S3Uri":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/ssm-example-processing/cli-bash/cli-bash-2020-10-06-23-30-46-ULQ8RRV0/output","S3UploadMode":"EndOfJob"}}],"KmsKeyId":null},"ProcessingResources":{"ClusterConfig":{"InstanceCount":1,"InstanceType":"ml.m5.large","VolumeSizeInGB":30,"VolumeKmsKeyId":null}},"RoleArn":"arn:aws:iam::XXXXXXXXXXXX:role/SageMakerIAMRole","StoppingCondition":{"MaxRuntimeInSeconds":900}} -------------------------------------------------------------------------------- /examples/single_task/example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import sys 5 | from time import gmtime, strftime 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | file_path = os.path.split(__file__)[0] 10 | if "TOX_ENV_NAME" not in os.environ: 11 | srcPath = os.path.abspath(os.path.join(file_path, "..", "..", "src")) 12 | sys.path.append(srcPath) 13 | from simple_sagemaker.sm_project import SageMakerProject # noqa: E402 14 | 15 | 16 | def setDefaultParams(sm_project): 17 | # docker image params 18 | aws_repo_name = "task_repo" # remote (ECR) rpository name 19 | repo_name = "task_repo" # local repository name 20 | image_tag = "latest" # tag for local & remote images 21 | docker_file_path = os.path.join(file_path, "docker") # path of the local Dockerfile 22 | sm_project.setDefaultImageParams( 23 | aws_repo_name, repo_name, image_tag, docker_file_path 24 | ) 25 | 26 | # job code path, entrypoint and params 27 | source_dir = os.path.join(file_path, "code") 28 | entry_point = "algo.py" 29 | dependencies = [os.path.join(file_path, "external_dependency")] 30 | sm_project.setDefaultCodeParams(source_dir, entry_point, dependencies) 31 | 32 | # instances type an count 33 | instance_type = "ml.m5.large" 34 | training_instance_count = 2 35 | volume_size = ( 36 | 30 # Size in GB of the EBS volume to use for storing input data during training 37 | ) 38 | use_spot_instances = True # False 39 | max_run_mins = 15 40 | sm_project.setDefaultInstanceParams( 41 | instance_type, 42 | training_instance_count, 43 | volume_size, 44 | use_spot_instances, 45 | max_run_mins, 46 | ) 47 | 48 | 49 | def runner( 50 | project_name="simple-sagemaker-example", prefix="", postfix="", output_path=None 51 | ): 52 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 53 | 54 | sm_project = SageMakerProject(project_name, prefix=prefix) 55 | 56 | setDefaultParams(sm_project) 57 | 58 | image_uri = sm_project.buildOrGetImage( 59 | instance_type=sm_project.defaultInstanceParams.instance_type 60 | ) 61 | 62 | # task name 63 | task_name = ( 64 | "task1" 65 | + postfix # must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])* 66 | ) 67 | # input data params 68 | input_data_path = os.path.join( 69 | file_path, "input_data" 70 | ) # Can also provide a URI to an S3 bucket, e.g. next commented line 71 | # input_data_path = sagemaker.s3.s3_path_join("s3://", "sagemaker-us-east-1-XXXXXXXXXXXX", "task3", "input") 72 | distribution = "ShardedByS3Key" # or "FullyReplicated" which is the default 73 | model_uri = ( 74 | None # Can be used to supply model data as an additional input, local/s3 75 | ) 76 | hyperparameters = {"arg1": 5, "arg2": "hello"} 77 | 78 | sm_project.runTask( 79 | task_name, 80 | image_uri, 81 | hyperparameters, 82 | input_data_path, 83 | model_uri=model_uri, 84 | input_distribution=distribution, 85 | clean_state=True, 86 | ) 87 | 88 | # delete the output directory 89 | if not output_path: 90 | output_path = os.path.join(file_path, "output") 91 | shutil.rmtree(output_path, ignore_errors=True) 92 | sm_project.downloadResults(task_name, output_path) 93 | 94 | return sm_project 95 | 96 | 97 | if __name__ == "__main__": 98 | py_version_string = f"py{sys.version_info.major}{sys.version_info.minor}" 99 | time_string = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 100 | sm_project = runner(postfix=f"_{time_string}_{py_version_string}", prefix="tests/") 101 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37, py36, py38 3 | 4 | [testenv] # the default env - coverage + parallalism 5 | whitelist_externals = 6 | /bin/echo 7 | /bin/ls 8 | deps = 9 | pytest-xdist 10 | coverage 11 | pytest-cov 12 | sagemaker[local] 13 | passenv = 14 | AWS_DEFAULT_REGION 15 | AWS_REGION 16 | AWS_ACCESS_KEY_ID 17 | AWS_SECRET_ACCESS_KEY 18 | AWS_SESSION_TOKEN 19 | 20 | commands = 21 | pytest --cov=simple_sagemaker --cov-append --cov-report=term-missing -n 4 --basetemp="{envtmpdir}" {posargs} 22 | 23 | [testenv:single_proc] # no coverage nor parallelism 24 | deps = 25 | sagemaker[local] 26 | pytest 27 | commands = 28 | pytest --basetemp="{envtmpdir}" {posargs} 29 | 30 | [testenv:ssm] # run ssm cli 31 | deps = 32 | sagemaker[local] 33 | 34 | commands = 35 | ssm {posargs} 36 | 37 | [testenv:bash] # run ssm cli 38 | whitelist_externals = 39 | /usr/bin/bash 40 | /bin/bash 41 | commands = 42 | bash {posargs} 43 | 44 | 45 | [testenv:no-coverage] # no coverage 46 | commands = 47 | pytest -n 4 --basetemp="{envtmpdir}" {posargs} 48 | deps = 49 | pytest-xdist 50 | 51 | [testenv:report] # generate a coverage report 52 | skip_install = true 53 | deps = coverage 54 | commands = 55 | coverage html -i --include="*simple_sagemaker*" --omit="*worker_toolkit*","*shell_launcher.py" 56 | coverage report --include="*simple_sagemaker*" --omit="*worker_toolkit*","*shell_launcher.py" --fail-under=85 57 | coverage report --help 58 | 59 | [testenv:clean] # clean up coverage data 60 | skip_install = true 61 | deps = coverage 62 | commands = coverage erase 63 | 64 | [tool:pytest] 65 | testpaths = tests 66 | 67 | ### Formatting & linting 68 | [flake8] 69 | max-line-length = 127 70 | extend-ignore = E203 71 | 72 | [testenv:lint] 73 | skip_install = true 74 | setenv = 75 | deps = 76 | flake8 77 | black 78 | isort 79 | commands = 80 | flake8 ./src ./tests ./examples --count --statistics 81 | isort --check-only ./src ./tests ./examples 82 | black --check ./src ./tests ./examples 83 | 84 | [testenv:cf] # Code Format 85 | skip_install = true 86 | deps = 87 | black 88 | isort 89 | commands = 90 | isort ./src ./tests ./examples 91 | black ./src ./tests ./examples 92 | 93 | [testenv:publish] # build & publish the code 94 | skip_install = true 95 | basepython = python3.7 96 | setenv = 97 | deps = 98 | setuptools 99 | wheel 100 | twine 101 | commands = 102 | python setup.py sdist bdist_wheel 103 | twine upload dist/* 104 | 105 | [testenv:docs] 106 | description = invoke sphinx-build to build the HTML docs 107 | basepython = python3.7 108 | changedir = docs 109 | deps = 110 | sphinx 111 | sphinx-rtd-theme 112 | whitelist_externals = 113 | /usr/bin/make 114 | commands = 115 | sphinx-apidoc -f -o ./source ../src/simple_sagemaker ../src/simple_sagemaker/sm_task.py ../src/simple_sagemaker/s3_sync.py ../src/simple_sagemaker/iam_utils.py ../src/simple_sagemaker/ecr_sync.py ../src/simple_sagemaker/cli.py 116 | make html 117 | 118 | 119 | # Run a single test, e.g. for debugging 120 | # tox -e single_proc -- --capture=no --log-cli-level=INFO -k single_file 121 | 122 | # Run an ssm command line 123 | # tox -e ssm -- run -p simple-sagemaker-example-cli -t task1 -e ./examples/cli_simple/worker.py --cs -o ./output --use_spot_instances 0 124 | 125 | # Ececute a shell command 126 | # tox -e bash -- ./examples/cli_simple/run.sh ./output 127 | 128 | # Ececute a shell job 129 | # tox -e ssm -- shell -p shell-cli -t shell-cli-task --cmd_line "ls -la" -o ./output --dir_files=./src/simple_sagemaker 130 | # tox -e ssm -- shell -p shell-cli -t shell-cli-task --cmd_line "ls -la" -o ./output --df "RUN pip3 install pandas==0.25.3 scikit-learn==0.21.3" --repo_name "tt_repo" --aws_repo_name "tt_repo" -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build, test and deploy 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ${{ matrix.os }} 13 | #if: github.event_name != 'push' || (!startsWith(github.ref, 'refs/heads/master')) 14 | 15 | strategy: 16 | max-parallel: 10 17 | matrix: 18 | os: [ubuntu-latest, windows-latest, macos-latest] 19 | python-version: [3.6, 3.7, 3.8] 20 | 21 | steps: 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | # Cache pip packages 27 | - name: Get pip cache dir 28 | id: pip-cache 29 | run: | 30 | echo "::set-output name=dir::$(pip cache dir)" 31 | - name: pip cache 32 | uses: actions/cache@v2 33 | with: 34 | path: ${{ steps.pip-cache.outputs.dir }} 35 | key: ${{ matrix.os }}-pip-${{ hashFiles('**/setup.cfg') }} 36 | restore-keys: | 37 | ${{ matrix.os }}-pip- 38 | 39 | - name: Checkout 40 | uses: actions/checkout@v2 41 | - name: Install dependencies 42 | run: | 43 | python -m pip install --upgrade pip 44 | pip install -r requirements.txt 45 | - name: Lint 46 | run: | 47 | tox -e lint 48 | - name: Configure AWS credentials 49 | uses: aws-actions/configure-aws-credentials@v1 50 | with: 51 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 52 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 53 | aws-region: us-east-1 54 | - name: Smoke tests 55 | run: | 56 | tox -e py -- --capture=no --log-cli-level=INFO tests/smoke/ 57 | - name: System tests & check coverage 58 | if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') || github.base_ref == 'master') 59 | run: | 60 | tox -e py -- --capture=no --log-cli-level=INFO tests/system/ 61 | tox -e report 62 | - name: Upload coverage report 63 | if: (!cancelled()) && matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') || github.base_ref == 'master') 64 | uses: actions/upload-artifact@v2 65 | with: 66 | name: coverage 67 | path: htmlcov 68 | - name: Upload examples output 69 | if: (!cancelled()) && matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') || github.base_ref == 'master') 70 | uses: actions/upload-artifact@v2 71 | with: 72 | name: examples_output 73 | path: | 74 | .tox/py/tmp/*/*/*/output/ 75 | !.tox/py/tmp/*/*current/*/output/ 76 | - name: Build the package 77 | if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') || github.base_ref == 'master') 78 | run: | 79 | pip install setuptools wheel twine 80 | python setup.py sdist bdist_wheel 81 | - name: Upload dist files 82 | if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest' && (startsWith(github.ref, 'refs/heads/master') || startsWith(github.ref, 'refs/tags') || github.base_ref == 'master') 83 | uses: actions/upload-artifact@v2 84 | with: 85 | name: dist 86 | path: dist 87 | 88 | publish: 89 | 90 | runs-on: ubuntu-latest 91 | needs: build 92 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/heads/master') 93 | 94 | steps: 95 | - name: Download dist package 96 | uses: actions/download-artifact@v2 97 | with: 98 | name: dist 99 | path: dist 100 | - name: Publish 101 | uses: pypa/gh-action-pypi-publish@master 102 | with: 103 | user: __token__ 104 | password: ${{ secrets.pypi_password }} 105 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example1/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-10-04 09:21:07,561 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-10-04 09:21:07,584 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 5 | 2020-10-04 09:21:10,639 sagemaker_pytorch_container.training INFO Invoking user training script. 6 | 2020-10-04 09:21:11,064 sagemaker-training-toolkit INFO Invoking user script 7 | 8 | Training Env: 9 | 10 | { 11 | "additional_framework_parameters": {}, 12 | "channel_input_dirs": {}, 13 | "current_host": "algo-1", 14 | "framework_module": "sagemaker_pytorch_container.training:main", 15 | "hosts": [ 16 | "algo-1", 17 | "algo-2" 18 | ], 19 | "hyperparameters": {}, 20 | "input_config_dir": "/opt/ml/input/config", 21 | "input_data_config": {}, 22 | "input_dir": "/opt/ml/input", 23 | "is_master": true, 24 | "job_name": "cli-task1-2020-10-04-09-16-52-zNRvwXqG", 25 | "log_level": 20, 26 | "master_hostname": "algo-1", 27 | "model_dir": "/opt/ml/model", 28 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz", 29 | "module_name": "worker1", 30 | "network_interface_name": "eth0", 31 | "num_cpus": 8, 32 | "num_gpus": 1, 33 | "output_data_dir": "/opt/ml/output/data", 34 | "output_dir": "/opt/ml/output", 35 | "output_intermediate_dir": "/opt/ml/output/intermediate", 36 | "resource_config": { 37 | "current_host": "algo-1", 38 | "hosts": [ 39 | "algo-1", 40 | "algo-2" 41 | ], 42 | "network_interface_name": "eth0" 43 | }, 44 | "user_entry_point": "worker1.py" 45 | } 46 | 47 | Environment variables: 48 | 49 | SM_HOSTS=["algo-1","algo-2"] 50 | SM_NETWORK_INTERFACE_NAME=eth0 51 | SM_HPS={} 52 | SM_USER_ENTRY_POINT=worker1.py 53 | SM_FRAMEWORK_PARAMS={} 54 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"} 55 | SM_INPUT_DATA_CONFIG={} 56 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 57 | SM_CHANNELS=[] 58 | SM_CURRENT_HOST=algo-1 59 | SM_MODULE_NAME=worker1 60 | SM_LOG_LEVEL=20 61 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 62 | SM_INPUT_DIR=/opt/ml/input 63 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 64 | SM_OUTPUT_DIR=/opt/ml/output 65 | SM_NUM_CPUS=8 66 | SM_NUM_GPUS=1 67 | SM_MODEL_DIR=/opt/ml/model 68 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz 69 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"cli-task1-2020-10-04-09-16-52-zNRvwXqG","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz","module_name":"worker1","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"worker1.py"} 70 | SM_USER_ARGS=[] 71 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 72 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 73 | 74 | Invoking script with the following command: 75 | 76 | /opt/conda/bin/python worker1.py 77 | 78 | 79 | -***- Device 0: _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80) 80 | 2020-10-04 09:21:14,490 sagemaker-training-toolkit INFO Reporting training SUCCESS 81 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output/example1/logs/logs1: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-10-04 09:21:02,371 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-10-04 09:21:02,394 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 5 | 2020-10-04 09:21:15,062 sagemaker_pytorch_container.training INFO Invoking user training script. 6 | 2020-10-04 09:21:15,415 sagemaker-training-toolkit INFO Invoking user script 7 | 8 | Training Env: 9 | 10 | { 11 | "additional_framework_parameters": {}, 12 | "channel_input_dirs": {}, 13 | "current_host": "algo-2", 14 | "framework_module": "sagemaker_pytorch_container.training:main", 15 | "hosts": [ 16 | "algo-1", 17 | "algo-2" 18 | ], 19 | "hyperparameters": {}, 20 | "input_config_dir": "/opt/ml/input/config", 21 | "input_data_config": {}, 22 | "input_dir": "/opt/ml/input", 23 | "is_master": false, 24 | "job_name": "cli-task1-2020-10-04-09-16-52-zNRvwXqG", 25 | "log_level": 20, 26 | "master_hostname": "algo-1", 27 | "model_dir": "/opt/ml/model", 28 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz", 29 | "module_name": "worker1", 30 | "network_interface_name": "eth0", 31 | "num_cpus": 8, 32 | "num_gpus": 1, 33 | "output_data_dir": "/opt/ml/output/data", 34 | "output_dir": "/opt/ml/output", 35 | "output_intermediate_dir": "/opt/ml/output/intermediate", 36 | "resource_config": { 37 | "current_host": "algo-2", 38 | "hosts": [ 39 | "algo-1", 40 | "algo-2" 41 | ], 42 | "network_interface_name": "eth0" 43 | }, 44 | "user_entry_point": "worker1.py" 45 | } 46 | 47 | Environment variables: 48 | 49 | SM_HOSTS=["algo-1","algo-2"] 50 | SM_NETWORK_INTERFACE_NAME=eth0 51 | SM_HPS={} 52 | SM_USER_ENTRY_POINT=worker1.py 53 | SM_FRAMEWORK_PARAMS={} 54 | SM_RESOURCE_CONFIG={"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"} 55 | SM_INPUT_DATA_CONFIG={} 56 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 57 | SM_CHANNELS=[] 58 | SM_CURRENT_HOST=algo-2 59 | SM_MODULE_NAME=worker1 60 | SM_LOG_LEVEL=20 61 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 62 | SM_INPUT_DIR=/opt/ml/input 63 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 64 | SM_OUTPUT_DIR=/opt/ml/output 65 | SM_NUM_CPUS=8 66 | SM_NUM_GPUS=1 67 | SM_MODEL_DIR=/opt/ml/model 68 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz 69 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-2","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","input_data_config":{},"input_dir":"/opt/ml/input","is_master":false,"job_name":"cli-task1-2020-10-04-09-16-52-zNRvwXqG","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-10-04-09-16-49_py37/cli-task1/cli-task1-2020-10-04-09-16-52-zNRvwXqG/source/sourcedir.tar.gz","module_name":"worker1","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"worker1.py"} 70 | SM_USER_ARGS=[] 71 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 72 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 73 | 74 | Invoking script with the following command: 75 | 76 | /opt/conda/bin/python worker1.py 77 | 78 | 79 | -***- Device 0: _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80) 80 | 2020-10-04 09:21:17,972 sagemaker-training-toolkit INFO Reporting training SUCCESS 81 | -------------------------------------------------------------------------------- /src/simple_sagemaker/iam_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def createSageMakerIAMRole(boto3_session, role_name): 8 | logger.debug( 9 | f"Creating SageMaker IAM Role: {role_name} with an attached AmazonSageMakerFullAccess policy..." 10 | ) 11 | 12 | trustRelationship = { 13 | "Version": "2012-10-17", 14 | "Statement": [ 15 | { 16 | "Sid": "", 17 | "Effect": "Allow", 18 | "Principal": {"Service": "sagemaker.amazonaws.com"}, 19 | "Action": "sts:AssumeRole", 20 | } 21 | ], 22 | } 23 | client = boto3_session.client("iam") 24 | try: 25 | client.get_role(RoleName=role_name) 26 | except: # noqa: E722 27 | client.create_role( 28 | RoleName=role_name, 29 | AssumeRolePolicyDocument=json.dumps(trustRelationship), 30 | ) 31 | response = client.attach_role_policy( 32 | RoleName=role_name, 33 | PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess", 34 | ) 35 | assert ( 36 | response["ResponseMetadata"]["HTTPStatusCode"] == 200 37 | ), f"Couldn't attach AmazonSageMakerFullAccess policy to role {role_name}" 38 | 39 | 40 | def getOrCreatePolicy(iam_client, boto3_session, policy_name, policyString): 41 | listed_policies = iam_client.list_policies(Scope="Local") 42 | assert listed_policies["IsTruncated"] is False 43 | filtered_policy = [ 44 | policy 45 | for policy in listed_policies["Policies"] 46 | if policy["PolicyName"] == policy_name 47 | ] 48 | if not filtered_policy: 49 | response = iam_client.create_policy( 50 | PolicyName=policy_name, PolicyDocument=json.dumps(policyString) 51 | ) 52 | assert ( 53 | response["ResponseMetadata"]["HTTPStatusCode"] == 200 54 | ), f"Couldn't create polict {policy_name}" 55 | policy = response["Policy"] 56 | policy_arn = policy["Arn"] 57 | else: 58 | policy = filtered_policy[0] 59 | policy_arn = policy["Arn"] 60 | iam = boto3_session.resource("iam") 61 | policy_obj = iam.Policy(policy_arn) 62 | if json.dumps(policyString["Statement"][0]) in json.dumps( 63 | policy_obj.default_version.document["Statement"] 64 | ): 65 | logger.debug(f"Statement already exist im {policy_name}") 66 | else: 67 | logger.debug(f"Adding the statement to policy {policy_name}") 68 | policy_json = policy_obj.default_version.document 69 | policy_json["Statement"].append(policyString["Statement"][0]) 70 | response = iam_client.create_policy_version( 71 | PolicyArn=policy_arn, 72 | PolicyDocument=json.dumps(policy_json), 73 | SetAsDefault=True, 74 | ) 75 | assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 76 | response = iam_client.delete_policy_version( 77 | PolicyArn=policy_arn, VersionId=policy_obj.default_version.version_id 78 | ) 79 | assert response["ResponseMetadata"]["HTTPStatusCode"] == 200 80 | return policy_arn 81 | 82 | 83 | def allowAccessToS3Bucket(boto3_session, role_name, policy_name, bucket_name): 84 | logger.debug( 85 | f"Allowing access for {role_name} to {bucket_name} using the {policy_name} policy..." 86 | ) 87 | 88 | client = boto3_session.client("iam") 89 | policyString = { 90 | "Version": "2012-10-17", 91 | "Statement": [ 92 | { 93 | "Sid": "", 94 | "Effect": "Allow", 95 | "Action": ["s3:*"], 96 | "Resource": [ 97 | f"arn:aws:s3:::{bucket_name}", 98 | f"arn:aws:s3:::{bucket_name}/*", 99 | ], 100 | } 101 | ], 102 | } 103 | policy_arn = getOrCreatePolicy(client, boto3_session, policy_name, policyString) 104 | 105 | response = client.attach_role_policy( 106 | RoleName=role_name, 107 | PolicyArn=policy_arn, 108 | ) 109 | assert ( 110 | response["ResponseMetadata"]["HTTPStatusCode"] == 200 111 | ), f"Couldn't attach {policy_name} policy to role {role_name}" 112 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output2/logs/logs0: -------------------------------------------------------------------------------- 1 | ==Bash 2 | -***- Args: 3 | -- Env:SSM_OUTPUT=/opt/ml/processing/output HOSTNAME=ip-10-0-235-103.ec2.internal SAGEMAKER_SERVING_MODULE=sagemaker_sklearn_container.serving:main SAGEMAKER_TRAINING_MODULE=sagemaker_sklearn_container.training:main AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=/v2/credentials/AeW1bfcO2AJo1dQj5NBqhzLbpvn21RX1yr6lvJnO0lU PYTHONUNBUFFERED=1 SSM_STATE=/opt/ml/processing/state LC_ALL=C.UTF-8 PYTHONIOENCODING=UTF-8 PATH=/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin PWD=/ LANG=C.UTF-8 AWS_REGION=us-east-1 PYTHONDONTWRITEBYTECODE=1 SHLVL=1 HOME=/root _=/usr/bin/env 4 | *** START listing files 5 | /opt: 6 | total 12 7 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 . 8 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 .. 9 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 ml 10 | 11 | /opt/ml: 12 | total 20 13 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 . 14 | drwxr-xr-x 1 root root 4096 Oct 11 12:46 .. 15 | drw-r--r-- 2 root root 4096 Oct 11 12:41 config 16 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 output 17 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 processing 18 | 19 | /opt/ml/config: 20 | total 16 21 | drw-r--r-- 2 root root 4096 Oct 11 12:41 . 22 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 23 | -rw-r--r-- 1 root root 2358 Oct 11 12:41 processingjobconfig.json 24 | -rw-r--r-- 1 root root 44 Oct 11 12:41 resourceconfig.json 25 | 26 | /opt/ml/output: 27 | total 16 28 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 . 29 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 30 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 data 31 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 metrics 32 | 33 | /opt/ml/output/data: 34 | total 16 35 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 . 36 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 37 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 output 38 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 state 39 | 40 | /opt/ml/output/data/output: 41 | total 8 42 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 . 43 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 44 | 45 | /opt/ml/output/data/state: 46 | total 8 47 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 . 48 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 49 | 50 | /opt/ml/output/metrics: 51 | total 12 52 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 . 53 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 54 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 cloudwatch 55 | 56 | /opt/ml/output/metrics/cloudwatch: 57 | total 8 58 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 . 59 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .. 60 | 61 | /opt/ml/processing: 62 | total 20 63 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 . 64 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 65 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 input 66 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 output 67 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 state 68 | 69 | /opt/ml/processing/input: 70 | total 12 71 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 . 72 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 73 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 code 74 | 75 | /opt/ml/processing/input/code: 76 | total 16 77 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 . 78 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .. 79 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 dep 80 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 worker_toolkit 81 | 82 | /opt/ml/processing/input/code/dep: 83 | total 12 84 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 . 85 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 86 | -rw-r--r-- 1 root root 23 Oct 11 12:46 ex1_dep.py 87 | 88 | /opt/ml/processing/input/code/worker_toolkit: 89 | total 24 90 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 . 91 | drwxr-xr-x 4 root root 4096 Oct 11 12:46 .. 92 | -rw-r--r-- 1 root root 0 Oct 11 12:46 __init__.py 93 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 __pycache__ 94 | -rw-r--r-- 1 root root 9763 Oct 11 12:46 worker_lib.py 95 | 96 | /opt/ml/processing/input/code/worker_toolkit/__pycache__: 97 | total 20 98 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 . 99 | drwxr-xr-x 3 root root 4096 Oct 11 12:46 .. 100 | -rw-r--r-- 1 root root 218 Oct 11 12:46 __init__.cpython-37.pyc 101 | -rw-r--r-- 1 root root 7548 Oct 11 12:46 worker_lib.cpython-37.pyc 102 | 103 | /opt/ml/processing/output: 104 | total 8 105 | drwxr-xr-x 2 root root 4096 Oct 11 12:46 . 106 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 107 | 108 | /opt/ml/processing/state: 109 | total 8 110 | drwxr-xr-x 2 root root 4096 Oct 11 12:42 . 111 | drwxr-xr-x 5 root root 4096 Oct 11 12:46 .. 112 | *** END 113 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-09-13 13:19:33,550 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-09-13 13:19:33,552 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-09-13 13:19:33,561 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-09-13 13:19:36,593 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-09-13 13:19:51,068 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-09-13 13:19:51,080 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-09-13 13:19:51,092 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-09-13 13:19:51,102 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": {}, 17 | "current_host": "algo-1", 18 | "framework_module": "sagemaker_pytorch_container.training:main", 19 | "hosts": [ 20 | "algo-1" 21 | ], 22 | "hyperparameters": {}, 23 | "input_config_dir": "/opt/ml/input/config", 24 | "channel_data_config": {}, 25 | "input_dir": "/opt/ml/input", 26 | "is_master": true, 27 | "job_name": "task3-2020-09-13-13-16-15-tj2lK7rZ", 28 | "log_level": 20, 29 | "master_hostname": "algo-1", 30 | "model_dir": "/opt/ml/model", 31 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz", 32 | "module_name": "worker3", 33 | "network_interface_name": "eth0", 34 | "num_cpus": 2, 35 | "num_gpus": 0, 36 | "output_data_dir": "/opt/ml/output/data", 37 | "output_dir": "/opt/ml/output", 38 | "output_intermediate_dir": "/opt/ml/output/intermediate", 39 | "resource_config": { 40 | "current_host": "algo-1", 41 | "hosts": [ 42 | "algo-1" 43 | ], 44 | "network_interface_name": "eth0" 45 | }, 46 | "user_entry_point": "worker3.py" 47 | } 48 | 49 | Environment variables: 50 | 51 | SM_HOSTS=["algo-1"] 52 | SM_NETWORK_INTERFACE_NAME=eth0 53 | SM_HPS={} 54 | SM_USER_ENTRY_POINT=worker3.py 55 | SM_FRAMEWORK_PARAMS={} 56 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"} 57 | SM_channel_data_CONFIG={} 58 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 59 | SM_CHANNELS=[] 60 | SM_CURRENT_HOST=algo-1 61 | SM_MODULE_NAME=worker3 62 | SM_LOG_LEVEL=20 63 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 64 | SM_INPUT_DIR=/opt/ml/input 65 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 66 | SM_OUTPUT_DIR=/opt/ml/output 67 | SM_NUM_CPUS=2 68 | SM_NUM_GPUS=0 69 | SM_MODEL_DIR=/opt/ml/model 70 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz 71 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task3-2020-09-13-13-16-15-tj2lK7rZ","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-16-15-tj2lK7rZ/source/sourcedir.tar.gz","module_name":"worker3","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker3.py"} 72 | SM_USER_ARGS=[] 73 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 74 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 75 | 76 | Invoking script with the following command: 77 | 78 | /opt/conda/bin/python worker3.py 79 | 80 | 81 | 2020-09-13 13:19:51,169 sagemaker-training-toolkit INFO Reporting training SUCCESS 82 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example3_2/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-09-13 13:23:18,591 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-09-13 13:23:18,595 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-09-13 13:23:18,612 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-09-13 13:23:20,041 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-09-13 13:23:51,904 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-09-13 13:23:51,916 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-09-13 13:23:51,928 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-09-13 13:23:51,938 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": {}, 17 | "current_host": "algo-1", 18 | "framework_module": "sagemaker_pytorch_container.training:main", 19 | "hosts": [ 20 | "algo-1" 21 | ], 22 | "hyperparameters": {}, 23 | "input_config_dir": "/opt/ml/input/config", 24 | "channel_data_config": {}, 25 | "input_dir": "/opt/ml/input", 26 | "is_master": true, 27 | "job_name": "task3-2020-09-13-13-20-31-f6osgaSU", 28 | "log_level": 20, 29 | "master_hostname": "algo-1", 30 | "model_dir": "/opt/ml/model", 31 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz", 32 | "module_name": "worker3", 33 | "network_interface_name": "eth0", 34 | "num_cpus": 2, 35 | "num_gpus": 0, 36 | "output_data_dir": "/opt/ml/output/data", 37 | "output_dir": "/opt/ml/output", 38 | "output_intermediate_dir": "/opt/ml/output/intermediate", 39 | "resource_config": { 40 | "current_host": "algo-1", 41 | "hosts": [ 42 | "algo-1" 43 | ], 44 | "network_interface_name": "eth0" 45 | }, 46 | "user_entry_point": "worker3.py" 47 | } 48 | 49 | Environment variables: 50 | 51 | SM_HOSTS=["algo-1"] 52 | SM_NETWORK_INTERFACE_NAME=eth0 53 | SM_HPS={} 54 | SM_USER_ENTRY_POINT=worker3.py 55 | SM_FRAMEWORK_PARAMS={} 56 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"} 57 | SM_channel_data_CONFIG={} 58 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 59 | SM_CHANNELS=[] 60 | SM_CURRENT_HOST=algo-1 61 | SM_MODULE_NAME=worker3 62 | SM_LOG_LEVEL=20 63 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 64 | SM_INPUT_DIR=/opt/ml/input 65 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 66 | SM_OUTPUT_DIR=/opt/ml/output 67 | SM_NUM_CPUS=2 68 | SM_NUM_GPUS=0 69 | SM_MODEL_DIR=/opt/ml/model 70 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz 71 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task3-2020-09-13-13-20-31-f6osgaSU","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task3/task3-2020-09-13-13-20-31-f6osgaSU/source/sourcedir.tar.gz","module_name":"worker3","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker3.py"} 72 | SM_USER_ARGS=[] 73 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 74 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 75 | 76 | Invoking script with the following command: 77 | 78 | /opt/conda/bin/python worker3.py 79 | 80 | 81 | 2020-09-13 13:23:52,003 sagemaker-training-toolkit INFO Reporting training SUCCESS 82 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example2/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-09-14 21:46:30,898 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-09-14 21:46:30,901 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-09-14 21:46:30,910 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-09-14 21:46:32,341 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-09-14 21:46:32,613 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-09-14 21:46:32,625 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-09-14 21:46:32,638 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-09-14 21:46:32,648 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": {}, 17 | "current_host": "algo-1", 18 | "framework_module": "sagemaker_pytorch_container.training:main", 19 | "hosts": [ 20 | "algo-1" 21 | ], 22 | "hyperparameters": { 23 | "msg": "Hello, world!" 24 | }, 25 | "input_config_dir": "/opt/ml/input/config", 26 | "channel_data_config": {}, 27 | "input_dir": "/opt/ml/input", 28 | "is_master": true, 29 | "job_name": "task2-2020-09-14-21-43-32-oKDGLvk6", 30 | "log_level": 20, 31 | "master_hostname": "algo-1", 32 | "model_dir": "/opt/ml/model", 33 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz", 34 | "module_name": "worker2", 35 | "network_interface_name": "eth0", 36 | "num_cpus": 2, 37 | "num_gpus": 0, 38 | "output_data_dir": "/opt/ml/output/data", 39 | "output_dir": "/opt/ml/output", 40 | "output_intermediate_dir": "/opt/ml/output/intermediate", 41 | "resource_config": { 42 | "current_host": "algo-1", 43 | "hosts": [ 44 | "algo-1" 45 | ], 46 | "network_interface_name": "eth0" 47 | }, 48 | "user_entry_point": "worker2.py" 49 | } 50 | 51 | Environment variables: 52 | 53 | SM_HOSTS=["algo-1"] 54 | SM_NETWORK_INTERFACE_NAME=eth0 55 | SM_HPS={"msg":"Hello, world!"} 56 | SM_USER_ENTRY_POINT=worker2.py 57 | SM_FRAMEWORK_PARAMS={} 58 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"} 59 | SM_channel_data_CONFIG={} 60 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 61 | SM_CHANNELS=[] 62 | SM_CURRENT_HOST=algo-1 63 | SM_MODULE_NAME=worker2 64 | SM_LOG_LEVEL=20 65 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 66 | SM_INPUT_DIR=/opt/ml/input 67 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 68 | SM_OUTPUT_DIR=/opt/ml/output 69 | SM_NUM_CPUS=2 70 | SM_NUM_GPUS=0 71 | SM_MODEL_DIR=/opt/ml/model 72 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz 73 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"msg":"Hello, world!"},"input_config_dir":"/opt/ml/input/config","channel_data_config":{},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task2-2020-09-14-21-43-32-oKDGLvk6","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/readme/simple-sagemaker-example-cli/task2/task2-2020-09-14-21-43-32-oKDGLvk6/source/sourcedir.tar.gz","module_name":"worker2","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker2.py"} 74 | SM_USER_ARGS=["--msg","Hello, world!"] 75 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 76 | SM_HP_MSG=Hello, world! 77 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 78 | 79 | Invoking script with the following command: 80 | 81 | /opt/conda/bin/python worker2.py --msg Hello, world! 82 | 83 | 84 | -***- Hello, world! 85 | 2020-09-14 21:46:32,715 sagemaker-training-toolkit INFO Reporting training SUCCESS 86 | -------------------------------------------------------------------------------- /examples/multiple_tasks/example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import sys 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | file_path = os.path.split(__file__)[0] 9 | if "TOX_ENV_NAME" not in os.environ: 10 | srcPath = os.path.abspath(os.path.join(file_path, "..", "..", "src")) 11 | sys.path.append(srcPath) 12 | from simple_sagemaker.sm_project import SageMakerProject # noqa: E402 13 | 14 | 15 | def setDefaultParams(sm_project): 16 | # docker image params 17 | aws_repo_name = "task_repo" # remote (ECR) rpository name 18 | repo_name = "task_repo" # local repository name 19 | image_tag = "latest" # tag for local & remote images 20 | docker_file_path = os.path.join( 21 | file_path, "..", "single_task", "docker" 22 | ) # path of the local Dockerfile 23 | sm_project.setDefaultImageParams( 24 | aws_repo_name, repo_name, image_tag, docker_file_path 25 | ) 26 | 27 | # job code path, entrypoint and params 28 | source_dir = os.path.join(file_path, "code") 29 | entry_point = "algo_multi.py" 30 | dependencies = [] 31 | sm_project.setDefaultCodeParams(source_dir, entry_point, dependencies) 32 | 33 | # instances type an count 34 | instance_type = "ml.m5.large" 35 | training_instance_count = 2 36 | volume_size = ( 37 | 30 # Size in GB of the EBS volume to use for storing input data during training 38 | ) 39 | use_spot_instances = True # False 40 | max_run_mins = 15 41 | sm_project.setDefaultInstanceParams( 42 | instance_type, 43 | training_instance_count, 44 | volume_size, 45 | use_spot_instances, 46 | max_run_mins, 47 | ) 48 | 49 | 50 | def runner( 51 | project_name="simple-sagemaker-example-multi", 52 | prefix="", 53 | postfix="", 54 | output_path=None, 55 | ): 56 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 57 | 58 | sm_project = SageMakerProject(project_name, prefix=prefix) 59 | setDefaultParams(sm_project) 60 | image_uri = sm_project.buildOrGetImage( 61 | instance_type=sm_project.defaultInstanceParams.instance_type 62 | ) 63 | 64 | # task name 65 | task_name = ( 66 | "multi-task1" + postfix 67 | ) # must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])* 68 | # input data params 69 | input_data_path = os.path.join( 70 | file_path, "..", "single_task", "input_data" 71 | ) # Can also provide a URI to an S3 bucket, e.g. next commented line 72 | # input_data_path = sagemaker.s3.s3_path_join("s3://", "sagemaker-us-east-1-XXXXXXXXXXXX", "task3", "input") 73 | distribution = "ShardedByS3Key" # or "FullyReplicated" which is the default 74 | model_uri = ( 75 | None # Can be used to supply model data as an additional input, local/s3 76 | ) 77 | hyperparameters = {"stage": 1} 78 | sm_project.runTask( 79 | task_name, 80 | image_uri, 81 | hyperparameters, 82 | input_data_path, 83 | model_uri=model_uri, 84 | input_distribution=distribution, 85 | clean_state=True, 86 | ) 87 | 88 | if not output_path: 89 | output_path = os.path.join(file_path, "output") 90 | # delete the output directory 91 | outputDir1 = os.path.join(output_path, "output1") 92 | shutil.rmtree(outputDir1, ignore_errors=True) 93 | sm_project.downloadResults(task_name, outputDir1) 94 | 95 | task_name2 = "multi-task2" 96 | hyperparameters = {"stage": 2} 97 | additional_inputs = dict() 98 | additional_inputs["task1_state1"] = sm_project.getInputConfig(task_name, "state") 99 | additional_inputs["task1_state2"] = sm_project.getInputConfig( 100 | task_name, "state", distribution="ShardedByS3Key" 101 | ) 102 | additional_inputs["task1_state3"] = sm_project.getInputConfig( 103 | task_name, "output", distribution="ShardedByS3Key" 104 | ) 105 | model_uri = sm_project.tasks[task_name].getOutputTargetUri(model=True) 106 | sm_project.runTask( 107 | task_name2, 108 | image_uri, 109 | hyperparameters, 110 | input_data_path, 111 | model_uri=model_uri, 112 | input_distribution=distribution, 113 | additional_inputs=additional_inputs, 114 | clean_state=True, 115 | ) 116 | 117 | # delete the output directory 118 | output_dir2 = os.path.join(output_path, "output2") 119 | shutil.rmtree(output_dir2, ignore_errors=True) 120 | sm_project.downloadResults(task_name2, output_dir2) 121 | 122 | return sm_project 123 | 124 | 125 | if __name__ == "__main__": 126 | runner() 127 | -------------------------------------------------------------------------------- /examples/single_file/example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import shutil 3 | import sys 4 | from pathlib import Path 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | dockerFileContent = """ 9 | # __BASE_IMAGE__ is automatically replaced with the correct base image 10 | FROM __BASE_IMAGE__ 11 | RUN pip3 install pandas==1.1 scikit-learn==0.21.3 12 | """ 13 | file_path = Path(__file__).parent 14 | 15 | 16 | def runner(project_name="simple-sagemaker-sf", prefix="", postfix="", output_path=None): 17 | from simple_sagemaker.sm_project import SageMakerProject 18 | 19 | sm_project = SageMakerProject(project_name, prefix=prefix) 20 | # define the code parameters 21 | sm_project.setDefaultCodeParams( 22 | source_dir=None, entry_point=__file__, dependencies=[] 23 | ) 24 | # define the instance parameters 25 | sm_project.setDefaultInstanceParams(instance_count=2, max_run_mins=15) 26 | # docker image 27 | sm_project.setDefaultImageParams( 28 | aws_repo_name="task_repo", 29 | repo_name="task_repo", 30 | image_tag="latest", 31 | docker_file_path_or_content=dockerFileContent, 32 | ) 33 | image_uri = sm_project.buildOrGetImage( 34 | instance_type=sm_project.defaultInstanceParams.instance_type 35 | ) 36 | 37 | # *** Task 1 - process input data 38 | task1_name = "single-file-task1" + postfix 39 | # set the input data 40 | input_data_path = file_path / "data" 41 | # run the task 42 | sm_project.runTask( 43 | task1_name, 44 | image_uri, 45 | input_distribution="ShardedByS3Key", # distribute the input files among the workers 46 | hyperparameters={"worker": 1, "arg": "hello world!", "task": 1}, 47 | input_data_path=str(input_data_path) if input_data_path.is_dir() else None, 48 | clean_state=True, # clean the current state, also forces re-running 49 | ) 50 | # download the results 51 | if not output_path: 52 | output_path = file_path / "output" 53 | shutil.rmtree(output_path, ignore_errors=True) 54 | sm_project.downloadResults(task1_name, Path(output_path) / "output1") 55 | 56 | # *** Task 2 - process the results of Task 1 57 | task2_name = "single-file-task2" + postfix 58 | # set the input 59 | additional_inputs = { 60 | "task2_data": sm_project.getInputConfig(task1_name, "model"), 61 | "task2_data_dist": sm_project.getInputConfig( 62 | task1_name, "model", distribution="ShardedByS3Key" 63 | ), 64 | } 65 | # run the task 66 | sm_project.runTask( 67 | task2_name, 68 | image_uri, 69 | hyperparameters={"worker": 1, "arg": "hello world!", "task": 2}, 70 | clean_state=True, # clean the current state, also forces re-running 71 | additional_inputs=additional_inputs, 72 | ) 73 | # download the results 74 | sm_project.downloadResults(task2_name, Path(output_path) / "output2") 75 | 76 | return sm_project 77 | 78 | 79 | def worker(): 80 | from worker_toolkit import worker_lib 81 | 82 | logger.info("Starting worker...") 83 | # parse the arguments 84 | worker_config = worker_lib.WorkerConfig() 85 | 86 | logger.info(f"Hyperparams: {worker_config.hps}") 87 | logger.info( 88 | f"Input data files: {list(Path(worker_config.channel_data).rglob('*'))}" 89 | ) 90 | logger.info(f"State files: { list(Path(worker_config.state).rglob('*'))}") 91 | 92 | if int(worker_config.hps["task"]) == 1: 93 | # update the state per running instance 94 | open( 95 | f"{worker_config.instance_state}/state_{worker_config.current_host}", "wt" 96 | ).write("state") 97 | # write to the model output directory 98 | for file in Path(worker_config.channel_data).rglob("*"): 99 | if file.is_file(): 100 | relp = file.relative_to(worker_config.channel_data) 101 | path = Path(worker_config.model_dir) / ( 102 | str(relp) + "_proc_by_" + worker_config.current_host 103 | ) 104 | path.write_text( 105 | file.read_text() + " processed by " + worker_config.current_host 106 | ) 107 | open( 108 | f"{worker_config.model_dir}/output_{worker_config.current_host}", "wt" 109 | ).write("output") 110 | elif int(worker_config.hps["task"]) == 2: 111 | logger.info( 112 | f"Input task2_data: {list(Path(worker_config.channel_task2_data).rglob('*'))}" 113 | ) 114 | logger.info( 115 | f"Input task2_data_dist: {list(Path(worker_config.channel_task2_data_dist).rglob('*'))}" 116 | ) 117 | 118 | logger.info("finished!") 119 | # The task is marked as completed 120 | 121 | 122 | def main(): 123 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 124 | if "--worker" in sys.argv: 125 | worker() 126 | else: 127 | runner() 128 | 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output3/logs/logs0: -------------------------------------------------------------------------------- 1 | ======= Starting Bash script ... 2 | -***- Args: arg1 -arg2 --arg3 argument 4 3 | -- Env:, SSM_OUTPUT=/opt/ml/processing/output HOSTNAME=ip-10-0-189-78.ec2.internal SAGEMAKER_SERVING_MODULE=sagemaker_sklearn_container.serving:main SAGEMAKER_TRAINING_MODULE=sagemaker_sklearn_container.training:main AWS_CONTAINER_CREDENTIALS_RELATIVE_URI=/v2/credentials/f_w_W_i_iOPc3jxTvlBDwhlypkVF7JT0tnKUxHe_kCY PYTHONUNBUFFERED=1 SSM_STATE=/opt/ml/processing/state LC_ALL=C.UTF-8 PYTHONIOENCODING=UTF-8 PATH=/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin PWD=/ LANG=C.UTF-8 AWS_REGION=us-east-1 PYTHONDONTWRITEBYTECODE=1 SHLVL=1 HOME=/root SM_CHANNEL_CLI_CODE_STATE=/opt/ml/processing/input/data/cli_code_state SM_CHANNEL_CLI_CODE_OUTPUT=/opt/ml/processing/input/data/cli_code_output SM_CHANNEL_DATA=/opt/ml/processing/data _=/usr/bin/env 4 | -***- Pwd:, / 5 | *** START listing files in /opt 6 | /opt: 7 | total 12 8 | drwxr-xr-x 1 root root 4096 Oct 9 05:55 . 9 | drwxr-xr-x 1 root root 4096 Oct 9 05:55 .. 10 | drwxr-xr-x 5 root root 4096 Oct 9 05:55 ml 11 | 12 | /opt/ml: 13 | total 20 14 | drwxr-xr-x 5 root root 4096 Oct 9 05:55 . 15 | drwxr-xr-x 1 root root 4096 Oct 9 05:55 .. 16 | drw-r--r-- 2 root root 4096 Oct 9 05:53 config 17 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 output 18 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 processing 19 | 20 | /opt/ml/config: 21 | total 16 22 | drw-r--r-- 2 root root 4096 Oct 9 05:53 . 23 | drwxr-xr-x 5 root root 4096 Oct 9 05:55 .. 24 | -rw-r--r-- 1 root root 3628 Oct 9 05:53 processingjobconfig.json 25 | -rw-r--r-- 1 root root 44 Oct 9 05:53 resourceconfig.json 26 | 27 | /opt/ml/output: 28 | total 16 29 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 . 30 | drwxr-xr-x 5 root root 4096 Oct 9 05:55 .. 31 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 data 32 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 metrics 33 | 34 | /opt/ml/output/data: 35 | total 16 36 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 . 37 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 38 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 output 39 | drwxr-xr-x 2 root root 4096 Oct 9 05:53 state 40 | 41 | /opt/ml/output/data/output: 42 | total 8 43 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 44 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 45 | 46 | /opt/ml/output/data/state: 47 | total 8 48 | drwxr-xr-x 2 root root 4096 Oct 9 05:53 . 49 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 50 | 51 | /opt/ml/output/metrics: 52 | total 12 53 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 . 54 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 55 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 cloudwatch 56 | 57 | /opt/ml/output/metrics/cloudwatch: 58 | total 8 59 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 60 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 .. 61 | 62 | /opt/ml/processing: 63 | total 24 64 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 . 65 | drwxr-xr-x 5 root root 4096 Oct 9 05:55 .. 66 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 data 67 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 input 68 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 output 69 | drwxr-xr-x 2 root root 4096 Oct 9 05:53 state 70 | 71 | /opt/ml/processing/data: 72 | total 8 73 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 74 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 .. 75 | -rw-r--r-- 1 root root 0 Oct 9 05:55 sample_data.txt 76 | 77 | /opt/ml/processing/input: 78 | total 16 79 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 . 80 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 .. 81 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 code 82 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 data 83 | 84 | /opt/ml/processing/input/code: 85 | total 20 86 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 . 87 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 88 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 dep 89 | -rw-r--r-- 1 root root 325 Oct 9 05:55 ex3.sh 90 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 worker_toolkit 91 | 92 | /opt/ml/processing/input/code/dep: 93 | total 12 94 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 95 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 96 | -rw-r--r-- 1 root root 23 Oct 9 05:55 ex1_dep.py 97 | 98 | /opt/ml/processing/input/code/worker_toolkit: 99 | total 24 100 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 . 101 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 102 | -rw-r--r-- 1 root root 0 Oct 9 05:55 __init__.py 103 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 __pycache__ 104 | -rw-r--r-- 1 root root 9637 Oct 9 05:55 worker_lib.py 105 | 106 | /opt/ml/processing/input/code/worker_toolkit/__pycache__: 107 | total 20 108 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 109 | drwxr-xr-x 3 root root 4096 Oct 9 05:55 .. 110 | -rw-r--r-- 1 root root 218 Oct 9 05:55 __init__.cpython-37.pyc 111 | -rw-r--r-- 1 root root 7460 Oct 9 05:55 worker_lib.cpython-37.pyc 112 | 113 | /opt/ml/processing/input/data: 114 | total 16 115 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 . 116 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 117 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 cli_code_output 118 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 cli_code_state 119 | 120 | /opt/ml/processing/input/data/cli_code_output: 121 | total 12 122 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 123 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 124 | -rw-r--r-- 1 root root 6 Oct 9 05:55 output 125 | 126 | /opt/ml/processing/input/data/cli_code_state: 127 | total 12 128 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 129 | drwxr-xr-x 4 root root 4096 Oct 9 05:55 .. 130 | -rw-r--r-- 1 root root 5 Oct 9 05:55 state 131 | 132 | /opt/ml/processing/output: 133 | total 8 134 | drwxr-xr-x 2 root root 4096 Oct 9 05:55 . 135 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 .. 136 | 137 | /opt/ml/processing/state: 138 | total 8 139 | drwxr-xr-x 2 root root 4096 Oct 9 05:53 . 140 | drwxr-xr-x 6 root root 4096 Oct 9 05:55 .. 141 | *** END file listing /opt -------------------------------------------------------------------------------- /examples/processing_cli/expected_output/output1/logs/logs0: -------------------------------------------------------------------------------- 1 | Dependency!!! 2 | INFO:__main__:======= Starting python script ... 3 | INFO:worker_toolkit.worker_lib:Deleting other instances' state 4 | INFO:worker_toolkit.worker_lib:Creating state dir 5 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='', channel_model='', channels=[], current_host='algo-1', host_rank=0, hosts=['algo-1'], hps=[], input_config_dir='', input_data_config='', input_dir='', instance_state='/opt/ml/processing/state/algo-1', job_name='cli-code-2020-10-06-23-25-36-bE5AcbXg', model_dir='', network_interface_name='', num_cpus=2, num_gpus=-1, num_nodes=1, output_data_dir='', output_dir='', resource_config='', state='/opt/ml/processing/state') 6 | Environ: environ({'PATH': '/miniconda3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'HOSTNAME': 'ip-10-0-228-147.ec2.internal', 'SSM_STATE': '/opt/ml/processing/state', 'SSM_OUTPUT': '/opt/ml/processing/output', 'AWS_REGION': 'us-east-1', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/2ucF5alLTY-9jFs9Hp_ZIv8OpNqr93rvz-omNZVcG6E', 'PYTHONDONTWRITEBYTECODE': '1', 'PYTHONUNBUFFERED': '1', 'PYTHONIOENCODING': 'UTF-8', 'LANG': 'C.UTF-8', 'LC_ALL': 'C.UTF-8', 'SAGEMAKER_TRAINING_MODULE': 'sagemaker_sklearn_container.training:main', 'SAGEMAKER_SERVING_MODULE': 'sagemaker_sklearn_container.serving:main', 'HOME': '/root', 'SAGEMAKER_JOB_NAME': 'cli-code-2020-10-06-23-25-36-bE5AcbXg', 'SM_HOSTS': '["algo-1"]', 'SM_CURRENT_HOST': 'algo-1', 'SSM_NUM_NODES': '1', 'SSM_HOST_RANK': '0', 'SSM_INSTANCE_STATE': '/opt/ml/processing/state/algo-1', 'SMDEBUG_LOG_LEVEL': 'warning'}) 7 | Args: ['/opt/ml/processing/input/code/ex1.py', 'arg1', '-arg2', '--arg3', 'argument 4'] 8 | INFO:__main__:*** START listing files in /opt/ 9 | INFO:__main__:/opt/: 10 | total 12 11 | drwxr-xr-x 1 root root 4096 Oct 6 23:29 . 12 | drwxr-xr-x 1 root root 4096 Oct 6 23:29 .. 13 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 ml 14 | 15 | /opt/ml: 16 | total 20 17 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 . 18 | drwxr-xr-x 1 root root 4096 Oct 6 23:29 .. 19 | drw-r--r-- 2 root root 4096 Oct 6 23:27 config 20 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 output 21 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 processing 22 | 23 | /opt/ml/config: 24 | total 16 25 | drw-r--r-- 2 root root 4096 Oct 6 23:27 . 26 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 27 | -rw-r--r-- 1 root root 2271 Oct 6 23:27 processingjobconfig.json 28 | -rw-r--r-- 1 root root 44 Oct 6 23:27 resourceconfig.json 29 | 30 | /opt/ml/output: 31 | total 16 32 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 . 33 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 34 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 data 35 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 metrics 36 | 37 | /opt/ml/output/data: 38 | total 16 39 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 . 40 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 41 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 output 42 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 state 43 | 44 | /opt/ml/output/data/output: 45 | total 8 46 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 47 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 48 | 49 | /opt/ml/output/data/state: 50 | total 12 51 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 . 52 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 53 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 algo-1 54 | 55 | /opt/ml/output/data/state/algo-1: 56 | total 8 57 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 58 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 .. 59 | 60 | /opt/ml/output/metrics: 61 | total 12 62 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 . 63 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 64 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 cloudwatch 65 | 66 | /opt/ml/output/metrics/cloudwatch: 67 | total 8 68 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 69 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 .. 70 | 71 | /opt/ml/processing: 72 | total 20 73 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 . 74 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 75 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 input 76 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 output 77 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 state 78 | 79 | /opt/ml/processing/input: 80 | total 12 81 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 . 82 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 83 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 code 84 | 85 | /opt/ml/processing/input/code: 86 | total 20 87 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 . 88 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 .. 89 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 dep 90 | -rw-r--r-- 1 root root 1037 Oct 6 23:29 ex1.py 91 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 worker_toolkit 92 | 93 | /opt/ml/processing/input/code/dep: 94 | total 12 95 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 96 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 97 | -rw-r--r-- 1 root root 23 Oct 6 23:29 ex1_dep.py 98 | 99 | /opt/ml/processing/input/code/worker_toolkit: 100 | total 24 101 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 . 102 | drwxr-xr-x 4 root root 4096 Oct 6 23:29 .. 103 | -rw-r--r-- 1 root root 0 Oct 6 23:29 __init__.py 104 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 __pycache__ 105 | -rw-r--r-- 1 root root 10325 Oct 6 23:29 worker_lib.py 106 | 107 | /opt/ml/processing/input/code/worker_toolkit/__pycache__: 108 | total 20 109 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 110 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 .. 111 | -rw-r--r-- 1 root root 201 Oct 6 23:29 __init__.cpython-37.pyc 112 | -rw-r--r-- 1 root root 8111 Oct 6 23:29 worker_lib.cpython-37.pyc 113 | 114 | /opt/ml/processing/output: 115 | total 8 116 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 117 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 118 | 119 | /opt/ml/processing/state: 120 | total 12 121 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 . 122 | drwxr-xr-x 5 root root 4096 Oct 6 23:29 .. 123 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 algo-1 124 | 125 | /opt/ml/processing/state/algo-1: 126 | total 8 127 | drwxr-xr-x 2 root root 4096 Oct 6 23:29 . 128 | drwxr-xr-x 3 root root 4096 Oct 6 23:29 .. 129 | 130 | INFO:__main__:*** END file listing /opt/ 131 | INFO:__main__:finished! 132 | -------------------------------------------------------------------------------- /src/simple_sagemaker/ecr_sync.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | import os 4 | from io import BytesIO 5 | 6 | import docker 7 | from sagemaker import image_uris 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ECRSync: 13 | def __init__(self, boto3_session): 14 | self.boto3_session = boto3_session 15 | self.ecrClient = self.boto3_session.client("ecr") 16 | 17 | def getRpoUri(self, aws_repo_name): 18 | repo_uri = None 19 | for repo in self.ecrClient.describe_repositories()["repositories"]: 20 | if repo["repositoryName"] == aws_repo_name: 21 | repo_uri = repo["repositoryUri"] 22 | return repo_uri 23 | 24 | def getOrCreateRepo(self, aws_repo_name): 25 | repo_uri = self.getRpoUri(aws_repo_name) 26 | if repo_uri is None: 27 | logging.info(f"Creating ECR repository: {aws_repo_name}") 28 | repo = self.ecrClient.create_repository(repositoryName=aws_repo_name) 29 | repo_uri = repo["repository"]["repositoryUri"] 30 | return repo_uri 31 | 32 | def getPrebuiltImage( 33 | self, 34 | instance_type, 35 | framework, 36 | framework_version, 37 | py_version, 38 | image_scope="training", 39 | ): 40 | assert framework, "Framework has to be specified" 41 | defaults = { 42 | "pytorch": ("1.6.0", "py3"), 43 | "tensorflow": ("2.3.0", "py37"), 44 | "sklearn": ("0.20.0", None), 45 | } 46 | 47 | if framework in defaults: 48 | if framework_version is None or py_version is None: 49 | framework_version, py_version = defaults[framework] 50 | 51 | logger.debug( 52 | f"Getting the image for {framework}, framework_version {framework_version}, python version {py_version}" 53 | ) 54 | 55 | region_name = self.boto3_session.region_name 56 | 57 | # Get the base image name, validate Dockerfile is based on it (TODO: replace in file) 58 | baseimage_uri = image_uris.retrieve( 59 | framework, 60 | region=region_name, 61 | version=framework_version, 62 | py_version=py_version, 63 | image_scope=image_scope, 64 | instance_type=instance_type, 65 | ) 66 | return baseimage_uri 67 | 68 | def buildAndPushDockerImage( 69 | self, 70 | docker_file_path_or_content, 71 | aws_repo_name, 72 | repo_name, 73 | image_tag, 74 | instance_type, 75 | framework, 76 | framework_version, 77 | py_version, 78 | ): 79 | baseimage_uri = self.getPrebuiltImage( 80 | instance_type, framework, framework_version, py_version 81 | ) 82 | 83 | if not docker_file_path_or_content: 84 | logger.debug(f"Using a pre-built image {baseimage_uri}...") 85 | return baseimage_uri 86 | 87 | repo_uri = self.getOrCreateRepo(aws_repo_name) 88 | 89 | build_args = dict() 90 | build_args["tag"] = repo_name + ":" + image_tag 91 | 92 | if os.path.isdir(docker_file_path_or_content): 93 | docker_file_path_or_content = open( 94 | os.path.join(docker_file_path_or_content, "Dockerfile"), "rt" 95 | ).read() 96 | elif os.path.isfile(docker_file_path_or_content): 97 | docker_file_path_or_content = open(docker_file_path_or_content, "rt").read() 98 | 99 | # If it's not there -> add it :) 100 | if "__BASE_IMAGE__" not in docker_file_path_or_content: 101 | logger.warning( 102 | "__BASE_IMAGE__ couln't be found in docker_file_path_or_content, it was added on the beginning!" 103 | ) 104 | docker_file_path_or_content = ( 105 | f"FROM {baseimage_uri}\n" + docker_file_path_or_content 106 | ) 107 | else: 108 | docker_file_path_or_content = docker_file_path_or_content.replace( 109 | "__BASE_IMAGE__", baseimage_uri 110 | ) 111 | 112 | logging.info( 113 | f"Building {docker_file_path_or_content} to {repo_name}:{image_tag} and pushing to {aws_repo_name}..." 114 | ) 115 | 116 | fileObj = BytesIO(docker_file_path_or_content.encode("utf-8")) 117 | build_args["fileobj"] = fileObj 118 | 119 | # Create auth config 120 | resp = self.ecrClient.get_authorization_token() 121 | token = resp["authorizationData"][0]["authorizationToken"] 122 | token = base64.b64decode(token).decode() 123 | username, password = token.split(":") 124 | auth_config = {"username": username, "password": password} 125 | 126 | client = docker.from_env() 127 | # pull the base image 128 | client.images.pull(baseimage_uri, auth_config=auth_config) 129 | # build and tag the image 130 | image = client.images.build(**build_args) 131 | 132 | images = self.ecrClient.describe_images(repositoryName=aws_repo_name) 133 | images_digests = [x["imageDigest"] for x in images["imageDetails"]] 134 | build_repo_digests = image[0].attrs["RepoDigests"] 135 | if build_repo_digests: 136 | builtImageDigest = build_repo_digests[0].split("@")[1] 137 | if not build_repo_digests or (builtImageDigest not in images_digests): 138 | logging.info("Tagging and pushing the image...") 139 | res = image[0].tag(repo_uri, image_tag) 140 | assert res 141 | 142 | # push the image to ECR 143 | for line in client.images.push( 144 | repo_uri, image_tag, auth_config=auth_config, stream=True, decode=True 145 | ): 146 | logging.info(line) 147 | image_uri = f"{repo_uri}:{image_tag}" 148 | else: 149 | logging.info("Image already exists!") 150 | image_idx = images_digests.index(builtImageDigest) 151 | image_details = images["imageDetails"][image_idx] 152 | # see https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-pull-ecr-image.html 153 | image_uri = f'{repo_uri}@{image_details["imageDigest"]}' 154 | logging.info(f"Image uri: {image_uri}") 155 | return image_uri 156 | -------------------------------------------------------------------------------- /examples/readme_examples/expected_output_rest/example5/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-09-13 13:23:30,697 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-09-13 13:23:30,702 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-09-13 13:23:30,711 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-09-13 13:23:33,730 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-09-13 13:23:33,996 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-09-13 13:23:34,008 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-09-13 13:23:34,020 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-09-13 13:23:34,030 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": { 17 | "bucket": "/opt/ml/input/data/bucket" 18 | }, 19 | "current_host": "algo-1", 20 | "framework_module": "sagemaker_pytorch_container.training:main", 21 | "hosts": [ 22 | "algo-1" 23 | ], 24 | "hyperparameters": {}, 25 | "input_config_dir": "/opt/ml/input/config", 26 | "channel_data_config": { 27 | "bucket": { 28 | "TrainingInputMode": "File", 29 | "S3DistributionType": "FullyReplicated", 30 | "RecordWrapperType": "None" 31 | } 32 | }, 33 | "input_dir": "/opt/ml/input", 34 | "is_master": true, 35 | "job_name": "task5-2020-09-13-13-20-31-Cz53I5to", 36 | "log_level": 20, 37 | "master_hostname": "algo-1", 38 | "model_dir": "/opt/ml/model", 39 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz", 40 | "module_name": "worker4", 41 | "network_interface_name": "eth0", 42 | "num_cpus": 2, 43 | "num_gpus": 0, 44 | "output_data_dir": "/opt/ml/output/data", 45 | "output_dir": "/opt/ml/output", 46 | "output_intermediate_dir": "/opt/ml/output/intermediate", 47 | "resource_config": { 48 | "current_host": "algo-1", 49 | "hosts": [ 50 | "algo-1" 51 | ], 52 | "network_interface_name": "eth0" 53 | }, 54 | "user_entry_point": "worker4.py" 55 | } 56 | 57 | Environment variables: 58 | 59 | SM_HOSTS=["algo-1"] 60 | SM_NETWORK_INTERFACE_NAME=eth0 61 | SM_HPS={} 62 | SM_USER_ENTRY_POINT=worker4.py 63 | SM_FRAMEWORK_PARAMS={} 64 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"} 65 | SM_channel_data_CONFIG={"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}} 66 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 67 | SM_CHANNELS=["bucket"] 68 | SM_CURRENT_HOST=algo-1 69 | SM_MODULE_NAME=worker4 70 | SM_LOG_LEVEL=20 71 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 72 | SM_INPUT_DIR=/opt/ml/input 73 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 74 | SM_OUTPUT_DIR=/opt/ml/output 75 | SM_NUM_CPUS=2 76 | SM_NUM_GPUS=0 77 | SM_MODEL_DIR=/opt/ml/model 78 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz 79 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"bucket":"/opt/ml/input/data/bucket"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{},"input_config_dir":"/opt/ml/input/config","channel_data_config":{"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task5-2020-09-13-13-20-31-Cz53I5to","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli_2020-09-13-13-16-10_py37/task5/task5-2020-09-13-13-20-31-Cz53I5to/source/sourcedir.tar.gz","module_name":"worker4","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker4.py"} 80 | SM_USER_ARGS=[] 81 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 82 | SM_CHANNEL_BUCKET=/opt/ml/input/data/bucket 83 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 84 | 85 | Invoking script with the following command: 86 | 87 | /opt/conda/bin/python worker4.py 88 | 89 | 90 | INFO:worker_toolkit.worker_lib:Args: Namespace(batch_size=64, channel_names=['bucket'], current_host='algo-1', epochs=50, hosts=['algo-1'], hps={}, channel_bucket='/opt/ml/input/data/bucket', input_config_dir='/opt/ml/input/config', channel_data='', channel_data_config='{"bucket":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', channel_model='', job_name='task5-2020-09-13-13-20-31-Cz53I5to', learning_rate=0.05, model_dir='/opt/ml/model', network_interface='eth0', num_cpus=2, num_gpus=0, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}', state='/state', use_cuda=False) 91 | INFO:worker_toolkit.worker_lib:Unmatched: [] 92 | INFO:__main__:*** START listing files in 93 | INFO:__main__: 94 | INFO:__main__:*** END file listing 95 | INFO:__main__:*** START listing files in /opt/ml/input/data/bucket 96 | INFO:__main__:/opt/ml/input/data/bucket: 97 | total 12 98 | drwxr-xr-x 2 root root 4096 Sep 13 13:23 . 99 | drwxr-xr-x 3 root root 4096 Sep 13 13:23 .. 100 | -rw-r--r-- 1 root root 127 Sep 13 13:23 model.tar.gz 101 | 102 | INFO:__main__:*** END file listing /opt/ml/input/data/bucket 103 | 2020-09-13 13:23:34,107 sagemaker-training-toolkit INFO Reporting training SUCCESS 104 | -------------------------------------------------------------------------------- /examples/cli_multi/expected_output/output1/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-09-13 13:20:23,180 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-09-13 13:20:23,182 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-09-13 13:20:23,192 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-09-13 13:20:26,236 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-09-13 13:20:26,500 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-09-13 13:20:26,512 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-09-13 13:20:26,524 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-09-13 13:20:26,533 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": { 17 | "data": "/opt/ml/input/data/data" 18 | }, 19 | "current_host": "algo-1", 20 | "framework_module": "sagemaker_pytorch_container.training:main", 21 | "hosts": [ 22 | "algo-1" 23 | ], 24 | "hyperparameters": { 25 | "task_type": "1" 26 | }, 27 | "input_config_dir": "/opt/ml/input/config", 28 | "channel_data_config": { 29 | "data": { 30 | "TrainingInputMode": "File", 31 | "S3DistributionType": "ShardedByS3Key", 32 | "RecordWrapperType": "None" 33 | } 34 | }, 35 | "input_dir": "/opt/ml/input", 36 | "is_master": true, 37 | "job_name": "task1-2020-09-13-13-16-15-3uM65148", 38 | "log_level": 20, 39 | "master_hostname": "algo-1", 40 | "model_dir": "/opt/ml/model", 41 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz", 42 | "module_name": "worker", 43 | "network_interface_name": "eth0", 44 | "num_cpus": 2, 45 | "num_gpus": 0, 46 | "output_data_dir": "/opt/ml/output/data", 47 | "output_dir": "/opt/ml/output", 48 | "output_intermediate_dir": "/opt/ml/output/intermediate", 49 | "resource_config": { 50 | "current_host": "algo-1", 51 | "hosts": [ 52 | "algo-1" 53 | ], 54 | "network_interface_name": "eth0" 55 | }, 56 | "user_entry_point": "worker.py" 57 | } 58 | 59 | Environment variables: 60 | 61 | SM_HOSTS=["algo-1"] 62 | SM_NETWORK_INTERFACE_NAME=eth0 63 | SM_HPS={"task_type":"1"} 64 | SM_USER_ENTRY_POINT=worker.py 65 | SM_FRAMEWORK_PARAMS={} 66 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"} 67 | SM_channel_data_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}} 68 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 69 | SM_CHANNELS=["data"] 70 | SM_CURRENT_HOST=algo-1 71 | SM_MODULE_NAME=worker 72 | SM_LOG_LEVEL=20 73 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 74 | SM_INPUT_DIR=/opt/ml/input 75 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 76 | SM_OUTPUT_DIR=/opt/ml/output 77 | SM_NUM_CPUS=2 78 | SM_NUM_GPUS=0 79 | SM_MODEL_DIR=/opt/ml/model 80 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz 81 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1"],"hyperparameters":{"task_type":"1"},"input_config_dir":"/opt/ml/input/config","channel_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"task1-2020-09-13-13-16-15-3uM65148","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-example-cli-multi_2020-09-13-13-16-10_py37/task1/task1-2020-09-13-13-16-15-3uM65148/source/sourcedir.tar.gz","module_name":"worker","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"},"user_entry_point":"worker.py"} 82 | SM_USER_ARGS=["--task_type","1"] 83 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 84 | SM_CHANNEL_DATA=/opt/ml/input/data/data 85 | SM_HP_TASK_TYPE=1 86 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 87 | 88 | Invoking script with the following command: 89 | 90 | /opt/conda/bin/python worker.py --task_type 1 91 | 92 | 93 | INFO:__main__:Starting worker... 94 | INFO:worker_toolkit.worker_lib:Args: Namespace(batch_size=64, channel_names=['data'], current_host='algo-1', epochs=50, hosts=['algo-1'], hps={'task_type': '1'}, input_config_dir='/opt/ml/input/config', channel_data='/opt/ml/input/data/data', channel_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', channel_model='', job_name='task1-2020-09-13-13-16-15-3uM65148', learning_rate=0.05, model_dir='/opt/ml/model', network_interface='eth0', num_cpus=2, num_gpus=0, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}', state='/state', use_cuda=False) 95 | INFO:worker_toolkit.worker_lib:Unmatched: ['--task_type', '1'] 96 | INFO:worker_toolkit.worker_lib:Deleting other instances' state 97 | INFO:worker_toolkit.worker_lib:Creating instance specific state dir 98 | INFO:__main__:Hyperparams: {'task_type': '1'} 99 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data1.txt'), PosixPath('/opt/ml/input/data/data/sample_data2.txt')] 100 | INFO:__main__:State files: [PosixPath('/state/algo-1')] 101 | INFO:__main__:finished! 102 | 2020-09-13 13:20:26,601 sagemaker-training-toolkit INFO Reporting training SUCCESS 103 | -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/logs/logs0: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-10-04 09:21:03,620 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-10-04 09:21:03,623 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-10-04 09:21:03,632 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-10-04 09:21:16,308 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-10-04 09:21:16,553 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-10-04 09:21:16,564 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-10-04 09:21:16,575 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-10-04 09:21:16,585 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": { 17 | "data": "/opt/ml/input/data/data" 18 | }, 19 | "current_host": "algo-1", 20 | "framework_module": "sagemaker_pytorch_container.training:main", 21 | "hosts": [ 22 | "algo-1", 23 | "algo-2" 24 | ], 25 | "hyperparameters": { 26 | "task": 1, 27 | "arg": "hello world!", 28 | "worker": 1 29 | }, 30 | "input_config_dir": "/opt/ml/input/config", 31 | "input_data_config": { 32 | "data": { 33 | "TrainingInputMode": "File", 34 | "S3DistributionType": "ShardedByS3Key", 35 | "RecordWrapperType": "None" 36 | } 37 | }, 38 | "input_dir": "/opt/ml/input", 39 | "is_master": true, 40 | "job_name": "single-file-task1-2020-10-04-09-17-17-PMGHWPsv", 41 | "log_level": 20, 42 | "master_hostname": "algo-1", 43 | "model_dir": "/opt/ml/model", 44 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz", 45 | "module_name": "example", 46 | "network_interface_name": "eth0", 47 | "num_cpus": 2, 48 | "num_gpus": 0, 49 | "output_data_dir": "/opt/ml/output/data", 50 | "output_dir": "/opt/ml/output", 51 | "output_intermediate_dir": "/opt/ml/output/intermediate", 52 | "resource_config": { 53 | "current_host": "algo-1", 54 | "hosts": [ 55 | "algo-1", 56 | "algo-2" 57 | ], 58 | "network_interface_name": "eth0" 59 | }, 60 | "user_entry_point": "example.py" 61 | } 62 | 63 | Environment variables: 64 | 65 | SM_HOSTS=["algo-1","algo-2"] 66 | SM_NETWORK_INTERFACE_NAME=eth0 67 | SM_HPS={"arg":"hello world!","task":1,"worker":1} 68 | SM_USER_ENTRY_POINT=example.py 69 | SM_FRAMEWORK_PARAMS={} 70 | SM_RESOURCE_CONFIG={"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"} 71 | SM_INPUT_DATA_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}} 72 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 73 | SM_CHANNELS=["data"] 74 | SM_CURRENT_HOST=algo-1 75 | SM_MODULE_NAME=example 76 | SM_LOG_LEVEL=20 77 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 78 | SM_INPUT_DIR=/opt/ml/input 79 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 80 | SM_OUTPUT_DIR=/opt/ml/output 81 | SM_NUM_CPUS=2 82 | SM_NUM_GPUS=0 83 | SM_MODEL_DIR=/opt/ml/model 84 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz 85 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-1","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{"arg":"hello world!","task":1,"worker":1},"input_config_dir":"/opt/ml/input/config","input_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":true,"job_name":"single-file-task1-2020-10-04-09-17-17-PMGHWPsv","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz","module_name":"example","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"example.py"} 86 | SM_USER_ARGS=["--arg","hello world!","--task","1","--worker","1"] 87 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 88 | SM_CHANNEL_DATA=/opt/ml/input/data/data 89 | SM_HP_TASK=1 90 | SM_HP_ARG=hello world! 91 | SM_HP_WORKER=1 92 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 93 | 94 | Invoking script with the following command: 95 | 96 | /opt/conda/bin/python example.py --arg hello world! --task 1 --worker 1 97 | 98 | 99 | INFO:__main__:Starting worker... 100 | INFO:worker_toolkit.worker_lib:Deleting other instances' state 101 | INFO:worker_toolkit.worker_lib:Creating state dir 102 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='/opt/ml/input/data/data', channel_model='', channels=['data'], current_host='algo-1', host_rank=0, hosts=['algo-1', 'algo-2'], hps={'arg': 'hello world!', 'task': 1, 'worker': 1}, input_config_dir='/opt/ml/input/config', input_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', instance_state='/state/algo-1', job_name='single-file-task1-2020-10-04-09-17-17-PMGHWPsv', model_dir='/opt/ml/model', network_interface_name='eth0', num_cpus=2, num_gpus=0, num_nodes=2, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-1","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}', state='/state') 103 | INFO:__main__:Hyperparams: {'arg': 'hello world!', 'task': 1, 'worker': 1} 104 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data1.txt')] 105 | INFO:__main__:State files: [PosixPath('/state/algo-1')] 106 | INFO:__main__:finished! 107 | 2020-10-04 09:21:16,653 sagemaker-training-toolkit INFO Reporting training SUCCESS 108 | -------------------------------------------------------------------------------- /examples/single_file/expected_output/output1/logs/logs1: -------------------------------------------------------------------------------- 1 | bash: cannot set terminal process group (-1): Inappropriate ioctl for device 2 | bash: no job control in this shell 3 | 2020-10-04 09:21:07,312 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training 4 | 2020-10-04 09:21:07,314 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 5 | 2020-10-04 09:21:07,324 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed. 6 | 2020-10-04 09:21:10,395 sagemaker_pytorch_container.training INFO Invoking user training script. 7 | 2020-10-04 09:21:10,647 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 8 | 2020-10-04 09:21:10,659 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 9 | 2020-10-04 09:21:10,671 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed) 10 | 2020-10-04 09:21:10,680 sagemaker-training-toolkit INFO Invoking user script 11 | 12 | Training Env: 13 | 14 | { 15 | "additional_framework_parameters": {}, 16 | "channel_input_dirs": { 17 | "data": "/opt/ml/input/data/data" 18 | }, 19 | "current_host": "algo-2", 20 | "framework_module": "sagemaker_pytorch_container.training:main", 21 | "hosts": [ 22 | "algo-1", 23 | "algo-2" 24 | ], 25 | "hyperparameters": { 26 | "task": 1, 27 | "arg": "hello world!", 28 | "worker": 1 29 | }, 30 | "input_config_dir": "/opt/ml/input/config", 31 | "input_data_config": { 32 | "data": { 33 | "TrainingInputMode": "File", 34 | "S3DistributionType": "ShardedByS3Key", 35 | "RecordWrapperType": "None" 36 | } 37 | }, 38 | "input_dir": "/opt/ml/input", 39 | "is_master": false, 40 | "job_name": "single-file-task1-2020-10-04-09-17-17-PMGHWPsv", 41 | "log_level": 20, 42 | "master_hostname": "algo-1", 43 | "model_dir": "/opt/ml/model", 44 | "module_dir": "s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz", 45 | "module_name": "example", 46 | "network_interface_name": "eth0", 47 | "num_cpus": 2, 48 | "num_gpus": 0, 49 | "output_data_dir": "/opt/ml/output/data", 50 | "output_dir": "/opt/ml/output", 51 | "output_intermediate_dir": "/opt/ml/output/intermediate", 52 | "resource_config": { 53 | "current_host": "algo-2", 54 | "hosts": [ 55 | "algo-1", 56 | "algo-2" 57 | ], 58 | "network_interface_name": "eth0" 59 | }, 60 | "user_entry_point": "example.py" 61 | } 62 | 63 | Environment variables: 64 | 65 | SM_HOSTS=["algo-1","algo-2"] 66 | SM_NETWORK_INTERFACE_NAME=eth0 67 | SM_HPS={"arg":"hello world!","task":1,"worker":1} 68 | SM_USER_ENTRY_POINT=example.py 69 | SM_FRAMEWORK_PARAMS={} 70 | SM_RESOURCE_CONFIG={"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"} 71 | SM_INPUT_DATA_CONFIG={"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}} 72 | SM_OUTPUT_DATA_DIR=/opt/ml/output/data 73 | SM_CHANNELS=["data"] 74 | SM_CURRENT_HOST=algo-2 75 | SM_MODULE_NAME=example 76 | SM_LOG_LEVEL=20 77 | SM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main 78 | SM_INPUT_DIR=/opt/ml/input 79 | SM_INPUT_CONFIG_DIR=/opt/ml/input/config 80 | SM_OUTPUT_DIR=/opt/ml/output 81 | SM_NUM_CPUS=2 82 | SM_NUM_GPUS=0 83 | SM_MODEL_DIR=/opt/ml/model 84 | SM_MODULE_DIR=s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz 85 | SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"data":"/opt/ml/input/data/data"},"current_host":"algo-2","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1","algo-2"],"hyperparameters":{"arg":"hello world!","task":1,"worker":1},"input_config_dir":"/opt/ml/input/config","input_data_config":{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":false,"job_name":"single-file-task1-2020-10-04-09-17-17-PMGHWPsv","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-us-east-1-XXXXXXXXXXXX/tests/simple-sagemaker-sf_2020-10-04-09-16-49_py37/single-file-task1/single-file-task1-2020-10-04-09-17-17-PMGHWPsv/source/sourcedir.tar.gz","module_name":"example","network_interface_name":"eth0","num_cpus":2,"num_gpus":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"},"user_entry_point":"example.py"} 86 | SM_USER_ARGS=["--arg","hello world!","--task","1","--worker","1"] 87 | SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate 88 | SM_CHANNEL_DATA=/opt/ml/input/data/data 89 | SM_HP_TASK=1 90 | SM_HP_ARG=hello world! 91 | SM_HP_WORKER=1 92 | PYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages 93 | 94 | Invoking script with the following command: 95 | 96 | /opt/conda/bin/python example.py --arg hello world! --task 1 --worker 1 97 | 98 | 99 | INFO:__main__:Starting worker... 100 | INFO:worker_toolkit.worker_lib:Deleting other instances' state 101 | INFO:worker_toolkit.worker_lib:Creating state dir 102 | INFO:worker_toolkit.worker_lib:Worker config: Namespace(channel_data='/opt/ml/input/data/data', channel_model='', channels=['data'], current_host='algo-2', host_rank=1, hosts=['algo-1', 'algo-2'], hps={'arg': 'hello world!', 'task': 1, 'worker': 1}, input_config_dir='/opt/ml/input/config', input_data_config='{"data":{"RecordWrapperType":"None","S3DistributionType":"ShardedByS3Key","TrainingInputMode":"File"}}', input_dir='/opt/ml/input', instance_state='/state/algo-2', job_name='single-file-task1-2020-10-04-09-17-17-PMGHWPsv', model_dir='/opt/ml/model', network_interface_name='eth0', num_cpus=2, num_gpus=0, num_nodes=2, output_data_dir='/opt/ml/output/data', output_dir='/opt/ml/output', resource_config='{"current_host":"algo-2","hosts":["algo-1","algo-2"],"network_interface_name":"eth0"}', state='/state') 103 | INFO:__main__:Hyperparams: {'arg': 'hello world!', 'task': 1, 'worker': 1} 104 | INFO:__main__:Input data files: [PosixPath('/opt/ml/input/data/data/sample_data2.txt')] 105 | INFO:__main__:State files: [PosixPath('/state/algo-2')] 106 | INFO:__main__:finished! 107 | 2020-10-04 09:21:10,751 sagemaker-training-toolkit INFO Reporting training SUCCESS 108 | --------------------------------------------------------------------------------