├── tests ├── __init__.py ├── core │ └── __init__.py ├── spark │ ├── __init__.py │ └── models │ │ └── test_spark_configuration.py ├── utils │ ├── __init__.py │ ├── test_helpers.py │ └── test_command_builder.py ├── models │ ├── __init__.py │ └── internal │ │ ├── test_plugin_reference.py │ │ └── test_plugin-manager.py └── integration_tests │ ├── __init__.py │ └── spark │ ├── __init__.py │ └── sdk │ ├── __init__.py │ ├── cluster │ └── __init__.py │ ├── job │ └── __init__.py │ ├── wait_for_all_nodes.py │ ├── clean_up_cluster.py │ ├── ensure_spark_processes.py │ └── get_client.py ├── aztk ├── core │ ├── __init__.py │ └── models │ │ └── __init__.py ├── spark │ ├── utils │ │ ├── __init__.py │ │ ├── constants.py │ │ └── util.py │ ├── client │ │ ├── base │ │ │ ├── helpers │ │ │ │ ├── __init__.py │ │ │ │ └── list_applications.py │ │ │ └── __init__.py │ │ ├── job │ │ │ ├── helpers │ │ │ │ ├── __init__.py │ │ │ │ ├── stop_application.py │ │ │ │ ├── list.py │ │ │ │ ├── stop.py │ │ │ │ ├── wait_until_complete.py │ │ │ │ ├── get.py │ │ │ │ ├── get_application.py │ │ │ │ ├── delete.py │ │ │ │ └── list_applications.py │ │ │ └── __init__.py │ │ ├── cluster │ │ │ ├── helpers │ │ │ │ ├── __init__.py │ │ │ │ ├── wait.py │ │ │ │ ├── get_configuration.py │ │ │ │ ├── delete.py │ │ │ │ ├── get.py │ │ │ │ ├── get_application_log.py │ │ │ │ ├── get_remote_login_settings.py │ │ │ │ ├── get_application_state.py │ │ │ │ ├── list.py │ │ │ │ ├── run.py │ │ │ │ ├── node_run.py │ │ │ │ ├── ssh_into_master.py │ │ │ │ ├── create_user.py │ │ │ │ ├── copy.py │ │ │ │ ├── download.py │ │ │ │ └── diagnostics.py │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── client.py │ ├── __init__.py │ └── models │ │ ├── __init__.py │ │ └── plugins │ │ ├── hdfs │ │ ├── __init__.py │ │ └── configuration.py │ │ ├── install │ │ ├── __init__.py │ │ ├── pip │ │ │ ├── __init__.py │ │ │ └── configuration.py │ │ ├── apt_get │ │ │ ├── __init__.py │ │ │ └── configuration.py │ │ ├── conda │ │ │ ├── __init__.py │ │ │ └── configuration.py │ │ ├── install.sh │ │ └── configuration.py │ │ ├── jupyter │ │ ├── __init__.py │ │ ├── configuration.py │ │ └── jupyter.sh │ │ ├── nvblas │ │ ├── __init__.py │ │ └── configuration.py │ │ ├── simple │ │ ├── __init__.py │ │ ├── simple.sh │ │ └── configuration.py │ │ ├── jupyter_lab │ │ ├── __init__.py │ │ ├── configuration.py │ │ └── jupyter_lab.sh │ │ ├── openblas │ │ ├── __init__.py │ │ ├── openblas.sh │ │ └── configuration.py │ │ ├── resource_monitor │ │ ├── __init__.py │ │ ├── images │ │ │ ├── chronograf_hosts.png │ │ │ ├── chronograf_single_host.png │ │ │ ├── chronograf_build_dashboard.png │ │ │ └── chronograf_create_dashboard.png │ │ ├── start_monitor.sh │ │ ├── docker-compose.yml │ │ └── configuration.py │ │ ├── rstudio_server │ │ ├── __init__.py │ │ ├── configuration.py │ │ └── rstudio_server.sh │ │ ├── spark_ui_proxy │ │ ├── __init__.py │ │ ├── spark_ui_proxy.sh │ │ └── configuration.py │ │ ├── tensorflow_on_spark │ │ ├── __init__.py │ │ ├── configuration.py │ │ └── tensorflow_on_spark.sh │ │ └── __init__.py ├── node_scripts │ ├── __init__.py │ ├── install │ │ └── __init__.py │ ├── scheduling │ │ ├── __init__.py │ │ └── scheduling_target.py │ ├── core │ │ ├── __init__.py │ │ └── logger.py │ ├── requirements.txt │ ├── wait_until_setup_complete.py │ ├── Pipfile │ ├── main.py │ ├── wait_until_master_selected.py │ └── docker_main.sh ├── client │ ├── base │ │ ├── helpers │ │ │ ├── __init__.py │ │ │ ├── delete_user_on_cluster.py │ │ │ ├── delete_user_on_node.py │ │ │ ├── generate_user_on_node.py │ │ │ ├── create_user_on_cluster.py │ │ │ ├── get_recent_job.py │ │ │ ├── generate_user_on_cluster.py │ │ │ ├── get_task_state.py │ │ │ ├── get_remote_login_settings.py │ │ │ ├── list_tasks.py │ │ │ ├── ssh_into_node.py │ │ │ ├── node_run.py │ │ │ ├── run.py │ │ │ └── create_user_on_node.py │ │ └── __init__.py │ ├── job │ │ ├── helpers │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── operations.py │ ├── cluster │ │ ├── helpers │ │ │ ├── __init__.py │ │ │ ├── wait_for_task_to_complete.py │ │ │ ├── get.py │ │ │ ├── list.py │ │ │ ├── delete.py │ │ │ └── copy.py │ │ └── __init__.py │ ├── __init__.py │ └── client.py ├── models │ ├── plugins │ │ ├── __init__.py │ │ ├── internal │ │ │ └── __init__.py │ │ └── plugin_file.py │ ├── software.py │ ├── ssh_log.py │ ├── remote_login.py │ ├── file.py │ ├── vm_image.py │ ├── task_state.py │ ├── cluster_state.py │ ├── port_forward_specification.py │ ├── user_configuration.py │ ├── file_share.py │ ├── node_output.py │ ├── scheduling_target.py │ ├── task.py │ ├── application_log.py │ ├── __init__.py │ ├── cluster.py │ └── secrets_configuration.py ├── internal │ ├── cluster_data │ │ ├── __init__.py │ │ └── blob_data.py │ ├── __init__.py │ ├── docker_cmd.py │ └── configuration_base.py ├── utils │ ├── file_utils.py │ ├── __init__.py │ ├── try_func.py │ ├── get_ssh_key.py │ ├── secure_utils.py │ ├── retry.py │ ├── deprecation.py │ └── command_builder.py ├── __init__.py ├── version.py └── error.py ├── aztk_cli ├── config │ ├── jars │ │ └── .null │ ├── ssh.yaml │ ├── secrets.yaml.template │ └── core-site.xml ├── spark │ ├── __init__.py │ └── endpoints │ │ ├── __init__.py │ │ ├── job │ │ ├── __init__.py │ │ ├── list.py │ │ ├── get.py │ │ ├── list_apps.py │ │ ├── stop.py │ │ ├── get_app.py │ │ ├── stop_app.py │ │ ├── get_app_logs.py │ │ ├── delete.py │ │ └── submit.py │ │ ├── cluster │ │ ├── __init__.py │ │ ├── cluster_list.py │ │ ├── cluster_debug.py │ │ ├── cluster_get.py │ │ ├── cluster_copy.py │ │ ├── cluster_app_logs.py │ │ ├── cluster_run.py │ │ ├── cluster_delete.py │ │ └── cluster_add_user.py │ │ └── spark.py ├── __init__.py ├── constants.py ├── plugins.py └── toolkit.py ├── pytest.ini ├── examples └── src │ └── main │ ├── resources │ ├── people.txt │ ├── people.json │ ├── users.avro │ ├── users.parquet │ ├── employees.json │ ├── user.avsc │ └── full_user.avsc │ ├── scala │ └── org │ │ └── apache │ │ └── spark │ │ └── examples │ │ ├── LocalPi.scala │ │ ├── SparkPi.scala │ │ └── DriverSubmissionTest.scala │ ├── python │ ├── wordcount.py │ ├── pi.py │ └── sort.py │ ├── r │ └── dataframe.R │ └── java │ └── org │ └── apache │ └── spark │ └── examples │ └── JavaSparkPi.java ├── docs ├── misc │ ├── AAD_1.png │ ├── Batch_1.png │ ├── Batch_2.png │ ├── Batch_3.png │ ├── Batch_4.png │ ├── Storage_1.png │ ├── Storage_2.png │ ├── Storage_3.png │ ├── Storage_4.png │ ├── plugin-logs.png │ ├── Batch_secrets.png │ ├── Storage_secrets.png │ ├── AppRegistrations_1.png │ ├── AppRegistrations_2.png │ ├── AppRegistrations_3.png │ ├── PySpark Shell (wiki).png │ └── PySpark Jupypter (wiki).png ├── aztk.models.rst ├── aztk.spark.models.plugins.rst ├── aztk.spark.models.rst ├── dev │ ├── docs.md │ ├── tests.md │ └── writing-models.md ├── aztk.spark.rst ├── Makefile ├── aztk.rst ├── make.bat ├── 60-gpu.md ├── index.rst ├── 14-azure-files.md └── 20-spark-submit.md ├── .gitattributes ├── .style.yapf ├── .editorconfig ├── docker-image ├── README.md ├── miniconda │ ├── spark1.6.3 │ │ ├── base │ │ │ └── Dockerfile │ │ └── gpu │ │ │ └── Dockerfile │ ├── spark2.1.0 │ │ ├── base │ │ │ └── Dockerfile │ │ └── gpu │ │ │ └── Dockerfile │ ├── spark2.2.0 │ │ ├── base │ │ │ └── Dockerfile │ │ └── gpu │ │ │ └── Dockerfile │ └── spark2.3.0 │ │ ├── base │ │ └── Dockerfile │ │ └── gpu │ │ └── Dockerfile ├── anaconda │ ├── spark1.6.3 │ │ ├── gpu │ │ │ └── Dockerfile │ │ └── base │ │ │ └── Dockerfile │ ├── spark2.1.0 │ │ ├── gpu │ │ │ └── Dockerfile │ │ └── base │ │ │ └── Dockerfile │ ├── spark2.2.0 │ │ ├── gpu │ │ │ └── Dockerfile │ │ └── base │ │ │ └── Dockerfile │ └── spark2.3.0 │ │ ├── gpu │ │ └── Dockerfile │ │ └── base │ │ └── Dockerfile ├── base │ └── README.md └── r │ └── README.md ├── custom-scripts ├── simple.sh ├── rstudio_server.sh └── jupyter.sh ├── requirements.txt ├── account_setup.sh ├── .vscode └── settings.json ├── .gitignore ├── .travis.yml ├── LICENSE └── .vsts-ci.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/spark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk_cli/config/jars/.null: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk_cli/spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/node_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/client/job/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/node_scripts/install/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/node_scripts/scheduling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/spark/client/base/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/job/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aztk/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import CoreClient 2 | -------------------------------------------------------------------------------- /aztk/spark/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import * 2 | -------------------------------------------------------------------------------- /aztk/node_scripts/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import log 2 | -------------------------------------------------------------------------------- /aztk/spark/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import Client 2 | -------------------------------------------------------------------------------- /aztk/client/job/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import CoreJobOperations 2 | -------------------------------------------------------------------------------- /aztk/client/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_operations import BaseOperations 2 | -------------------------------------------------------------------------------- /aztk/spark/client/job/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import JobOperations 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/hdfs/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/nvblas/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/simple/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/client/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import CoreClusterOperations 2 | -------------------------------------------------------------------------------- /aztk/spark/client/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import SparkBaseOperations 2 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import ClusterOperations 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/pip/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter_lab/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/openblas/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk_cli/__init__.py: -------------------------------------------------------------------------------- 1 | import aztk_cli.logger 2 | 3 | log = aztk_cli.logger.root 4 | -------------------------------------------------------------------------------- /examples/src/main/resources/people.txt: -------------------------------------------------------------------------------- 1 | Michael, 29 2 | Andy, 30 3 | Justin, 19 4 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/apt_get/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/conda/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | eval $COMMAND $@ 4 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/rstudio_server/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/spark_ui_proxy/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/tensorflow_on_spark/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration import * 2 | -------------------------------------------------------------------------------- /aztk_cli/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Name of the executable 3 | """ 4 | CLI_EXE = "aztk" 5 | -------------------------------------------------------------------------------- /docs/misc/AAD_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/AAD_1.png -------------------------------------------------------------------------------- /aztk/models/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from .plugin_file import * 2 | from .plugin_configuration import * 3 | -------------------------------------------------------------------------------- /docs/misc/Batch_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Batch_1.png -------------------------------------------------------------------------------- /docs/misc/Batch_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Batch_2.png -------------------------------------------------------------------------------- /docs/misc/Batch_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Batch_3.png -------------------------------------------------------------------------------- /docs/misc/Batch_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Batch_4.png -------------------------------------------------------------------------------- /docs/misc/Storage_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Storage_1.png -------------------------------------------------------------------------------- /docs/misc/Storage_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Storage_2.png -------------------------------------------------------------------------------- /docs/misc/Storage_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Storage_3.png -------------------------------------------------------------------------------- /docs/misc/Storage_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Storage_4.png -------------------------------------------------------------------------------- /docs/misc/plugin-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/plugin-logs.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text eol=lf 2 | 3 | *.jar binary 4 | *.png binary 5 | *.avro binary 6 | *.parquet binary 7 | -------------------------------------------------------------------------------- /aztk/models/plugins/internal/__init__.py: -------------------------------------------------------------------------------- 1 | from .plugin_manager import * 2 | from .plugin_reference import * 3 | -------------------------------------------------------------------------------- /docs/misc/Batch_secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Batch_secrets.png -------------------------------------------------------------------------------- /docs/misc/Storage_secrets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/Storage_secrets.png -------------------------------------------------------------------------------- /docs/misc/AppRegistrations_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/AppRegistrations_1.png -------------------------------------------------------------------------------- /docs/misc/AppRegistrations_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/AppRegistrations_2.png -------------------------------------------------------------------------------- /docs/misc/AppRegistrations_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/AppRegistrations_3.png -------------------------------------------------------------------------------- /docs/misc/PySpark Shell (wiki).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/PySpark Shell (wiki).png -------------------------------------------------------------------------------- /examples/src/main/resources/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} 4 | -------------------------------------------------------------------------------- /docs/misc/PySpark Jupypter (wiki).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/docs/misc/PySpark Jupypter (wiki).png -------------------------------------------------------------------------------- /examples/src/main/resources/users.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/examples/src/main/resources/users.avro -------------------------------------------------------------------------------- /examples/src/main/resources/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/examples/src/main/resources/users.parquet -------------------------------------------------------------------------------- /aztk/models/software.py: -------------------------------------------------------------------------------- 1 | class Software: 2 | """ 3 | Enum with list of available softwares 4 | """ 5 | 6 | spark = "spark" 7 | -------------------------------------------------------------------------------- /aztk/internal/cluster_data/__init__.py: -------------------------------------------------------------------------------- 1 | from .blob_data import BlobData 2 | from .node_data import NodeData 3 | from .cluster_data import ClusterData 4 | -------------------------------------------------------------------------------- /aztk/models/ssh_log.py: -------------------------------------------------------------------------------- 1 | class SSHLog: 2 | def __init__(self, output, node_id): 3 | self.output = output 4 | self.node_id = node_id 5 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/spark_ui_proxy/spark_ui_proxy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python $AZTK_WORKING_DIR/plugins/spark_ui_proxy/spark_ui_proxy.py $1 $2 & 3 | -------------------------------------------------------------------------------- /aztk/spark/utils/constants.py: -------------------------------------------------------------------------------- 1 | from aztk.spark import models 2 | 3 | SPARK_VM_IMAGE = models.VmImage(publisher="Canonical", offer="UbuntuServer", sku="16.04") 4 | -------------------------------------------------------------------------------- /aztk/models/remote_login.py: -------------------------------------------------------------------------------- 1 | class RemoteLogin: 2 | def __init__(self, ip_address, port): 3 | self.ip_address = ip_address 4 | self.port = port 5 | -------------------------------------------------------------------------------- /aztk/core/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .fields import (Boolean, Datetime, Float, Integer, List, ListMergeStrategy, ModelMergeStrategy, String) 2 | from .model import Model 3 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/openblas/openblas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | apt-get update && 3 | apt-get install -y libopenblas-base && 4 | update-alternatives --config libblas.so.3 5 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/simple/simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Is master: $AZTK_IS_MASTER" 4 | echo "Is worker: $AZTK_IS_WORKER" 5 | echo "Master node ip: $AZTK_MASTER_IP" 6 | -------------------------------------------------------------------------------- /aztk/models/file.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | 4 | class File: 5 | def __init__(self, name: str, payload: io.StringIO): 6 | self.name = name 7 | self.payload = payload 8 | -------------------------------------------------------------------------------- /examples/src/main/resources/employees.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael", "salary":3000} 2 | {"name":"Andy", "salary":4500} 3 | {"name":"Justin", "salary":3500} 4 | {"name":"Berta", "salary":4000} 5 | -------------------------------------------------------------------------------- /aztk/models/vm_image.py: -------------------------------------------------------------------------------- 1 | class VmImage: 2 | def __init__(self, publisher, offer, sku): 3 | self.publisher = publisher 4 | self.offer = offer 5 | self.sku = sku 6 | -------------------------------------------------------------------------------- /aztk/node_scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-batch==4.1.3 2 | azure-mgmt-batch==5.0.0 3 | azure-mgmt-storage==1.5.0 4 | azure-storage-blob==1.1.0 5 | pyyaml==3.12 6 | pycryptodomex>=3.4 7 | 8 | -------------------------------------------------------------------------------- /aztk/internal/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module containing classes used in the library but without any use for SDK user 3 | """ 4 | 5 | from .configuration_base import * 6 | from .docker_cmd import * 7 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/images/chronograf_hosts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/aztk/spark/models/plugins/resource_monitor/images/chronograf_hosts.png -------------------------------------------------------------------------------- /aztk/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def ensure_dir(file_path): 5 | directory = os.path.dirname(file_path) 6 | if not os.path.exists(directory): 7 | os.makedirs(directory) 8 | -------------------------------------------------------------------------------- /aztk/models/task_state.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class TaskState(Enum): 5 | Running = "running" 6 | Completed = "completed" 7 | Failed = "failed" 8 | Preparing = "preparing" 9 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/images/chronograf_single_host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/aztk/spark/models/plugins/resource_monitor/images/chronograf_single_host.png -------------------------------------------------------------------------------- /aztk/models/cluster_state.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ClusterState(Enum): 5 | deleting = "deleting" 6 | resizing = "resizing" 7 | steady = "steady" 8 | stopping_resize = "stopping" 9 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/images/chronograf_build_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/aztk/spark/models/plugins/resource_monitor/images/chronograf_build_dashboard.png -------------------------------------------------------------------------------- /docs/aztk.models.rst: -------------------------------------------------------------------------------- 1 | aztk.models package 2 | =================== 3 | 4 | 5 | .. automodule:: aztk.models 6 | :members: 7 | :show-inheritance: 8 | :imported-members: 9 | :undoc-members: 10 | -------------------------------------------------------------------------------- /aztk/models/port_forward_specification.py: -------------------------------------------------------------------------------- 1 | from aztk.core.models import Model, fields 2 | 3 | 4 | class PortForwardingSpecification(Model): 5 | remote_port = fields.Integer() 6 | local_port = fields.Integer() 7 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/images/chronograf_create_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/aztk/master/aztk/spark/models/plugins/resource_monitor/images/chronograf_create_dashboard.png -------------------------------------------------------------------------------- /aztk/node_scripts/wait_until_setup_complete.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | 4 | while not os.path.exists("/tmp/setup_complete"): 5 | time.sleep(1) 6 | 7 | print("SETUP FINISHED") 8 | os.remove("/tmp/setup_complete") 9 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style=google 3 | spaces_before_comment=4 4 | split_before_logical_operator=True 5 | indent_width=4 6 | column_limit=120 7 | split_arguments_when_comma_terminated=True 8 | blank_line_before_nested_class_or_def=False -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | indent_style = space 3 | indent_size = 4 4 | insert_final_newline = true 5 | trim_trailing_whitespace = true 6 | end_of_line = lf 7 | 8 | [*.{json,yml,yaml}] 9 | indent_size = 2 10 | 11 | [*.xml] 12 | indent_size = 2 13 | -------------------------------------------------------------------------------- /aztk/models/user_configuration.py: -------------------------------------------------------------------------------- 1 | from aztk.core.models import Model, fields 2 | 3 | 4 | class UserConfiguration(Model): 5 | username = fields.String() 6 | ssh_key = fields.String(default=None) 7 | password = fields.String(default=None) 8 | -------------------------------------------------------------------------------- /aztk/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import (azure_api, command_builder, constants, file_utils, get_ssh_key, helpers, secure_utils) 2 | from .deprecation import deprecate, deprecated 3 | from .retry import BackOffPolicy, retry 4 | from .try_func import try_func 5 | -------------------------------------------------------------------------------- /docs/aztk.spark.models.plugins.rst: -------------------------------------------------------------------------------- 1 | aztk.spark.models.plugins package 2 | ================================= 3 | 4 | .. automodule:: aztk.spark.models.plugins 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :imported-members: 9 | -------------------------------------------------------------------------------- /examples/src/main/resources/user.avsc: -------------------------------------------------------------------------------- 1 | {"namespace": "example.avro", 2 | "type": "record", 3 | "name": "User", 4 | "fields": [ 5 | {"name": "name", "type": "string"}, 6 | {"name": "favorite_color", "type": ["string", "null"]} 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /docker-image/README.md: -------------------------------------------------------------------------------- 1 | # Docker Image Gallery 2 | Azure Distributed Data Engineering Toolkit uses Docker containers to run Spark. 3 | 4 | Please refer to the docs for details on [how to select a docker-repo at cluster creation time](../docs/12-docker-image.md). 5 | -------------------------------------------------------------------------------- /aztk/models/file_share.py: -------------------------------------------------------------------------------- 1 | from aztk.core.models import Model, fields 2 | 3 | 4 | class FileShare(Model): 5 | storage_account_name = fields.String() 6 | storage_account_key = fields.String() 7 | file_share_path = fields.String() 8 | mount_path = fields.String() 9 | -------------------------------------------------------------------------------- /examples/src/main/resources/full_user.avsc: -------------------------------------------------------------------------------- 1 | {"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]} -------------------------------------------------------------------------------- /tests/utils/test_helpers.py: -------------------------------------------------------------------------------- 1 | from aztk.utils import helpers 2 | 3 | 4 | def test_bool_env(): 5 | assert helpers.bool_env(True) == "true" 6 | assert helpers.bool_env(False) == "false" 7 | assert helpers.bool_env(None) == "false" 8 | assert helpers.bool_env("some") == "false" 9 | -------------------------------------------------------------------------------- /docs/aztk.spark.models.rst: -------------------------------------------------------------------------------- 1 | aztk.spark.models package 2 | ========================= 3 | 4 | .. toctree:: 5 | 6 | aztk.spark.models.plugins 7 | 8 | .. automodule:: aztk.spark.models 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | :imported-members: 13 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/pip/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.spark.models.plugins.install import InstallPlugin 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | 6 | 7 | def PipPlugin(packages=None): 8 | return InstallPlugin(name="pip", command="pip install", packages=packages) 9 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/conda/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.spark.models.plugins.install import InstallPlugin 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | 6 | 7 | def CondaPlugin(packages=None): 8 | return InstallPlugin(name="conda", command="conda install -y", packages=packages) 9 | -------------------------------------------------------------------------------- /aztk/models/node_output.py: -------------------------------------------------------------------------------- 1 | from tempfile import SpooledTemporaryFile 2 | from typing import Union 3 | 4 | 5 | class NodeOutput: 6 | def __init__(self, id: str, output: Union[SpooledTemporaryFile, str] = None, error: Exception = None): 7 | self.id = id 8 | self.output = output 9 | self.error = error 10 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/apt_get/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.spark.models.plugins.install import InstallPlugin 3 | 4 | dir_path = os.path.dirname(os.path.realpath(__file__)) 5 | 6 | 7 | def AptGetPlugin(packages=None): 8 | return InstallPlugin(name="apt-get", command="apt-get update && apt-get install -y", packages=packages) 9 | -------------------------------------------------------------------------------- /aztk/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | # Azure storage is logging error in the console which make the CLI quite confusing 4 | logging.getLogger("azure.storage").setLevel(logging.CRITICAL) 5 | 6 | # msrestazure logs warning for keyring 7 | logging.getLogger("msrestazure").setLevel(logging.CRITICAL) 8 | 9 | # msrest 10 | logging.getLogger("msrest").setLevel(logging.CRITICAL) 11 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/delete_user_on_cluster.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | 3 | 4 | # TODO: remove nodes param 5 | def delete_user_on_cluster(base_client, id, nodes, username): 6 | with concurrent.futures.ThreadPoolExecutor() as executor: 7 | futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes] 8 | concurrent.futures.wait(futures) 9 | -------------------------------------------------------------------------------- /aztk/node_scripts/core/logger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | 4 | log = logging.getLogger("aztk.node-agent") 5 | 6 | DEFAULT_FORMAT = "%(message)s" 7 | 8 | 9 | def setup_logging(): 10 | for handler in logging.root.handlers[:]: 11 | logging.root.removeHandler(handler) 12 | 13 | log.setLevel(logging.INFO) 14 | logging.basicConfig(stream=sys.stdout, format=DEFAULT_FORMAT) 15 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(_: argparse.ArgumentParser): 9 | # No arguments for list yet 10 | pass 11 | 12 | 13 | def execute(_: typing.NamedTuple): 14 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 15 | 16 | utils.print_jobs(spark_client.job.list()) 17 | -------------------------------------------------------------------------------- /custom-scripts/simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Demo plugin. Not actually meant to be used. 4 | 5 | if [ "$AZTK_IS_MASTER" = "true" ]; then 6 | echo "This is a custom script running on just the master!" 7 | fi 8 | 9 | if [ "$AZTK_IS_WORKER" = "true" ]; then 10 | echo "This is a custom script running on just the workers!" 11 | fi 12 | 13 | echo "This is a custom script running all workers and the master!" 14 | 15 | -------------------------------------------------------------------------------- /aztk/models/scheduling_target.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class SchedulingTarget(Enum): 5 | """ 6 | Target where task will get scheduled. 7 | For spark this is where the driver will live. 8 | """ 9 | 10 | Master = "master" 11 | """ 12 | Only master is allowed to run task 13 | """ 14 | 15 | Any = "any" 16 | """ 17 | Any node(Not recommended if using low pri) (Default) 18 | """ 19 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/wait.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def wait_for_application_to_complete(core_cluster_operations, id, application_name): 8 | try: 9 | return core_cluster_operations.wait(id, application_name) 10 | except BatchErrorException as e: 11 | raise error.AztkError(helpers.format_batch_exception(e)) 12 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/delete_user_on_node.py: -------------------------------------------------------------------------------- 1 | def delete_user(self, pool_id: str, node_id: str, username: str) -> str: 2 | """ 3 | Create a pool user 4 | :param pool: the pool to add the user to 5 | :param node: the node to add the user to 6 | :param username: username of the user to add 7 | """ 8 | # Delete a user on the given node 9 | self.batch_client.compute_node.delete_user(pool_id, node_id, username) 10 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/get_configuration.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def get_configuration(core_cluster_operations, cluster_id: str): 8 | try: 9 | return core_cluster_operations.get_cluster_configuration(cluster_id) 10 | except BatchErrorException as e: 11 | raise error.AztkError(helpers.format_batch_exception(e)) 12 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/delete.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def delete_cluster(core_cluster_operations, cluster_id: str, keep_logs: bool = False): 8 | try: 9 | return core_cluster_operations.delete(cluster_id, keep_logs) 10 | except BatchErrorException as e: 11 | raise error.AztkError(helpers.format_batch_exception(e)) 12 | -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/wait_for_task_to_complete.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import azure.batch.models as batch_models 4 | 5 | 6 | def wait_for_task_to_complete(core_cluster_operations, job_id: str, task_id: str): 7 | while True: 8 | task = core_cluster_operations.batch_client.task.get(job_id=job_id, task_id=task_id) 9 | if task.state != batch_models.TaskState.completed: 10 | time.sleep(2) 11 | else: 12 | return 13 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/stop_application.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | 4 | def stop_app(core_job_operations, job_id, application_name): 5 | recent_run_job = core_job_operations.get_recent_job(job_id) 6 | 7 | # stop batch task 8 | try: 9 | core_job_operations.batch_client.task.terminate(job_id=recent_run_job.id, task_id=application_name) 10 | return True 11 | except BatchErrorException: 12 | return False 13 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/get.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark import models 5 | from aztk.utils import helpers 6 | 7 | 8 | def get_cluster(core_cluster_operations, cluster_id: str): 9 | try: 10 | cluster = core_cluster_operations.get(cluster_id) 11 | return models.Cluster(cluster) 12 | except BatchErrorException as e: 13 | raise error.AztkError(helpers.format_batch_exception(e)) 14 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/get.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 10 | 11 | 12 | def execute(args: typing.NamedTuple): 13 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 14 | 15 | utils.print_job(spark_client, spark_client.job.get(id=args.job_id)) 16 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/get_application_log.py: -------------------------------------------------------------------------------- 1 | from aztk.spark import models 2 | 3 | 4 | def get_application_log(core_base_operations, 5 | cluster_id: str, 6 | application_name: str, 7 | tail=False, 8 | current_bytes: int = 0): 9 | base_application_log = core_base_operations.get_application_log(cluster_id, application_name, tail, current_bytes) 10 | return models.ApplicationLog(base_application_log) 11 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/list_apps.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 10 | 11 | 12 | def execute(args: typing.NamedTuple): 13 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 14 | utils.print_applications(spark_client.job.list_applications(args.job_id)) 15 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/generate_user_on_node.py: -------------------------------------------------------------------------------- 1 | from Cryptodome.PublicKey import RSA 2 | 3 | from aztk.utils import secure_utils 4 | 5 | 6 | def generate_user_on_node(base_client, pool_id, node_id): 7 | generated_username = secure_utils.generate_random_string() 8 | ssh_key = RSA.generate(2048) 9 | ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") 10 | base_client.create_user_on_node(pool_id, node_id, generated_username, ssh_pub_key) 11 | return generated_username, ssh_key 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Distribution 2 | azure-batch~=5.1.0 3 | azure-mgmt-batch~=5.0.1 4 | azure-mgmt-storage~=3.0.0 5 | azure-storage-blob~=1.3.1 6 | azure-cosmosdb-table~=1.0.5 7 | pycryptodomex~=3.6.6 8 | PyYAML~=3.13 9 | paramiko~=2.4.2 10 | 11 | # Development 12 | yapf==0.22.0 13 | pylint==2.1.1 14 | pytest==5.3.5 15 | pytest-xdist==1.31.0 16 | twine==1.11.0 17 | docker==3.2.1 18 | 19 | # Docs 20 | sphinx==1.7.2 21 | sphinx-autobuild==0.7.1 22 | recommonmark==0.4.0 23 | sphinx_rtd_theme==0.3.0 24 | docutils==0.12 25 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/create_user_on_cluster.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | 3 | 4 | # TODO: remove nodes param 5 | def create_user_on_cluster(base_operations, id, nodes, username, ssh_pub_key=None, password=None): 6 | with concurrent.futures.ThreadPoolExecutor() as executor: 7 | futures = { 8 | executor.submit(base_operations.create_user_on_node, id, node.id, username, ssh_pub_key, password): node 9 | for node in nodes 10 | } 11 | concurrent.futures.wait(futures) 12 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/stop.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, log 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 10 | 11 | 12 | def execute(args: typing.NamedTuple): 13 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 14 | spark_client.job.stop(args.job_id) 15 | log.print("Stopped Job {0}".format(args.job_id)) 16 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/get_remote_login_settings.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark import models 5 | from aztk.utils import helpers 6 | 7 | 8 | def get_remote_login_settings(core_cluster_operations, id: str, node_id: str): 9 | try: 10 | return models.RemoteLogin(core_cluster_operations.get_remote_login_settings(id, node_id)) 11 | except BatchErrorException as e: 12 | raise error.AztkError(helpers.format_batch_exception(e)) 13 | -------------------------------------------------------------------------------- /docs/dev/docs.md: -------------------------------------------------------------------------------- 1 | # Writing docs 2 | 3 | Docs are located in the docs folder. We are using `sphinx` to generate the docs and then hosting them on `readthedocs`. 4 | 5 | ## Start docs autobuild to test locally 6 | ```bash 7 | sphinx-autobuild docs docs/_build/html --watch aztk 8 | ``` 9 | Open `docs/_build/index.html` 10 | 11 | ## Publish the docs 12 | 13 | Docs should be published automatically to read the docs as soon as you push to master under the `latest` tag. 14 | You when creating a git tag readthedocs can also build that one. 15 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/get_application_state.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark.models import ApplicationState 5 | from aztk.utils import helpers 6 | 7 | 8 | def get_application_state(core_cluster_operations, cluster_id: str, app_name: str): 9 | try: 10 | return ApplicationState(core_cluster_operations.get_task_state(cluster_id, app_name).value) 11 | except BatchErrorException as e: 12 | raise error.AztkError(helpers.format_batch_exception(e)) 13 | -------------------------------------------------------------------------------- /aztk/node_scripts/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pip = "==18.0" 8 | wheel = "==0.32.1" 9 | setuptools = "==40.4.3" 10 | azure-batch = "==5.1.0" 11 | azure-mgmt-batch = "==5.0.1" 12 | azure-mgmt-storage = "==3.0.0" 13 | azure-storage-blob = "==1.3.1" 14 | pycryptodomex = "==3.6.6" 15 | PyYAML = "==5.1" 16 | requests = "==2.20.0" 17 | paramiko = "==2.4.2" 18 | azure-cosmosdb-table = "==1.0.5" 19 | 20 | [dev-packages] 21 | 22 | [requires] 23 | python_version = "3.5" 24 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/nvblas/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def NvBLASPlugin(): 9 | return PluginConfiguration( 10 | name="nvblas", 11 | ports=[], 12 | target_role=PluginTargetRole.All, 13 | execute="nvblas.sh", 14 | files=[PluginFile("nvblas.sh", os.path.join(dir_path, "nvblas.sh"))], 15 | ) 16 | -------------------------------------------------------------------------------- /account_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Installing dependencies..." && 4 | pip install --force-reinstall --upgrade --user pyyaml==3.12 azure==3.0.0 azure-cli-core==2.0.30 msrestazure==0.4.25 > /dev/null 2>&1 && 5 | echo "Finished installing dependencies." && 6 | echo "Getting account setup script..." && 7 | wget -q https://raw.githubusercontent.com/Azure/aztk/v0.10.3/account_setup.py -O account_setup.py && 8 | chmod 755 account_setup.py && 9 | echo "Finished getting account setup script." && 10 | echo "Running account setup script..." && 11 | python3 account_setup.py 12 | -------------------------------------------------------------------------------- /docs/aztk.spark.rst: -------------------------------------------------------------------------------- 1 | aztk.spark package 2 | ================== 3 | 4 | .. toctree:: 5 | 6 | aztk.spark.models 7 | 8 | aztk.spark.client module 9 | ------------------------ 10 | 11 | .. autoclass:: aztk.spark.client.Client 12 | :members: 13 | :undoc-members: 14 | :show-inheritance: 15 | 16 | 17 | .. autoclass:: aztk.spark.client.cluster.ClusterOperations 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | 23 | .. autoclass:: aztk.spark.client.job.JobOperations 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/get.py: -------------------------------------------------------------------------------- 1 | # TODO: return Cluster instead of (pool, nodes) 2 | from aztk import models 3 | 4 | 5 | def get_pool_details(core_cluster_operations, cluster_id: str): 6 | """ 7 | Print the information for the given cluster 8 | :param cluster_id: Id of the cluster 9 | :return pool: CloudPool, nodes: ComputeNodePaged 10 | """ 11 | pool = core_cluster_operations.batch_client.pool.get(cluster_id) 12 | nodes = core_cluster_operations.batch_client.compute_node.list(pool_id=cluster_id) 13 | return models.Cluster(pool, nodes) 14 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/openblas/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def OpenBLASPlugin(): 9 | return PluginConfiguration( 10 | name="openblas", 11 | ports=[], 12 | target_role=PluginTargetRole.All, 13 | execute="openblas.sh", 14 | files=[PluginFile("openblas.sh", os.path.join(dir_path, "openblas.sh"))], 15 | ) 16 | -------------------------------------------------------------------------------- /aztk/models/task.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from aztk.core.models import Model, fields 4 | 5 | 6 | class Task(Model): 7 | id = fields.String() 8 | node_id = fields.String(default=None) 9 | state = fields.String(default=None) 10 | state_transition_time = fields.String(default=None) 11 | command_line = fields.String(default=None) 12 | exit_code = fields.Integer(default=None) 13 | start_time = fields.Datetime(datetime, default=None) 14 | end_time = fields.Datetime(datetime, default=None) 15 | failure_info = fields.String(default=None) 16 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/list.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk import models as base_models 5 | from aztk.spark import models 6 | from aztk.utils import helpers 7 | 8 | 9 | def list_clusters(core_cluster_operations): 10 | try: 11 | software_metadata_key = base_models.Software.spark 12 | return [models.Cluster(cluster) for cluster in core_cluster_operations.list(software_metadata_key)] 13 | except BatchErrorException as e: 14 | raise error.AztkError(helpers.format_batch_exception(e)) 15 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from .hdfs import HDFSPlugin 2 | from .jupyter import JupyterPlugin 3 | from .jupyter_lab import JupyterLabPlugin 4 | from .resource_monitor import ResourceMonitorPlugin 5 | from .rstudio_server import RStudioServerPlugin 6 | from .simple import SimplePlugin 7 | from .spark_ui_proxy import SparkUIProxyPlugin 8 | from .tensorflow_on_spark import TensorflowOnSparkPlugin 9 | from .openblas import OpenBLASPlugin 10 | from .nvblas import NvBLASPlugin 11 | from .install.conda import CondaPlugin 12 | from .install.apt_get import AptGetPlugin 13 | from .install.pip import PipPlugin 14 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def JupyterPlugin(): 9 | return PluginConfiguration( 10 | name="jupyter", 11 | ports=[PluginPort(internal=8888, public=True)], 12 | target_role=PluginTargetRole.All, 13 | execute="jupyter.sh", 14 | files=[PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh"))], 15 | ) 16 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/tensorflow_on_spark/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def TensorflowOnSparkPlugin(): 9 | return PluginConfiguration( 10 | name="tensorflow_on_spark", 11 | target_role=PluginTargetRole.Master, 12 | execute="tensorflow_on_spark.sh", 13 | files=[PluginFile("tensorflow_on_spark.sh", os.path.join(dir_path, "tensorflow_on_spark.sh"))], 14 | ) 15 | -------------------------------------------------------------------------------- /aztk/utils/try_func.py: -------------------------------------------------------------------------------- 1 | def try_func(exception_formatter=None, raise_exception=None, catch_exceptions=()): 2 | import functools 3 | 4 | def decorator(function): 5 | @functools.wraps(function) 6 | def wrapper(*args, **kwargs): 7 | try: 8 | return function(*args, **kwargs) 9 | except catch_exceptions as e: 10 | if exception_formatter: 11 | raise raise_exception(exception_formatter(e)) 12 | else: 13 | raise raise_exception(str(e)) 14 | 15 | return wrapper 16 | 17 | return decorator 18 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/get_recent_job.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | # Note: this only works with jobs, not clusters 8 | # cluster impl is planned to change to job schedule 9 | def get_recent_job(core_job_operations, id): 10 | try: 11 | job_schedule = core_job_operations.batch_client.job_schedule.get(id) 12 | return core_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) 13 | except BatchErrorException as e: 14 | raise error.AztkError(helpers.format_batch_exception(e)) 15 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/list.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark import models 5 | from aztk.utils import helpers 6 | 7 | 8 | def _list_jobs(core_job_operations): 9 | return [cloud_job_schedule for cloud_job_schedule in core_job_operations.batch_client.job_schedule.list()] 10 | 11 | 12 | def list_jobs(core_job_operations): 13 | try: 14 | return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(core_job_operations)] 15 | except BatchErrorException as e: 16 | raise error.AztkError(helpers.format_batch_exception(e)) 17 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/install/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def InstallPlugin(name, command, packages=None): 9 | return PluginConfiguration( 10 | name=name, 11 | target_role=PluginTargetRole.All, 12 | execute="install.sh", 13 | files=[PluginFile("install.sh", os.path.join(dir_path, "install.sh"))], 14 | args=packages, 15 | env=dict(COMMAND=command), 16 | ) 17 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/get_app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 10 | parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") 11 | 12 | 13 | def execute(args: typing.NamedTuple): 14 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 15 | 16 | utils.print_application(spark_client.job.get_application(args.job_id, args.app_name)) 17 | -------------------------------------------------------------------------------- /aztk/node_scripts/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from aztk.node_scripts.core import logger 4 | from aztk.node_scripts.install import install 5 | 6 | 7 | def run(): 8 | if len(sys.argv) < 2: 9 | print("Error: Expected at least one argument") 10 | exit(1) 11 | 12 | action = sys.argv[1] 13 | 14 | if action == "setup-node": 15 | install.setup_host(sys.argv[2], sys.argv[3]) 16 | elif action == "setup-spark-container": 17 | install.setup_spark_container() 18 | else: 19 | print("Action not supported") 20 | 21 | 22 | if __name__ == "__main__": 23 | logger.setup_logging() 24 | run() 25 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter_lab/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def JupyterLabPlugin(): 9 | return PluginConfiguration( 10 | name="jupyterlab", 11 | ports=[PluginPort(internal=8889, public=True)], 12 | target_role=PluginTargetRole.All, 13 | execute="jupyter_lab.sh", 14 | files=[PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh"))], 15 | ) 16 | -------------------------------------------------------------------------------- /aztk/node_scripts/wait_until_master_selected.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def main(): 5 | master = None 6 | 7 | while master is None: 8 | try: 9 | from aztk.node_scripts.core import config 10 | from aztk.node_scripts.install.pick_master import get_master_node_id 11 | 12 | batch_client = config.batch_client 13 | pool = batch_client.pool.get(config.pool_id) 14 | master = get_master_node_id(pool) 15 | time.sleep(1) 16 | 17 | except Exception as e: 18 | print(e) 19 | time.sleep(1) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /aztk/models/application_log.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | 3 | 4 | class ApplicationLog: 5 | def __init__( 6 | self, 7 | name: str, 8 | cluster_id: str, 9 | log: str, 10 | total_bytes: int, 11 | application_state: batch_models.TaskState, 12 | exit_code: int, 13 | ): 14 | self.name = name 15 | self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic 16 | self.log = log 17 | self.total_bytes = total_bytes 18 | self.application_state = application_state 19 | self.exit_code = exit_code 20 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/simple/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginTarget, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | class SimplePlugin(PluginConfiguration): 9 | def __init__(self): 10 | super().__init__( 11 | name="simple", 12 | target_role=PluginTargetRole.All, 13 | target=PluginTarget.Host, 14 | execute="simple.sh", 15 | files=[PluginFile("simple.sh", os.path.join(dir_path, "simple.sh"))], 16 | ) 17 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/run.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def cluster_run(core_cluster_operations, 8 | cluster_id: str, 9 | command: str, 10 | host=False, 11 | internal: bool = False, 12 | timeout=None): 13 | try: 14 | return core_cluster_operations.run( 15 | cluster_id, command, internal, container_name="spark" if not host else None, timeout=timeout) 16 | except BatchErrorException as e: 17 | raise error.AztkError(helpers.format_batch_exception(e)) 18 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | // "python.linting.pylintEnabled": false, 3 | "search.exclude": { 4 | "build/**": true, 5 | "bin/**": true 6 | }, 7 | "files.exclude": { 8 | "**/__pycache__": true, 9 | "*.egg-info": true, 10 | }, 11 | "python.autoComplete.extraPaths": [ 12 | "${workspaceRoot}/node_scripts" 13 | ], 14 | "python.formatting.provider": "yapf", 15 | "python.venvPath": "${workspaceFolder}/.venv/", 16 | "python.pythonPath": "${workspaceFolder}/.venv/Scripts/python.exe", 17 | "python.unitTest.pyTestEnabled": true, 18 | "editor.formatOnSave": true, 19 | "editor.codeActionsOnSave": { 20 | "source.organizeImports": true 21 | } 22 | } -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = aztk 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # custom 2 | .aztk 3 | 4 | # Environments 5 | .env 6 | .venv 7 | env/ 8 | venv/ 9 | ENV/ 10 | env.bak/ 11 | venv.bak/ 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | tmp/ 40 | 41 | # Visual Studio Code 42 | .vscode 43 | .DS_Store 44 | 45 | # PyTest 46 | .cache/ 47 | 48 | 49 | # Built docs 50 | docs/_build/ 51 | 52 | # PyCharm 53 | .idea/ 54 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/tensorflow_on_spark/tensorflow_on_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This plugin requires HDFS to be enabled and on the path 4 | 5 | # setup TensorFlowOnSpark 6 | git clone https://github.com/yahoo/TensorFlowOnSpark.git 7 | cd TensorFlowOnSpark 8 | export TFoS_HOME=$(pwd) 9 | export TFoS_HOME=~/TensorFlowOnSpark >> ~/.bashrc 10 | 11 | if [ "$AZTK_GPU_ENABLED" = "true" ]; then 12 | pip install tensorflow-gpu 13 | pip install tensorflowonspark 14 | else 15 | pip install tensorflow-cpu 16 | pip install tensorflowonspark 17 | fi 18 | 19 | # add libhdfs.so to path 20 | echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/libhdfs.so" >> ~/.bashrc 21 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument( 10 | "-q", "--quiet", dest="quiet", required=False, action="store_true", help="The unique id of your spark cluster") 11 | parser.set_defaults(quiet=False) 12 | 13 | 14 | def execute(args: typing.NamedTuple): 15 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 16 | clusters = spark_client.cluster.list() 17 | if args.quiet: 18 | utils.print_clusters_quiet(clusters) 19 | else: 20 | utils.print_clusters(clusters) 21 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/stop.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def _stop(core_job_operations, job_id): 8 | # terminate currently running job and tasks 9 | recent_run_job = core_job_operations.get_recent_job(job_id) 10 | core_job_operations.batch_client.job.terminate(recent_run_job.id) 11 | # terminate job_schedule 12 | core_job_operations.batch_client.job_schedule.terminate(job_id) 13 | 14 | 15 | def stop(self, job_id): 16 | try: 17 | return _stop(self, job_id) 18 | except BatchErrorException as e: 19 | raise error.AztkError(helpers.format_batch_exception(e)) 20 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/rstudio_server/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def RStudioServerPlugin(version="1.1.383"): 9 | return PluginConfiguration( 10 | name="rstudio_server", 11 | ports=[PluginPort(internal=8787, public=True)], 12 | target_role=PluginTargetRole.Master, 13 | execute="rstudio_server.sh", 14 | files=[PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh"))], 15 | env=dict(RSTUDIO_SERVER_VERSION=version), 16 | ) 17 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/stop_app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, log 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 10 | parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") 11 | 12 | 13 | def execute(args: typing.NamedTuple): 14 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 15 | 16 | if spark_client.job.stop_application(args.job_id, args.app_name): 17 | log.info("Stopped app %s", args.app_name) 18 | else: 19 | log.error("App with name %s does not exist or was already deleted", args.app_name) 20 | -------------------------------------------------------------------------------- /aztk/spark/client/client.py: -------------------------------------------------------------------------------- 1 | from aztk.client import CoreClient 2 | from aztk.spark import models 3 | from aztk.spark.client.cluster import ClusterOperations 4 | from aztk.spark.client.job import JobOperations 5 | 6 | 7 | class Client(CoreClient): 8 | """The client used to create and manage Spark clusters 9 | 10 | Attributes: 11 | cluster (:obj:`aztk.spark.client.cluster.ClusterOperations`): Cluster 12 | job (:obj:`aztk.spark.client.job.JobOperations`): Job 13 | """ 14 | 15 | def __init__(self, secrets_configuration: models.SecretsConfiguration): 16 | super().__init__() 17 | context = self._get_context(secrets_configuration) 18 | self.cluster = ClusterOperations(context) 19 | self.job = JobOperations(context) 20 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/generate_user_on_cluster.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | 3 | from Cryptodome.PublicKey import RSA 4 | 5 | from aztk.utils import secure_utils 6 | 7 | 8 | # TODO: remove nodes param 9 | def generate_user_on_cluster(base_operations, id, nodes): 10 | generated_username = secure_utils.generate_random_string() 11 | ssh_key = RSA.generate(2048) 12 | ssh_pub_key = ssh_key.publickey().exportKey("OpenSSH").decode("utf-8") 13 | with concurrent.futures.ThreadPoolExecutor() as executor: 14 | futures = { 15 | executor.submit(base_operations.create_user_on_node, id, node.id, generated_username, ssh_pub_key): node 16 | for node in nodes 17 | } 18 | concurrent.futures.wait(futures) 19 | 20 | return generated_username, ssh_key 21 | -------------------------------------------------------------------------------- /aztk_cli/config/ssh.yaml: -------------------------------------------------------------------------------- 1 | # ssh configuration 2 | 3 | # cluster_id: 4 | 5 | # username: 6 | username: spark 7 | 8 | # job_ui_port: 9 | job_ui_port: 4040 10 | 11 | # job_history_ui_port: 12 | job_history_ui_port: 18080 13 | 14 | # web_ui_port: 15 | web_ui_port: 8080 16 | 17 | # connect: 18 | connect: true 19 | 20 | # internal: 21 | internal: false 22 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/start_monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | cd $DIR 4 | 5 | mkdir /etc/telegraf 6 | cp ./etc/telegraf.conf /etc/telegraf/telegraf.conf 7 | 8 | echo "Install telegraf" 9 | curl -sL https://repos.influxdata.com/influxdb.key | apt-key add - 10 | source /etc/lsb-release 11 | echo "deb https://repos.influxdata.com/${DISTRIB_ID,,} ${DISTRIB_CODENAME} stable" | tee /etc/apt/sources.list.d/influxdb.list 12 | apt-get update && apt-get install telegraf 13 | 14 | if [ "$AZTK_IS_MASTER" = "true" ]; then 15 | echo "Create docker containers" 16 | sudo docker-compose up --no-start 17 | echo "Run the containers" 18 | sudo docker-compose start 19 | fi 20 | 21 | echo "Run telegraf" 22 | telegraf --config ./etc/telegraf.conf & 23 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/node_run.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def node_run( 8 | core_cluster_operations, 9 | cluster_id: str, 10 | node_id: str, 11 | command: str, 12 | host=False, 13 | internal: bool = False, 14 | timeout=None, 15 | block=False, 16 | ): 17 | try: 18 | return core_cluster_operations.node_run( 19 | cluster_id, 20 | node_id, 21 | command, 22 | internal, 23 | container_name="spark" if not host else None, 24 | timeout=timeout, 25 | block=block) 26 | except BatchErrorException as e: 27 | raise error.AztkError(helpers.format_batch_exception(e)) 28 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/ssh_into_master.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def ssh_into_master( 8 | spark_cluster_operations, 9 | core_cluster_operations, 10 | cluster_id, 11 | username, 12 | ssh_key=None, 13 | password=None, 14 | port_forward_list=None, 15 | internal=False, 16 | ): 17 | try: 18 | master_node_id = spark_cluster_operations.get(cluster_id).master_node_id 19 | core_cluster_operations.ssh_into_node(cluster_id, master_node_id, username, ssh_key, password, 20 | port_forward_list, internal) 21 | except BatchErrorException as e: 22 | raise error.AztkError(helpers.format_batch_exception(e)) 23 | -------------------------------------------------------------------------------- /aztk/spark/client/base/helpers/list_applications.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | from azure.batch.models import BatchErrorException 3 | 4 | from aztk import error 5 | from aztk.spark.models import Application, SchedulingTarget 6 | from aztk.utils import helpers 7 | 8 | 9 | def list_applications(core_operations, cluster_id): 10 | try: 11 | scheduling_target = core_operations.get_cluster_configuration(cluster_id).scheduling_target 12 | if scheduling_target is not SchedulingTarget.Any: 13 | tasks = core_operations.list_task_table_entries(cluster_id) 14 | else: 15 | tasks = core_operations.list_batch_tasks(cluster_id) 16 | return [Application(task) for task in tasks] 17 | except BatchErrorException as e: 18 | raise error.AztkError(helpers.format_batch_exception(e)) 19 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/get_task_state.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.models import SchedulingTarget, TaskState 5 | from aztk.utils import helpers 6 | 7 | 8 | def get_task_state(core_cluster_operations, cluster_id: str, task_id: str): 9 | try: 10 | scheduling_target = core_cluster_operations.get_cluster_configuration(cluster_id).scheduling_target 11 | if scheduling_target is not SchedulingTarget.Any: 12 | task = core_cluster_operations.get_task_from_table(cluster_id, task_id) 13 | return task.state 14 | else: 15 | task = core_cluster_operations.get_batch_task(cluster_id, task_id) 16 | return task.state 17 | except BatchErrorException as e: 18 | raise error.AztkError(helpers.format_batch_exception(e)) 19 | -------------------------------------------------------------------------------- /docs/aztk.rst: -------------------------------------------------------------------------------- 1 | aztk package 2 | ============ 3 | 4 | .. toctree:: 5 | 6 | aztk.models 7 | aztk.spark 8 | 9 | aztk.client module 10 | ------------------ 11 | 12 | .. autoclass:: aztk.client.CoreClient 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | 18 | .. autoclass:: aztk.client.base.BaseOperations 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | .. autoclass:: aztk.client.cluster.CoreClusterOperations 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | 30 | .. autoclass:: aztk.client.job.CoreJobOperations 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | aztk.error module 36 | ----------------- 37 | 38 | .. automodule:: aztk.error 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | # Define an InfluxDB service 5 | influxdb: 6 | image: influxdb:1.3.5 7 | volumes: 8 | - ./data/influxdb:/var/lib/influxdb 9 | ports: 10 | - "8086:8086" 11 | # Define a Chronograf service 12 | chronograf: 13 | image: chronograf:1.3.8 14 | environment: 15 | INFLUXDB_URL: http://influxdb:8086 16 | KAPACITOR_URL: http://kapacitor:9092 17 | ports: 18 | - "8890:8888" 19 | links: 20 | - influxdb 21 | - kapacitor 22 | # Define a Kapacitor service 23 | kapacitor: 24 | image: kapacitor:1.3.3 25 | environment: 26 | KAPACITOR_HOSTNAME: kapacitor 27 | KAPACITOR_INFLUXDB_0_URLS_0: http://influxdb:8086 28 | links: 29 | - influxdb 30 | ports: 31 | - "9092:9092" 32 | -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/list.py: -------------------------------------------------------------------------------- 1 | from aztk import models 2 | from aztk.utils import constants 3 | 4 | 5 | def list_clusters(cluster_client, software_metadata_key): 6 | """ 7 | List all the cluster on your account. 8 | """ 9 | pools = cluster_client.batch_client.pool.list() 10 | software_metadata = (constants.AZTK_SOFTWARE_METADATA_KEY, software_metadata_key) 11 | cluster_metadata = (constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) 12 | 13 | aztk_clusters = [] 14 | for pool in [pool for pool in pools if pool.metadata]: 15 | pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] 16 | if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): 17 | aztk_clusters.append(models.Cluster(pool)) 18 | return aztk_clusters 19 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/wait_until_complete.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import azure.batch.models as batch_models 4 | from azure.batch.models import BatchErrorException 5 | 6 | from aztk import error 7 | from aztk.utils import helpers 8 | 9 | 10 | def _wait_until_job_finished(core_job_operations, job_id): 11 | job_state = core_job_operations.batch_client.job_schedule.get(job_id).state 12 | 13 | while job_state not in [batch_models.JobScheduleState.completed, batch_models.JobScheduleState.terminating]: 14 | time.sleep(3) 15 | job_state = core_job_operations.batch_client.job_schedule.get(job_id).state 16 | 17 | 18 | def wait_until_job_finished(core_job_operations, job_id): 19 | try: 20 | _wait_until_job_finished(core_job_operations, job_id) 21 | except BatchErrorException as e: 22 | raise error.AztkError(helpers.format_batch_exception(e)) 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark1.6.3/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark1.6.3-base 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark1.6.3/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark1.6.3-gpu 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.1.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.1.0-base 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.1.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.1.0-gpu 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.2.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.2.0-base 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.2.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.2.0-gpu 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.3.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.3.0-base 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /docker-image/miniconda/spark2.3.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.3.0-gpu 2 | 3 | ARG MINICONDA_VERISON=Miniconda3-4.4.10 4 | 5 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 6 | ENV PATH /opt/conda/bin:$PATH 7 | 8 | RUN apt-get update --fix-missing \ 9 | && apt-get install -y wget bzip2 ca-certificates curl git \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | RUN wget --quiet https://repo.continuum.io/miniconda/${MINICONDA_VERISON}-Linux-x86_64.sh -O ~/miniconda.sh \ 14 | && /bin/bash ~/miniconda.sh -b -p /opt/conda \ 15 | && rm ~/miniconda.sh \ 16 | && /opt/conda/bin/conda clean -tipsy \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 19 | # install extras 20 | # && conda install numba pandas scikit-learn 21 | 22 | CMD ["/bin/bash"] 23 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/create_user.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def create_user( 8 | core_cluster_operations, 9 | spark_cluster_operations, 10 | cluster_id: str, 11 | username: str, 12 | password: str = None, 13 | ssh_key: str = None, 14 | ) -> str: 15 | try: 16 | cluster = spark_cluster_operations.get(cluster_id) 17 | master_node_id = cluster.master_node_id 18 | if not master_node_id: 19 | raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") 20 | core_cluster_operations.create_user_on_cluster(cluster.id, cluster.nodes, username, ssh_key, password) 21 | except BatchErrorException as e: 22 | raise error.AztkError(helpers.format_batch_exception(e)) 23 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/spark_ui_proxy/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | class SparkUIProxyPlugin(PluginConfiguration): 9 | def __init__(self): 10 | super().__init__( 11 | name="spark_ui_proxy", 12 | ports=[PluginPort(internal=9999, public=True)], 13 | target_role=PluginTargetRole.Master, 14 | execute="spark_ui_proxy.sh", 15 | args=["localhost:8080", "9999"], 16 | files=[ 17 | PluginFile("spark_ui_proxy.sh", os.path.join(dir_path, "spark_ui_proxy.sh")), 18 | PluginFile("spark_ui_proxy.py", os.path.join(dir_path, "spark_ui_proxy.py")), 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/get_remote_login_settings.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error, models 4 | from aztk.utils import helpers 5 | 6 | 7 | def _get_remote_login_settings(base_client, pool_id: str, node_id: str): 8 | """ 9 | Get the remote_login_settings for node 10 | :param pool_id 11 | :param node_id 12 | :returns aztk.models.RemoteLogin 13 | """ 14 | result = base_client.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) 15 | return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) 16 | 17 | 18 | def get_remote_login_settings(base_client, cluster_id: str, node_id: str): 19 | try: 20 | return _get_remote_login_settings(base_client, cluster_id, node_id) 21 | except BatchErrorException as e: 22 | raise error.AztkError(helpers.format_batch_exception(e)) 23 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/copy.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def cluster_copy( 8 | core_cluster_operations, 9 | cluster_id: str, 10 | source_path: str, 11 | destination_path: str, 12 | host: bool = False, 13 | internal: bool = False, 14 | timeout: int = None, 15 | ): 16 | try: 17 | container_name = None if host else "spark" 18 | return core_cluster_operations.copy( 19 | cluster_id, 20 | source_path, 21 | destination_path=destination_path, 22 | container_name=container_name, 23 | get=False, 24 | internal=internal, 25 | timeout=timeout, 26 | ) 27 | except BatchErrorException as e: 28 | raise error.AztkError(helpers.format_batch_exception(e)) 29 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/wait_for_all_nodes.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import azure.batch.models as batch_models 4 | 5 | from aztk.error import AztkError 6 | 7 | 8 | def wait_for_all_nodes(spark_client, id, nodes): 9 | nodes = [node for node in nodes] 10 | start_time = time.time() 11 | while (time.time() - start_time) < 300: 12 | if any([ 13 | node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed] 14 | for node in nodes 15 | ]): 16 | raise AztkError("A node is unusable or had its start task fail.") 17 | 18 | if not all(node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running] 19 | for node in nodes): 20 | nodes = [node for node in spark_client.cluster.get(id).nodes] 21 | time.sleep(1) 22 | else: 23 | break 24 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/download.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.utils import helpers 5 | 6 | 7 | def cluster_download( 8 | core_cluster_operations, 9 | cluster_id: str, 10 | source_path: str, 11 | destination_path: str = None, 12 | host: bool = False, 13 | internal: bool = False, 14 | timeout: int = None, 15 | ): 16 | try: 17 | container_name = None if host else "spark" 18 | return core_cluster_operations.copy( 19 | cluster_id, 20 | source_path, 21 | destination_path=destination_path, 22 | container_name=container_name, 23 | get=True, 24 | internal=internal, 25 | timeout=timeout, 26 | ) 27 | except BatchErrorException as e: 28 | raise error.AztkError(helpers.format_batch_exception(e)) 29 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/spark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | from . import init 5 | from .cluster import cluster 6 | from .job import job 7 | 8 | 9 | def setup_parser(parser: argparse.ArgumentParser): 10 | subparsers = parser.add_subparsers(title="Actions", dest="action", metavar="") 11 | subparsers.required = True 12 | 13 | cluster_parser = subparsers.add_parser("cluster", help="Commands to manage a cluster") 14 | job_parser = subparsers.add_parser("job", help="Commands to manage a Job") 15 | init_parser = subparsers.add_parser("init", help="Initialize your environment") 16 | 17 | cluster.setup_parser(cluster_parser) 18 | job.setup_parser(job_parser) 19 | init.setup_parser(init_parser) 20 | 21 | 22 | def execute(args: typing.NamedTuple): 23 | actions = dict(cluster=cluster.execute, job=job.execute, init=init.execute) 24 | func = actions[args.action] 25 | func(args) 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=aztk 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /aztk/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .application_log import ApplicationLog 2 | from .cluster import Cluster 3 | from .cluster_configuration import ClusterConfiguration 4 | from .cluster_state import ClusterState 5 | from .file import File 6 | from .file_share import FileShare 7 | from .node_output import NodeOutput 8 | from .plugins import * 9 | from .port_forward_specification import PortForwardingSpecification 10 | from .remote_login import RemoteLogin 11 | from .scheduling_target import SchedulingTarget 12 | from .secrets_configuration import (DockerConfiguration, SecretsConfiguration, ServicePrincipalConfiguration, 13 | SharedKeyConfiguration) 14 | from .software import Software 15 | from .ssh_log import SSHLog 16 | from .task import Task 17 | from .task_state import TaskState 18 | from .toolkit import TOOLKIT_MAP, Toolkit 19 | from .user_configuration import UserConfiguration 20 | from .vm_image import VmImage 21 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark1.6.3/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark1.6.3-gpu 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.1.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.1.0-gpu 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.2.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.2.0-gpu 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.3.0/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.3.0-gpu 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark1.6.3/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark1.6.3-base 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.1.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.1.0-base 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.2.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.2.0-base 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /docker-image/anaconda/spark2.3.0/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM aztk/spark:v0.1.0-spark2.3.0-base 2 | 3 | ARG ANACONDA_VERSION=Anaconda3-5.1.0 4 | 5 | ENV PATH /opt/conda/bin:$PATH 6 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 7 | 8 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 9 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 10 | git mercurial subversion \ 11 | && apt-get clean \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN wget --quiet https://repo.continuum.io/archive/${ANACONDA_VERSION}-Linux-x86_64.sh -O ~/anaconda.sh \ 15 | && /bin/bash ~/anaconda.sh -b -p /opt/conda \ 16 | && rm ~/anaconda.sh \ 17 | && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh \ 18 | && echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc \ 19 | # reset default python to 3.5 20 | && rm /usr/bin/python \ 21 | && ln -s /usr/bin/python3.5 /usr/bin/python 22 | 23 | CMD ["/bin/bash"] 24 | -------------------------------------------------------------------------------- /tests/spark/models/test_spark_configuration.py: -------------------------------------------------------------------------------- 1 | from aztk.spark.models import SparkConfiguration 2 | from aztk.error import InvalidModelError 3 | 4 | 5 | def test_spark_configuration_defaults(): 6 | spark_configuration = SparkConfiguration() 7 | spark_configuration.validate() 8 | 9 | assert spark_configuration.spark_defaults_conf is None 10 | assert spark_configuration.spark_defaults_conf is None 11 | assert spark_configuration.spark_defaults_conf is None 12 | 13 | 14 | def test_spark_configuration_fields(): 15 | spark_configuration = SparkConfiguration( 16 | spark_defaults_conf="spark-defaults.conf", 17 | spark_env_sh="spark-env.sh", 18 | core_site_xml="core-site.xml", 19 | ) 20 | spark_configuration.validate() 21 | 22 | assert spark_configuration.spark_defaults_conf == "spark-defaults.conf" 23 | assert spark_configuration.spark_env_sh == "spark-env.sh" 24 | assert spark_configuration.core_site_xml == "core-site.xml" 25 | -------------------------------------------------------------------------------- /custom-scripts/rstudio_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This custom script only works on images where rstudio server is pre-installed on the Docker image 4 | # 5 | # This custom script has been tested to work on the following docker images: 6 | # - jiata/aztk-r:0.1.0-spark2.2.0-r3.4.1 7 | # - jiata/aztk-r:0.1.0-spark2.1.0-r3.4.1 8 | # - jiata/aztk-r:0.1.0-spark1.6.3-r3.4.1 9 | 10 | if [ "$AZTK_IS_MASTER" = "true" ]; then 11 | 12 | ## Download and install Rstudio Server 13 | wget https://download2.rstudio.org/rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb 14 | gdebi rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb --non-interactive 15 | echo "server-app-armor-enabled=0" | tee -a /etc/rstudio/rserver.conf 16 | rm rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb 17 | 18 | ## Preparing default user for Rstudio Server 19 | set -e 20 | useradd -m -d /home/rstudio rstudio -g staff 21 | echo rstudio:rstudio | chpasswd 22 | 23 | rstudio-server start 24 | 25 | fi 26 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/resource_monitor/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | class ResourceMonitorPlugin(PluginConfiguration): 9 | def __init__(self): 10 | super().__init__( 11 | name="resource_monitor", 12 | ports=[PluginPort(internal=8890, public=True)], 13 | target=PluginTarget.Host, 14 | target_role=PluginTargetRole.All, 15 | execute="start_monitor.sh", 16 | files=[ 17 | PluginFile("start_monitor.sh", os.path.join(dir_path, "start_monitor.sh")), 18 | PluginFile("etc/telegraf.conf", os.path.join(dir_path, "telegraf.conf")), 19 | PluginFile("docker-compose.yml", os.path.join(dir_path, "docker-compose.yml")), 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/list_tasks.py: -------------------------------------------------------------------------------- 1 | from aztk.models import SchedulingTarget 2 | 3 | from .get_recent_job import get_recent_job 4 | from .task_table import list_task_table_entries 5 | 6 | 7 | def list_tasks(core_base_operations, id): 8 | """List all tasks on a job or cluster 9 | 10 | This will work for both Batch scheduling and scheduling_target 11 | 12 | Args: 13 | id: cluster or job id 14 | Returns: 15 | List[aztk.models.Task] 16 | 17 | """ 18 | scheduling_target = core_base_operations.get_cluster_configuration(id).scheduling_target 19 | if scheduling_target is not SchedulingTarget.Any: 20 | return list_task_table_entries(core_base_operations.table_service, id) 21 | else: 22 | # note: this currently only works for job_schedules 23 | # cluster impl is planned to move to job schedules 24 | recent_run_job = get_recent_job(core_base_operations, id) 25 | tasks = core_base_operations.list_batch_tasks(id=recent_run_job.id) 26 | return tasks 27 | -------------------------------------------------------------------------------- /aztk/utils/get_ssh_key.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_user_public_key(key_or_path: str = None, secrets_config=None): 5 | """ 6 | Return the ssh key. 7 | It will first check if the given argument is a ssh key or a path to one 8 | otherwise will check the configuration file. 9 | """ 10 | if not key_or_path: 11 | if not secrets_config.ssh_pub_key: 12 | return None 13 | 14 | key_or_path = secrets_config.ssh_pub_key 15 | 16 | if not key_or_path: 17 | return None 18 | 19 | key = None 20 | if os.path.isfile(os.path.expanduser(key_or_path)): 21 | key = __read_ssh_key_from_file(key_or_path) 22 | else: 23 | key = key_or_path 24 | 25 | return key 26 | 27 | 28 | def __read_ssh_key_from_file(path: str) -> str: 29 | """ 30 | Read the content of the given file 31 | """ 32 | with open(os.path.expanduser(path), "r", encoding="UTF-8") as content_file: 33 | content = content_file.read() 34 | return content 35 | -------------------------------------------------------------------------------- /aztk/node_scripts/docker_main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file is the entry point of the docker container. 4 | 5 | set -e 6 | source ~/.bashrc 7 | echo "Initializing spark container" 8 | 9 | aztk_dir=$AZTK_WORKING_DIR/aztk 10 | 11 | # ----------------------- 12 | # Preload jupyter samples 13 | # ----------------------- 14 | mkdir -p /mnt/samples 15 | 16 | # add all files from 'jupyter-samples' to container folder '/pyspark/samples' 17 | for file in $(dirname $0)/jupyter-samples/*; do 18 | cp $file /mnt/samples 19 | done 20 | 21 | # ---------------------------- 22 | # Run aztk setup python scripts 23 | # ---------------------------- 24 | # setup docker container 25 | echo "Starting setup using Docker" 26 | 27 | export PYTHONPATH=$PYTHONPATH:$AZTK_WORKING_DIR 28 | echo 'export PYTHONPATH=$PYTHONPATH:$AZTK_WORKING_DIR' >> ~/.bashrc 29 | 30 | echo "Running main.py script" 31 | $AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $(dirname $0)/main.py setup-spark-container 32 | 33 | # sleep to keep container running 34 | while true; do sleep 1; done 35 | -------------------------------------------------------------------------------- /aztk/models/cluster.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | 3 | from .cluster_state import ClusterState 4 | 5 | 6 | class Cluster: 7 | def __init__(self, pool: batch_models.CloudPool, nodes: batch_models.ComputeNodePaged = None): 8 | self.id = pool.id 9 | self.pool = pool 10 | self.nodes = nodes 11 | self.vm_size = pool.vm_size 12 | if pool.state is batch_models.PoolState.active: 13 | self.state = ClusterState(pool.allocation_state.value) 14 | else: 15 | self.state = ClusterState(pool.state.value) 16 | self.total_current_nodes = pool.current_dedicated_nodes + pool.current_low_priority_nodes 17 | self.total_target_nodes = pool.target_dedicated_nodes + pool.target_low_priority_nodes 18 | self.current_dedicated_nodes = pool.current_dedicated_nodes 19 | self.current_low_pri_nodes = pool.current_low_priority_nodes 20 | self.target_dedicated_nodes = pool.target_dedicated_nodes 21 | self.target_low_pri_nodes = pool.target_low_priority_nodes 22 | -------------------------------------------------------------------------------- /aztk/utils/secure_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | from Cryptodome.Cipher import AES, PKCS1_OAEP 5 | from Cryptodome.PublicKey import RSA 6 | from Cryptodome.Random import get_random_bytes 7 | 8 | 9 | def encrypt_password(ssh_pub_key, password): 10 | if not password: 11 | return [None, None, None, None] 12 | recipient_key = RSA.import_key(ssh_pub_key) 13 | session_key = get_random_bytes(16) 14 | 15 | # Encrypt the session key with the public RSA key 16 | cipher_rsa = PKCS1_OAEP.new(recipient_key) 17 | encrypted_aes_session_key = cipher_rsa.encrypt(session_key) 18 | 19 | # Encrypt the data with the AES session key 20 | cipher_aes = AES.new(session_key, AES.MODE_EAX) 21 | ciphertext, tag = cipher_aes.encrypt_and_digest(password.encode()) 22 | return [encrypted_aes_session_key, cipher_aes.nonce, tag, ciphertext] 23 | 24 | 25 | def generate_random_string(charset=string.ascii_uppercase + string.ascii_lowercase, length=16): 26 | return "".join(random.SystemRandom().choice(charset) for _ in range(length)) 27 | -------------------------------------------------------------------------------- /aztk_cli/config/secrets.yaml.template: -------------------------------------------------------------------------------- 1 | # For instructions on creating a Batch and Storage account, see 2 | # Getting Started (http://aztk.readthedocs.io/en/latest/00-getting-started.html) 3 | # NOTE - YAML requires a space after the colon. Ex: "batchaccountname: mybatchaccount" 4 | 5 | service_principal: 6 | tenant_id: 7 | client_id: 8 | credential: 9 | batch_account_resource_id: 10 | storage_account_resource_id: 11 | 12 | # shared_key: 13 | # batch_account_name: 14 | # batch_account_key: 15 | # batch_service_url: 16 | # storage_account_name: 17 | # storage_account_key: 18 | # storage_account_suffix: core.windows.net 19 | 20 | 21 | # Configuration for private docker repositories. If using public containers you do not need to provide authentication 22 | docker: 23 | # username: 24 | # password: 25 | # endpoint: 26 | 27 | 28 | # SSH keys used to create a user and connect to a server. 29 | # The public key can either be the public key itself (ssh-rsa ...) or the path to the ssh key. 30 | # ssh_pub_key: ~/.ssh/id_rsa.pub 31 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/ssh_into_node.py: -------------------------------------------------------------------------------- 1 | import aztk.models as models 2 | from aztk.utils import ssh as ssh_lib 3 | 4 | 5 | def ssh_into_node(base_client, 6 | pool_id, 7 | node_id, 8 | username, 9 | ssh_key=None, 10 | password=None, 11 | port_forward_list=None, 12 | internal=False): 13 | if internal: 14 | result = base_client.batch_client.compute_node.get(pool_id=pool_id, node_id=node_id) 15 | rls = models.RemoteLogin(ip_address=result.ip_address, port="22") 16 | else: 17 | result = base_client.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) 18 | rls = models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) 19 | 20 | ssh_lib.node_ssh( 21 | username=username, 22 | hostname=rls.ip_address, 23 | port=rls.port, 24 | ssh_key=ssh_key, 25 | password=password, 26 | port_forward_list=port_forward_list, 27 | ) 28 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/rstudio_server/rstudio_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This custom script only works on images where rstudio server is pre-installed on the Docker image 4 | # 5 | # This custom script has been tested to work on the following docker images: 6 | # - jiata/aztk-r:0.1.0-spark2.2.0-r3.4.1 7 | # - jiata/aztk-r:0.1.0-spark2.1.0-r3.4.1 8 | # - jiata/aztk-r:0.1.0-spark1.6.3-r3.4.1 9 | 10 | if [ "$AZTK_IS_MASTER" = "true" ]; then 11 | 12 | ## Download and install Rstudio Server 13 | wget https://download2.rstudio.org/rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb 14 | apt-get install -y --no-install-recommends gdebi-core 15 | gdebi rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb --non-interactive 16 | echo "server-app-armor-enabled=0" | tee -a /etc/rstudio/rserver.conf 17 | rm rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb 18 | 19 | ## Preparing default user for Rstudio Server 20 | set -e 21 | useradd -m -d /home/rstudio rstudio -g staff 22 | echo rstudio:rstudio | chpasswd 23 | 24 | rstudio-server start 25 | 26 | fi 27 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/get_app_logs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import typing 4 | 5 | import aztk.spark 6 | from aztk_cli import config, log, utils 7 | 8 | 9 | def setup_parser(parser: argparse.ArgumentParser): 10 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK job") 11 | parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") 12 | parser.add_argument( 13 | "--output", 14 | help="Path to the file you wish to output to. If not \ 15 | specified, output is printed to stdout", 16 | ) 17 | 18 | 19 | def execute(args: typing.NamedTuple): 20 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 21 | app_log = spark_client.job.get_application_log(args.job_id, args.app_name) 22 | if args.output: 23 | with utils.Spinner(): 24 | with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: 25 | f.write(app_log.log) 26 | else: 27 | log.print(app_log.log) 28 | -------------------------------------------------------------------------------- /aztk_cli/plugins.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | from aztk.models.plugins.internal import plugin_manager 5 | from aztk_cli import log 6 | 7 | 8 | def setup_parser(_: argparse.ArgumentParser): 9 | pass 10 | 11 | 12 | def execute(args: typing.NamedTuple): 13 | plugins = plugin_manager.plugins 14 | log.info("------------------------------------------------------") 15 | log.info(" Plugins (%i available)", len(plugins)) 16 | log.info("------------------------------------------------------") 17 | for name, plugin in plugins.items(): 18 | log.info("- %s", name) 19 | args = plugin_manager.get_args_for(plugin) 20 | if args: 21 | log.info(" Arguments:") 22 | for arg in args.values(): 23 | log.info(" - %s", arg_str(arg)) 24 | else: 25 | log.info(" Arguments: None") 26 | log.info("") 27 | 28 | 29 | def arg_str(arg): 30 | required = "Required" if arg.required else "Optional(Default: {0})".format(arg.default) 31 | return "{0}: {1}".format(arg.name, required) 32 | -------------------------------------------------------------------------------- /aztk/internal/cluster_data/blob_data.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import azure.batch.models as batch_models 4 | from azure.storage.blob import BlobPermissions, BlockBlobService 5 | 6 | 7 | class BlobData: 8 | """ 9 | Object mapping to a blob entry. Can generate resource files for batch 10 | """ 11 | 12 | def __init__(self, blob_client: BlockBlobService, container: str, blob: str): 13 | self.container = container 14 | self.blob = blob 15 | self.dest = blob 16 | self.blob_client = blob_client 17 | 18 | def to_resource_file(self, dest: str = None) -> batch_models.ResourceFile: 19 | sas_token = self.blob_client.generate_blob_shared_access_signature( 20 | self.container, 21 | self.blob, 22 | permission=BlobPermissions.READ, 23 | expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365), 24 | ) 25 | 26 | sas_url = self.blob_client.make_blob_url(self.container, self.blob, sas_token=sas_token) 27 | 28 | return batch_models.ResourceFile(file_path=dest or self.dest, blob_source=sas_url) 29 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/clean_up_cluster.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | from azure.batch.models import BatchErrorException 3 | 4 | from aztk.error import AztkError 5 | 6 | 7 | def clean_up_cluster(spark_client, id): 8 | try: 9 | cluster = spark_client.cluster.get(id) 10 | nodes = [node for node in cluster.nodes] 11 | if not any([ 12 | node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed] 13 | for node in nodes 14 | ]): 15 | spark_client.cluster.delete(id=id) 16 | except (BatchErrorException, AztkError) as e: 17 | # pass in the event that the cluster does not exist 18 | print(str(e)) 19 | acceptable_failures = [ 20 | "The specified job has been marked for deletion and is being garbage collected.", 21 | "The specified pool has been marked for deletion and is being reclaimed." 22 | ] 23 | if any(item in str(e) for item in acceptable_failures): 24 | pass 25 | else: 26 | raise e 27 | -------------------------------------------------------------------------------- /tests/utils/test_command_builder.py: -------------------------------------------------------------------------------- 1 | from aztk.utils.command_builder import CommandBuilder 2 | 3 | 4 | def test_only_command(): 5 | cmd = CommandBuilder("ssh") 6 | assert cmd.to_str() == "ssh" 7 | 8 | 9 | def test_with_option(): 10 | cmd = CommandBuilder("ssh") 11 | cmd.add_option("-L", "8080:localhost:8080") 12 | assert cmd.to_str() == "ssh -L 8080:localhost:8080" 13 | 14 | 15 | def test_with_multiple_options(): 16 | cmd = CommandBuilder("ssh") 17 | cmd.add_option("-L", "8080:localhost:8080") 18 | cmd.add_option("-p", "2020") 19 | assert cmd.to_str() == "ssh -L 8080:localhost:8080 -p 2020" 20 | 21 | 22 | def test_with_arg_and_option(): 23 | cmd = CommandBuilder("ssh") 24 | cmd.add_argument("admin@1.2.3.4") 25 | cmd.add_option("-p", "2020") 26 | assert cmd.to_str() == "ssh -p 2020 admin@1.2.3.4" 27 | 28 | 29 | def test_with_disabled_options(): 30 | cmd = CommandBuilder("ssh") 31 | 32 | cmd.add_option("--verbose", enable=True) 33 | cmd.add_option("-p", None) 34 | cmd.add_option("-L", "8080:localhost:8080", enable=False) 35 | assert cmd.to_str() == "ssh --verbose" 36 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/hdfs/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole 3 | from aztk.models.plugins.plugin_file import PluginFile 4 | 5 | dir_path = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | class HDFSPlugin(PluginConfiguration): 9 | def __init__(self): 10 | super().__init__( 11 | name="hdfs", 12 | ports=[ 13 | PluginPort(name="File system metadata operations", internal=8020), 14 | PluginPort(name="File system metadata operations(Backup)", internal=9000), 15 | PluginPort(name="Datanode data transfer", internal=50010), 16 | PluginPort(name="Datanode IPC metadata operations", internal=50020), 17 | PluginPort(name="Namenode", internal=50070, public=True), 18 | PluginPort(name="Datanodes", internal=50075, public=True), 19 | ], 20 | target_role=PluginTargetRole.All, 21 | execute="hdfs.sh", 22 | files=[PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh"))], 23 | ) 24 | -------------------------------------------------------------------------------- /aztk/utils/retry.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import time 3 | from enum import Enum 4 | 5 | 6 | class BackOffPolicy(Enum): 7 | linear = "linear" 8 | exponential = "exponential" 9 | 10 | 11 | def retry(retry_count=1, retry_interval=0, backoff_policy=BackOffPolicy.linear, exceptions=()): 12 | def decorator(function): 13 | @functools.wraps(function) 14 | def wrapper(*args, **kwargs): 15 | for i in range(retry_count - 1): 16 | try: 17 | return function(*args, **kwargs) 18 | except exceptions: 19 | if backoff_policy == BackOffPolicy.linear: 20 | time.sleep(i * retry_interval) 21 | if backoff_policy == BackOffPolicy.exponential: 22 | # TODO: enable logger 23 | # log.debug("{} failed, sleeping for".format(function), 2**(i * retry_interval)) 24 | time.sleep(2**(i * retry_interval)) 25 | # do not retry on the last iteration 26 | return function(*args, **kwargs) 27 | 28 | return wrapper 29 | 30 | return decorator 31 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_debug.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import typing 5 | 6 | import aztk.spark 7 | from aztk_cli import config, utils 8 | 9 | 10 | def setup_parser(parser: argparse.ArgumentParser): 11 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 12 | 13 | parser.add_argument("--output", "-o", required=False, help="the directory for the output folder") 14 | parser.add_argument( 15 | "--brief", "-b", required=False, action="store_true", help="Only gets a small subset of key logs") 16 | parser.set_defaults(brief=False) 17 | 18 | 19 | def execute(args: typing.NamedTuple): 20 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 21 | timestr = time.strftime("%Y%m%d-%H%M%S") 22 | 23 | if not args.output: 24 | args.output = os.path.join(os.getcwd(), "debug-{0}-{1}".format(args.cluster_id, timestr)) 25 | with utils.Spinner(): 26 | spark_client.cluster.diagnostics(id=args.cluster_id, output_directory=args.output, brief=args.brief) 27 | # TODO: analyze results, display some info about status 28 | -------------------------------------------------------------------------------- /docs/60-gpu.md: -------------------------------------------------------------------------------- 1 | # GPU 2 | 3 | Use GPUs to accelerate your Spark applications. When using a [GPU enabled Azure VM](https://azure.microsoft.com/en-us/pricing/details/batch/), your docker image will contain CUDA-8.0 and cuDnn-6.0 by default. See [Docker Image](./12-docker-image.html) for more information about the AZTK Docker images. 4 | 5 | [NOTE: Azure does not have GPU enabled VMs in all regions. Please use this [link](https://azure.microsoft.com/en-us/pricing/details/batch/) to make sure that your Batch account is in a region that has GPU enabled VMs] 6 | 7 | AZTK uses Nvidia-Docker to expose the VM's GPU(s) inside the container. Nvidia drivers (ver. 384) are installed at runtime. 8 | 9 | 10 | ### Tutorial 11 | 12 | Create a cluster specifying a GPU enabled VM 13 | ```sh 14 | aztk spark cluster create --id gpu-cluster --vm-size standard_nc6 --size 1 15 | ``` 16 | 17 | Submit your an application to the cluster that will take advantage of the GPU 18 | ```sh 19 | aztk spark cluster submit --id gpu-cluster --name gpu-app ./examples/src/main/python/gpu/nubma_example.py 20 | ``` 21 | ### Installation Location 22 | By default, CUDA is installed at `/usr/local/cuda-8.0`. 23 | -------------------------------------------------------------------------------- /docker-image/base/README.md: -------------------------------------------------------------------------------- 1 | # Base AZTK Docker image 2 | 3 | This Dockerfile is used to build the __aztk-base__ image used by this toolkit. This Dockerfile produces the Docker image that is selected by AZTK by default. 4 | 5 | You can modify this Dockerfile to build your own image. 6 | 7 | ## How to build this image 8 | This Dockerfile takes in a single variable at build time that allows you to specify your desired Spark version: **SPARK_VERSION_KEY**. 9 | 10 | By default, this image will also be installed with python v3.5.4 as a requirement for this toolkit. 11 | 12 | ```sh 13 | # For example, if I want to use Spark 1.6.3 I would build the image as follows: 14 | docker build \ 15 | --build-arg SPARK_VERSION_KEY=spark-1.6.3-bin-hadoop2.6 \ 16 | -t . 17 | ``` 18 | 19 | **SPARK_VERSION_KEY** is used to locate which version of Spark to download. These are the values that have been tested and known to work: 20 | - spark-1.6.3-bin-hadoop2.6 21 | - spark-2.1.0-bin-hadoop2.7 22 | - spark-2.2.0-bin-hadoop2.7 23 | 24 | For a full list of supported keys, please see this [page](https://d3kbcqa49mib13.cloudfront.net) 25 | 26 | NOTE: Do not include the '.tgz' suffix as part of the Spark version key. 27 | -------------------------------------------------------------------------------- /docker-image/r/README.md: -------------------------------------------------------------------------------- 1 | # R 2 | This Dockerfile is used to build the __aztk-r__ Docker image used by this toolkit. This image uses CRAN R3.4.1, RStudio-Server v1.1.383, SparklyR and comes packaged with Tidyverse. 3 | 4 | You can modify these Dockerfiles to build your own image. However, in mose cases, building on top of the __aztk-base__ image is recommended. 5 | 6 | NOTE: If you plan to use RStudio-Server, hosted on the Spark cluster's master node, with your Spark cluster, we recommend using this image. 7 | 8 | ## How to build this image 9 | This Dockerfile takes in a variable at build time that allow you to specify your desired R version: **R_VERSION** 10 | 11 | By default, we set **R_VERSION=3.4.1**. 12 | 13 | For example, if I wanted to use R v3.4.0 with Spark v2.1.0, I would select the appropriate Dockerfile and build the image as follows: 14 | ```sh 15 | # spark2.1.0/Dockerfile 16 | docker build \ 17 | --build-arg R_VERSION=3.4.0 \ 18 | -t . 19 | ``` 20 | 21 | **R_VERSION** is used to set the version of R for your cluster. 22 | 23 | NOTE: Most versions of R will work. However, when selecting your R version, please make sure that the it is compatible with your selected version of Spark. 24 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/get.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark import models 5 | from aztk.utils import helpers 6 | 7 | 8 | def _get_job(core_job_operations, job_id): 9 | job = core_job_operations.batch_client.job_schedule.get(job_id) 10 | tasks = [app for app in core_job_operations.list_tasks(id=job_id) if app.id != job_id] 11 | recent_run_job = core_job_operations.get_recent_job(job_id) 12 | pool_prefix = recent_run_job.pool_info.auto_pool_specification.auto_pool_id_prefix 13 | pool = nodes = None 14 | for cloud_pool in core_job_operations.batch_client.pool.list(): 15 | if pool_prefix in cloud_pool.id: 16 | pool = cloud_pool 17 | break 18 | if pool: 19 | nodes = core_job_operations.batch_client.compute_node.list(pool_id=pool.id) 20 | return job, tasks, pool, nodes 21 | 22 | 23 | def get_job(core_job_operations, job_id): 24 | try: 25 | job, tasks, pool, nodes = _get_job(core_job_operations, job_id) 26 | return models.Job(job, tasks, pool, nodes) 27 | except BatchErrorException as e: 28 | raise error.AztkError(helpers.format_batch_exception(e)) 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.5 4 | - 3.6 5 | 6 | install: 7 | - pip install -r requirements.txt 8 | - pip install -e . 9 | 10 | script: 11 | - yapf --style .style.yapf -dpr aztk/ aztk_cli/ 12 | - pylint -j 2 -E aztk aztk_cli 13 | - pytest --ignore=tests/integration_tests 14 | 15 | branches: 16 | only: 17 | - master 18 | - /^v.*$/ 19 | 20 | deploy: 21 | provider: pypi 22 | distributions: "sdist bdist_wheel" 23 | user: aztk 24 | password: 25 | secure: j/mwA+hWudujDZ+JkgN3hDBUcIH3Vt1SWsvxnyUadErFsi9S9bW8OZ8kOp3R8yj73pnaENADmvDPjBt6w39b3h7l/EmNYlEdsAuPBMmaNQ0+fmPiKLCcALVIt/Odxdc+a1p2p+F3HEatqMCkaSRUs9gcHSYA1P57rt7y6i28xrCTt1ayQrdOZofgyAx4egocFQlgXgFpQIO3vNXySmc5HaIJF8h84CUKvLG/jv9bHgDoT7n2F00sQQV/tLbXdWEA2LcJFN5q4gjk3AwFtofTtMzlk3IE3y3qp9DagWbdsKoy9+b31lypVDBiIp/N+kCm/pwYDJXyG0H7U1pev2F8K/f4xF2x0AJDJJamLczAY4Ac4AMPImlI3cCpSS6htQhb+i5+KeEUW2tm+xGmdprOSto2712C4TNMna25x+WZteuPmRGYTmmDxGSeD769lE6TMGqrJTlmpkzndJiyp8ek/86hJPJUQoPfDadWdIdcA4wCjpXFBach4DYEItKvYoIk/KfK3wdqp3Lbs+MhT+JhuXGR0RRWlkCROiIo48TwQ716ddvjy2PPxwxNhplu3BdyEXLEDGN9EnEL+3L5P0GELhB4+KstjXSbECBYzULJ1+4kzvrdSB3K9F41stpT7x9PN9mK0t9FrVpXWbNozuJqkCvVVtz2ZSJ+20kvSuI1TPA= 26 | on: 27 | python: 3.6 28 | tags: true 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /aztk/client/client.py: -------------------------------------------------------------------------------- 1 | from aztk import models 2 | from aztk.utils import azure_api 3 | 4 | 5 | class CoreClient: 6 | """The base AZTK client that all other clients inherit from. 7 | 8 | **This client should not be used directly. Only software specific clients 9 | should be used.** 10 | 11 | """ 12 | 13 | def __init__(self): 14 | self.secrets_configuration = None 15 | self.batch_client = None 16 | self.blob_client = None 17 | self.table_service = None 18 | 19 | def _get_context(self, secrets_configuration: models.SecretsConfiguration): 20 | self.secrets_configuration = secrets_configuration 21 | 22 | azure_api.validate_secrets(secrets_configuration) 23 | self.batch_client = azure_api.make_batch_client(secrets_configuration) 24 | self.blob_client = azure_api.make_blob_client(secrets_configuration) 25 | self.table_service = azure_api.make_table_service(secrets_configuration) 26 | context = { 27 | "batch_client": self.batch_client, 28 | "blob_client": self.blob_client, 29 | "table_service": self.table_service, 30 | "secrets_configuration": self.secrets_configuration, 31 | } 32 | return context 33 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/get_application.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | from azure.batch.models import BatchErrorException 3 | 4 | from aztk import error 5 | from aztk.spark import models 6 | from aztk.utils import helpers 7 | 8 | 9 | def _get_application(core_operations, job_id, application_name): 10 | # info about the app 11 | recent_run_job = core_operations.get_recent_job(job_id) 12 | scheduling_target = core_operations.get_cluster_configuration(job_id).scheduling_target 13 | if scheduling_target is not models.SchedulingTarget.Any: 14 | return core_operations.get_task_from_table(job_id, application_name) 15 | try: 16 | return core_operations.get_batch_task(id=recent_run_job.id, task_id=application_name) 17 | except batch_models.BatchErrorException: 18 | raise error.AztkError( 19 | "The Spark application {0} is still being provisioned or does not exist.".format(application_name)) 20 | 21 | 22 | def get_application(core_operations, job_id, application_name): 23 | try: 24 | return models.Application(_get_application(core_operations, job_id, application_name)) 25 | except BatchErrorException as e: 26 | raise error.AztkError(helpers.format_batch_exception(e)) 27 | -------------------------------------------------------------------------------- /aztk/models/plugins/plugin_file.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Union 3 | from aztk.core.models import Model, fields 4 | 5 | 6 | class PluginFile(Model): 7 | """ 8 | Reference to a file for a plugin. 9 | """ 10 | 11 | target = fields.String() 12 | local_path = fields.String() 13 | 14 | def __init__(self, target: str = None, local_path: str = None): 15 | super().__init__(target=target, local_path=local_path) 16 | 17 | def content(self): 18 | with open(self.local_path, "r", encoding="UTF-8") as f: 19 | return f.read() 20 | 21 | 22 | class TextPluginFile(Model): 23 | """ 24 | Reference to a file for a plugin. 25 | 26 | Args: 27 | target (str): Where should the file be uploaded relative to the plugin working dir 28 | content (str|io.StringIO): Content of the file. Can either be a string or a StringIO 29 | """ 30 | 31 | target = fields.String() 32 | 33 | def __init__(self, target: str, content: Union[str, io.StringIO]): 34 | super().__init__(target=target) 35 | if isinstance(content, str): 36 | self._content = content 37 | else: 38 | self._content = content.getValue() 39 | 40 | def content(self): 41 | return self._content 42 | -------------------------------------------------------------------------------- /docs/dev/tests.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | AZTK comes with a testing library that can be used for verification, and debugging. Please note that some tests will provision and test real resources in Azure, and as a result, will cost money to run. See [Integration Tests](#integration-tests) for more details. 4 | 5 | ## Integration Tests 6 | 7 | Integration tests use the credentials given in your `.aztk/secrets.yaml` file to spin up real Clusters and Jobs to verify the functionality of the library. Please note that these tests __will__ cost money to run. All created Clusters nad Jobs will be deleted when the test completes. 8 | 9 | Since each integration test spins up a Cluster or Job, you may want to run the tests in parallel to reduce the time needed to complete the testing library: 10 | 11 | ```sh 12 | pytest $path_to_repo_root -n <5> 13 | ``` 14 | _Note: $path_to_repo_root represents the path to the root of the aztk repository, and is only required if you are running the tests from a different location._ 15 | 16 | Please note that the number passed to the `-n` flag determines the number of tests you wish to run in parallel. Parallelizing the tests will increase the number of CPU cores used at one time, so please verify that you have the available core quota in your Batch account. 17 | 18 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_get.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk 5 | from aztk_cli import config, log, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 10 | parser.add_argument("--show-config", dest="show_config", action="store_true", help="Show the cluster configuration") 11 | parser.add_argument( 12 | "--internal", 13 | action="store_true", 14 | help="Show the local IP of the nodes. " 15 | "Only use if using connecting with a VPN.", 16 | ) 17 | parser.set_defaults(internal=False) 18 | 19 | 20 | def execute(args: typing.NamedTuple): 21 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 22 | cluster_id = args.cluster_id 23 | cluster = spark_client.cluster.get(cluster_id) 24 | utils.print_cluster(spark_client, cluster, args.internal) 25 | 26 | if args.show_config: 27 | configuration = spark_client.cluster.get_configuration(cluster_id) 28 | if configuration: 29 | log.info("-------------------------------------------") 30 | log.info("Cluster configuration:") 31 | utils.print_cluster_conf(configuration, False) 32 | -------------------------------------------------------------------------------- /tests/models/internal/test_plugin_reference.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from aztk.error import AztkError, AztkAttributeError 4 | from aztk.models.plugins.internal import PluginReference, PluginTarget, PluginTargetRole 5 | 6 | 7 | def test_from_dict(): 8 | ref = PluginReference.from_dict( 9 | dict( 10 | name="my-test-script", 11 | script="path/to/script.sh", 12 | target="host", 13 | target_role="worker", 14 | )) 15 | 16 | assert ref.name == "my-test-script" 17 | assert ref.script == "path/to/script.sh" 18 | assert ref.target == PluginTarget.Host 19 | assert ref.target_role == PluginTargetRole.Worker 20 | 21 | 22 | def test_from_dict_invalid_param(): 23 | with pytest.raises(AztkAttributeError): 24 | PluginReference.from_dict(dict(name2="invalid")) 25 | 26 | 27 | def test_from_dict_invalid_target(): 28 | with pytest.raises(AztkError): 29 | PluginReference.from_dict(dict( 30 | script="path/to/script.sh", 31 | target="host-invalid", 32 | )) 33 | 34 | 35 | def test_from_dict_invalid_target_role(): 36 | with pytest.raises(AztkError): 37 | PluginReference.from_dict(dict( 38 | script="path/to/script.sh", 39 | target_role="worker-invalid", 40 | )) 41 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/node_run.py: -------------------------------------------------------------------------------- 1 | from aztk import error, models 2 | from aztk.utils import ssh as ssh_lib 3 | 4 | 5 | def node_run(base_client, cluster_id, node_id, command, internal, container_name=None, timeout=None, block=True): 6 | cluster = base_client.get(cluster_id) 7 | pool, nodes = cluster.pool, list(cluster.nodes) 8 | try: 9 | node = next(node for node in nodes if node.id == node_id) 10 | except StopIteration: 11 | raise error.AztkError("Node with id {} not found".format(node_id)) 12 | if internal: 13 | node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") 14 | else: 15 | node_rls = base_client.get_remote_login_settings(pool.id, node.id) 16 | try: 17 | generated_username, ssh_key = base_client.generate_user_on_node(pool.id, node.id) 18 | output = ssh_lib.node_exec_command( 19 | node.id, 20 | command, 21 | generated_username, 22 | node_rls.ip_address, 23 | node_rls.port, 24 | ssh_key=ssh_key.exportKey().decode("utf-8"), 25 | container_name=container_name, 26 | timeout=timeout, 27 | block=block) 28 | return output 29 | finally: 30 | base_client.delete_user_on_node(cluster_id, node.id, generated_username) 31 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Azure Distributed Data Engineering Toolkit 2 | ========================================== 3 | Azure Distributed Data Engineering Toolkit (AZTK) is a python CLI application for provisioning on-demand Spark on Docker clusters in Azure. It's a cheap and easy way to get up and running with a Spark cluster, and a great tool for Spark users who want to experiment and start testing at scale. 4 | 5 | This toolkit is built on top of Azure Batch but does not require any Azure Batch knowledge to use. 6 | 7 | .. _user-docs: 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: User documentation: 12 | 13 | 00-getting-started 14 | 10-clusters 15 | 12-docker-image 16 | 13-configuration 17 | 14-azure-files 18 | 15-plugins 19 | 20-spark-submit 20 | 30-cloud-storage 21 | 60-gpu 22 | 70-jobs 23 | 80-migration 24 | 25 | .. _sdk-docs: 26 | .. toctree:: 27 | :maxdepth: 2 28 | :caption: SDK documentation: 29 | 30 | sdk-examples 31 | 51-define-plugin 32 | aztk 33 | 34 | 35 | .. _dev-docs: 36 | 37 | .. toctree:: 38 | :maxdepth: 2 39 | :caption: Developer documentation: 40 | 41 | dev/docs 42 | dev/writing-models 43 | dev/tests 44 | 45 | 46 | 47 | Indices and tables 48 | ================== 49 | 50 | * :ref:`genindex` 51 | * :ref:`modindex` 52 | * :ref:`search` 53 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/apache/spark/examples/LocalPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // scalastyle:off println 19 | package org.apache.spark.examples 20 | 21 | import scala.math.random 22 | 23 | object LocalPi { 24 | def main(args: Array[String]) { 25 | var count = 0 26 | for (i <- 1 to 100000) { 27 | val x = random * 2 - 1 28 | val y = random * 2 - 1 29 | if (x*x + y*y <= 1) count += 1 30 | } 31 | println("Pi is roughly " + 4 * count / 100000.0) 32 | } 33 | } 34 | // scalastyle:on println 35 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/ensure_spark_processes.py: -------------------------------------------------------------------------------- 1 | import azure.batch.models as batch_models 2 | 3 | from aztk.error import AztkError 4 | 5 | 6 | def ensure_spark_master(spark_client, id): 7 | results = spark_client.cluster.run( 8 | id, 9 | "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" 10 | " else echo AZTK_IS_MASTER is false ; fi") 11 | for result in results: 12 | if result.error: 13 | raise result.error 14 | assert result.output.rstrip() in [ 15 | "org.apache.spark.deploy.master.Master is running.", "AZTK_IS_MASTER is false" 16 | ] 17 | 18 | 19 | def ensure_spark_worker(spark_client, id): 20 | results = spark_client.cluster.run( 21 | id, 22 | "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" 23 | " else echo AZTK_IS_WORKER is false ; fi") 24 | for result in results: 25 | if result.error: 26 | raise result 27 | assert result.output.rstrip() in [ 28 | "org.apache.spark.deploy.worker.Worker is running.", "AZTK_IS_WORKER is false" 29 | ] 30 | 31 | 32 | def ensure_spark_processes(spark_client, id): 33 | ensure_spark_master(spark_client, id) 34 | ensure_spark_worker(spark_client, id) 35 | -------------------------------------------------------------------------------- /aztk/internal/docker_cmd.py: -------------------------------------------------------------------------------- 1 | from aztk.utils.command_builder import CommandBuilder 2 | 3 | 4 | class DockerCmd: 5 | """ 6 | Class helping to write a docker command 7 | """ 8 | 9 | def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False): 10 | if gpu_enabled: 11 | self.cmd = CommandBuilder("nvidia-docker run") 12 | else: 13 | self.cmd = CommandBuilder("docker run") 14 | self.cmd.add_option("--net", "host") 15 | self.cmd.add_option("--name", name) 16 | self.cmd.add_argument("-d") 17 | self.cmd.add_argument(docker_run_options) 18 | self.cmd.add_argument(docker_repo) 19 | self.cmd.add_argument(cmd) 20 | 21 | def add_env(self, env: str, value: str): 22 | self.cmd.add_option("-e", "{0}={1}".format(env, value)) 23 | 24 | def pass_env(self, env: str): 25 | """ 26 | Give the value of an environment variable in the main process to the docker image 27 | """ 28 | self.cmd.add_option("-e", "{0}".format(env)) 29 | 30 | def share_folder(self, folder: str): 31 | self.cmd.add_option("-v", "{0}:{0}".format(folder)) 32 | 33 | def open_port(self, port: int): 34 | self.cmd.add_option("-p", "{0}:{0}".format(port)) # Spark Master UI 35 | 36 | def to_str(self): 37 | return self.cmd.to_str() 38 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/delete.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | from msrest.exceptions import ClientRequestError 3 | 4 | from aztk import error 5 | from aztk.utils import BackOffPolicy, helpers, retry 6 | 7 | 8 | def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = False): 9 | deleted_job_schedule = False 10 | 11 | # delete job_schedule 12 | try: 13 | core_job_operations.batch_client.job_schedule.delete(job_id) 14 | deleted_job_schedule = True 15 | except BatchErrorException: 16 | pass 17 | 18 | # delete storage container 19 | if keep_logs: 20 | cluster_data = core_job_operations.get_cluster_data(job_id) 21 | cluster_data.delete_container(job_id) 22 | 23 | table_exists = core_job_operations.table_service.exists(job_id) 24 | if table_exists: 25 | core_job_operations.delete_task_table(job_id) 26 | 27 | return deleted_job_schedule 28 | 29 | 30 | @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) 31 | def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): 32 | try: 33 | return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) 34 | except BatchErrorException as e: 35 | raise error.AztkError(helpers.format_batch_exception(e)) 36 | -------------------------------------------------------------------------------- /aztk/version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation 2 | # 3 | # All rights reserved. 4 | # 5 | # MIT License 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a 8 | # copy of this software and associated documentation files (the "Software"), 9 | # to deal in the Software without restriction, including without limitation 10 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 | # and/or sell copies of the Software, and to permit persons to whom the 12 | # Software is furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in 15 | # all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | # DEALINGS IN THE SOFTWARE. 24 | major = 0 25 | minor = 10 26 | patch = 3 27 | 28 | suffix = "" 29 | 30 | __version__ = "{major}.{minor}.{patch}{suffix}".format(major=major, minor=minor, patch=patch, suffix=suffix) 31 | -------------------------------------------------------------------------------- /tests/models/internal/test_plugin-manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from aztk.models.plugins import PluginConfiguration 4 | from aztk.models.plugins.internal import PluginManager 5 | from aztk.error import InvalidPluginReferenceError 6 | 7 | dir_path = os.path.dirname(os.path.realpath(__file__)) 8 | fake_plugin_dir = os.path.join(dir_path, "fake_plugins") 9 | 10 | 11 | def RequiredArgPlugin(req_arg): 12 | return PluginConfiguration(name="required-arg") 13 | 14 | 15 | def test_missing_plugin(): 16 | plugin_manager = PluginManager() 17 | message = "Cannot find a plugin with name .*" 18 | with pytest.raises(InvalidPluginReferenceError, match=message): 19 | plugin_manager.get_plugin("non-existing-plugin") 20 | 21 | 22 | def test_extra_args_plugin(): 23 | plugin_manager = PluginManager() 24 | message = "Plugin JupyterPlugin doesn't have an argument called 'invalid'" 25 | with pytest.raises(InvalidPluginReferenceError, match=message): 26 | plugin_manager.get_plugin("jupyter", args=dict(invalid="foo")) 27 | 28 | 29 | def test_missing_required_arg(): 30 | plugin_manager = PluginManager() 31 | plugin_manager.plugins["required-arg"] = RequiredArgPlugin 32 | message = "Missing a required argument req_arg for plugin RequiredArgPlugin" 33 | with pytest.raises(InvalidPluginReferenceError, match=message): 34 | plugin_manager.get_plugin("required-arg") 35 | -------------------------------------------------------------------------------- /aztk/error.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains all errors used in Aztk. 3 | All error should inherit from `AztkError` 4 | """ 5 | 6 | 7 | class AztkError(Exception): 8 | pass 9 | 10 | 11 | class AztkAttributeError(AztkError): 12 | pass 13 | 14 | 15 | class ClusterNotReadyError(AztkError): 16 | pass 17 | 18 | 19 | class AzureApiInitError(AztkError): 20 | pass 21 | 22 | 23 | class InvalidPluginConfigurationError(AztkError): 24 | pass 25 | 26 | 27 | class InvalidModelError(AztkError): 28 | def __init__(self, message: str, model=None): 29 | super().__init__() 30 | self.message = message 31 | self.model = model 32 | 33 | def __str__(self): 34 | model_name = self.model and self.model.__class__.__name__ 35 | return "{model} {message}".format(model=model_name, message=self.message) 36 | 37 | 38 | class MissingRequiredAttributeError(InvalidModelError): 39 | pass 40 | 41 | 42 | class InvalidPluginReferenceError(InvalidModelError): 43 | pass 44 | 45 | 46 | class InvalidModelFieldError(InvalidModelError): 47 | def __init__(self, message: str, model=None, field=None): 48 | super().__init__(message, model) 49 | self.field = field 50 | 51 | def __str__(self): 52 | model_name = self.model and self.model.__class__.__name__ 53 | return "{model} {field} {message}".format(model=model_name, field=self.field, message=self.message) 54 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_copy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import typing 4 | 5 | import aztk.spark 6 | from aztk_cli import config, utils 7 | 8 | 9 | def setup_parser(parser: argparse.ArgumentParser): 10 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 11 | 12 | parser.add_argument("--source-path", required=True, help="the local file you wish to copy to the cluster") 13 | 14 | parser.add_argument( 15 | "--dest-path", 16 | required=True, 17 | help="the path the file will be copied to on each node in the cluster." 18 | "Note that this must include the file name.", 19 | ) 20 | parser.add_argument( 21 | "--internal", 22 | action="store_true", 23 | help="Connect using the local IP of the master node. Only use if using a VPN.", 24 | ) 25 | parser.set_defaults(internal=False) 26 | 27 | 28 | def execute(args: typing.NamedTuple): 29 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 30 | with utils.Spinner(): 31 | copy_output = spark_client.cluster.copy( 32 | id=args.cluster_id, source_path=args.source_path, destination_path=args.dest_path, internal=args.internal) 33 | for node_output in copy_output: 34 | utils.log_node_copy_output(node_output) 35 | sys.exit(0 if not any([node_output.error for node_output in copy_output]) else 1) 36 | -------------------------------------------------------------------------------- /aztk/spark/client/job/helpers/list_applications.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | 3 | from aztk import error 4 | from aztk.spark import models 5 | from aztk.utils import helpers 6 | 7 | 8 | def _list_applications(core_job_operations, job_id): 9 | recent_run_job = core_job_operations.get_recent_job(job_id) 10 | # get application names from Batch job metadata 11 | applications = {} 12 | for metadata_item in recent_run_job.metadata: 13 | if metadata_item.name == "applications": 14 | for app_name in metadata_item.value.split("\n"): 15 | applications[app_name] = None 16 | 17 | tasks = core_job_operations.list_tasks(job_id) 18 | for task in tasks: 19 | if task.id != job_id: 20 | applications[task.id] = task 21 | 22 | return applications 23 | 24 | 25 | # TODO: this needs to be changed to return a list of aztk.model.Task 26 | # currently, it returns a dictionary indicating whether 27 | # a task has been scheduled or not 28 | def list_applications(core_job_operations, job_id): 29 | try: 30 | applications = _list_applications(core_job_operations, job_id) 31 | for item in applications: 32 | if applications[item]: 33 | applications[item] = models.Application(applications[item]) 34 | return applications 35 | except BatchErrorException as e: 36 | raise error.AztkError(helpers.format_batch_exception(e)) 37 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_app_logs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import typing 4 | 5 | import aztk 6 | from aztk_cli import config, utils, log 7 | 8 | 9 | def setup_parser(parser: argparse.ArgumentParser): 10 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 11 | parser.add_argument("--name", dest="app_name", required=True, help="The unique id of your job name") 12 | 13 | output_group = parser.add_mutually_exclusive_group() 14 | 15 | output_group.add_argument( 16 | "--output", 17 | help="Path to the file you wish to output to. If not \ 18 | specified, output is printed to stdout", 19 | ) 20 | output_group.add_argument("--tail", dest="tail", action="store_true") 21 | 22 | 23 | def execute(args: typing.NamedTuple): 24 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 25 | 26 | if args.tail: 27 | utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.app_name) 28 | else: 29 | app_log = spark_client.cluster.get_application_log(id=args.cluster_id, application_name=args.app_name) 30 | if args.output: 31 | with utils.Spinner(): 32 | with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: 33 | f.write(app_log.log) 34 | else: 35 | log.print(app_log.log) 36 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/run.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from azure.batch.models import BatchErrorException 4 | 5 | import aztk.models as models 6 | from aztk import error 7 | from aztk.utils import ssh as ssh_lib 8 | from aztk.utils import helpers 9 | 10 | 11 | def cluster_run(base_operations, cluster_id, command, internal, container_name=None, timeout=None): 12 | cluster = base_operations.get(cluster_id) 13 | pool, nodes = cluster.pool, list(cluster.nodes) 14 | if internal: 15 | cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] 16 | else: 17 | cluster_nodes = [(node, base_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] 18 | try: 19 | generated_username, ssh_key = base_operations.generate_user_on_cluster(pool.id, nodes) 20 | except BatchErrorException as e: 21 | raise error.AztkError(helpers.format_batch_exception(e)) 22 | 23 | try: 24 | output = asyncio.get_event_loop().run_until_complete( 25 | ssh_lib.clus_exec_command( 26 | command, 27 | generated_username, 28 | cluster_nodes, 29 | ssh_key=ssh_key.exportKey().decode("utf-8"), 30 | container_name=container_name, 31 | timeout=timeout, 32 | )) 33 | return output 34 | except OSError as exc: 35 | raise exc 36 | finally: 37 | base_operations.delete_user_on_cluster(pool.id, nodes, generated_username) 38 | -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/delete.py: -------------------------------------------------------------------------------- 1 | from azure.batch.models import BatchErrorException 2 | from msrest.exceptions import ClientRequestError 3 | 4 | from aztk.utils import BackOffPolicy, retry 5 | 6 | 7 | def delete_pool_and_job_and_table(core_cluster_operations, pool_id: str, keep_logs: bool = False): 8 | """ 9 | Delete a pool and it's associated job 10 | :param cluster_id: the pool to add the user to 11 | :return bool: deleted the pool if exists and job if exists 12 | """ 13 | # job id is equal to pool id 14 | job_exists = True 15 | 16 | try: 17 | core_cluster_operations.batch_client.job.get(pool_id) 18 | except BatchErrorException: 19 | job_exists = False 20 | 21 | pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id) 22 | 23 | table_deleted = core_cluster_operations.delete_task_table(pool_id) 24 | 25 | if job_exists: 26 | delete_object(core_cluster_operations.batch_client.job.delete, pool_id) 27 | 28 | if pool_exists: 29 | delete_object(core_cluster_operations.batch_client.pool.delete, pool_id) 30 | 31 | if not keep_logs: 32 | cluster_data = core_cluster_operations.get_cluster_data(pool_id) 33 | cluster_data.delete_container(pool_id) 34 | 35 | return job_exists or pool_exists or table_deleted 36 | 37 | 38 | @retry(retry_count=4, retry_interval=1, backoff_policy=BackOffPolicy.exponential, exceptions=(ClientRequestError)) 39 | def delete_object(function, *args, **kwargs): 40 | return function(*args, **kwargs) 41 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 10 | parser.add_argument( 11 | "--node-id", 12 | "-n", 13 | dest="node_id", 14 | required=False, 15 | help="The unique id of the node in the cluster to run the command on", 16 | ) 17 | parser.add_argument("command", help="The command to run on your spark cluster") 18 | parser.add_argument( 19 | "--internal", 20 | action="store_true", 21 | help="Connect using the local IP of the master node. Only use if using a VPN") 22 | parser.add_argument( 23 | "--host", action="store_true", help="Run the command on the host instead of the Spark Docker container") 24 | parser.set_defaults(internal=False, host=False) 25 | 26 | 27 | def execute(args: typing.NamedTuple): 28 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 29 | with utils.Spinner(): 30 | if args.node_id: 31 | results = [ 32 | spark_client.cluster.node_run(args.cluster_id, args.node_id, args.command, args.host, args.internal) 33 | ] 34 | else: 35 | results = spark_client.cluster.run(args.cluster_id, args.command, args.host, args.internal) 36 | for node_output in results: 37 | utils.log_node_run_output(node_output) 38 | -------------------------------------------------------------------------------- /aztk/spark/utils/util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import datetime 4 | import time 5 | 6 | import azure.batch.models as batch_models 7 | 8 | from aztk.utils import constants 9 | 10 | 11 | class MasterInvalidStateError(Exception): 12 | pass 13 | 14 | 15 | def wait_for_master_to_be_ready(core_operations, spark_operations, cluster_id: str): 16 | 17 | master_node_id = None 18 | start_time = datetime.datetime.now() 19 | while True: 20 | if not master_node_id: 21 | master_node_id = spark_operations.get(cluster_id).master_node_id 22 | if not master_node_id: 23 | time.sleep(5) 24 | continue 25 | 26 | master_node = core_operations.batch_client.compute_node.get(cluster_id, master_node_id) 27 | 28 | if master_node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: 29 | break 30 | elif master_node.state is batch_models.ComputeNodeState.start_task_failed: 31 | raise MasterInvalidStateError("Start task failed on master") 32 | elif master_node.state in [batch_models.ComputeNodeState.unknown, batch_models.ComputeNodeState.unusable]: 33 | raise MasterInvalidStateError("Master is in an invalid state") 34 | else: 35 | now = datetime.datetime.now() 36 | 37 | delta = now - start_time 38 | if delta.total_seconds() > constants.WAIT_FOR_MASTER_TIMEOUT: 39 | raise MasterInvalidStateError("Master didn't become ready before timeout.") 40 | 41 | time.sleep(10) 42 | -------------------------------------------------------------------------------- /docs/14-azure-files.md: -------------------------------------------------------------------------------- 1 | # Azure Files 2 | 3 | The ability to load a file share on the cluster is really useful when you want to be able to share data across all the nodes, and/or want that data to be persisted longer than the lifetime of the cluster. [Azure Files](https://docs.microsoft.com/azure/storage/files/storage-files-introduction) provides a very easy way to mount a share into the cluster and have it accessible to all nodes. This is useful in cases where you have small data sets you want to process (less than 1GB) or have notebooks that you want to re-use between clusters. 4 | 5 | Mounting an Azure Files share in the cluster only required updating the cluster.yaml file at `.aztk/cluster.yaml`. For example, the following configuration will load two files shares into the cluster, one with my notebooks and one will a small data set that I have previously uploaded to Azure Files. 6 | 7 | ```yaml 8 | azure_files: 9 | - storage_account_name: STORAGE_ACCOUNT_NAME 10 | storage_account_key: STORAGE_ACCOUNT_KEY 11 | # Name of the file share in Azure Files 12 | file_share_path: data 13 | # Mount point on the node in the cluster 14 | mount_path: /mnt/data 15 | - storage_account_name: STORAGE_ACCOUNT_NAME 16 | storage_account_key: STORAGE_ACCOUNT_KEY 17 | # Name of the file share in Azure Files 18 | file_share_path: notebooks 19 | # Mount point on the node in the cluster 20 | mount_path: /mnt/notebooks 21 | ``` 22 | 23 | From the cluster I can now access both of these file shares directly simply by navigating to /mnt/data or /mnt/notebooks respectively. 24 | -------------------------------------------------------------------------------- /aztk/internal/configuration_base.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from aztk.error import AztkError, InvalidModelError 3 | 4 | 5 | class ConfigurationBase: 6 | """ 7 | Base class for any configuration. 8 | Include methods to help with validation 9 | """ 10 | 11 | @classmethod 12 | def from_dict(cls, args: dict): 13 | """ 14 | Create a new model from a dict values 15 | The dict is cleaned from null values and passed expanded to the constructor 16 | """ 17 | try: 18 | return cls._from_dict(args) 19 | except (ValueError, TypeError) as e: 20 | pretty_args = yaml.dump(args, default_flow_style=False) 21 | raise AztkError("{0} {1}\n{2}".format(cls.__name__, str(e), pretty_args)) 22 | 23 | @classmethod 24 | def _from_dict(cls, args: dict): 25 | clean = dict((k, v) for k, v in args.items() if v) 26 | return cls(**clean) 27 | 28 | def validate(self): 29 | raise NotImplementedError("Validate not implemented") 30 | 31 | def valid(self): 32 | try: 33 | self.validate() 34 | return True 35 | except AztkError: 36 | return False 37 | 38 | def _validate_required(self, attrs): 39 | for attr in attrs: 40 | if not getattr(self, attr): 41 | raise InvalidModelError("{0} missing {1}.".format(self.__class__.__name__, attr)) 42 | 43 | def _merge_attributes(self, other, attrs): 44 | for attr in attrs: 45 | val = getattr(other, attr) 46 | if val is not None: 47 | setattr(self, attr, val) 48 | -------------------------------------------------------------------------------- /examples/src/main/python/wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | from operator import add 22 | 23 | from pyspark.sql import SparkSession 24 | 25 | 26 | if __name__ == "__main__": 27 | if len(sys.argv) != 2: 28 | print("Usage: wordcount ", file=sys.stderr) 29 | exit(-1) 30 | 31 | spark = SparkSession\ 32 | .builder\ 33 | .appName("PythonWordCount")\ 34 | .getOrCreate() 35 | 36 | lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) 37 | counts = lines.flatMap(lambda x: x.split(' ')) \ 38 | .map(lambda x: (x, 1)) \ 39 | .reduceByKey(add) 40 | output = counts.collect() 41 | for (word, count) in output: 42 | print("%s: %i" % (word, count)) 43 | 44 | spark.stop() 45 | -------------------------------------------------------------------------------- /examples/src/main/python/pi.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | from random import random 22 | from operator import add 23 | 24 | from pyspark.sql import SparkSession 25 | 26 | 27 | if __name__ == "__main__": 28 | """ 29 | Usage: pi [partitions] 30 | """ 31 | spark = SparkSession\ 32 | .builder\ 33 | .appName("PythonPi")\ 34 | .getOrCreate() 35 | 36 | partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 37 | n = 100000 * partitions 38 | 39 | def f(_): 40 | x = random() * 2 - 1 41 | y = random() * 2 - 1 42 | return 1 if x ** 2 + y ** 2 <= 1 else 0 43 | 44 | count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add) 45 | print("Pi is roughly %f" % (4.0 * count / n)) 46 | 47 | spark.stop() 48 | -------------------------------------------------------------------------------- /aztk/utils/deprecation.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import functools 3 | import inspect 4 | 5 | 6 | def deprecated(version: str, advice: str = None): 7 | """ 8 | This is a decorator which can be used to mark functions 9 | as deprecated. It will result in a warning being emitted 10 | when the function is used. 11 | 12 | Args: 13 | version (str): The version in which the deprecated functionality will be removed 14 | advice (str): Sentence explaining alternatives to the deprecated functionality. 15 | """ 16 | 17 | def decorator(func): 18 | if inspect.isclass(func): 19 | msg = "Call to deprecated class {name}." 20 | else: 21 | msg = "Call to deprecated function {name}." 22 | 23 | @functools.wraps(func) 24 | def new_func(*args, **kwargs): 25 | deprecate(version=version, message=msg.format(name=func.__name__, advice=advice), advice=advice) 26 | return func(*args, **kwargs) 27 | 28 | return new_func 29 | 30 | return decorator 31 | 32 | 33 | def deprecate(version: str, message: str, advice: str = ""): 34 | """ 35 | Print a deprecation warning. 36 | 37 | Args: 38 | message (str): Sentence explaining what is deprecated. 39 | advice (str): Sentence explaining alternatives to the deprecated functionality. 40 | """ 41 | 42 | warnings.simplefilter("always", DeprecationWarning) # turn off filter 43 | warnings.warn( 44 | "{0} It will be removed in Aztk version {1}. {2}".format(message, version, advice), 45 | category=DeprecationWarning, 46 | stacklevel=2, 47 | ) 48 | warnings.simplefilter("default", DeprecationWarning) # reset filter 49 | -------------------------------------------------------------------------------- /aztk/client/base/helpers/create_user_on_node.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | 3 | import azure.batch.models as batch_models 4 | from azure.batch.models import BatchErrorException 5 | 6 | from aztk.utils import get_ssh_key 7 | 8 | 9 | def __create_user(self, id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: 10 | """ 11 | Create a pool user 12 | :param pool: the pool to add the user to 13 | :param node: the node to add the user to 14 | :param username: username of the user to add 15 | :param password: password of the user to add 16 | :param ssh_key: ssh_key of the user to add 17 | """ 18 | # Create new ssh user for the given node 19 | self.batch_client.compute_node.add_user( 20 | id, 21 | node_id, 22 | batch_models.ComputeNodeUser( 23 | name=username, 24 | is_admin=True, 25 | password=password, 26 | ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_configuration), 27 | expiry_time=datetime.now(timezone.utc) + timedelta(days=365), 28 | ), 29 | ) 30 | 31 | 32 | def create_user_on_node(base_client, id, node_id, username, ssh_key=None, password=None): 33 | try: 34 | __create_user(base_client, id=id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) 35 | except BatchErrorException as error: 36 | try: 37 | base_client.delete_user_on_node(id, node_id, username) 38 | base_client.create_user_on_node(id=id, node_id=node_id, username=username, ssh_key=ssh_key) 39 | except BatchErrorException as error: 40 | raise error 41 | -------------------------------------------------------------------------------- /examples/src/main/python/sort.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from __future__ import print_function 19 | 20 | import sys 21 | 22 | from pyspark.sql import SparkSession 23 | 24 | 25 | if __name__ == "__main__": 26 | if len(sys.argv) != 2: 27 | print("Usage: sort ", file=sys.stderr) 28 | exit(-1) 29 | 30 | spark = SparkSession\ 31 | .builder\ 32 | .appName("PythonSort")\ 33 | .getOrCreate() 34 | 35 | lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) 36 | sortedCount = lines.flatMap(lambda x: x.split(' ')) \ 37 | .map(lambda x: (int(x), 1)) \ 38 | .sortByKey() 39 | # This is just a demo on how to bring all the sorted data back to a single node. 40 | # In reality, we wouldn't want to collect all the data to the driver node. 41 | output = sortedCount.collect() 42 | for (num, unitcount) in output: 43 | print(num) 44 | 45 | spark.stop() 46 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/delete.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, log 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="job_id", required=True, help="The unique id of your AZTK Job") 10 | parser.add_argument( 11 | "--force", 12 | "-f", 13 | dest="force", 14 | required=False, 15 | action="store_true", 16 | help="Do not prompt for confirmation, force deletion of cluster.", 17 | ) 18 | parser.add_argument( 19 | "--keep-logs", 20 | "-k", 21 | dest="keep_logs", 22 | action="store_true", 23 | required=False, 24 | help="Prevent logs in storage from being deleted.", 25 | ) 26 | parser.set_defaults(force=False, keep_logs=False) 27 | 28 | 29 | def execute(args: typing.NamedTuple): 30 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 31 | job_id = args.job_id 32 | 33 | if not args.force: 34 | # check if job exists before prompting for confirmation 35 | spark_client.job.get(id=job_id) 36 | 37 | if not args.keep_logs: 38 | log.warning("All logs persisted for this job will be deleted.") 39 | 40 | confirmation_cluster_id = input("Please confirm the id of the cluster you wish to delete: ") 41 | 42 | if confirmation_cluster_id != job_id: 43 | log.error("Confirmation cluster id does not match. Please try again.") 44 | return 45 | 46 | if spark_client.job.delete(id=job_id, keep_logs=args.keep_logs): 47 | log.info("Deleting Job %s", job_id) 48 | else: 49 | log.error("Job with id '%s' doesn't exist or was already deleted.", job_id) 50 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/apache/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // scalastyle:off println 19 | package org.apache.spark.examples 20 | 21 | import scala.math.random 22 | 23 | import org.apache.spark.sql.SparkSession 24 | 25 | /** Computes an approximation to pi */ 26 | object SparkPi { 27 | def main(args: Array[String]) { 28 | val spark = SparkSession 29 | .builder 30 | .appName("Spark Pi") 31 | .getOrCreate() 32 | val slices = if (args.length > 0) args(0).toInt else 2 33 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow 34 | val count = spark.sparkContext.parallelize(1 until n, slices).map { i => 35 | val x = random * 2 - 1 36 | val y = random * 2 - 1 37 | if (x*x + y*y <= 1) 1 else 0 38 | }.reduce(_ + _) 39 | println("Pi is roughly " + 4.0 * count / (n - 1)) 40 | spark.stop() 41 | } 42 | } 43 | // scalastyle:on println 44 | -------------------------------------------------------------------------------- /aztk_cli/config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | io.file.buffer.size 7 | 131072 8 | 9 | 10 | 11 | 12 | 20 | 21 | 22 | 23 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /.vsts-ci.yml: -------------------------------------------------------------------------------- 1 | name: $(Build.SourceBranch)$(Rev:.r) 2 | 3 | trigger: 4 | - master 5 | 6 | jobs: 7 | - job: Test 8 | pool: 9 | vmImage: 'ubuntu-16.04' 10 | steps: 11 | - task: UsePythonVersion@0 12 | inputs: 13 | versionSpec: '3.6 >= 3.5' 14 | addToPath: true 15 | architecture: 'x64' 16 | 17 | - script: | 18 | pip install -r requirements.txt 19 | pip install -e . 20 | condition: succeeded() 21 | displayName: install aztk 22 | 23 | - script: | 24 | yapf --style .style.yapf -dpr aztk/ aztk_cli/ 25 | condition: succeeded() 26 | displayName: yapf 27 | 28 | - script: | 29 | pylint --jobs 2 --errors-only aztk aztk_cli 30 | condition: succeeded() 31 | displayName: pylint error check 32 | 33 | - script: | 34 | pytest --ignore=tests/integration_tests 35 | condition: succeeded() 36 | displayName: unit tests 37 | 38 | - script: | 39 | export BATCH_ACCOUNT_RESOURCE_ID=$(BATCH_ACCOUNT_RESOURCE_ID) 40 | export CLIENT_ID=$(CLIENT_ID) 41 | export CREDENTIAL=$(CREDENTIAL) 42 | export ID_RSA="$(ID_RSA)" 43 | export ID_RSA_PUB="$(ID_RSA_PUB)" 44 | export STORAGE_ACCOUNT_RESOURCE_ID=$(STORAGE_ACCOUNT_RESOURCE_ID) 45 | export TENANT_ID=$(TENANT_ID) 46 | pytest --numprocesses=70 tests/integration_tests 47 | condition: and(succeeded(), or(startsWith(variables['Build.SourceBranchName'], 'release'), eq(variables['Build.SourceBranchName'], 'master'))) 48 | displayName: integration tests 49 | 50 | - script: | 51 | pylint --jobs 2 --disable=fixme aztk aztk_cli 52 | continueOnError: true 53 | condition: succeeded() 54 | displayName: pylint report 55 | -------------------------------------------------------------------------------- /aztk/client/cluster/helpers/copy.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from azure.batch.models import BatchErrorException 4 | 5 | import aztk.models as models 6 | from aztk import error 7 | from aztk.utils import ssh as ssh_lib 8 | from aztk.utils import helpers 9 | 10 | 11 | def cluster_copy( 12 | cluster_operations, 13 | cluster_id, 14 | source_path, 15 | destination_path=None, 16 | container_name=None, 17 | internal=False, 18 | get=False, 19 | timeout=None, 20 | ): 21 | cluster = cluster_operations.get(cluster_id) 22 | pool, nodes = cluster.pool, list(cluster.nodes) 23 | if internal: 24 | cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] 25 | else: 26 | cluster_nodes = [(node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] 27 | 28 | try: 29 | generated_username, ssh_key = cluster_operations.generate_user_on_cluster(pool.id, nodes) 30 | except BatchErrorException as e: 31 | raise error.AztkError(helpers.format_batch_exception(e)) 32 | 33 | try: 34 | output = asyncio.get_event_loop().run_until_complete( 35 | ssh_lib.clus_copy( 36 | container_name=container_name, 37 | username=generated_username, 38 | nodes=cluster_nodes, 39 | source_path=source_path, 40 | destination_path=destination_path, 41 | ssh_key=ssh_key.exportKey().decode("utf-8"), 42 | get=get, 43 | timeout=timeout, 44 | )) 45 | return output 46 | except (OSError, BatchErrorException) as exc: 47 | raise exc 48 | finally: 49 | cluster_operations.delete_user_on_cluster(pool.id, nodes, generated_username) 50 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_delete.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk 5 | from aztk_cli import config, log 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument( 10 | "--id", dest="cluster_ids", nargs="*", required=True, help="The unique id of your spark cluster") 11 | parser.add_argument( 12 | "--force", 13 | "-f", 14 | dest="force", 15 | required=False, 16 | action="store_true", 17 | help="Do not prompt for confirmation, force deletion of cluster.", 18 | ) 19 | parser.add_argument( 20 | "--keep-logs", 21 | "-k", 22 | dest="keep_logs", 23 | action="store_true", 24 | required=False, 25 | help="Prevent logs in storage from being deleted.", 26 | ) 27 | parser.set_defaults(force=False, keep_logs=False) 28 | 29 | 30 | def execute(args: typing.NamedTuple): 31 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 32 | cluster_ids = args.cluster_ids 33 | 34 | for cluster_id in cluster_ids: 35 | if not args.force: 36 | if not args.keep_logs: 37 | log.warning("All logs persisted for this cluster will be deleted.") 38 | 39 | confirmation_cluster_id = input( 40 | "Please confirm the id of the cluster you wish to delete [{}]: ".format(cluster_id)) 41 | 42 | if confirmation_cluster_id != cluster_id: 43 | log.error("Confirmation cluster id does not match. Please try again.") 44 | return 45 | 46 | if spark_client.cluster.delete(id=cluster_id, keep_logs=args.keep_logs): 47 | log.info("Deleting cluster %s", cluster_id) 48 | else: 49 | log.error("Cluster with id '%s' doesn't exist or was already deleted.", cluster_id) 50 | -------------------------------------------------------------------------------- /tests/integration_tests/spark/sdk/get_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | import aztk.spark 5 | from aztk_cli import config 6 | 7 | 8 | def get_spark_client(): 9 | # load secrets 10 | # note: this assumes secrets are set up in .aztk/secrets 11 | tenant_id = os.environ.get("TENANT_ID") 12 | client_id = os.environ.get("CLIENT_ID") 13 | credential = os.environ.get("CREDENTIAL") 14 | batch_account_resource_id = os.environ.get("BATCH_ACCOUNT_RESOURCE_ID") 15 | storage_account_resource_id = os.environ.get("STORAGE_ACCOUNT_RESOURCE_ID") 16 | ssh_pub_key = os.environ.get("ID_RSA_PUB") 17 | ssh_private_key = os.environ.get("ID_RSA") 18 | keys = [ 19 | tenant_id, client_id, credential, batch_account_resource_id, storage_account_resource_id, ssh_private_key, 20 | ssh_pub_key 21 | ] 22 | 23 | spark_client = None 24 | if all(keys): 25 | spark_client = aztk.spark.Client( 26 | aztk.spark.models.SecretsConfiguration( 27 | service_principal=aztk.spark.models.ServicePrincipalConfiguration( 28 | tenant_id=tenant_id, 29 | client_id=client_id, 30 | credential=credential, 31 | batch_account_resource_id=batch_account_resource_id, 32 | storage_account_resource_id=storage_account_resource_id), 33 | ssh_pub_key=ssh_pub_key, 34 | ssh_priv_key=ssh_private_key)) 35 | else: 36 | # fallback to local secrets if environment variables don't exist 37 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 38 | 39 | return spark_client 40 | 41 | 42 | def get_test_suffix(prefix: str): 43 | # base cluster name 44 | dt = datetime.now() 45 | current_time = dt.microsecond 46 | base_cluster_id = "{0}-{1}".format(prefix, current_time) 47 | return base_cluster_id 48 | -------------------------------------------------------------------------------- /aztk/utils/command_builder.py: -------------------------------------------------------------------------------- 1 | class CommandOption: 2 | def __init__(self, name: str, value: str): 3 | self.name = name 4 | self.value = value 5 | 6 | 7 | class CommandBuilder: 8 | """ 9 | Helper class to build a command line 10 | """ 11 | 12 | def __init__(self, executable: str): 13 | """ 14 | :param executable: Path/name of the executable to run 15 | """ 16 | self.executable = executable 17 | self.options = [] 18 | self.arguments = [] 19 | 20 | def add_option(self, name: str, value: str = None, enable: bool = None): 21 | """ 22 | Add an option to the command line. 23 | 24 | :param name: Option name (with the dash(es)) 25 | :param value: Value for the option(If null and enable is not provided it won't add the option) 26 | :param enable: To explicitly add or ignore the option 27 | 28 | Usage: 29 | >>> command.add_option("--id", myId) # => Will only add to the command if myId is not null 30 | >>> command.add_option("--id", myId, enable=False) # => Will not add it to the list 31 | """ 32 | if enable is None: 33 | enable = value 34 | if enable: 35 | self.options.append(CommandOption(name=name, value=value)) 36 | return True 37 | 38 | return False 39 | 40 | def add_argument(self, arg): 41 | self.arguments.append(arg) 42 | 43 | def to_array(self): 44 | cmd = [self.executable] 45 | for option in self.options: 46 | cmd.append(option.name) 47 | if option.value is not None: 48 | cmd.append(option.value) 49 | 50 | for arg in self.arguments: 51 | cmd.append(arg) 52 | return cmd 53 | 54 | def to_str(self): 55 | cmd = self.to_array() 56 | return " ".join(cmd) 57 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter_lab/jupyter_lab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This custom script has been tested to work on the following docker images: 4 | # - aztk/python:spark2.2.0-python3.6.2-base 5 | # - aztk/python:spark2.2.0-python3.6.2-gpu 6 | # - aztk/python:spark2.1.0-python3.6.2-base 7 | # - aztk/python:spark2.1.0-python3.6.2-gpu 8 | 9 | if [ "$AZTK_IS_MASTER" = "true" ]; then 10 | conda install -c conda-force jupyterlab 11 | 12 | PYSPARK_DRIVER_PYTHON="/opt/conda/bin/jupyter" 13 | JUPYTER_KERNELS="/opt/conda/share/jupyter/kernels" 14 | 15 | # disable password/token on jupyter notebook 16 | jupyter lab --generate-config --allow-root 17 | JUPYTER_CONFIG='/root/.jupyter/jupyter_notebook_config.py' 18 | echo >> $JUPYTER_CONFIG 19 | echo -e 'c.NotebookApp.token=""' >> $JUPYTER_CONFIG 20 | echo -e 'c.NotebookApp.password=""' >> $JUPYTER_CONFIG 21 | 22 | # get master ip 23 | MASTER_IP=$(hostname -i) 24 | 25 | # remove existing kernels 26 | rm -rf $JUPYTER_KERNELS/* 27 | 28 | # set up jupyter to use pyspark 29 | mkdir $JUPYTER_KERNELS/pyspark 30 | touch $JUPYTER_KERNELS/pyspark/kernel.json 31 | cat << EOF > $JUPYTER_KERNELS/pyspark/kernel.json 32 | { 33 | "display_name": "PySpark", 34 | "language": "python", 35 | "argv": [ 36 | "python", 37 | "-m", 38 | "ipykernel", 39 | "-f", 40 | "{connection_file}" 41 | ], 42 | "env": { 43 | "SPARK_HOME": "$SPARK_HOME", 44 | "PYSPARK_PYTHON": "python", 45 | "PYSPARK_SUBMIT_ARGS": "--master spark://$MASTER_IP:7077 pyspark-shell" 46 | } 47 | } 48 | EOF 49 | 50 | # start jupyter notebook from /mnt - this is where we recommend you put your azure files mount point as well 51 | cd /mnt 52 | (PYSPARK_DRIVER_PYTHON=$PYSPARK_DRIVER_PYTHON PYSPARK_DRIVER_PYTHON_OPTS="lab --no-browser --port=8889 --allow-root" pyspark &) 53 | fi 54 | 55 | 56 | -------------------------------------------------------------------------------- /docs/dev/writing-models.md: -------------------------------------------------------------------------------- 1 | # Writing a model 2 | 3 | 4 | ## Getting started 5 | In `aztk/models` create a new file with the name of your model `my_model.py` 6 | 7 | In `aztk/models/__init__.py` add `from .my_model import MyModel` 8 | 9 | Create a new class `MyModel` that inherit `Modle` 10 | ```python 11 | from aztk.core.models import Model, fields 12 | 13 | class MyModel(Model): 14 | """ 15 | MyModel is an sample model 16 | 17 | Args: 18 | input1 (str): This is the first input 19 | """ 20 | 21 | input1 = fields.String() 22 | 23 | def __validate__(self): 24 | pass 25 | 26 | ``` 27 | 28 | ### Available fields types 29 | 30 | Check `aztk/core/models/fields.py` for the sources 31 | 32 | * `Field`: Base field class 33 | * `String`: Field that validate it is given a string 34 | * `Integer`: Field that validate it is given a int 35 | * `Float`: Field that validate it is given a float 36 | * `Boolean`: Field that validate it is given a boolean 37 | * `List`: Field that validate it is given a list and can also automatically convert entries to the given model type. 38 | * `Model`: Field that map to another model. If passed a dict it will automatically try to convert to the Model type 39 | * `Enum`: Field which value should be an enum. It will convert automatically to the enum if given the value. 40 | 41 | ## Add validation 42 | The fields provide basic validation automatically. A field without a default will be marked as required. 43 | 44 | To provide model wide validation implement a `__validate__` method and raise a `InvalidModelError` if there is any problems with the values 45 | 46 | ```python 47 | def __validate__(self): 48 | if 'secret' in self.input1: 49 | raise InvalidModelError("Input1 contains secrets") 50 | 51 | ``` 52 | 53 | ## Convert dict to model 54 | 55 | When inheriting from `Model` it comes with a `from_dict` class method which allows to convert a dict to this class 56 | -------------------------------------------------------------------------------- /docs/20-spark-submit.md: -------------------------------------------------------------------------------- 1 | # Submitting an Application 2 | Submitting a job to your Spark cluster in this package mimics the experience of a typical standalone cluster. A spark job will be submitted to the system and run to completion. 3 | 4 | ## Spark-Submit 5 | The spark-submit experience is mostly the same as any regular Spark cluster with a few minor differences. You can take a look at `aztk spark cluster --help` for more detailed information and options. 6 | 7 | Run a Spark job: 8 | ```sh 9 | aztk spark cluster submit --id --name 10 | ``` 11 | 12 | For example, to run a local pi.py file on a Spark cluster, simply specify the local path of the file: 13 | ```sh 14 | aztk spark cluster submit --id spark --name pipy examples/src/main/python/pi.py 100 15 | ``` 16 | 17 | To run a remotely hosted pi.py file on a Spark cluster, specify the remote path of the file and use the '--remote' flag: 18 | ```sh 19 | aztk spark cluster submit --id spark --name pipy --remote wasbs://path@remote/pi.py 100 20 | ``` 21 | 22 | NOTE: The job name (--name) must be at least 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name. 23 | 24 | ## Monitoring job 25 | If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port forwarding, you can navigate to http://localhost:8080 and http://localhost:4040 to view the progress of the job using the Spark UI 26 | 27 | 28 | ## Getting output logs 29 | The default setting when running a job is --wait. This will simply submit a job to the cluster and wait for the job to run. If you want to just submit the job and not wait, use the --no-wait flag and tail the logs manually: 30 | 31 | ```sh 32 | aztk spark cluster submit --id spark --name pipy --no-wait examples/src/main/python/pi.py 1000 33 | ``` 34 | 35 | ```sh 36 | aztk spark cluster app-logs --id spark --name pipy --tail 37 | ``` 38 | -------------------------------------------------------------------------------- /aztk_cli/toolkit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | from aztk.models import TOOLKIT_MAP, Toolkit 5 | from aztk_cli import log 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("toolkit_software", nargs="?") 10 | parser.add_argument("version", nargs="?") 11 | parser.add_argument("environment", nargs="?") 12 | parser.add_argument("--gpu", action="store_true") 13 | 14 | 15 | def execute(args: typing.NamedTuple): 16 | if not args.toolkit_software: 17 | return print_available_softwares() 18 | 19 | if not validate_software(args.toolkit_software): 20 | return None 21 | 22 | if not args.version: 23 | return print_available_software_version(args.toolkit_software) 24 | if not args.environment: 25 | print_available_environments(args.toolkit_software) 26 | 27 | toolkit = Toolkit(software=args.toolkit_software, version=args.version, environment=args.environment) 28 | 29 | toolkit.validate() 30 | log.info("Docker image picked for this toolkit: %s", toolkit.get_docker_repo(args.gpu)) 31 | return None 32 | 33 | 34 | def print_available_softwares(): 35 | log.info("Available toolkits: ") 36 | for toolkit in TOOLKIT_MAP: 37 | log.info(" - %s", toolkit) 38 | 39 | 40 | def validate_software(software: str): 41 | if software not in TOOLKIT_MAP: 42 | log.error("Software '%s' is not supported.", software) 43 | print_available_softwares() 44 | return False 45 | return True 46 | 47 | 48 | def print_available_software_version(software: str): 49 | toolkit_def = TOOLKIT_MAP.get(software) 50 | log.info("Available version for %s: ", software) 51 | for version in toolkit_def.versions: 52 | log.info(" - %s", version) 53 | 54 | 55 | def print_available_environments(software: str): 56 | toolkit_def = TOOLKIT_MAP.get(software) 57 | 58 | log.info("Available environment for %s: ", software) 59 | for env in toolkit_def.environments: 60 | log.info(" - %s", env) 61 | -------------------------------------------------------------------------------- /custom-scripts/jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This custom script only works on images where jupyter is pre-installed on the Docker image 4 | # 5 | # This custom script has been tested to work on the following docker images: 6 | # - aztk/python:spark2.2.0-python3.6.2-base 7 | # - aztk/python:spark2.2.0-python3.6.2-gpu 8 | # - aztk/python:spark2.1.0-python3.6.2-base 9 | # - aztk/python:spark2.1.0-python3.6.2-gpu 10 | 11 | if [ "$AZTK_IS_MASTER" = "true" ]; then 12 | pip install jupyter --upgrade 13 | pip install notebook --upgrade 14 | 15 | PYSPARK_DRIVER_PYTHON="/opt/conda/bin/jupyter" 16 | JUPYTER_KERNELS="/opt/conda/share/jupyter/kernels" 17 | 18 | # disable password/token on jupyter notebook 19 | jupyter notebook --generate-config --allow-root 20 | JUPYTER_CONFIG='/root/.jupyter/jupyter_notebook_config.py' 21 | echo >> $JUPYTER_CONFIG 22 | echo -e 'c.NotebookApp.token=""' >> $JUPYTER_CONFIG 23 | echo -e 'c.NotebookApp.password=""' >> $JUPYTER_CONFIG 24 | 25 | # get master ip 26 | MASTER_IP=$(hostname -i) 27 | 28 | # remove existing kernels 29 | rm -rf $JUPYTER_KERNELS/* 30 | 31 | # set up jupyter to use pyspark 32 | mkdir $JUPYTER_KERNELS/pyspark 33 | touch $JUPYTER_KERNELS/pyspark/kernel.json 34 | cat << EOF > $JUPYTER_KERNELS/pyspark/kernel.json 35 | { 36 | "display_name": "PySpark", 37 | "language": "python", 38 | "argv": [ 39 | "python", 40 | "-m", 41 | "ipykernel", 42 | "-f", 43 | "{connection_file}" 44 | ], 45 | "env": { 46 | "SPARK_HOME": "$SPARK_HOME", 47 | "PYSPARK_PYTHON": "python", 48 | "PYSPARK_SUBMIT_ARGS": "--master spark://$AZTK_MASTER_IP:7077 pyspark-shell" 49 | } 50 | } 51 | EOF 52 | 53 | # start jupyter notebook from /mnt - this is where we recommend you put your azure files mount point as well 54 | cd /mnt 55 | (PYSPARK_DRIVER_PYTHON=$PYSPARK_DRIVER_PYTHON PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=8888 --allow-root" pyspark &) 56 | fi 57 | 58 | 59 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/cluster/cluster_add_user.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config, log, utils 6 | 7 | 8 | def setup_parser(parser: argparse.ArgumentParser): 9 | parser.add_argument("--id", dest="cluster_id", required=True, help="The unique id of your spark cluster") 10 | parser.add_argument("-u", "--username", help="The username to access your spark cluster's head node") 11 | 12 | auth_group = parser.add_mutually_exclusive_group() 13 | auth_group.add_argument( 14 | "-p", 15 | "--password", 16 | help="The password to access your spark cluster's master node. If not provided will use ssh public key.", 17 | ) 18 | auth_group.add_argument( 19 | "--ssh-key", 20 | help="The ssh public key to access your spark cluster's master node. " 21 | "You can also set the ssh-key in the configuration file.", 22 | ) 23 | parser.set_defaults(username="admin") 24 | 25 | 26 | def execute(args: typing.NamedTuple): 27 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 28 | 29 | log.info("-------------------------------------------") 30 | log.info("spark cluster id: %s", args.cluster_id) 31 | log.info("username: %s", args.username) 32 | log.info("-------------------------------------------") 33 | 34 | if args.ssh_key: 35 | ssh_key = args.ssh_key 36 | else: 37 | ssh_key = spark_client.secrets_configuration.ssh_pub_key 38 | 39 | ssh_key, password = utils.get_ssh_key_or_prompt(ssh_key, args.username, args.password, 40 | spark_client.secrets_configuration) 41 | 42 | spark_client.cluster.create_user(id=args.cluster_id, username=args.username, password=password, ssh_key=ssh_key) 43 | 44 | if password: 45 | log.info("password: %s", "*" * len(password)) 46 | elif ssh_key: 47 | log.info("ssh public key: %s", ssh_key) 48 | 49 | log.info("-------------------------------------------") 50 | -------------------------------------------------------------------------------- /aztk_cli/spark/endpoints/job/submit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import typing 3 | 4 | import aztk.spark 5 | from aztk_cli import config 6 | from aztk_cli.config import JobConfig 7 | 8 | 9 | def setup_parser(parser: argparse.ArgumentParser): 10 | parser.add_argument( 11 | "--id", 12 | dest="job_id", 13 | required=False, 14 | help="The unique id of your Spark Job. Defaults to the id value in .aztk/job.yaml", 15 | ) 16 | parser.add_argument( 17 | "--configuration", 18 | "-c", 19 | dest="job_conf", 20 | required=False, 21 | help="Path to the job.yaml configuration file. Defaults to .aztk/job.yaml", 22 | ) 23 | 24 | 25 | def execute(args: typing.NamedTuple): 26 | spark_client = aztk.spark.Client(config.load_aztk_secrets()) 27 | job_conf = JobConfig() 28 | 29 | job_conf.merge(args.job_id, args.job_conf) 30 | 31 | # by default, load spark configuration files in .aztk/ 32 | spark_configuration = config.load_aztk_spark_config() 33 | # overwrite with values in job_conf if they exist 34 | if job_conf.spark_defaults_conf: 35 | spark_configuration.spark_defaults_conf = job_conf.spark_defaults_conf 36 | if job_conf.spark_env_sh: 37 | spark_configuration.spark_env_sh = job_conf.spark_env_sh 38 | if job_conf.core_site_xml: 39 | spark_configuration.core_site_xml = job_conf.core_site_xml 40 | 41 | job_configuration = aztk.spark.models.JobConfiguration( 42 | id=job_conf.id, 43 | applications=job_conf.applications, 44 | spark_configuration=spark_configuration, 45 | vm_size=job_conf.vm_size, 46 | toolkit=job_conf.toolkit, 47 | max_dedicated_nodes=job_conf.max_dedicated_nodes, 48 | max_low_pri_nodes=job_conf.max_low_pri_nodes, 49 | subnet_id=job_conf.subnet_id, 50 | worker_on_master=job_conf.worker_on_master, 51 | scheduling_target=job_conf.scheduling_target, 52 | ) 53 | 54 | # TODO: utils.print_job_conf(job_configuration) 55 | spark_client.job.submit(job_configuration) 56 | -------------------------------------------------------------------------------- /aztk/spark/models/plugins/jupyter/jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This custom script only works on images where jupyter is pre-installed on the Docker image 4 | # 5 | # This custom script has been tested to work on the following docker images: 6 | # - aztk/python:spark2.2.0-python3.6.2-base 7 | # - aztk/python:spark2.2.0-python3.6.2-gpu 8 | # - aztk/python:spark2.1.0-python3.6.2-base 9 | # - aztk/python:spark2.1.0-python3.6.2-gpu 10 | 11 | echo "Is master: $AZTK_IS_MASTER" 12 | 13 | if [ "$AZTK_IS_MASTER" = "true" ]; then 14 | pip install jupyter --upgrade 15 | pip install notebook --upgrade 16 | 17 | PYSPARK_DRIVER_PYTHON="/opt/conda/bin/jupyter" 18 | JUPYTER_KERNELS="/opt/conda/share/jupyter/kernels" 19 | 20 | # disable password/token on jupyter notebook 21 | jupyter notebook --generate-config --allow-root 22 | JUPYTER_CONFIG='/root/.jupyter/jupyter_notebook_config.py' 23 | echo >> $JUPYTER_CONFIG 24 | echo -e 'c.NotebookApp.token=""' >> $JUPYTER_CONFIG 25 | echo -e 'c.NotebookApp.password=""' >> $JUPYTER_CONFIG 26 | 27 | # get master ip 28 | MASTER_IP=$(hostname -i) 29 | 30 | # remove existing kernels 31 | rm -rf $JUPYTER_KERNELS/* 32 | 33 | # set up jupyter to use pyspark 34 | mkdir $JUPYTER_KERNELS/pyspark 35 | touch $JUPYTER_KERNELS/pyspark/kernel.json 36 | cat << EOF > $JUPYTER_KERNELS/pyspark/kernel.json 37 | { 38 | "display_name": "PySpark", 39 | "language": "python", 40 | "argv": [ 41 | "python", 42 | "-m", 43 | "ipykernel", 44 | "-f", 45 | "{connection_file}" 46 | ], 47 | "env": { 48 | "SPARK_HOME": "$SPARK_HOME", 49 | "PYSPARK_PYTHON": "python", 50 | "PYSPARK_SUBMIT_ARGS": "--master spark://$AZTK_MASTER_IP:7077 pyspark-shell" 51 | } 52 | } 53 | EOF 54 | 55 | # start jupyter notebook from /mnt - this is where we recommend you put your azure files mount point as well 56 | cd /mnt 57 | (PYSPARK_DRIVER_PYTHON=$PYSPARK_DRIVER_PYTHON PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=8888 --allow-root" pyspark &) 58 | fi 59 | 60 | 61 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // scalastyle:off println 19 | package org.apache.spark.examples 20 | 21 | import scala.collection.JavaConverters._ 22 | 23 | import org.apache.spark.util.Utils 24 | 25 | /** 26 | * Prints out environmental information, sleeps, and then exits. Made to 27 | * test driver submission in the standalone scheduler. 28 | */ 29 | object DriverSubmissionTest { 30 | def main(args: Array[String]) { 31 | if (args.length < 1) { 32 | println("Usage: DriverSubmissionTest ") 33 | System.exit(0) 34 | } 35 | val numSecondsToSleep = args(0).toInt 36 | 37 | val env = System.getenv() 38 | val properties = Utils.getSystemProperties 39 | 40 | println("Environment variables containing SPARK_TEST:") 41 | env.asScala.filter { case (k, _) => k.contains("SPARK_TEST")}.foreach(println) 42 | 43 | println("System properties containing spark.test:") 44 | properties.filter { case (k, _) => k.toString.contains("spark.test") }.foreach(println) 45 | 46 | for (i <- 1 until numSecondsToSleep) { 47 | println(s"Alive for $i out of $numSecondsToSleep seconds") 48 | Thread.sleep(1000) 49 | } 50 | } 51 | } 52 | // scalastyle:on println 53 | -------------------------------------------------------------------------------- /aztk/client/job/operations.py: -------------------------------------------------------------------------------- 1 | from aztk.client.base import BaseOperations 2 | 3 | from .helpers import submit 4 | 5 | 6 | class CoreJobOperations(BaseOperations): 7 | def submit( 8 | self, 9 | job_configuration, 10 | start_task, 11 | job_manager_task, 12 | autoscale_formula, 13 | software_metadata_key: str, 14 | vm_image_model, 15 | application_metadata, 16 | ): 17 | """Submit a job 18 | 19 | Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's 20 | cluster will be allocated and configured, then the applications will be executed with their output stored 21 | in Azure Storage. When all applications have completed, the cluster will be automatically deleted. 22 | 23 | Args: 24 | job_configuration (:obj:`aztk.models.JobConfiguration`): Model defining the job's configuration. 25 | start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool 26 | job_manager_task (:obj:`azure.batch.models.JobManagerTask`): Batch JobManagerTask defintion to schedule 27 | the defined applications on the cluster. 28 | autoscale_formula (:obj:`str`): formula that defines the numbers of nodes allocated to the cluster. 29 | software_metadata_key (:obj:`str`): the key of the primary softare running on the cluster. 30 | vm_image_model 31 | application_metadata (:obj:`List[str]`): list of the names of all applications that will be run as a 32 | part of the job 33 | 34 | Returns: 35 | :obj:`azure.batch.models.CloudJobSchedule`: Model representing the Azure Batch JobSchedule state. 36 | """ 37 | return submit.submit_job( 38 | self, 39 | job_configuration, 40 | start_task, 41 | job_manager_task, 42 | autoscale_formula, 43 | software_metadata_key, 44 | vm_image_model, 45 | application_metadata, 46 | ) 47 | -------------------------------------------------------------------------------- /aztk/node_scripts/scheduling/scheduling_target.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import os 3 | import time 4 | 5 | import requests 6 | 7 | from aztk import error 8 | 9 | 10 | def http_request_wrapper(func, *args, timeout=None, max_execution_time=300, **kwargs): 11 | start_time = time.clock() 12 | while True: 13 | try: 14 | response = func(*args, timeout=timeout, **kwargs) 15 | response.raise_for_status() 16 | return response 17 | except requests.Timeout: 18 | pass 19 | 20 | if (time.clock() - start_time > max_execution_time): 21 | raise error.AztkError("Waited {} seconds for request {}, exceeded max_execution_time={}".format( 22 | time.clock() - start_time, 23 | func.__name__, 24 | max_execution_time, 25 | )) 26 | 27 | 28 | def _download_resource_file(task_id, resource_file): 29 | response = http_request_wrapper(requests.get, url=resource_file.blob_source, timeout=None, stream=True) 30 | if resource_file.file_path: 31 | write_path = os.path.join(os.environ.get("AZ_BATCH_TASK_WORKING_DIR"), resource_file.file_path) 32 | with open(write_path, 'wb') as stream: 33 | for chunk in response.iter_content(chunk_size=16777216): 34 | stream.write(chunk) 35 | return None 36 | 37 | raise error.AztkError("ResourceFile file_path not set.") 38 | 39 | 40 | def download_task_resource_files(task_id, resource_files): 41 | with concurrent.futures.ThreadPoolExecutor() as executor: 42 | futures = { 43 | executor.submit(_download_resource_file, task_id, resource_file): resource_file 44 | for resource_file in resource_files 45 | } 46 | done, not_done = concurrent.futures.wait(futures) 47 | if not_done: 48 | raise error.AztkError("Not all futures completed. {}".format(not_done.pop().result())) 49 | errors = [result.result() for result in done if isinstance(result.result(), Exception)] 50 | if errors: 51 | raise error.AztkError(errors) 52 | else: 53 | return [result.result() for result in done] 54 | -------------------------------------------------------------------------------- /aztk/spark/client/cluster/helpers/diagnostics.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from azure.batch.models import BatchErrorException 4 | 5 | from aztk import error 6 | from aztk.utils import helpers 7 | 8 | 9 | def _run(spark_cluster_operations, cluster_id, output_directory=None, brief=False): 10 | # copy debug program to each node 11 | copy_output = spark_cluster_operations.copy( 12 | cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True) 13 | for node_output in copy_output: 14 | if node_output.error: 15 | raise error.AztkError("Failed to copy diagnostic script to cluster.") 16 | ssh_cmd = _build_diagnostic_ssh_command(brief) 17 | run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True) 18 | remote_path = "/tmp/debug.zip" 19 | result = None 20 | if output_directory: 21 | local_path = os.path.join(os.path.abspath(output_directory), "debug.zip") 22 | result = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True) 23 | 24 | # write run output or error to debug/ directory 25 | with open(os.path.join(output_directory, "debug-output.txt"), "w", encoding="UTF-8") as stream: 26 | for node_output in run_output: 27 | stream.write(node_output.error) if node_output.error else stream.write(node_output.output) 28 | else: 29 | result = spark_cluster_operations.download(cluster_id, remote_path, host=True) 30 | 31 | return result 32 | 33 | 34 | def _build_diagnostic_ssh_command(brief): 35 | return ("sudo rm -rf /tmp/debug.zip; " 36 | "sudo apt-get install -y python3-pip; " 37 | "sudo -H pip3 install --upgrade pip; " 38 | "sudo -H pip3 install docker; " 39 | "sudo python3 /tmp/debug.py {}".format(brief)) 40 | 41 | 42 | def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None, brief=False): 43 | try: 44 | output = _run(spark_cluster_operations, cluster_id, output_directory, brief) 45 | return output 46 | except BatchErrorException as e: 47 | raise error.AztkError(helpers.format_batch_exception(e)) 48 | -------------------------------------------------------------------------------- /examples/src/main/r/dataframe.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # To run this example use 19 | # ./bin/spark-submit examples/src/main/r/dataframe.R 20 | 21 | library(SparkR) 22 | 23 | # Initialize SparkSession 24 | sparkR.session(appName = "SparkR-DataFrame-example") 25 | 26 | # Create a simple local data.frame 27 | localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) 28 | 29 | # Convert local data frame to a SparkDataFrame 30 | df <- createDataFrame(localDF) 31 | 32 | # Print its schema 33 | printSchema(df) 34 | # root 35 | # |-- name: string (nullable = true) 36 | # |-- age: double (nullable = true) 37 | 38 | # Create a DataFrame from a JSON file 39 | path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") 40 | peopleDF <- read.json(path) 41 | printSchema(peopleDF) 42 | # root 43 | # |-- age: long (nullable = true) 44 | # |-- name: string (nullable = true) 45 | 46 | # Register this DataFrame as a table. 47 | createOrReplaceTempView(peopleDF, "people") 48 | 49 | # SQL statements can be run by using the sql methods 50 | teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") 51 | 52 | # Call collect to get a local data.frame 53 | teenagersLocalDF <- collect(teenagers) 54 | 55 | # Print the teenagers in our dataset 56 | print(teenagersLocalDF) 57 | 58 | # Stop the SparkSession now 59 | sparkR.session.stop() 60 | -------------------------------------------------------------------------------- /aztk/models/secrets_configuration.py: -------------------------------------------------------------------------------- 1 | from aztk.core.models import Model, fields 2 | from aztk.error import InvalidModelError 3 | 4 | 5 | class ServicePrincipalConfiguration(Model): 6 | """ 7 | Container class for AAD authentication 8 | """ 9 | 10 | tenant_id = fields.String() 11 | client_id = fields.String() 12 | credential = fields.String() 13 | batch_account_resource_id = fields.String() 14 | storage_account_resource_id = fields.String() 15 | 16 | 17 | class SharedKeyConfiguration(Model): 18 | """ 19 | Container class for shared key authentication 20 | """ 21 | 22 | batch_account_name = fields.String() 23 | batch_account_key = fields.String() 24 | batch_service_url = fields.String() 25 | storage_account_name = fields.String() 26 | storage_account_key = fields.String() 27 | storage_account_suffix = fields.String() 28 | 29 | 30 | class DockerConfiguration(Model): 31 | """ 32 | Configuration for connecting to private docker 33 | 34 | Args: 35 | endpoint (str): Which docker endpoint to use. Default to docker hub. 36 | username (str): Docker endpoint username 37 | password (str): Docker endpoint password 38 | """ 39 | 40 | endpoint = fields.String(default=None) 41 | username = fields.String(default=None) 42 | password = fields.String(default=None) 43 | 44 | 45 | class SecretsConfiguration(Model): 46 | service_principal = fields.Model(ServicePrincipalConfiguration, default=None) 47 | shared_key = fields.Model(SharedKeyConfiguration, default=None) 48 | docker = fields.Model(DockerConfiguration, default=None) 49 | ssh_pub_key = fields.String(default=None) 50 | ssh_priv_key = fields.String(default=None) 51 | 52 | def __validate__(self): 53 | if self.service_principal and self.shared_key: 54 | raise InvalidModelError("Both service_principal and shared_key auth are configured, must use only one") 55 | 56 | if not self.service_principal and not self.shared_key: 57 | raise InvalidModelError("Neither service_principal and shared_key auth are configured, must use only one") 58 | 59 | def is_aad(self): 60 | return self.service_principal is not None 61 | -------------------------------------------------------------------------------- /examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples; 19 | 20 | import org.apache.spark.api.java.JavaRDD; 21 | import org.apache.spark.api.java.JavaSparkContext; 22 | import org.apache.spark.sql.SparkSession; 23 | 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | /** 28 | * Computes an approximation to pi 29 | * Usage: JavaSparkPi [partitions] 30 | */ 31 | public final class JavaSparkPi { 32 | 33 | public static void main(String[] args) throws Exception { 34 | SparkSession spark = SparkSession 35 | .builder() 36 | .appName("JavaSparkPi") 37 | .getOrCreate(); 38 | 39 | JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); 40 | 41 | int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2; 42 | int n = 100000 * slices; 43 | List l = new ArrayList<>(n); 44 | for (int i = 0; i < n; i++) { 45 | l.add(i); 46 | } 47 | 48 | JavaRDD dataSet = jsc.parallelize(l, slices); 49 | 50 | int count = dataSet.map(integer -> { 51 | double x = Math.random() * 2 - 1; 52 | double y = Math.random() * 2 - 1; 53 | return (x * x + y * y <= 1) ? 1 : 0; 54 | }).reduce((integer, integer2) -> integer + integer2); 55 | 56 | System.out.println("Pi is roughly " + 4.0 * count / n); 57 | 58 | spark.stop(); 59 | } 60 | } 61 | --------------------------------------------------------------------------------