├── .devcontainer ├── devcontainer.json └── docker-compose.override.yaml ├── .env ├── .gitignore ├── Dockerfile ├── README.md ├── docker-compose.yaml ├── img └── devcontainer.png └── notebooks ├── Spark-Shuffling.ipynb └── teste.ipynb /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Dev Spark Jupyter", 3 | "dockerComposeFile": [ 4 | "../docker-compose.yaml", 5 | "./docker-compose.override.yaml" 6 | ], 7 | "service": "jupyter-notebook", 8 | "workspaceFolder": "/home/spark/notebooks", 9 | "customizations": { 10 | "vscode": { 11 | "settings": { 12 | "terminal.integrated.shell.linux": "/bin/bash" 13 | }, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-toolsai.jupyter" 17 | ] 18 | } 19 | }, 20 | "remoteUser": "spark" 21 | } 22 | -------------------------------------------------------------------------------- /.devcontainer/docker-compose.override.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | spark-worker: 3 | scale: 2 4 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | USER_NAME=spark 2 | USER_HOME=/home/spark 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project 2 | logs 3 | # Jupyter Notebook 4 | .ipynb_checkpoints 5 | 6 | # Environments 7 | # .env 8 | .venv 9 | env/ 10 | venv/ 11 | ENV/ 12 | env.bak/ 13 | venv.bak/ 14 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim AS base 2 | 3 | # Install common dependencies 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | curl \ 6 | openjdk-17-jre-headless \ 7 | procps \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | # Download and install Spark 12 | ENV SPARK_VERSION=3.5.1 13 | ENV HADOOP_VERSION=3 14 | RUN curl -O https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ 15 | && tar -xzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /opt \ 16 | && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz 17 | 18 | # Load build arguments as environment variables 19 | ARG USER_NAME 20 | ARG USER_HOME 21 | 22 | # Create user 23 | RUN useradd -m -s /bin/bash ${USER_NAME} 24 | 25 | # Switch to the created user 26 | USER ${USER_NAME} 27 | WORKDIR ${USER_HOME} 28 | 29 | # Set environment variables for Spark 30 | ENV SPARK_HOME=/opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} 31 | ENV PATH=$SPARK_HOME/bin:$USER_HOME/venv/bin:$PATH 32 | 33 | # Ensure the virtual environment is used in future sessions 34 | ENV VIRTUAL_ENV=$USER_HOME/venv 35 | ENV PATH="$VIRTUAL_ENV/bin:$PATH" 36 | ENV SPARK_RPC_AUTHENTICATION_ENABLED=no 37 | ENV SPARK_RPC_ENCRYPTION_ENABLED=no 38 | ENV SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no 39 | ENV SPARK_SSL_ENABLED=no 40 | ENV SPARK_USER=spark 41 | ENV PYSPARK_PYTHON=$VIRTUAL_ENV/bin/python 42 | ENV PYSPARK_DRIVER_PYTHON=$PYSPARK_PYTHON 43 | 44 | # Downloading Spark Measure JAR 45 | ARG JARS_USER_DIR=$USER_HOME/jars 46 | ARG SPARK_MEASURE_URL="https://repo1.maven.org/maven2/ch/cern/sparkmeasure/spark-measure_2.12/0.24/spark-measure_2.12-0.24.jar" 47 | RUN mkdir -p ${JARS_USER_DIR} \ 48 | && curl -o ${JARS_USER_DIR}/$(basename ${SPARK_MEASURE_URL}) ${SPARK_MEASURE_URL} 49 | 50 | # Set up Python virtual environment 51 | RUN python -m venv venv 52 | 53 | # Activate virtual environment and install packages 54 | RUN . venv/bin/activate && pip install --no-cache-dir jupyter pyspark findspark sparkmeasure 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > Written with [StackEdit] 2 | 3 | Este repositório contém os arquivos necessários para criar containers Docker para: 4 | 5 | - Spark Master (v. 3.5.1) 6 | - Spark Worker 7 | - Spark History Server 8 | - Jupyter Notebook 9 | 10 | ## Pré-requisitos 11 | 12 | - Docker instalado 13 | 14 | ## Estrutura dos arquivos 15 | 16 | - `docker-compose.yaml`: Define os serviços Docker para Spark Master, Spark Workers e Jupyter Notebook. 17 | - `dockerfile-jupyter`: Dockerfile para construir a imagem do Jupyter Notebook. 18 | - `dockerfile-spark`: Dockerfile para construir a imagem do Spark. 19 | 20 | ## Como usar o Docker Compose 21 | 22 | ### Subindo os containers 23 | 24 | 1. Clone o repositório: 25 | 26 | ```bash 27 | git clone https://github.com/airtoncarneiro/dev-spark-jupyter 28 | cd dev-spark-jupyter 29 | ``` 30 | 31 | 2. Para iniciar os containers com 1 worker, execute: 32 | ```bash 33 | docker compose up -d 34 | ``` 35 | 36 | 3. Para iniciar os containers com um ou mais workers, utilize a variável de ambiente `WORKERS` para definir o número de workers desejados. Por exemplo, para iniciar com 2 workers: 37 | ```bash 38 | docker compose up -d --scale worker=2 39 | ``` 40 | *Se não quiser worker: --scale worker=0* 41 | 42 | 43 | ### Parando/Desfazendo os containers: 44 | 45 | 1. Parar o container e, além de mantê-lo, permanecem a rede e o volume: 46 | ```bash 47 | docker compose stop 48 | ``` 49 | 2. Remover o container (desfaz tudo): 50 | ```bash 51 | docker compose down 52 | ``` 53 | 54 | ## Acessando as interfaces web 55 | 56 | - **Spark Master**: 57 | - URL: [http://localhost:8080](http://localhost:8080) 58 | - Interface web do Spark Master para monitoramento do cluster. 59 | 60 | - **Jupyter Notebook**: 61 | - URL: [http://localhost:8888](http://localhost:8888) 62 | - Interface web do Jupyter Notebook para executar notebooks interativos. 63 | 64 | - **Spark History Server**: 65 | - URL: [http://localhost:18080](http://localhost:18080) 66 | - Interface web do Spark History Server para visualizar o histórico de execuções de jobs. 67 | - Os logs são armazenados no diretório `./logs` do Host 68 | 69 | ## Usando o VSCode (devcontainer) 70 | 71 | Ao abrir o projeto no VSCode surgirá uma mensagem solicitando para abrir no _devcontainer_. Basta aceitar que todo o ambiente será montado automaticamente e você poderá usar o Jupyter integrado ao VSCode. 72 | Editar o arquivo _docker_compose_override.yaml_ caso queira alterar as configurações padrões que foram estabelecidas no _docker-compose.yaml_. 73 | 74 | ![vscode devcontainer popup image](./img/devcontainer.png) 75 | 76 | 77 | ## Personalização 78 | 79 | Você pode personalizar as configurações dos containers editando os arquivos `docker-compose.yaml`, `dockerfile-jupyter` e `dockerfile-spark` conforme necessário. 80 | 81 | Por exemplo: Se quiser alterar a quantidade de memória ou cores do Spark Worker, ajuste os valores das variáveis de ambiente `SPARK_WORKER_MEMORY` e `SPARK_WORKER_CORES` no `docker-compose.yml` 82 | 83 | ```yaml 84 | environment: 85 | - SPARK_MODE=worker 86 | - SPARK_WORKER_MEMORY=1G # Altere este valor para ajustar a memória (ex: 2G) 87 | - SPARK_WORKER_CORES=1 # Altere este valor para ajustar o número de núcleos (ex: 2) 88 | ``` 89 | 90 | ## Contribuições 91 | 92 | Contribuições são bem vindas! Sinta-se à vontade para sugerir melhorias ou novos ambientes. 93 | 94 | ## Nota 95 | 96 | [2024-06-02] Inclusão do **Spark Measure** 97 | Caso queira fazer uso, veja um exemplo: 98 | 99 | ```python 100 | from sparkmeasure import StageMetrics 101 | stagemetrics = StageMetrics(spark) 102 | 103 | query = """ 104 | spark.sql("select count(*) \ 105 | from range(1000) \ 106 | cross join range(1000) \ 107 | cross join range(1000)").show() 108 | """ 109 | 110 | stagemetrics.runandmeasure(globals(), query) 111 | ``` 112 | e inclua o código abaixo no SparkSession: 113 | ```bash 114 | .config("spark.jars", os.environ.get("JARS_USER_DIR")) 115 | ``` 116 | --- 117 | [2024-06-14] 118 | 1. Mudança de Jupyter para Jupyter Lab como default 119 | 2. Disponibilização de um devcontainer para o VSCode -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | x-common-args: 2 | &common-args 3 | USER_NAME: ${USER_NAME} 4 | USER_HOME: ${USER_HOME} 5 | 6 | services: 7 | spark-master: 8 | build: 9 | context: . 10 | args: 11 | <<: *common-args 12 | image: dev-spark-jupyter-pyspark:3.5.1 13 | environment: 14 | SPARK_MODE: master 15 | ports: 16 | - "8080:8080" 17 | - "7077:7077" 18 | user: ${USER_NAME} 19 | command: /bin/bash -c "/opt/spark-3.5.1-bin-hadoop3/sbin/start-master.sh && tail -f /opt/spark-3.5.1-bin-hadoop3/logs/*" 20 | networks: 21 | - spark-network 22 | 23 | spark-worker: 24 | build: 25 | context: . 26 | args: 27 | <<: *common-args 28 | image: dev-spark-jupyter-pyspark:3.5.1 29 | environment: 30 | SPARK_MODE: worker 31 | SPARK_MASTER_URL: spark://spark-master:7077 32 | # ports: 33 | # - "8081:8081" 34 | user: ${USER_NAME} 35 | command: /bin/bash -c "/opt/spark-3.5.1-bin-hadoop3/sbin/start-worker.sh spark://spark-master:7077 && tail -f /opt/spark-3.5.1-bin-hadoop3/logs/*" 36 | depends_on: 37 | - spark-master 38 | networks: 39 | - spark-network 40 | 41 | history-server: 42 | build: 43 | context: . 44 | args: 45 | <<: *common-args 46 | image: dev-spark-jupyter-pyspark:3.5.1 47 | environment: 48 | - SPARK_MODE=history-server 49 | - SPARK_HISTORY_OPTS=-Dspark.history.fs.logDirectory=${USER_HOME}/logs 50 | - SPARK_HISTORY_FS_UPDATE_INTERVAL=10s 51 | ports: 52 | - "18080:18080" 53 | volumes: 54 | - ./logs:${USER_HOME}/logs 55 | command: /bin/bash -c "/opt/spark-3.5.1-bin-hadoop3/sbin/start-history-server.sh && tail -f ${USER_HOME}/logs/*" 56 | depends_on: 57 | - spark-master 58 | networks: 59 | - spark-network 60 | 61 | 62 | jupyter-notebook: 63 | build: 64 | context: . 65 | args: 66 | <<: *common-args 67 | image: dev-spark-jupyter-pyspark:3.5.1 68 | # environment: 69 | ports: 70 | - "8888:8888" 71 | volumes: 72 | - ./notebooks:${USER_HOME}/notebooks 73 | - ./logs:${USER_HOME}/logs 74 | working_dir: ${USER_HOME}/notebooks 75 | user: ${USER_NAME} 76 | command: ["jupyter-lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=''"] 77 | depends_on: 78 | - spark-master 79 | networks: 80 | - spark-network 81 | 82 | networks: 83 | spark-network: 84 | driver: bridge 85 | -------------------------------------------------------------------------------- /img/devcontainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/airtoncarneiro/dev-spark-jupyter/55e6703a2eac588aa5af7c3589408748831ccc8e/img/devcontainer.png -------------------------------------------------------------------------------- /notebooks/Spark-Shuffling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import findspark # type: ignore\n", 10 | "findspark.init() # type: ignore\n", 11 | "\n", 12 | "from pyspark.sql import SparkSession" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Initialize SparkSession\n", 22 | "spark = SparkSession.builder \\\n", 23 | " .appName(\"Product\") \\\n", 24 | " .getOrCreate()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Create a sample DataFrame\n", 34 | "data = [(\"Apple\", 10), (\"Samsung\", 30), (\"Apple\", 20), (\"Samsung\", 20)]\n", 35 | "df = spark.createDataFrame(data, [\"Product\", \"Qty\"])" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Perform a groupBy operation, triggering shuffling\n", 45 | "grouped_df = df.groupBy(\"Product\").sum(\"Qty\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "+-------+--------+\n", 58 | "|Product|sum(Qty)|\n", 59 | "+-------+--------+\n", 60 | "| Apple| 30|\n", 61 | "|Samsung| 50|\n", 62 | "+-------+--------+\n", 63 | "\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "# Show the result\n", 69 | "grouped_df.show()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Stop SparkSession\n", 79 | "spark.stop()" 80 | ] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "base", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.11.6" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 2 104 | } 105 | -------------------------------------------------------------------------------- /notebooks/teste.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "86b10e83-fff2-4768-994b-dca9f3c1acc3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Obter o caminho para os notebooks e logs a partir das variáveis de ambiente\n", 11 | "import os\n", 12 | "user_name = os.environ.get('USER_NAME', 'spark')\n", 13 | "user_home = os.environ.get('USER_HOME', '/home/spark')" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "69270f6c", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import findspark # type: ignore\n", 24 | "# findspark.init() # type: ignore\n", 25 | "\n", 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "df6b4b6c-d262-4cf5-89db-6fd961326a27", 32 | "metadata": {}, 33 | "source": [ 34 | "### SparkSession - STANDALONE" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "id": "26ef24d7-ff33-402a-88ea-1d99c119fde8", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "Setting default log level to \"WARN\".\n", 48 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 49 | "24/06/16 13:01:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "# Construir a sessão Spark\n", 55 | "spark = SparkSession.builder \\\n", 56 | " .appName(\"JupyterSparkExample\") \\\n", 57 | " .master(\"spark://spark-master:7077\") \\\n", 58 | " .config(\"spark.eventLog.enabled\", \"true\") \\\n", 59 | " .config(\"spark.eventLog.dir\", f\"file://{user_home}/logs\") \\\n", 60 | " .getOrCreate()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "e503eba3-5189-494c-8655-c6bc1d794b47", 66 | "metadata": {}, 67 | "source": [ 68 | "### SparkSession - LOCAL" 69 | ] 70 | }, 71 | { 72 | "cell_type": "raw", 73 | "id": "a5f7e074-c389-4350-8cdd-7c24075a194f", 74 | "metadata": {}, 75 | "source": [ 76 | "# se quiser executar o Spark em modo LOCAL usando todos os cores da CPU\n", 77 | "spark = SparkSession.builder \\\n", 78 | " .appName(\"LocalSparkExample\") \\\n", 79 | " .master(\"local[*]\") \\\n", 80 | " .getOrCreate()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "4998a90a-5a1c-4df0-a937-ebf982f73315", 86 | "metadata": {}, 87 | "source": [ 88 | "#### Código" 89 | ] 90 | }, 91 | { 92 | "cell_type": "raw", 93 | "id": "01d4e0f3-9019-4fec-83da-b5746f0aa150", 94 | "metadata": {}, 95 | "source": [ 96 | "print(spark.version)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "id": "f37af223-1c65-4655-90d0-7c0f31e83761", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "data = [(\"Alice\", 34), (\"Bob\", 45), (\"Cathy\", 29)]\n", 107 | "columns = [\"Name\", \"Age\"]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "19806373-12b6-44e7-b91b-b01e3014dcdf", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "df = spark.createDataFrame(data, columns)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "b385df48-ab8e-48f9-aa61-ab6337ba391e", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "df.show()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "128db74c-0a90-4e46-8d48-4dc3ea98e4a4", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "df_grouped = df.groupBy(\"Age\").count()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "6953a4a9-5248-4bfd-ad94-2391f85bb0b9", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "df_grouped.show()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "raw", 152 | "id": "474dd9c9-a2a7-4b34-a75a-4a3e2ba509f2", 153 | "metadata": {}, 154 | "source": [ 155 | "# Salva o DataFrame transformado em um arquivo Parquet (opcional)\n", 156 | "df_grouped.write.parquet(\"/opt/bitnami/spark/data/age_counts.parquet\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "547f5b3a-21cc-4651-9a70-04f9afe1e9df", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "spark.stop()" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.12.4" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | --------------------------------------------------------------------------------