├── data ├── test_log1.csv └── test_log2.csv ├── master ├── work_dir │ ├── python_apps │ │ └── example │ │ │ ├── main.py │ │ │ ├── functions.py │ │ │ └── example.py │ ├── requirements.txt │ ├── pyproject.toml │ ├── notebooks │ │ └── Example.ipynb │ └── scala_apps │ │ └── example │ │ ├── pom.xml │ │ └── src │ │ └── main │ │ └── scala │ │ └── stubs │ │ └── example.scala └── Dockerfile ├── docker-compose.yml ├── conf ├── master │ └── spark-defaults.conf └── worker │ └── spark-defaults.conf ├── .gitignore └── README.md /data/test_log1.csv: -------------------------------------------------------------------------------- 1 | userId;region;type;paid 2 | 27103;0;0;true 3 | 74637;2;1;false 4 | 4052;1;2;false 5 | 36462;1;0;true 6 | 12832;0;1;true 7 | 55466;2;0;false 8 | -------------------------------------------------------------------------------- /master/work_dir/python_apps/example/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | main.py: It executes an example function. 5 | """ 6 | 7 | __author__ = "alvertogit" 8 | __copyright__ = "Copyright 2018-2025" 9 | 10 | import sys 11 | 12 | from functions import example_function 13 | 14 | if __name__ == "__main__": 15 | arguments = sys.argv[1:] 16 | print("Executing example_function") 17 | example_function(*arguments) 18 | print("Ending example_function") 19 | -------------------------------------------------------------------------------- /data/test_log2.csv: -------------------------------------------------------------------------------- 1 | hour;userId;songId;genderId;deviceId 2 | 18-10-2017 00:00:25;27103;231990117;23;1_27103 3 | 18-10-2017 00:02:00;74637;241781021;24;1_74637 4 | 18-10-2017 23:02:01;4052;220134142;22;0_4052 5 | 18-10-2017 00:05:31;36462;282102171;28;0_36462 6 | 18-10-2017 00:08:46;12832;152921597;15;0_12832 7 | 18-10-2017 01:09:21;27103;160764835;16;0_27103 8 | 18-10-2017 00:11:19;55466;251274066;25;1_55466 9 | 18-10-2017 00:11:39;74637;273954131;27;2_74637 10 | 18-10-2017 23:14:54;4052;273735239;27;0_4052 11 | -------------------------------------------------------------------------------- /master/work_dir/requirements.txt: -------------------------------------------------------------------------------- 1 | findspark==2.0.1 2 | folium==0.20.0 3 | geojson==3.2.0 4 | graphviz==0.21 5 | imbalanced-learn==0.14.0 6 | ipyleaflet==0.20.0 7 | ipywidgets==8.1.8 8 | joblib==1.5.2 9 | jupyterlab==4.5.0 10 | matplotlib==3.10.8 11 | numpy==2.3.5 12 | pandas==2.3.3 13 | pre-commit==4.5.0 14 | pur==7.3.3 15 | pydot==4.0.1 16 | pyspark-client==4.0.1 17 | requests==2.32.5 18 | ruff==0.14.9 19 | scikit-image==0.25.2 20 | scikit-learn==1.8.0 21 | seaborn==0.13.2 22 | Sphinx==9.0.4 23 | sphinx-rtd-theme==3.0.2 24 | tensorflow==2.20.0 25 | tpot==1.1.0 26 | xgboost==3.1.2 27 | -------------------------------------------------------------------------------- /master/work_dir/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "bigdata_docker" 3 | version = "1.0.0" 4 | authors = [ 5 | { name="alvertogit" }, 6 | ] 7 | description = "A containerized cluster for Big Data" 8 | readme = "README.md" 9 | classifiers = [ 10 | "Programming Language :: Python :: 3", 11 | ] 12 | 13 | [project.urls] 14 | Homepage = "https://github.com/alvertogit/bigdata_docker" 15 | 16 | [tool.ruff] 17 | extend-include = ["*.ipynb"] 18 | 19 | line-length = 100 20 | target-version = "py312" 21 | 22 | [tool.ruff.lint] 23 | select = [ 24 | # pycodestyle 25 | "E", 26 | "W", 27 | # Pyflakes 28 | "F", 29 | # pyupgrade 30 | "UP", 31 | # flake8-bugbear 32 | "B", 33 | # flake8-simplify 34 | "SIM", 35 | # isort 36 | "I", 37 | ] 38 | -------------------------------------------------------------------------------- /master/work_dir/python_apps/example/functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | functions.py: It contents example functions. 5 | """ 6 | 7 | __author__ = "alvertogit" 8 | __copyright__ = "Copyright 2018-2025" 9 | 10 | import sys 11 | 12 | 13 | def example_function(*args): 14 | """Function example that process input data and prints a number. 15 | 16 | Args: 17 | number (int): The integer parameter to be printed. 18 | 19 | Raises: 20 | Exception: If the number of args is lower than 1. 21 | ValueError: If args[0] is not an integer. 22 | 23 | """ 24 | 25 | if len(args) < 1: 26 | raise ValueError("""Error: required arguments """) 27 | 28 | try: 29 | int(args[0]) 30 | except ValueError: 31 | print("Error: args[0] is not an integer") 32 | sys.exit(1) 33 | 34 | print(f"Number: {int(args[0])}") 35 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | master: 3 | build: 4 | context: ./master 5 | target: spark-master 6 | command: sbin/start-master.sh 7 | init: true 8 | hostname: master 9 | environment: 10 | MASTER: spark://master:7077 11 | SPARK_CONF_DIR: /conf 12 | SPARK_NO_DAEMONIZE: true 13 | SPARK_PUBLIC_DNS: localhost 14 | expose: 15 | - 7001 16 | - 7002 17 | - 7077 18 | - 6066 19 | ports: 20 | - 4040:4040 21 | - 6066:6066 22 | - 7077:7077 23 | - 8080:8080 24 | - 8888:8888 25 | volumes: 26 | - ./conf/master:/conf 27 | - ./data:/tmp/data 28 | networks: 29 | - spark_network 30 | 31 | worker: 32 | build: 33 | context: ./master 34 | target: spark-node 35 | command: sbin/start-worker.sh spark://master:7077 36 | init: true 37 | depends_on: 38 | - master 39 | hostname: worker 40 | environment: 41 | SPARK_CONF_DIR: /conf 42 | SPARK_NO_DAEMONIZE: true 43 | SPARK_PUBLIC_DNS: localhost 44 | SPARK_WORKER_CORES: 2 45 | SPARK_WORKER_MEMORY: 2g 46 | SPARK_WORKER_PORT: 8881 47 | SPARK_WORKER_WEBUI_PORT: 8081 48 | expose: 49 | - 7012 50 | - 8881 51 | ports: 52 | - 8081:8081 53 | volumes: 54 | - ./conf/worker:/conf 55 | - ./data:/tmp/data 56 | networks: 57 | - spark_network 58 | 59 | networks: 60 | spark_network: 61 | driver: bridge 62 | -------------------------------------------------------------------------------- /conf/master/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Default system properties included when running spark-submit. 19 | # This is useful for setting default environmental settings. 20 | 21 | # Example: 22 | # spark.master spark://master:7077 23 | # spark.eventLog.enabled true 24 | # spark.eventLog.dir hdfs://namenode:8021/directory 25 | # spark.serializer org.apache.spark.serializer.KryoSerializer 26 | # spark.driver.memory 5g 27 | # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" 28 | 29 | # spark.driver.memory 10g 30 | spark.driver.port 7001 31 | spark.blockManager.port 7002 32 | -------------------------------------------------------------------------------- /conf/worker/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Default system properties included when running spark-submit. 19 | # This is useful for setting default environmental settings. 20 | 21 | # Example: 22 | # spark.master spark://master:7077 23 | # spark.eventLog.enabled true 24 | # spark.eventLog.dir hdfs://namenode:8021/directory 25 | # spark.serializer org.apache.spark.serializer.KryoSerializer 26 | # spark.driver.memory 5g 27 | # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" 28 | 29 | # spark.driver.memory 10g 30 | # spark.driver.port 7011 31 | spark.blockManager.port 7012 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /master/work_dir/notebooks/Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Example of Jupyter Notebook" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import findspark\n", 17 | "\n", 18 | "findspark.init()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pyspark" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "sc = pyspark.SparkContext()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Hello, Hola\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "print(\"Hello, Hola\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [] 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 3 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython3", 80 | "version": "3.7.5" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 2 85 | } 86 | -------------------------------------------------------------------------------- /master/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eclipse-temurin:17-jdk-noble AS spark-node 2 | 3 | RUN apt-get update \ 4 | && apt-get install -y curl procps unzip wget \ 5 | && apt-get clean \ 6 | && rm -rf /var/lib/apt/lists/* 7 | 8 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 9 | 10 | # HADOOP 11 | 12 | ENV HADOOP_VERSION=3.4.2 13 | ENV HADOOP_HOME=/usr/hadoop-$HADOOP_VERSION 14 | ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 15 | ENV PATH=$PATH:$HADOOP_HOME/bin 16 | RUN curl -sL --retry 3 \ 17 | "https://www-eu.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION-lean.tar.gz" \ 18 | | gunzip \ 19 | | tar -x -C /usr/ \ 20 | && rm -rf $HADOOP_HOME/share/doc \ 21 | && chown -R root:root $HADOOP_HOME 22 | 23 | # SPARK 24 | 25 | ENV SPARK_VERSION=4.0.1 26 | ENV SPARK_PACKAGE=spark-${SPARK_VERSION}-bin-without-hadoop 27 | ENV SPARK_HOME=/usr/spark-${SPARK_VERSION} 28 | ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*" 29 | ENV PATH=$PATH:${SPARK_HOME}/bin 30 | RUN curl -sL --retry 3 \ 31 | "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \ 32 | | gunzip \ 33 | | tar x -C /usr/ \ 34 | && mv /usr/$SPARK_PACKAGE $SPARK_HOME \ 35 | && chown -R root:root $SPARK_HOME 36 | 37 | WORKDIR $SPARK_HOME 38 | 39 | CMD ["sbin/start-master.sh"] 40 | 41 | FROM spark-node AS spark-master 42 | 43 | # PYTHON 44 | 45 | RUN apt-get update --fix-missing \ 46 | && apt-get install -y bzip2 ca-certificates git python3.12 python3.12-venv python3-pip \ 47 | && apt-get clean \ 48 | && rm -rf /var/lib/apt/lists/* 49 | 50 | # MAVEN 51 | 52 | WORKDIR /usr/spark-${SPARK_VERSION}/ 53 | 54 | ENV MAVEN_VERSION=3.9.11 55 | RUN mkdir /opt/apache-maven 56 | RUN wget --quiet -c https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz -O apache-maven.tar.gz && \ 57 | tar xzvf apache-maven.tar.gz -C /opt/apache-maven --strip-components=1 && \ 58 | rm -f apache-maven.tar.gz && \ 59 | ln -s /opt/apache-maven/bin/mvn /usr/bin/mvn 60 | 61 | ENV MAVEN_HOME=/opt/apache-maven 62 | ENV PATH=MAVEN_HOME:$PATH 63 | 64 | # Work dir files 65 | 66 | RUN mkdir /usr/spark-${SPARK_VERSION}/work_dir 67 | WORKDIR /usr/spark-${SPARK_VERSION}/work_dir 68 | ADD ./work_dir /usr/spark-${SPARK_VERSION}/work_dir 69 | 70 | # Python virtual environment 71 | 72 | RUN python3 -m venv /opt/venv 73 | ENV PATH="/opt/venv/bin:$PATH" 74 | RUN python3 -m pip install pip==25.3 75 | RUN python3 -m pip install setuptools==80.9.0 76 | RUN python3 -m pip install --no-cache-dir -r requirements.txt 77 | 78 | # Compiling Scala example with MAVEN 79 | 80 | WORKDIR /usr/spark-${SPARK_VERSION}/work_dir/scala_apps/example 81 | RUN mvn package 82 | 83 | WORKDIR $SPARK_HOME 84 | CMD ["sbin/start-master.sh"] 85 | -------------------------------------------------------------------------------- /master/work_dir/scala_apps/example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | alvertogit.bigdata_docker 4 | example 5 | 1.0 6 | jar 7 | "Example App" 8 | 9 | 10 | 3.4.2 11 | 4.0.1 12 | 2.13.16 13 | 2.13 14 | 3.2.0 15 | 17 16 | ${spark.version} 17 | 18 | 19 | 20 | 21 | apache-repo 22 | Apache Repository 23 | https://repository.apache.org/content/repositories/releases 24 | 25 | true 26 | 27 | 28 | false 29 | 30 | 31 | 32 | 33 | 34 | 35 | org.scala-lang 36 | scala-library 37 | ${scala.version} 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-core_${scala.binary.version} 43 | ${spark.version} 44 | 45 | 46 | 47 | org.apache.spark 48 | spark-sql_${scala.binary.version} 49 | ${spark.version} 50 | 51 | 52 | 53 | org.apache.hadoop 54 | hadoop-client 55 | ${hadoop.version} 56 | 57 | 58 | 59 | org.opensearch 60 | opensearch 61 | ${opensearch.version} 62 | provided 63 | 64 | 65 | 66 | org.opensearch.client 67 | opensearch-hadoop 68 | 1.3.0 69 | 70 | 71 | 72 | org.opensearch.client 73 | opensearch-spark-30_2.13 74 | 1.3.0 75 | 76 | 77 | 78 | 79 | 80 | 81 | net.alchim31.maven 82 | scala-maven-plugin 83 | 4.9.5 84 | 85 | 86 | 87 | compile 88 | 89 | 90 | 91 | 92 | 93 | maven-compiler-plugin 94 | 3.14.0 95 | 96 | ${java.version} 97 | ${java.version} 98 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /master/work_dir/scala_apps/example/src/main/scala/stubs/example.scala: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | @author alvertogit 4 | Copyright 2018-2025 5 | 6 | Local execution command example: 7 | 8 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \ 9 | --master local[2] \ 10 | --driver-memory 10g \ 11 | --class stubs.Example \ 12 | target/example-1.0.jar \ 13 | 1 49999 \ 14 | /tmp/data/test_log1.csv \ 15 | /tmp/data/test_log2.csv \ 16 | /tmp/data/result_local_log 17 | 18 | Worker execution command example: 19 | 20 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \ 21 | --master spark://master:7077 \ 22 | --driver-memory 10g \ 23 | --class stubs.Example \ 24 | target/example-1.0.jar \ 25 | 1 49999 \ 26 | /tmp/data/test_log1.csv \ 27 | /tmp/data/test_log2.csv \ 28 | /tmp/data/result_worker_log 29 | 30 | */ 31 | 32 | package stubs 33 | 34 | import org.apache.spark.SparkContext 35 | import org.apache.spark.sql.SparkSession 36 | import org.apache.spark.sql.expressions.Window 37 | import org.apache.spark.sql.types._ 38 | import org.apache.spark.sql.Row 39 | import org.apache.spark.sql.functions._ 40 | 41 | object Example { 42 | def main(args: Array[String]) { 43 | 44 | if (args.length < 4) { 45 | System.err.println("Usage: stubs.Example ") 46 | System.exit(1) 47 | } 48 | 49 | val minRangeId = args(0).toInt 50 | val maxRangeId = args(1).toInt 51 | val path_input_log1 = args(2) 52 | val path_input_log2 = args(3) 53 | val path_output_log = args(4) 54 | 55 | def rangeId (user_id: Int) = { 56 | if (user_id >= minRangeId & user_id <= maxRangeId) { 57 | true 58 | } 59 | else{ 60 | false 61 | } 62 | } 63 | 64 | val sc = new SparkContext() 65 | val spark = SparkSession.builder.appName("ExampleApp").getOrCreate() 66 | import spark.implicits._ 67 | 68 | // test_log1.csv input data format example 69 | // 70 | // userId;region;type;paid 71 | // 27103;0;0;true 72 | // 74637;2;1;false 73 | // ... 74 | 75 | val infoSchema = StructType(Array( 76 | StructField("userId",IntegerType,true), 77 | StructField("region",IntegerType,true), 78 | StructField("type",IntegerType,true), 79 | StructField("paid",BooleanType,true) 80 | )) 81 | 82 | val infoDF = spark.read.schema(infoSchema).option("header","true").option("delimiter", ";").csv(path_input_log1) 83 | 84 | // test_log2.csv input data format example 85 | // 86 | // hour;userId;songId;genderId;deviceId 87 | // 18-10-2017 00:00:25;27103;231990117;23;1_27103 88 | // 18-10-2017 00:02:00;74637;241781021;24;1_74637 89 | // ... 90 | 91 | val songSchema = StructType(Array( 92 | StructField("userId",IntegerType,true), 93 | StructField("hour",IntegerType,true), 94 | StructField("songId",IntegerType,true), 95 | StructField("genderId",IntegerType,true), 96 | StructField("deviceId",IntegerType,true) 97 | )) 98 | 99 | val songRDD = sc.textFile(path_input_log2) 100 | 101 | val songHeader = songRDD.first 102 | 103 | val songRDDFiltered = songRDD.filter(record => record != songHeader).map(line => line.split(";")).filter( 104 | field => rangeId(field(1).toInt 105 | )).map( 106 | rec => Row( 107 | rec(1).toInt, 108 | rec(0).split(' ')(1).split(':')(0).toInt, 109 | rec(2).toInt, 110 | rec(3).toInt, 111 | rec(4).split('_')(0).toInt 112 | )) 113 | 114 | val songDF = spark.createDataFrame(songRDDFiltered,songSchema) 115 | 116 | val distSongDF = songDF.groupBy($"userId").agg(countDistinct($"songId")).withColumnRenamed("count(DISTINCT songId)","distSongIds") 117 | 118 | val genderSongDF = songDF.groupBy($"userId",$"genderId").count.withColumnRenamed("count","topGenderIdSongs") 119 | 120 | val genderWindow = Window.partitionBy("userId").orderBy($"topGenderIdSongs".desc) 121 | 122 | val windowGenderSongDF = genderSongDF.withColumn("rank", row_number().over(genderWindow)).where($"rank" === 1).drop($"rank" 123 | ).withColumnRenamed("genderId","topGenderId") 124 | 125 | val globalSongDF = windowGenderSongDF.join(distSongDF,"userId") 126 | 127 | // globalSongDF format example 128 | // 129 | // userId;topGenderId;topGenderIdSongs;distSongIds 130 | // 27103;23;1;2 131 | // 4052;27;1;2 132 | // ... 133 | 134 | // joining global song and info DFs and ordering output 135 | val resultDF = globalSongDF.join(infoDF ,"userId") 136 | 137 | // resultDF format example 138 | // 139 | // userId;topGenderId;topGenderIdSongs;distSongIds;region;type;paid 140 | // 27103;23;1;2;0;0;true 141 | // 4052;27;1;2;1;2;false 142 | // ... 143 | 144 | resultDF.coalesce(1).write.option("header","true").option("delimiter", ";").csv(path_output_log) 145 | 146 | spark.stop 147 | 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /master/work_dir/python_apps/example/example.py: -------------------------------------------------------------------------------- 1 | """ 2 | example.py: pyspark application example. 3 | 4 | Local execution command example: 5 | 6 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ spark-submit \ 7 | --master local[2] \ 8 | --driver-memory 10g \ 9 | example.py \ 10 | 1 49999 \ 11 | /tmp/data/test_log1.csv \ 12 | /tmp/data/test_log2.csv \ 13 | /tmp/data/result_local_log 14 | 15 | Worker execution command example: 16 | 17 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ spark-submit \ 18 | --master spark://master:7077 \ 19 | --driver-memory 10g \ 20 | example.py \ 21 | 1 49999 \ 22 | /tmp/data/test_log1.csv \ 23 | /tmp/data/test_log2.csv \ 24 | /tmp/data/result_worker_log 25 | """ 26 | 27 | __author__ = "alvertogit" 28 | __copyright__ = "Copyright 2018-2025" 29 | 30 | 31 | import os 32 | import sys 33 | 34 | from pyspark.sql import Row, SparkSession, Window 35 | from pyspark.sql.functions import countDistinct, desc, row_number 36 | from pyspark.sql.types import BooleanType, IntegerType, StructField, StructType 37 | 38 | if __name__ == "__main__": 39 | if len(sys.argv) < 5: 40 | print(""" 41 | Usage error: example.py 42 | 43 | """) 44 | sys.exit(1) 45 | 46 | minRangeId = int(sys.argv[1]) 47 | maxRangeId = int(sys.argv[2]) 48 | path_input_log1 = sys.argv[3] 49 | path_input_log2 = sys.argv[4] 50 | path_output_log = sys.argv[5] 51 | 52 | def rangeId(user_id): 53 | if user_id >= minRangeId and user_id <= maxRangeId: 54 | return True 55 | return False 56 | 57 | os.environ["PYSPARK_PYTHON"] = sys.executable 58 | os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable 59 | spark = SparkSession.builder.appName("pyspark example").getOrCreate() 60 | # spark.conf.set("spark.driver.memory", "10g") 61 | # spark.sparkContext.setLogLevel("WARN") 62 | sc = spark.sparkContext 63 | 64 | # test_log1.csv input data format example 65 | # 66 | # userId;region;type;paid 67 | # 27103;0;0;true 68 | # 74637;2;1;false 69 | # ... 70 | 71 | infoSchema = StructType( 72 | [ 73 | StructField("userId", IntegerType(), True), 74 | StructField("region", IntegerType(), True), 75 | StructField("type", IntegerType(), True), 76 | StructField("paid", BooleanType(), True), 77 | ] 78 | ) 79 | 80 | infoDF = ( 81 | spark.read.schema(infoSchema) 82 | .option("header", "true") 83 | .option("delimiter", ";") 84 | .csv(path_input_log1) 85 | ) 86 | 87 | # test_log2.csv input data format example 88 | # 89 | # hour;userId;songId;genderId;deviceId 90 | # 18-10-2017 00:00:25;27103;231990117;23;1_27103 91 | # 18-10-2017 00:02:00;74637;241781021;24;1_74637 92 | # ... 93 | 94 | songSchema = StructType( 95 | [ 96 | StructField("userId", IntegerType(), True), 97 | StructField("hour", IntegerType(), True), 98 | StructField("songId", IntegerType(), True), 99 | StructField("genderId", IntegerType(), True), 100 | StructField("deviceId", IntegerType(), True), 101 | ] 102 | ) 103 | 104 | songRDD = sc.textFile(path_input_log2) 105 | 106 | songHeader = songRDD.first() 107 | 108 | songRDDFiltered = ( 109 | songRDD.filter(lambda record: record != songHeader) 110 | .map(lambda line: line.split(";")) 111 | .filter(lambda field: rangeId(int(field[1]))) 112 | .map( 113 | lambda rec: Row( 114 | int(rec[1]), 115 | int(rec[0].split(" ")[1].split(":")[0]), 116 | int(rec[2]), 117 | int(rec[3]), 118 | int(rec[4].split("_")[0]), 119 | ) 120 | ) 121 | ) 122 | 123 | songDF = spark.createDataFrame(songRDDFiltered, songSchema) 124 | 125 | distSongDF = ( 126 | songDF.groupBy("userId") 127 | .agg(countDistinct("songId")) 128 | .withColumnRenamed("count(DISTINCT songId)", "distSongIds") 129 | ) 130 | 131 | genderSongDF = ( 132 | songDF.groupBy("userId", "genderId").count().withColumnRenamed("count", "topGenderIdSongs") 133 | ) 134 | 135 | genderWindow = Window.partitionBy("userId").orderBy(desc("topGenderIdSongs")) 136 | 137 | windowGenderSongDF = ( 138 | genderSongDF.withColumn("rank", row_number().over(genderWindow)) 139 | .where("rank = 1") 140 | .drop("rank") 141 | .withColumnRenamed("genderId", "topGenderId") 142 | ) 143 | 144 | globalSongDF = windowGenderSongDF.join(distSongDF, "userId") 145 | 146 | # globalSongDF format example 147 | # 148 | # userId;topGenderId;topGenderIdSongs;distSongIds 149 | # 27103;23;1;2 150 | # 4052;27;1;2 151 | # ... 152 | 153 | # joining global song and info DFs and ordering output 154 | resultDF = globalSongDF.join(infoDF, "userId") 155 | 156 | # resultDF format example 157 | # 158 | # userId;topGenderId;topGenderIdSongs;distSongIds;region;type;paid 159 | # 27103;23;1;2;0;0;true 160 | # 4052;27;1;2;1;2;false 161 | # ... 162 | 163 | resultDF.coalesce(1).write.option("header", "true").option("delimiter", ";").csv( 164 | path_output_log 165 | ) 166 | 167 | spark.stop() 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SPARK DOCKER CLUSTER FOR BIG DATA & DATA SCIENCE 2 | 3 | This repository stores all the required components to build a containerized cluster for [Big Data] and [Data Science] applications. It allows scalable production services using technologies such as [Machine Learning] [Python] libraries, [Apache Spark] analytics engine, [Scala] language, [HDFS] and [Docker] containers among others. 4 | 5 | - [DEPENDENCIES](#dependencies) 6 | - [PYTHON VIRTUAL ENVIRONMENT](#python-virtual-environment) 7 | - [REPOSITORY CONTENT](#repository-content) 8 | - [WORK DIRECTORY CONTENT](#work-directory-content) 9 | - [ARCHITECTURE](#architecture) 10 | - [CONTAINERIZED BIG DATA CLUSTER](#containerized-big-data-cluster) 11 | - [HOW TO RUN CONTAINERIZED CLUSTER WITH DOCKER COMPOSE](#how-to-run-containerized-cluster-with-docker-compose) 12 | - [SCALA BIG DATA APPLICATIONS](#scala-big-data-applications) 13 | - [PYTHON DATA SCIENCE APPLICATIONS](#python-data-science-applications) 14 | - [JUPYTER LAB \& NOTEBOOKS](#jupyter-lab--notebooks) 15 | - [CREDITS](#credits) 16 | 17 | ## DEPENDENCIES 18 | 19 | The code has been tested using: 20 | 21 | - [Apache Spark] (4.0): an unified analytics engine for [Big Data] processing, with built-in modules for streaming, [SQL], [Machine Learning] and graph processing. It has high-level [API]s in [Scala] and [Python]. 22 | - [Hadoop] (3.4): an open-source software for reliable, scalable, distributed computing. It uses [Hadoop Distributed File System] ([HDFS]) which is suitable to work with large [RDD] (Resilient Distributed Datasets). 23 | - [Docker] (28.5): an open platform for developers and sysadmins to build, ship, and run distributed applications, whether on laptops, data center VMs, or the cloud. 24 | - [Docker Compose] (2.40): a tool for defining and running multi-container [Docker] applications. 25 | 26 | ### PYTHON VIRTUAL ENVIRONMENT 27 | 28 | The virtual environment employed for [Data Science] applications is generated from **requirements.txt** file located in the repository. 29 | 30 | The main components of this virtual environment are listed below: 31 | 32 | - [Python] (3.12): an interpreted high-level programming language for general-purpose programming. 33 | - [Jupyter Lab] (4.5): a web-based interactive development environment for [Jupyter Notebooks], code, and data. 34 | - [Keras] ([TensorFlow] built-in): a high-level neural networks [API], written in [Python] and capable of running on top of [TensorFlow], CNTK, or Theano. 35 | - [TensorFlow] (2.20): an open source [Deep Learning] library for high performance numerical computation using data flow graphs. 36 | - [Matplotlib] (3.10): a plotting library for [Python] and its numerical mathematics extension [NumPy]. 37 | - [NumPy] (2.3): a library for [Python], adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays. 38 | - [Pandas] (2.3): an open source library providing high-performance, easy-to-use data structures and data analysis tools for [Python]. 39 | - [scikit-learn] (1.7): a [machine learning] library for [Python]. It features various classification, regression and clustering algorithms including support vector machines, [random forest], [gradient boosting], k-means and DBSCAN. 40 | - [scikit-image] (0.25): a collection of algorithms for image processing with [Python]. 41 | - [TPOT] (1.1): a [Python] Automated [Machine Learning] tool that optimizes [machine learning] pipelines using genetic programming. 42 | - [XGBoost] (3.0): an optimized distributed [gradient boosting] library designed to be highly efficient, flexible and portable. 43 | - [Folium] (0.20): an open source library to visualize data that has been manipulated in [Python] on an interactive [Leaflet.js] map. 44 | - [ipyleaflet] (0.20): a [Jupyter] / [Leaflet.js] bridge enabling interactive maps in [Jupyter Notebook]. 45 | - [Seaborn] (0.13): a [Python] visualization library based on [Matplotlib]. It provides a high-level interface for drawing attractive statistical graphics. 46 | - [imbalanced-learn] (0.14): a [Python] package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance. It is compatible with [scikit-learn] and it allows [SMOTE (Synthetic Minority Over-sampling Technique)]. 47 | - [joblib] (1.5): a set of tools to provide lightweight pipelining in [Python]. 48 | - [findspark] (2.0): a package to make [Spark] Context available in [Jupyter Notebook]. 49 | 50 | It is available in the [Spark] master node created with [Docker Compose]. 51 | 52 | Command to access [Spark] master node: 53 | 54 | ```bash 55 | ~/bigdata_docker/$ docker compose exec master bash 56 | ~/usr/spark-4.0.1/$ 57 | ``` 58 | 59 | ## REPOSITORY CONTENT 60 | 61 | The **bigdata_docker** main folder contains subfolders, application and data files needed to build [Big Data] and [Data Science] solutions: 62 | 63 | ```bash 64 | bigdata_docker 65 | ├── .gitignore 66 | ├── conf 67 | │   ├── master 68 | │   └── worker 69 | ├── data 70 | │   ├── test_log1.csv 71 | │   └── test_log2.csv 72 | ├── master 73 | │   ├── Dockerfile 74 | │   └── work_dir 75 | │   ├── notebooks 76 | │   ├── pyproject.toml 77 | │   ├── python_apps 78 | │   │   └── example 79 | │   ├── requirements.txt 80 | │   └── scala_apps 81 | │   └── example 82 | ├── docker-compose.yml 83 | └── README.md 84 | ``` 85 | 86 | - **conf**: stores [Spark] configuration files for master and worker nodes. These folders are mapped as volumes in the [Docker Compose] file and they can be accessed from containers through **conf/** path. 87 | - **data**: folder to contain raw, processed and test data. It is mapped as volume in [docker-compose] and it can be accessed from containers through **tmp/data/** path. 88 | - **docker-compose.yml**: creates the [Spark] cluster based on [Docker] in which the applications shall run. 89 | - **master**: stores all configuration and working files for the [Spark] master and worker nodes of the cluster created with [Docker Compose]. 90 | - **Dockerfile**: defines all required tools, virtual environment and work files to be installed in the [Spark] master and worker nodes. 91 | - **work_dir**: stores files employed for [Big Data] and [Data Science] applications. 92 | 93 | ### WORK DIRECTORY CONTENT 94 | 95 | The **work_dir** folder has the following structure: 96 | 97 | ```bash 98 | work_dir 99 | ├── pyproject.toml 100 | ├── requirements.txt 101 | ├── notebooks 102 | │   └── Example.ipynb 103 | ├── python_apps 104 | │   └── example 105 | └── scala_apps 106 | └── example 107 | ``` 108 | 109 | - **requirements.txt**: file which defines the dependencies for the virtual environment employed by [Python] [Data Science] applications and [Jupyter Notebooks]. 110 | - **notebooks**: [Jupyter Notebooks] for data analysis, elaboration and training of prediction models and testing. 111 | - **scala_apps**: used to contain [Spark] applications written in [Scala]. There is one example application compiled using [Maven]. 112 | - **python_apps**: folder to store [Python] applications. There is one example application. 113 | 114 | ## ARCHITECTURE 115 | 116 | The system has three main components: 117 | 118 | 1. Containerized [Big Data] cluster: It shall be the base of the system and it can allow to run large files processing and predictive applications. 119 | 2. [Scala] [Big Data] applications: It shall process the available large data files and extract the relevant information that it will be used to train and feed the predictive models. 120 | 3. [Python] [Data Science] applications: It shall employ [Python] [Data Science] libraries to use [machine learning] models for tasks such as predicitions. 121 | 122 | Apart from the three main components listed above [Jupyter Notebooks] are also utilized for data analysis, modelling and testing of applications. 123 | 124 | ### CONTAINERIZED BIG DATA CLUSTER 125 | 126 | The system has to be a scalable solution. Thus the applications shall be deployed in a [Big Data] cluster built on [Apache Spark], [Hadoop] and [Docker] containers. 127 | 128 | The reason for this choice is because [Docker] enables the utilization of container clustering systems to set up and scale the processing and predictive applications in production. It makes easy to add new containers to handle additional load. 129 | 130 | The containers shall run [Spark] as data engine and [HDFS] for storage in master and worker nodes. The [Spark] master node has also [Maven] and the [Python] virtual environment installed. 131 | 132 | The number of worker nodes can be increased modifying the docker-compose file. By default it creates one master and one worker node. 133 | 134 | The following diagram illustrates the [Big Data] cluster architecture in blocks: 135 | 136 | ```mermaid 137 | flowchart LR; 138 | Client<-->id1[["Master
--------
Python
Spark
HDFS"]]; 139 | subgraph Big Data Cluster; 140 | id1[["Master
--------
Python
Spark
HDFS"]]<-->id2[["Worker
--------
Spark
HDFS"]]; 141 | end; 142 | ``` 143 | 144 | Other possible improvements in the [Big Data] cluster that shall not be implemented here could be: 145 | 146 | - Use of [Kubernetes] to manage the [Docker] containers. 147 | - Take advantage of [Cloud Computing] services, such as [AWS EMR], to build up a [Spark] cluster with the desired amount of resources and only utilize them when is required for cost efficiency. 148 | 149 | ### HOW TO RUN CONTAINERIZED CLUSTER WITH DOCKER COMPOSE 150 | 151 | The steps and commands to run the [Spark] cluster with [Docker Compose] are described below. 152 | 153 | Before executing [Docker Compose] is strongly recommended to close other applications to free up resources and ports to avoid potential issues. Then [Docker Compose] can be execute to build services: 154 | 155 | ```bash 156 | ~/bigdata_docker/$ docker compose build 157 | ``` 158 | 159 | Next step consists in executing [Docker Compose] up command: 160 | 161 | ```bash 162 | ~/bigdata_docker/$ docker compose up 163 | ``` 164 | 165 | It is likely that for the first time it could spend some time to download [Docker] images and additional packages. If everything goes fine at the end the cluster should be ready appearing something similar to: 166 | 167 | ```bash 168 | ... 169 | master_1 | 2018-10-19 09:59:53 INFO Master:54 - I have been elected leader! New state: ALIVE 170 | master_1 | 2018-10-19 09:59:53 INFO Master:54 - Registering worker 172.27.0.3:8881 with 2 cores, 2.0 GB RAM 171 | worker_1 | 2018-10-19 09:59:53 INFO Worker:54 - Successfully registered with master spark://master:7077 172 | ``` 173 | 174 | To shutdown the cluster simply press 'Control+C' and wait patiently to return to shell. 175 | 176 | ### SCALA BIG DATA APPLICATIONS 177 | 178 | It is necessary to filter and prepare the data from [RDD]s to extract the relevant information that will be used by [Python] [Data Science] applications. The approach to accomplish this task can be the employ of [Spark] applications programmed in [Scala]. 179 | 180 | A [Scala] [Big Data] example application is stored in **work_dir/scala_apps/example/** folder and for the first time it must be compiled with [Maven] to generate the *.jar* target file. This is done automatically with the [Dockerfile] but it can be done manually using the following command: 181 | 182 | ```bash 183 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ mvn package 184 | ``` 185 | 186 | The application requires the parameters *min-range-Id*, *max-range-Id*, *path-input-log1*, *path-input-log2*, *path-output-log*. 187 | 188 | Command to run the **Example** application locally in the [Spark] master node with test logs: 189 | 190 | ```bash 191 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \ 192 | --master local[2] \ 193 | --class stubs.Example \ 194 | target/example-1.0.jar \ 195 | 1 49999 \ 196 | /tmp/data/test_log1.csv \ 197 | /tmp/data/test_log2.csv \ 198 | /tmp/data/result_local_log 199 | ``` 200 | 201 | Command to run the **Example** application in the [Spark] worker node with test logs: 202 | 203 | ```bash 204 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \ 205 | --master spark://master:7077 \ 206 | --class stubs.Example \ 207 | target/example-1.0.jar \ 208 | 1 49999 \ 209 | /tmp/data/test_log1.csv \ 210 | /tmp/data/test_log2.csv \ 211 | /tmp/data/result_worker_log 212 | ``` 213 | 214 | When using larger files it is recommended to tune additional parameters to provide additional resources. e.g. "--driver-memory 10g". 215 | 216 | ### PYTHON DATA SCIENCE APPLICATIONS 217 | 218 | The way to run the [Python] example application is simple. Just go to **work_dir/python_apps/example/** folder and execute it: 219 | 220 | Command to access [Spark] master node: 221 | 222 | ```bash 223 | ~/bigdata_docker/$ docker compose exec master bash 224 | ~/usr/spark-4.0.1/$ 225 | ``` 226 | 227 | Command to run [Python] example application in master node: 228 | 229 | ```bash 230 | ~/usr/spark-4.0.1/$ cd work_dir/python_apps/example 231 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ python3 main.py 10000 232 | ``` 233 | 234 | ### JUPYTER LAB & NOTEBOOKS 235 | 236 | A good way to analyze data, build [machine learning] models and test them is through [Jupyter Lab]. An example of [Jupyter Notebook] is stored in the **work_dir/notebooks/** folder. 237 | 238 | All the required packages to run [Jupyter Notebooks] remotely in the [Spark] master node are installed so it is possible to run them through web interface. To achieve this it is necessary to use the commands shown below: 239 | 240 | Command to access master node: 241 | 242 | ```bash 243 | ~/bigdata_docker/$ docker compose exec master bash 244 | ~/usr/spark-4.0.1$ 245 | ``` 246 | 247 | Launch [Jupyter Lab] service in master node. 248 | 249 | ```bash 250 | ~/usr/spark-4.0.1$ jupyter lab \ 251 | --notebook-dir=/usr/spark-4.0.1/work_dir/notebooks \ 252 | --ip='0.0.0.0' \ 253 | --port=8888 \ 254 | --no-browser \ 255 | --allow-root 256 | ``` 257 | 258 | Now [Jupyter Notebooks] stored in the master node can be run remotely. Next step is to open a local web browser and paste the URL printed after executing the launch command to access to the [Jupyter Lab] interface, checking that the server is running fine. A similar output will be shown: 259 | 260 | ```bash 261 | Copy/paste this URL into your browser when you connect for the first time, 262 | to login with a token: 263 | http://(master or 127.0.0.1):8888/?token= 264 | ``` 265 | 266 | Valid URL: 267 | 268 | ```bash 269 | http://localhost:8888/?token= 270 | ``` 271 | 272 | To shutdown the [Jupyter Lab] service in the master node simply press 'Control+C' and then confirm with 'y'. 273 | 274 | ## CREDITS 275 | 276 | author: alvertogit 277 | copyright: 2018-2025 278 | 279 | [Data Science]: https://en.wikipedia.org/wiki/Data_science 280 | [Big Data]: https://en.wikipedia.org/wiki/Big_data 281 | [Python]: https://www.python.org/ 282 | [Machine Learning]: https://en.wikipedia.org/wiki/Machine_learning 283 | [Deep Learning]: https://en.wikipedia.org/wiki/Deep_learning 284 | [Random Forest]: https://en.wikipedia.org/wiki/Random_forest 285 | [Gradient Boosting]: https://en.wikipedia.org/wiki/Gradient_boosting 286 | [Scala]: https://www.scala-lang.org/ 287 | [Docker]: https://www.docker.com/ 288 | [Docker Compose]: https://github.com/docker/compose 289 | [Dockerfile]: https://docs.docker.com/engine/reference/builder/ 290 | [Apache Spark]: https://spark.apache.org/ 291 | [Spark]: https://spark.apache.org/ 292 | [API]: https://en.wikipedia.org/wiki/Application_programming_interface 293 | [SQL]: https://en.wikipedia.org/wiki/SQL 294 | [Hadoop]: https://hadoop.apache.org/ 295 | [Hadoop Distributed File System]: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html 296 | [HDFS]: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html 297 | [RDD]: https://spark.apache.org/docs/latest/rdd-programming-guide.html 298 | [Kubernetes]: https://kubernetes.io/ 299 | [Keras]: https://keras.io/ 300 | [TensorFlow]: https://www.tensorflow.org/ 301 | [Matplotlib]: https://matplotlib.org/ 302 | [NumPy]: https://numpy.org/ 303 | [Pandas]: https://pandas.pydata.org/ 304 | [scikit-learn]: https://scikit-learn.org/stable/ 305 | [scikit-image]: https://scikit-image.org/ 306 | [TPOT]: https://github.com/EpistasisLab/tpot 307 | [XGBoost]: https://github.com/dmlc/xgboost 308 | [Folium]: https://github.com/python-visualization/folium 309 | [Leaflet.js]: https://leafletjs.com/ 310 | [ipyleaflet]: https://github.com/jupyter-widgets/ipyleaflet 311 | [Seaborn]: http://seaborn.pydata.org/ 312 | [imbalanced-learn]: https://github.com/scikit-learn-contrib/imbalanced-learn 313 | [SMOTE (Synthetic Minority Over-sampling Technique)]: https://jair.org/index.php/jair/article/view/10302 314 | [joblib]: https://pypi.org/project/joblib/ 315 | [findspark]: https://github.com/minrk/findspark 316 | [Jupyter]: https://jupyter.org/ 317 | [Jupyter Lab]: https://jupyter.org/ 318 | [Jupyter Notebook]: https://jupyter.org/ 319 | [Jupyter Notebooks]: https://jupyter.org/ 320 | [Maven]: https://maven.apache.org/ 321 | [Cloud Computing]: https://en.wikipedia.org/wiki/Cloud_computing 322 | [AWS EMR]: https://aws.amazon.com/emr/ 323 | --------------------------------------------------------------------------------