├── data
    ├── test_log1.csv
    └── test_log2.csv
├── master
    ├── work_dir
    │   ├── python_apps
    │   │   └── example
    │   │   │   ├── main.py
    │   │   │   ├── functions.py
    │   │   │   └── example.py
    │   ├── requirements.txt
    │   ├── pyproject.toml
    │   ├── notebooks
    │   │   └── Example.ipynb
    │   └── scala_apps
    │   │   └── example
    │   │       ├── pom.xml
    │   │       └── src
    │   │           └── main
    │   │               └── scala
    │   │                   └── stubs
    │   │                       └── example.scala
    └── Dockerfile
├── docker-compose.yml
├── conf
    ├── master
    │   └── spark-defaults.conf
    └── worker
    │   └── spark-defaults.conf
├── .gitignore
└── README.md


/data/test_log1.csv:
--------------------------------------------------------------------------------
1 | userId;region;type;paid
2 | 27103;0;0;true
3 | 74637;2;1;false
4 | 4052;1;2;false
5 | 36462;1;0;true
6 | 12832;0;1;true
7 | 55466;2;0;false
8 | 


--------------------------------------------------------------------------------
/master/work_dir/python_apps/example/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | main.py: It executes an example function.
 5 | """
 6 | 
 7 | __author__ = "alvertogit"
 8 | __copyright__ = "Copyright 2018-2025"
 9 | 
10 | import sys
11 | 
12 | from functions import example_function
13 | 
14 | if __name__ == "__main__":
15 |     arguments = sys.argv[1:]
16 |     print("Executing example_function")
17 |     example_function(*arguments)
18 |     print("Ending example_function")
19 | 


--------------------------------------------------------------------------------
/data/test_log2.csv:
--------------------------------------------------------------------------------
 1 | hour;userId;songId;genderId;deviceId
 2 | 18-10-2017 00:00:25;27103;231990117;23;1_27103
 3 | 18-10-2017 00:02:00;74637;241781021;24;1_74637
 4 | 18-10-2017 23:02:01;4052;220134142;22;0_4052
 5 | 18-10-2017 00:05:31;36462;282102171;28;0_36462
 6 | 18-10-2017 00:08:46;12832;152921597;15;0_12832
 7 | 18-10-2017 01:09:21;27103;160764835;16;0_27103
 8 | 18-10-2017 00:11:19;55466;251274066;25;1_55466
 9 | 18-10-2017 00:11:39;74637;273954131;27;2_74637
10 | 18-10-2017 23:14:54;4052;273735239;27;0_4052
11 | 


--------------------------------------------------------------------------------
/master/work_dir/requirements.txt:
--------------------------------------------------------------------------------
 1 | findspark==2.0.1
 2 | folium==0.20.0
 3 | geojson==3.2.0
 4 | graphviz==0.21
 5 | imbalanced-learn==0.14.0
 6 | ipyleaflet==0.20.0
 7 | ipywidgets==8.1.8
 8 | joblib==1.5.2
 9 | jupyterlab==4.5.0
10 | matplotlib==3.10.8
11 | numpy==2.3.5
12 | pandas==2.3.3
13 | pre-commit==4.5.0
14 | pur==7.3.3
15 | pydot==4.0.1
16 | pyspark-client==4.0.1
17 | requests==2.32.5
18 | ruff==0.14.9
19 | scikit-image==0.25.2
20 | scikit-learn==1.8.0
21 | seaborn==0.13.2
22 | Sphinx==9.0.4
23 | sphinx-rtd-theme==3.0.2
24 | tensorflow==2.20.0
25 | tpot==1.1.0
26 | xgboost==3.1.2
27 | 


--------------------------------------------------------------------------------
/master/work_dir/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "bigdata_docker"
 3 | version = "1.0.0"
 4 | authors = [
 5 |   { name="alvertogit" },
 6 | ]
 7 | description = "A containerized cluster for Big Data"
 8 | readme = "README.md"
 9 | classifiers = [
10 |     "Programming Language :: Python :: 3",
11 | ]
12 | 
13 | [project.urls]
14 | Homepage = "https://github.com/alvertogit/bigdata_docker"
15 | 
16 | [tool.ruff]
17 | extend-include = ["*.ipynb"]
18 | 
19 | line-length = 100
20 | target-version = "py312"
21 | 
22 | [tool.ruff.lint]
23 | select = [
24 |     # pycodestyle
25 |     "E",
26 |     "W",
27 |     # Pyflakes
28 |     "F",
29 |     # pyupgrade
30 |     "UP",
31 |     # flake8-bugbear
32 |     "B",
33 |     # flake8-simplify
34 |     "SIM",
35 |     # isort
36 |     "I",
37 | ]
38 | 


--------------------------------------------------------------------------------
/master/work_dir/python_apps/example/functions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | functions.py: It contents example functions.
 5 | """
 6 | 
 7 | __author__ = "alvertogit"
 8 | __copyright__ = "Copyright 2018-2025"
 9 | 
10 | import sys
11 | 
12 | 
13 | def example_function(*args):
14 |     """Function example that process input data and prints a number.
15 | 
16 |     Args:
17 |         number (int): The integer parameter to be printed.
18 | 
19 |     Raises:
20 |         Exception: If the number of args is lower than 1.
21 |         ValueError: If args[0] is not an integer.
22 | 
23 |     """
24 | 
25 |     if len(args) < 1:
26 |         raise ValueError("""Error: required arguments <number>""")
27 | 
28 |     try:
29 |         int(args[0])
30 |     except ValueError:
31 |         print("Error: args[0] is not an integer")
32 |         sys.exit(1)
33 | 
34 |     print(f"Number: {int(args[0])}")
35 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services: 
 2 |   master:
 3 |     build:
 4 |       context: ./master
 5 |       target: spark-master
 6 |     command: sbin/start-master.sh
 7 |     init: true
 8 |     hostname: master
 9 |     environment:
10 |       MASTER: spark://master:7077
11 |       SPARK_CONF_DIR: /conf
12 |       SPARK_NO_DAEMONIZE: true
13 |       SPARK_PUBLIC_DNS: localhost
14 |     expose:
15 |       - 7001
16 |       - 7002
17 |       - 7077
18 |       - 6066
19 |     ports:
20 |       - 4040:4040
21 |       - 6066:6066
22 |       - 7077:7077
23 |       - 8080:8080
24 |       - 8888:8888
25 |     volumes:
26 |       - ./conf/master:/conf
27 |       - ./data:/tmp/data
28 |     networks:
29 |       - spark_network
30 | 
31 |   worker:
32 |     build:
33 |       context: ./master
34 |       target: spark-node
35 |     command: sbin/start-worker.sh spark://master:7077
36 |     init: true
37 |     depends_on:
38 |       - master
39 |     hostname: worker
40 |     environment:
41 |       SPARK_CONF_DIR: /conf
42 |       SPARK_NO_DAEMONIZE: true
43 |       SPARK_PUBLIC_DNS: localhost
44 |       SPARK_WORKER_CORES: 2
45 |       SPARK_WORKER_MEMORY: 2g
46 |       SPARK_WORKER_PORT: 8881
47 |       SPARK_WORKER_WEBUI_PORT: 8081
48 |     expose:
49 |       - 7012
50 |       - 8881
51 |     ports:
52 |       - 8081:8081
53 |     volumes:
54 |       - ./conf/worker:/conf
55 |       - ./data:/tmp/data
56 |     networks:
57 |       - spark_network
58 | 
59 | networks:
60 |   spark_network:
61 |     driver: bridge
62 | 


--------------------------------------------------------------------------------
/conf/master/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Default system properties included when running spark-submit.
19 | # This is useful for setting default environmental settings.
20 | 
21 | # Example:
22 | # spark.master                     spark://master:7077
23 | # spark.eventLog.enabled           true
24 | # spark.eventLog.dir               hdfs://namenode:8021/directory
25 | # spark.serializer                 org.apache.spark.serializer.KryoSerializer
26 | # spark.driver.memory              5g
27 | # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
28 | 
29 | # spark.driver.memory              10g
30 | spark.driver.port 7001
31 | spark.blockManager.port 7002
32 | 


--------------------------------------------------------------------------------
/conf/worker/spark-defaults.conf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Default system properties included when running spark-submit.
19 | # This is useful for setting default environmental settings.
20 | 
21 | # Example:
22 | # spark.master                     spark://master:7077
23 | # spark.eventLog.enabled           true
24 | # spark.eventLog.dir               hdfs://namenode:8021/directory
25 | # spark.serializer                 org.apache.spark.serializer.KryoSerializer
26 | # spark.driver.memory              5g
27 | # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
28 | 
29 | # spark.driver.memory              10g
30 | # spark.driver.port 7011
31 | spark.blockManager.port 7012
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/master/work_dir/notebooks/Example.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Example of Jupyter Notebook"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "import findspark\n",
17 |     "\n",
18 |     "findspark.init()"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "import pyspark"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": [
36 |     "sc = pyspark.SparkContext()"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "code",
41 |    "execution_count": 2,
42 |    "metadata": {},
43 |    "outputs": [
44 |     {
45 |      "name": "stdout",
46 |      "output_type": "stream",
47 |      "text": [
48 |       "Hello, Hola\n"
49 |      ]
50 |     }
51 |    ],
52 |    "source": [
53 |     "print(\"Hello, Hola\")"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "metadata": {},
60 |    "outputs": [],
61 |    "source": []
62 |   }
63 |  ],
64 |  "metadata": {
65 |   "kernelspec": {
66 |    "display_name": "Python 3",
67 |    "language": "python",
68 |    "name": "python3"
69 |   },
70 |   "language_info": {
71 |    "codemirror_mode": {
72 |     "name": "ipython",
73 |     "version": 3
74 |    },
75 |    "file_extension": ".py",
76 |    "mimetype": "text/x-python",
77 |    "name": "python",
78 |    "nbconvert_exporter": "python",
79 |    "pygments_lexer": "ipython3",
80 |    "version": "3.7.5"
81 |   }
82 |  },
83 |  "nbformat": 4,
84 |  "nbformat_minor": 2
85 | }
86 | 


--------------------------------------------------------------------------------
/master/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM eclipse-temurin:17-jdk-noble AS spark-node
 2 | 
 3 | RUN apt-get update \
 4 |     && apt-get install -y curl procps unzip wget \
 5 |     && apt-get clean \
 6 |     && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 9 | 
10 | # HADOOP
11 | 
12 | ENV HADOOP_VERSION=3.4.2
13 | ENV HADOOP_HOME=/usr/hadoop-$HADOOP_VERSION
14 | ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
15 | ENV PATH=$PATH:$HADOOP_HOME/bin
16 | RUN curl -sL --retry 3 \
17 |     "https://www-eu.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION-lean.tar.gz" \
18 |     | gunzip \
19 |     | tar -x -C /usr/ \
20 |     && rm -rf $HADOOP_HOME/share/doc \
21 |     && chown -R root:root $HADOOP_HOME
22 | 
23 | # SPARK
24 | 
25 | ENV SPARK_VERSION=4.0.1
26 | ENV SPARK_PACKAGE=spark-${SPARK_VERSION}-bin-without-hadoop
27 | ENV SPARK_HOME=/usr/spark-${SPARK_VERSION}
28 | ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop/*:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/tools/lib/*"
29 | ENV PATH=$PATH:${SPARK_HOME}/bin
30 | RUN curl -sL --retry 3 \
31 |     "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-${SPARK_VERSION}/${SPARK_PACKAGE}.tgz" \
32 |     | gunzip \
33 |     | tar x -C /usr/ \
34 |     && mv /usr/$SPARK_PACKAGE $SPARK_HOME \
35 |     && chown -R root:root $SPARK_HOME
36 | 
37 | WORKDIR $SPARK_HOME
38 | 
39 | CMD ["sbin/start-master.sh"]
40 | 
41 | FROM spark-node AS spark-master
42 | 
43 | # PYTHON
44 | 
45 | RUN apt-get update --fix-missing \
46 |     && apt-get install -y bzip2 ca-certificates git python3.12 python3.12-venv python3-pip \
47 |     && apt-get clean \
48 |     && rm -rf /var/lib/apt/lists/*
49 | 
50 | # MAVEN
51 | 
52 | WORKDIR /usr/spark-${SPARK_VERSION}/
53 | 
54 | ENV MAVEN_VERSION=3.9.11
55 | RUN mkdir /opt/apache-maven
56 | RUN wget --quiet -c https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz -O apache-maven.tar.gz && \
57 |     tar xzvf apache-maven.tar.gz -C /opt/apache-maven --strip-components=1 && \
58 |     rm -f apache-maven.tar.gz && \
59 |     ln -s /opt/apache-maven/bin/mvn /usr/bin/mvn
60 | 
61 | ENV MAVEN_HOME=/opt/apache-maven
62 | ENV PATH=MAVEN_HOME:$PATH
63 | 
64 | # Work dir files
65 | 
66 | RUN mkdir /usr/spark-${SPARK_VERSION}/work_dir
67 | WORKDIR /usr/spark-${SPARK_VERSION}/work_dir
68 | ADD ./work_dir /usr/spark-${SPARK_VERSION}/work_dir
69 | 
70 | # Python virtual environment
71 | 
72 | RUN python3 -m venv /opt/venv
73 | ENV PATH="/opt/venv/bin:$PATH"
74 | RUN python3 -m pip install pip==25.3
75 | RUN python3 -m pip install setuptools==80.9.0
76 | RUN python3 -m pip install --no-cache-dir -r requirements.txt
77 | 
78 | # Compiling Scala example with MAVEN
79 | 
80 | WORKDIR /usr/spark-${SPARK_VERSION}/work_dir/scala_apps/example
81 | RUN mvn package
82 | 
83 | WORKDIR $SPARK_HOME
84 | CMD ["sbin/start-master.sh"]
85 | 


--------------------------------------------------------------------------------
/master/work_dir/scala_apps/example/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>alvertogit.bigdata_docker</groupId>
  4 |   <artifactId>example</artifactId>
  5 |   <version>1.0</version>
  6 |   <packaging>jar</packaging>
  7 |   <name>"Example App"</name>
  8 | 
  9 |   <properties>
 10 |     <hadoop.version>3.4.2</hadoop.version>
 11 |     <spark.version>4.0.1</spark.version>
 12 |     <scala.version>2.13.16</scala.version>
 13 |     <scala.binary.version>2.13</scala.binary.version>
 14 |     <opensearch.version>3.2.0</opensearch.version>
 15 |     <java.version>17</java.version>
 16 |     <spark.kafka.version>${spark.version}</spark.kafka.version>
 17 |   </properties>
 18 | 
 19 |   <repositories>
 20 |     <repository>
 21 |       <id>apache-repo</id>
 22 |       <name>Apache Repository</name>
 23 |       <url>https://repository.apache.org/content/repositories/releases</url>
 24 |       <releases>
 25 |         <enabled>true</enabled>
 26 |       </releases>
 27 |       <snapshots>
 28 |         <enabled>false</enabled>
 29 |       </snapshots>
 30 |     </repository>
 31 |   </repositories>
 32 | 
 33 |   <dependencies>
 34 |       <dependency> <!-- Scala -->
 35 |         <groupId>org.scala-lang</groupId>
 36 |         <artifactId>scala-library</artifactId>
 37 |         <version>${scala.version}</version>
 38 |       </dependency>
 39 | 
 40 |       <dependency> <!-- Core Spark -->
 41 |         <groupId>org.apache.spark</groupId>
 42 |         <artifactId>spark-core_${scala.binary.version}</artifactId>
 43 |         <version>${spark.version}</version>
 44 |       </dependency>
 45 | 
 46 |       <dependency> <!-- Spark SQL -->
 47 |             <groupId>org.apache.spark</groupId>
 48 |             <artifactId>spark-sql_${scala.binary.version}</artifactId>
 49 |             <version>${spark.version}</version>
 50 |       </dependency>
 51 | 
 52 |       <dependency> <!-- Hadoop -->
 53 |          <groupId>org.apache.hadoop</groupId>
 54 |          <artifactId>hadoop-client</artifactId>
 55 |          <version>${hadoop.version}</version>
 56 |        </dependency>
 57 | 
 58 |       <dependency> <!-- OpenSearch -->
 59 |         <groupId>org.opensearch</groupId>
 60 |         <artifactId>opensearch</artifactId>
 61 |         <version>${opensearch.version}</version>
 62 |         <scope>provided</scope>
 63 |       </dependency>
 64 | 
 65 |       <dependency>
 66 |         <groupId>org.opensearch.client</groupId>
 67 |         <artifactId>opensearch-hadoop</artifactId>
 68 |         <version>1.3.0</version>
 69 |       </dependency>
 70 | 
 71 |       <dependency>
 72 |         <groupId>org.opensearch.client</groupId>
 73 |         <artifactId>opensearch-spark-30_2.13</artifactId>
 74 |         <version>1.3.0</version>
 75 |       </dependency>
 76 |   </dependencies>
 77 | 
 78 |   <build>
 79 |     <plugins>
 80 |       <plugin>
 81 |         <groupId>net.alchim31.maven</groupId>
 82 |         <artifactId>scala-maven-plugin</artifactId>
 83 | 	    <version>4.9.5</version>
 84 |         <executions>
 85 |           <execution>
 86 |             <goals>
 87 |               <goal>compile</goal>
 88 |             </goals>
 89 |           </execution>
 90 |         </executions>
 91 |       </plugin>
 92 |       <plugin>
 93 |         <artifactId>maven-compiler-plugin</artifactId>
 94 | 	    <version>3.14.0</version>
 95 |         <configuration>
 96 |           <source>${java.version}</source>
 97 |           <target>${java.version}</target>
 98 |         </configuration>
 99 |       </plugin>
100 |     </plugins>  
101 |   </build>
102 | 
103 | </project>
104 | 


--------------------------------------------------------------------------------
/master/work_dir/scala_apps/example/src/main/scala/stubs/example.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 | 
  3 | @author alvertogit
  4 | Copyright 2018-2025
  5 | 
  6 | Local execution command example:
  7 | 
  8 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \
  9 | --master local[2] \
 10 | --driver-memory 10g \
 11 | --class stubs.Example \
 12 | target/example-1.0.jar \
 13 | 1 49999 \
 14 | /tmp/data/test_log1.csv \
 15 | /tmp/data/test_log2.csv \
 16 | /tmp/data/result_local_log
 17 | 
 18 | Worker execution command example:
 19 | 
 20 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \
 21 | --master spark://master:7077 \
 22 | --driver-memory 10g \
 23 | --class stubs.Example \
 24 | target/example-1.0.jar \
 25 | 1 49999 \
 26 | /tmp/data/test_log1.csv \
 27 | /tmp/data/test_log2.csv \
 28 | /tmp/data/result_worker_log
 29 | 
 30 | */
 31 | 
 32 | package stubs
 33 | 
 34 | import org.apache.spark.SparkContext
 35 | import org.apache.spark.sql.SparkSession
 36 | import org.apache.spark.sql.expressions.Window
 37 | import org.apache.spark.sql.types._
 38 | import org.apache.spark.sql.Row
 39 | import org.apache.spark.sql.functions._
 40 | 
 41 | object Example {
 42 | def main(args: Array[String]) {
 43 | 
 44 | if (args.length < 4) {
 45 | 	System.err.println("Usage: stubs.Example <min-range-Id> <max-range-Id> <path-input-log1> <path-input-log2> <path-output-log>")
 46 | 	System.exit(1)
 47 | }
 48 |  
 49 | val minRangeId = args(0).toInt
 50 | val maxRangeId = args(1).toInt
 51 | val path_input_log1 = args(2)
 52 | val path_input_log2 = args(3)
 53 | val path_output_log = args(4)
 54 | 
 55 | def rangeId (user_id: Int) = {
 56 |   if (user_id >= minRangeId & user_id <= maxRangeId) {
 57 |     true
 58 |   }
 59 |   else{
 60 |     false
 61 |   }
 62 | }
 63 | 
 64 | val sc = new SparkContext()
 65 | val spark = SparkSession.builder.appName("ExampleApp").getOrCreate()
 66 | import spark.implicits._	
 67 | 
 68 | // test_log1.csv input data format example
 69 | //
 70 | // userId;region;type;paid
 71 | // 27103;0;0;true
 72 | // 74637;2;1;false
 73 | // ...
 74 | 
 75 | val infoSchema = StructType(Array(
 76 | StructField("userId",IntegerType,true),
 77 | StructField("region",IntegerType,true),
 78 | StructField("type",IntegerType,true),
 79 | StructField("paid",BooleanType,true)
 80 | ))
 81 | 
 82 | val infoDF = spark.read.schema(infoSchema).option("header","true").option("delimiter", ";").csv(path_input_log1)
 83 | 
 84 | // test_log2.csv input data format example
 85 | //
 86 | // hour;userId;songId;genderId;deviceId
 87 | // 18-10-2017 00:00:25;27103;231990117;23;1_27103
 88 | // 18-10-2017 00:02:00;74637;241781021;24;1_74637
 89 | // ...
 90 | 
 91 | val songSchema = StructType(Array(
 92 | StructField("userId",IntegerType,true),
 93 | StructField("hour",IntegerType,true),
 94 | StructField("songId",IntegerType,true),
 95 | StructField("genderId",IntegerType,true),
 96 | StructField("deviceId",IntegerType,true)
 97 | ))
 98 | 
 99 | val songRDD = sc.textFile(path_input_log2)
100 | 
101 | val songHeader = songRDD.first
102 | 
103 | val songRDDFiltered = songRDD.filter(record => record != songHeader).map(line => line.split(";")).filter(
104 | field => rangeId(field(1).toInt
105 | )).map(
106 | rec => Row(
107 | rec(1).toInt,
108 | rec(0).split(' ')(1).split(':')(0).toInt,
109 | rec(2).toInt,
110 | rec(3).toInt,
111 | rec(4).split('_')(0).toInt
112 | ))
113 | 
114 | val songDF = spark.createDataFrame(songRDDFiltered,songSchema)
115 | 
116 | val distSongDF = songDF.groupBy($"userId").agg(countDistinct($"songId")).withColumnRenamed("count(DISTINCT songId)","distSongIds")
117 | 
118 | val genderSongDF = songDF.groupBy($"userId",$"genderId").count.withColumnRenamed("count","topGenderIdSongs")
119 | 
120 | val genderWindow = Window.partitionBy("userId").orderBy($"topGenderIdSongs".desc)
121 | 
122 | val windowGenderSongDF = genderSongDF.withColumn("rank", row_number().over(genderWindow)).where($"rank" === 1).drop($"rank"
123 | ).withColumnRenamed("genderId","topGenderId")
124 | 
125 | val globalSongDF = windowGenderSongDF.join(distSongDF,"userId")
126 | 
127 | // globalSongDF format example
128 | //              
129 | // userId;topGenderId;topGenderIdSongs;distSongIds
130 | // 27103;23;1;2
131 | // 4052;27;1;2
132 | // ...
133 | 
134 | // joining global song and info DFs and ordering output
135 | val resultDF = globalSongDF.join(infoDF ,"userId")
136 | 
137 | // resultDF format example
138 | //              
139 | // userId;topGenderId;topGenderIdSongs;distSongIds;region;type;paid
140 | // 27103;23;1;2;0;0;true
141 | // 4052;27;1;2;1;2;false
142 | // ...
143 | 
144 | resultDF.coalesce(1).write.option("header","true").option("delimiter", ";").csv(path_output_log)
145 | 
146 | spark.stop
147 | 
148 | }
149 | }
150 | 


--------------------------------------------------------------------------------
/master/work_dir/python_apps/example/example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | example.py: pyspark application example.
  3 | 
  4 | Local execution command example:
  5 | 
  6 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ spark-submit \
  7 | --master local[2] \
  8 | --driver-memory 10g \
  9 | example.py \
 10 | 1 49999 \
 11 | /tmp/data/test_log1.csv \
 12 | /tmp/data/test_log2.csv \
 13 | /tmp/data/result_local_log
 14 | 
 15 | Worker execution command example:
 16 | 
 17 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ spark-submit \
 18 | --master spark://master:7077 \
 19 | --driver-memory 10g \
 20 | example.py \
 21 | 1 49999 \
 22 | /tmp/data/test_log1.csv \
 23 | /tmp/data/test_log2.csv \
 24 | /tmp/data/result_worker_log
 25 | """
 26 | 
 27 | __author__ = "alvertogit"
 28 | __copyright__ = "Copyright 2018-2025"
 29 | 
 30 | 
 31 | import os
 32 | import sys
 33 | 
 34 | from pyspark.sql import Row, SparkSession, Window
 35 | from pyspark.sql.functions import countDistinct, desc, row_number
 36 | from pyspark.sql.types import BooleanType, IntegerType, StructField, StructType
 37 | 
 38 | if __name__ == "__main__":
 39 |     if len(sys.argv) < 5:
 40 |         print("""
 41 |             Usage error: example.py <min-range-Id> <max-range-Id> <path-input-log1>
 42 |               <path-input-log2> <path-output-log>
 43 |             """)
 44 |         sys.exit(1)
 45 | 
 46 |     minRangeId = int(sys.argv[1])
 47 |     maxRangeId = int(sys.argv[2])
 48 |     path_input_log1 = sys.argv[3]
 49 |     path_input_log2 = sys.argv[4]
 50 |     path_output_log = sys.argv[5]
 51 | 
 52 |     def rangeId(user_id):
 53 |         if user_id >= minRangeId and user_id <= maxRangeId:
 54 |             return True
 55 |         return False
 56 | 
 57 |     os.environ["PYSPARK_PYTHON"] = sys.executable
 58 |     os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
 59 |     spark = SparkSession.builder.appName("pyspark example").getOrCreate()
 60 |     # spark.conf.set("spark.driver.memory", "10g")
 61 |     # spark.sparkContext.setLogLevel("WARN")
 62 |     sc = spark.sparkContext
 63 | 
 64 |     # test_log1.csv input data format example
 65 |     #
 66 |     # userId;region;type;paid
 67 |     # 27103;0;0;true
 68 |     # 74637;2;1;false
 69 |     # ...
 70 | 
 71 |     infoSchema = StructType(
 72 |         [
 73 |             StructField("userId", IntegerType(), True),
 74 |             StructField("region", IntegerType(), True),
 75 |             StructField("type", IntegerType(), True),
 76 |             StructField("paid", BooleanType(), True),
 77 |         ]
 78 |     )
 79 | 
 80 |     infoDF = (
 81 |         spark.read.schema(infoSchema)
 82 |         .option("header", "true")
 83 |         .option("delimiter", ";")
 84 |         .csv(path_input_log1)
 85 |     )
 86 | 
 87 |     # test_log2.csv input data format example
 88 |     #
 89 |     # hour;userId;songId;genderId;deviceId
 90 |     # 18-10-2017 00:00:25;27103;231990117;23;1_27103
 91 |     # 18-10-2017 00:02:00;74637;241781021;24;1_74637
 92 |     # ...
 93 | 
 94 |     songSchema = StructType(
 95 |         [
 96 |             StructField("userId", IntegerType(), True),
 97 |             StructField("hour", IntegerType(), True),
 98 |             StructField("songId", IntegerType(), True),
 99 |             StructField("genderId", IntegerType(), True),
100 |             StructField("deviceId", IntegerType(), True),
101 |         ]
102 |     )
103 | 
104 |     songRDD = sc.textFile(path_input_log2)
105 | 
106 |     songHeader = songRDD.first()
107 | 
108 |     songRDDFiltered = (
109 |         songRDD.filter(lambda record: record != songHeader)
110 |         .map(lambda line: line.split(";"))
111 |         .filter(lambda field: rangeId(int(field[1])))
112 |         .map(
113 |             lambda rec: Row(
114 |                 int(rec[1]),
115 |                 int(rec[0].split(" ")[1].split(":")[0]),
116 |                 int(rec[2]),
117 |                 int(rec[3]),
118 |                 int(rec[4].split("_")[0]),
119 |             )
120 |         )
121 |     )
122 | 
123 |     songDF = spark.createDataFrame(songRDDFiltered, songSchema)
124 | 
125 |     distSongDF = (
126 |         songDF.groupBy("userId")
127 |         .agg(countDistinct("songId"))
128 |         .withColumnRenamed("count(DISTINCT songId)", "distSongIds")
129 |     )
130 | 
131 |     genderSongDF = (
132 |         songDF.groupBy("userId", "genderId").count().withColumnRenamed("count", "topGenderIdSongs")
133 |     )
134 | 
135 |     genderWindow = Window.partitionBy("userId").orderBy(desc("topGenderIdSongs"))
136 | 
137 |     windowGenderSongDF = (
138 |         genderSongDF.withColumn("rank", row_number().over(genderWindow))
139 |         .where("rank = 1")
140 |         .drop("rank")
141 |         .withColumnRenamed("genderId", "topGenderId")
142 |     )
143 | 
144 |     globalSongDF = windowGenderSongDF.join(distSongDF, "userId")
145 | 
146 |     # globalSongDF format example
147 |     #
148 |     # userId;topGenderId;topGenderIdSongs;distSongIds
149 |     # 27103;23;1;2
150 |     # 4052;27;1;2
151 |     # ...
152 | 
153 |     # joining global song and info DFs and ordering output
154 |     resultDF = globalSongDF.join(infoDF, "userId")
155 | 
156 |     # resultDF format example
157 |     #
158 |     # userId;topGenderId;topGenderIdSongs;distSongIds;region;type;paid
159 |     # 27103;23;1;2;0;0;true
160 |     # 4052;27;1;2;1;2;false
161 |     # ...
162 | 
163 |     resultDF.coalesce(1).write.option("header", "true").option("delimiter", ";").csv(
164 |         path_output_log
165 |     )
166 | 
167 |     spark.stop()
168 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SPARK DOCKER CLUSTER FOR BIG DATA & DATA SCIENCE <!-- omit in toc -->
  2 | 
  3 | This repository stores all the required components to build a containerized cluster for [Big Data] and [Data Science] applications. It allows scalable production services using technologies such as [Machine Learning] [Python] libraries, [Apache Spark] analytics engine, [Scala] language, [HDFS] and [Docker] containers among others.
  4 | 
  5 | - [DEPENDENCIES](#dependencies)
  6 |   - [PYTHON VIRTUAL ENVIRONMENT](#python-virtual-environment)
  7 | - [REPOSITORY CONTENT](#repository-content)
  8 |   - [WORK DIRECTORY CONTENT](#work-directory-content)
  9 | - [ARCHITECTURE](#architecture)
 10 |   - [CONTAINERIZED BIG DATA CLUSTER](#containerized-big-data-cluster)
 11 |   - [HOW TO RUN CONTAINERIZED CLUSTER WITH DOCKER COMPOSE](#how-to-run-containerized-cluster-with-docker-compose)
 12 |   - [SCALA BIG DATA APPLICATIONS](#scala-big-data-applications)
 13 |   - [PYTHON DATA SCIENCE APPLICATIONS](#python-data-science-applications)
 14 |   - [JUPYTER LAB \& NOTEBOOKS](#jupyter-lab--notebooks)
 15 | - [CREDITS](#credits)
 16 | 
 17 | ## DEPENDENCIES
 18 | 
 19 | The code has been tested using:
 20 | 
 21 | - [Apache Spark] (4.0): an unified analytics engine for [Big Data] processing, with built-in modules for streaming, [SQL], [Machine Learning] and graph processing. It has high-level [API]s in [Scala] and [Python].
 22 | - [Hadoop] (3.4): an open-source software for reliable, scalable, distributed computing. It uses [Hadoop Distributed File System] ([HDFS]) which is suitable to work with large [RDD] (Resilient Distributed Datasets).
 23 | - [Docker] (28.5): an open platform for developers and sysadmins to build, ship, and run distributed applications, whether on laptops, data center VMs, or the cloud.
 24 | - [Docker Compose] (2.40): a tool for defining and running multi-container [Docker] applications.
 25 | 
 26 | ### PYTHON VIRTUAL ENVIRONMENT
 27 | 
 28 | The virtual environment employed for [Data Science] applications is generated from **requirements.txt** file located in the repository.
 29 | 
 30 | The main components of this virtual environment are listed below:
 31 | 
 32 | - [Python] (3.12): an interpreted high-level programming language for general-purpose programming.
 33 | - [Jupyter Lab] (4.5): a web-based interactive development environment for [Jupyter Notebooks], code, and data.
 34 | - [Keras] ([TensorFlow] built-in): a high-level neural networks [API], written in [Python] and capable of running on top of [TensorFlow], CNTK, or Theano.
 35 | - [TensorFlow] (2.20): an open source [Deep Learning] library for high performance numerical computation using data flow graphs.
 36 | - [Matplotlib] (3.10): a plotting library for [Python] and its numerical mathematics extension [NumPy].
 37 | - [NumPy] (2.3): a library for [Python], adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
 38 | - [Pandas] (2.3):  an open source library providing high-performance, easy-to-use data structures and data analysis tools for [Python].
 39 | - [scikit-learn] (1.7): a [machine learning] library for [Python]. It features various classification, regression and clustering algorithms including support vector machines, [random forest], [gradient boosting], k-means and DBSCAN.
 40 | - [scikit-image] (0.25): a collection of algorithms for image processing with [Python].
 41 | - [TPOT] (1.1): a [Python] Automated [Machine Learning] tool that optimizes [machine learning] pipelines using genetic programming.
 42 | - [XGBoost] (3.0): an optimized distributed [gradient boosting] library designed to be highly efficient, flexible and portable.
 43 | - [Folium] (0.20): an open source library to visualize data that has been manipulated in [Python] on an interactive [Leaflet.js] map.
 44 | - [ipyleaflet] (0.20): a [Jupyter] / [Leaflet.js] bridge enabling interactive maps in [Jupyter Notebook].
 45 | - [Seaborn] (0.13): a [Python] visualization library based on [Matplotlib]. It provides a high-level interface for drawing attractive statistical graphics.
 46 | - [imbalanced-learn] (0.14): a [Python] package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance. It is compatible with [scikit-learn] and it allows [SMOTE (Synthetic Minority Over-sampling Technique)].
 47 | - [joblib] (1.5): a set of tools to provide lightweight pipelining in [Python].
 48 | - [findspark] (2.0): a package to make [Spark] Context available in [Jupyter Notebook].
 49 | 
 50 | It is available in the [Spark] master node created with [Docker Compose].
 51 | 
 52 | Command to access [Spark] master node:
 53 | 
 54 | ```bash
 55 | ~/bigdata_docker/$ docker compose exec master bash
 56 | ~/usr/spark-4.0.1/$
 57 | ```
 58 | 
 59 | ## REPOSITORY CONTENT
 60 | 
 61 | The **bigdata_docker** main folder contains subfolders, application and data files needed to build [Big Data] and [Data Science] solutions:
 62 | 
 63 | ```bash
 64 | bigdata_docker
 65 | ├── .gitignore
 66 | ├── conf
 67 | │   ├── master
 68 | │   └── worker
 69 | ├── data
 70 | │   ├── test_log1.csv
 71 | │   └── test_log2.csv
 72 | ├── master
 73 | │   ├── Dockerfile
 74 | │   └── work_dir
 75 | │       ├── notebooks
 76 | │       ├── pyproject.toml
 77 | │       ├── python_apps
 78 | │       │   └── example
 79 | │       ├── requirements.txt
 80 | │       └── scala_apps
 81 | │           └── example
 82 | ├── docker-compose.yml
 83 | └── README.md
 84 | ```
 85 | 
 86 | - **conf**: stores [Spark] configuration files for master and worker nodes. These folders are mapped as volumes in the [Docker Compose] file and they can be accessed from containers through **conf/** path.
 87 | - **data**: folder to contain raw, processed and test data. It is mapped as volume in [docker-compose] and it can be accessed from containers through **tmp/data/** path.
 88 | - **docker-compose.yml**: creates the [Spark] cluster based on [Docker] in which the applications shall run.
 89 | - **master**: stores all configuration and working files for the [Spark] master and worker nodes of the cluster created with [Docker Compose].
 90 | - **Dockerfile**: defines all required tools, virtual environment and work files to be installed in the [Spark] master and worker nodes.
 91 | - **work_dir**: stores files employed for [Big Data] and [Data Science] applications.
 92 | 
 93 | ### WORK DIRECTORY CONTENT
 94 | 
 95 | The **work_dir** folder has the following structure:
 96 | 
 97 | ```bash
 98 | work_dir
 99 | ├── pyproject.toml
100 | ├── requirements.txt
101 | ├── notebooks
102 | │   └── Example.ipynb
103 | ├── python_apps
104 | │   └── example
105 | └── scala_apps
106 |     └── example
107 | ```
108 | 
109 | - **requirements.txt**: file which defines the dependencies for the virtual environment employed by [Python] [Data Science] applications and [Jupyter Notebooks].
110 | - **notebooks**: [Jupyter Notebooks] for data analysis, elaboration and training of prediction models and testing.
111 | - **scala_apps**: used to contain [Spark] applications written in [Scala]. There is one example application compiled using [Maven].
112 | - **python_apps**: folder to store [Python] applications. There is one example application.
113 | 
114 | ## ARCHITECTURE
115 | 
116 | The system has three main components:
117 | 
118 | 1. Containerized [Big Data] cluster: It shall be the base of the system and it can allow to run large files processing and predictive applications.
119 | 2. [Scala] [Big Data] applications: It shall process the available large data files and extract the relevant information that it will be used to train and feed the predictive models.
120 | 3. [Python] [Data Science] applications: It shall employ [Python] [Data Science] libraries to use [machine learning] models for tasks such as predicitions.
121 | 
122 | Apart from the three main components listed above [Jupyter Notebooks] are also utilized for data analysis, modelling and testing of applications.
123 | 
124 | ### CONTAINERIZED BIG DATA CLUSTER
125 | 
126 | The system has to be a scalable solution. Thus the applications shall be deployed in a [Big Data] cluster built on [Apache Spark], [Hadoop] and [Docker] containers.
127 | 
128 | The reason for this choice is because [Docker] enables the utilization of container clustering systems to set up and scale the processing and predictive applications in production. It makes easy to add new containers to handle additional load.
129 | 
130 | The containers shall run [Spark] as data engine and [HDFS] for storage in master and worker nodes. The [Spark] master node has also [Maven] and the [Python] virtual environment installed.
131 | 
132 | The number of worker nodes can be increased modifying the docker-compose file. By default it creates one master and one worker node.
133 | 
134 | The following diagram illustrates the [Big Data] cluster architecture in blocks:
135 | 
136 | ```mermaid
137 | flowchart LR;
138 |   Client<-->id1[["Master <br>--------<br>Python <br>Spark <br>HDFS"]];
139 |   subgraph Big Data Cluster;
140 |   id1[["Master <br>--------<br>Python <br>Spark <br>HDFS"]]<-->id2[["Worker <br>--------<br>Spark <br>HDFS"]];
141 |   end;
142 | ```
143 | 
144 | Other possible improvements in the [Big Data] cluster that shall not be implemented here could be:
145 | 
146 | - Use of [Kubernetes] to manage the [Docker] containers.
147 | - Take advantage of [Cloud Computing] services, such as [AWS EMR], to build up a [Spark] cluster with the desired amount of resources and only utilize them when is required for cost efficiency.
148 | 
149 | ### HOW TO RUN CONTAINERIZED CLUSTER WITH DOCKER COMPOSE
150 | 
151 | The steps and commands to run the [Spark] cluster with [Docker Compose] are described below.
152 | 
153 | Before executing [Docker Compose] is strongly recommended to close other applications to free up resources and ports to avoid potential issues. Then [Docker Compose] can be execute to build services:
154 | 
155 | ```bash
156 | ~/bigdata_docker/$ docker compose build
157 | ```
158 | 
159 | Next step consists in executing [Docker Compose] up command:
160 | 
161 | ```bash
162 | ~/bigdata_docker/$ docker compose up
163 | ```
164 | 
165 | It is likely that for the first time it could spend some time to download [Docker] images and additional packages. If everything goes fine at the end the cluster should be ready appearing something similar to:
166 | 
167 | ```bash
168 | ...
169 | master_1  | 2018-10-19 09:59:53 INFO  Master:54 - I have been elected leader! New state: ALIVE
170 | master_1  | 2018-10-19 09:59:53 INFO  Master:54 - Registering worker 172.27.0.3:8881 with 2 cores, 2.0 GB RAM
171 | worker_1  | 2018-10-19 09:59:53 INFO  Worker:54 - Successfully registered with master spark://master:7077
172 | ```
173 | 
174 | To shutdown the cluster simply press 'Control+C' and wait patiently to return to shell.
175 | 
176 | ### SCALA BIG DATA APPLICATIONS
177 | 
178 | It is necessary to filter and prepare the data from [RDD]s to extract the relevant information that will be used by [Python] [Data Science] applications. The approach to accomplish this task can be the employ of [Spark] applications programmed in [Scala].
179 | 
180 | A [Scala] [Big Data] example application is stored in **work_dir/scala_apps/example/** folder and for the first time it must be compiled with [Maven] to generate the *.jar* target file. This is done automatically with the [Dockerfile] but it can be done manually using the following command:
181 | 
182 | ```bash
183 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ mvn package
184 | ```
185 | 
186 | The application requires the parameters *min-range-Id*, *max-range-Id*, *path-input-log1*, *path-input-log2*, *path-output-log*.
187 | 
188 | Command to run the **Example** application locally in the [Spark] master node with test logs:
189 | 
190 | ```bash
191 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \
192 | --master local[2] \
193 | --class stubs.Example \
194 | target/example-1.0.jar \
195 | 1 49999 \
196 | /tmp/data/test_log1.csv \
197 | /tmp/data/test_log2.csv \
198 | /tmp/data/result_local_log
199 | ```
200 | 
201 | Command to run the **Example** application in the [Spark] worker node with test logs:
202 | 
203 | ```bash
204 | ~/usr/spark-4.0.1/work_dir/scala_apps/example$ spark-submit \
205 | --master spark://master:7077 \
206 | --class stubs.Example \
207 | target/example-1.0.jar \
208 | 1 49999 \
209 | /tmp/data/test_log1.csv \
210 | /tmp/data/test_log2.csv \
211 | /tmp/data/result_worker_log
212 | ```
213 | 
214 | When using larger files it is recommended to tune additional parameters to provide additional resources. e.g. "--driver-memory 10g".
215 | 
216 | ### PYTHON DATA SCIENCE APPLICATIONS
217 | 
218 | The way to run the [Python] example application is simple. Just go to **work_dir/python_apps/example/** folder and execute it:
219 | 
220 | Command to access [Spark] master node:
221 | 
222 | ```bash
223 | ~/bigdata_docker/$ docker compose exec master bash
224 | ~/usr/spark-4.0.1/$
225 | ```
226 | 
227 | Command to run [Python] example application in master node:
228 | 
229 | ```bash
230 | ~/usr/spark-4.0.1/$ cd work_dir/python_apps/example
231 | ~/usr/spark-4.0.1/work_dir/python_apps/example$ python3 main.py 10000
232 | ```
233 | 
234 | ### JUPYTER LAB & NOTEBOOKS
235 | 
236 | A good way to analyze data, build [machine learning] models and test them is through [Jupyter Lab]. An example of [Jupyter Notebook] is stored in the **work_dir/notebooks/** folder.
237 | 
238 | All the required packages to run [Jupyter Notebooks] remotely in the [Spark] master node are installed so it is possible to run them through web interface. To achieve this it is necessary to use the commands shown below:
239 | 
240 | Command to access master node:
241 | 
242 | ```bash
243 | ~/bigdata_docker/$ docker compose exec master bash
244 | ~/usr/spark-4.0.1$
245 | ```
246 | 
247 | Launch [Jupyter Lab] service in master node.
248 | 
249 | ```bash
250 | ~/usr/spark-4.0.1$ jupyter lab \
251 | --notebook-dir=/usr/spark-4.0.1/work_dir/notebooks \
252 | --ip='0.0.0.0' \
253 | --port=8888 \
254 | --no-browser \
255 | --allow-root
256 | ```
257 | 
258 | Now [Jupyter Notebooks] stored in the master node can be run remotely. Next step is to open a local web browser and paste the URL printed after executing the launch command to access to the [Jupyter Lab] interface, checking that the server is running fine. A similar output will be shown:
259 | 
260 | ```bash
261 | Copy/paste this URL into your browser when you connect for the first time,
262 |     to login with a token:
263 |         http://(master or 127.0.0.1):8888/?token=<token>
264 | ```
265 | 
266 | Valid URL:
267 | 
268 | ```bash
269 | http://localhost:8888/?token=<token>
270 | ```
271 | 
272 | To shutdown the [Jupyter Lab] service in the master node simply press 'Control+C' and then confirm with 'y'.
273 | 
274 | ## CREDITS
275 | 
276 | author: alvertogit
277 | copyright: 2018-2025
278 | 
279 | [Data Science]: https://en.wikipedia.org/wiki/Data_science
280 | [Big Data]: https://en.wikipedia.org/wiki/Big_data
281 | [Python]: https://www.python.org/
282 | [Machine Learning]: https://en.wikipedia.org/wiki/Machine_learning
283 | [Deep Learning]: https://en.wikipedia.org/wiki/Deep_learning
284 | [Random Forest]: https://en.wikipedia.org/wiki/Random_forest
285 | [Gradient Boosting]: https://en.wikipedia.org/wiki/Gradient_boosting
286 | [Scala]: https://www.scala-lang.org/
287 | [Docker]: https://www.docker.com/
288 | [Docker Compose]: https://github.com/docker/compose
289 | [Dockerfile]: https://docs.docker.com/engine/reference/builder/
290 | [Apache Spark]: https://spark.apache.org/
291 | [Spark]: https://spark.apache.org/
292 | [API]: https://en.wikipedia.org/wiki/Application_programming_interface
293 | [SQL]: https://en.wikipedia.org/wiki/SQL
294 | [Hadoop]: https://hadoop.apache.org/
295 | [Hadoop Distributed File System]: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html
296 | [HDFS]: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html
297 | [RDD]: https://spark.apache.org/docs/latest/rdd-programming-guide.html
298 | [Kubernetes]: https://kubernetes.io/
299 | [Keras]: https://keras.io/
300 | [TensorFlow]: https://www.tensorflow.org/
301 | [Matplotlib]: https://matplotlib.org/
302 | [NumPy]: https://numpy.org/
303 | [Pandas]: https://pandas.pydata.org/
304 | [scikit-learn]: https://scikit-learn.org/stable/
305 | [scikit-image]: https://scikit-image.org/
306 | [TPOT]: https://github.com/EpistasisLab/tpot
307 | [XGBoost]: https://github.com/dmlc/xgboost
308 | [Folium]: https://github.com/python-visualization/folium
309 | [Leaflet.js]: https://leafletjs.com/
310 | [ipyleaflet]: https://github.com/jupyter-widgets/ipyleaflet
311 | [Seaborn]: http://seaborn.pydata.org/
312 | [imbalanced-learn]: https://github.com/scikit-learn-contrib/imbalanced-learn
313 | [SMOTE (Synthetic Minority Over-sampling Technique)]: https://jair.org/index.php/jair/article/view/10302
314 | [joblib]: https://pypi.org/project/joblib/
315 | [findspark]: https://github.com/minrk/findspark
316 | [Jupyter]: https://jupyter.org/
317 | [Jupyter Lab]: https://jupyter.org/
318 | [Jupyter Notebook]: https://jupyter.org/
319 | [Jupyter Notebooks]: https://jupyter.org/
320 | [Maven]: https://maven.apache.org/
321 | [Cloud Computing]: https://en.wikipedia.org/wiki/Cloud_computing
322 | [AWS EMR]: https://aws.amazon.com/emr/
323 | 


--------------------------------------------------------------------------------