├── .gitignore
├── INSTALL.md
├── LICENSE
├── README-batch.md
├── README-ml.md
├── README-pyspark.md
├── README-streaming.md
├── README-superset.md
├── README.md
├── airflow
    ├── .gitignore
    ├── dags
    │   ├── random_number_dag.py
    │   ├── requests_example.py
    │   ├── solution_random_number_dag.py
    │   ├── solution_stocks_dag.py
    │   ├── spark_job.py
    │   ├── sqlite_cli.py
    │   ├── stocks_dag.py
    │   └── word_count_dag.py
    └── docs
    │   ├── clase_airflow.pdf
    │   ├── clase_airflow.tex
    │   └── figures
    │       ├── airflow.png
    │       ├── airflow_architecture.png
    │       ├── airflow_ui.png
    │       ├── dag_graph_view.png
    │       ├── example_bash_operator.png
    │       ├── exercise_1.png
    │       ├── exercise_2.png
    │       ├── exercise_3.png
    │       ├── exercise_4.png
    │       └── logo_mutt.png
├── code
    ├── postgresql-42.1.4.jar
    ├── python
    │   ├── introduction
    │   │   ├── ejercicios
    │   │   │   ├── alice.txt
    │   │   │   ├── list.py
    │   │   │   ├── small.txt
    │   │   │   ├── string.py
    │   │   │   └── wordcount.py
    │   │   ├── hello.py
    │   │   └── introduccion_a_python.pdf
    │   └── us-stock-analysis
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── requirements.txt
    │   │   └── src
    │   │       ├── batch
    │   │           └── etl_steps.py
    │   │       ├── examples
    │   │           ├── first_example.py
    │   │           └── postgres_example.py
    │   │       └── stream
    │   │           ├── etl_stream.py
    │   │           └── fake_stock_price_generator.py
    └── scala
    │   ├── credit-risk-analysis
    │       ├── .gitignore
    │       ├── README.md
    │       ├── build.sbt
    │       ├── project
    │       │   ├── assembly.sbt
    │       │   └── build.properties
    │       └── src
    │       │   └── main
    │       │       └── scala
    │       │           └── es
    │       │               └── arjon
    │       │                   ├── CreditRiskAnalysis.scala
    │       │                   ├── CreditRiskTrain.scala
    │       │                   └── DatasetUtil.scala
    │   └── us-stock-analysis
    │       ├── .gitignore
    │       ├── README.md
    │       ├── build.sbt
    │       ├── project
    │           ├── assembly.sbt
    │           └── build.properties
    │       └── src
    │           └── main
    │               ├── resources
    │                   └── log4j.properties
    │               └── scala
    │                   └── es
    │                       └── arjon
    │                           ├── EtlSteps.scala
    │                           ├── FakeStockPriceGenerator.scala
    │                           └── StreamingETL.scala
├── control-env.sh
├── dataset
    ├── .gitignore
    ├── credit-risk
    │   ├── germancredit-user-input.csv
    │   └── germancredit.csv
    ├── global-temperature-1880-2016.json
    ├── news
    │   └── huffingtonpost-news.json.gz
    ├── pyspark-df-overview
    │   ├── README.md
    │   └── census_income.csv.gz
    ├── stocks-small
    │   ├── aapl.us.txt
    │   ├── baba.us.txt
    │   ├── csco.us.txt
    │   ├── dhr.us.txt
    │   ├── ebay.us.txt
    │   ├── fb.us.txt
    │   ├── goog.us.txt
    │   ├── googl.us.txt
    │   ├── ibm.us.txt
    │   ├── intc.us.txt
    │   ├── jnj.us.txt
    │   ├── meli.us.txt
    │   ├── msft.us.txt
    │   ├── orcl.us.txt
    │   ├── qcom.us.txt
    │   ├── tsla.us.txt
    │   ├── txn.us.txt
    │   ├── wdc.us.txt
    │   └── xrx.us.txt
    ├── stocks
    │   └── README.md
    ├── titanic.csv
    └── yahoo-symbols-201709.csv
├── docker-compose.yml
├── images
    ├── docker-advanced-config.jpg
    ├── superset-01.png
    ├── superset-02.png
    ├── superset-03.png
    ├── superset-04.png
    ├── superset-05.png
    ├── superset-06.png
    ├── superset-07.png
    ├── superset-08.png
    ├── superset-09.png
    ├── superset-10.png
    ├── superset-11.png
    └── superset.png
├── jupyter
    └── notebook
    │   ├── README.md
    │   ├── batch_etl_steps.ipynb
    │   ├── pandas-json-sample.ipynb
    │   ├── pyspark-apache-arrow.ipynb
    │   ├── pyspark-check-install.ipynb
    │   ├── pyspark-dataframe-overview.ipynb
    │   ├── pyspark-intro.ipynb
    │   ├── pyspark-nlp.ipynb
    │   ├── pyspark-postgres.ipynb
    │   └── titanic
    │       ├── docs
    │           ├── clase_ml.bib
    │           ├── clase_ml.pdf
    │           ├── clase_ml.tex
    │           └── figures
    │           │   ├── bias_variance_tradeoff.png
    │           │   ├── bvt2.png
    │           │   ├── complexity.png
    │           │   ├── confusion_matrix.png
    │           │   ├── corr.png
    │           │   ├── facet.png
    │           │   ├── frontier.png
    │           │   ├── holdout.png
    │           │   ├── kde.png
    │           │   ├── logistic.png
    │           │   ├── logo_mutt.png
    │           │   ├── one_hot.png
    │           │   ├── overfitting.png
    │           │   ├── roc.png
    │           │   ├── run.png
    │           │   ├── sample_size.png
    │           │   ├── supervised.png
    │           │   ├── table_variables.pdf
    │           │   ├── table_variables.tex
    │           │   ├── titanic.jpg
    │           │   ├── tree.png
    │           │   ├── tree_regions.png
    │           │   ├── tvt.png
    │           │   ├── unbalance_class.png
    │           │   ├── underfitting.png
    │           │   ├── unsupervised.png
    │           │   └── whatido.jpg
    │       ├── titanic_spark_exercises.ipynb
    │       └── titanic_spark_solutions.ipynb
├── nginx
    └── html
    │   └── index.html
├── postgres
    └── scripts
    │   └── init.sql
├── scala
    ├── Day 1 - Scala Intro.html
    ├── README.md
    ├── databricks-import-notebook-1.png
    └── databricks-import-notebook-2.png
├── spark
    ├── Dockerfile
    ├── Dockerfile.pyspark
    ├── INSTALL.md
    ├── README.md
    └── requirements.txt
├── superset
    └── conf
    │   └── superset_config.py
└── vm
    ├── README.md
    ├── install-docker.sh
    ├── install-script.sh
    ├── virtualbox-port-forwarding.png
    ├── vm-0.png
    ├── vm-1.png
    ├── vm-2.png
    ├── vm-3.png
    ├── vm-4.png
    └── vm-5.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | .DS_Store
 3 | mnt/*
 4 | *.iso
 5 | *.ova
 6 | 
 7 | # Spark
 8 | checkpoint
 9 | derby.log
10 | metastore_db
11 | streaming.parquet
12 | dataset/output.parquet/
13 | 
14 | # Superset Dashboard
15 | superset/conf/.setup-complete
16 | superset/conf/superset.db
17 | 
18 | # Python
19 | .ipynb_checkpoints
20 | __pycache__
21 | dataset/titanic_*
22 | .idea/
23 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Instrucciones de Instalación
 2 | 
 3 | **IMPORTANTE:** Cualquier opción de instalación requeriere por lo menos 8Gb de RAM para un correcto funcionamiento.
 4 | 
 5 | ## Utilizando Windows
 6 | 
 7 | Si su computadora es Windows debe utilizar la Virtual Machine, algunas computadoras más antiguas no soporta virtualización, por lo cual [VirtualBox](https://www.virtualbox.org/) no funciona. Tampoco funciona adecuadamente **Docker on Windows**.
 8 | 
 9 | Se puede generar una Virtual Machine de cero siguiendo las instrucciones acá: [Virtual Box - INSTALL](./vm). La virtual machine completa pesa 15Gb y 5.2Gb cuando comprimida con `gzip -9`.
10 | 
11 | ## Utilizando MacOSX
12 | 
13 | Para compilar y correr el codigo adecuadamente en MacOSX es necesario instalar varias dependencias, acá pueden encontrar las instrucciones para instalar todas las dependencias necesarias: [Setting up Macbook Pro for Development](https://arjon.es/2019/setting-up-macbook-pro-for-development/)
14 | 
15 | Al finalizar la instalación clonar el repositorio:
16 | 
17 | ```shell
18 | git clone https://github.com/arjones/bigdata-workshop-es.git
19 | 
20 | cd bigdata-workshop-es
21 | 
22 | ./control-env.sh start
23 | ```
24 | 
25 | ## Sobre
26 | Gustavo Arjones &copy; 2017-2020  
27 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
28 | 


--------------------------------------------------------------------------------
/README-batch.md:
--------------------------------------------------------------------------------
 1 | # Workshop de Big Data con Apache Spark [🇪🇸]
 2 | Material del Workshop de Big Data
 3 | 
 4 | ## Batch Processing
 5 | 
 6 | ## Codigo
 7 | * [Analisis de acciones de EEUU](code/scala/us-stock-analysis) (Scala)
 8 | * [Analisis de acciones de EEUU](code/python/us-stock-analysis) (Python)
 9 | * [us-stock-analysis Jupyter Notebook](jupyter/notebook/batch_etl_steps.ipynb) (Python)
10 | 
11 | ## Compilar el codigo
12 | Compilar y empaquetar el codigo para deploy en el cluster
13 | 
14 | ```bash
15 | cd code/us-stock-analysis
16 | sbt clean assembly
17 | ```
18 | 
19 | ## Submit de un job
20 | Conectarse al Spark-Master y hacer submit del programa
21 | 
22 | ```bash
23 | docker exec -it master bash
24 | 
25 | cd /app/us-stock-analysis
26 | spark-submit --master 'spark://master:7077' \
27 |   --class "es.arjon.RunAll" \
28 |   --driver-class-path /app/postgresql-42.1.4.jar \
29 |   target/scala-2.11/us-stock-analysis-assembly-0.1.jar \
30 |   /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet
31 | ```
32 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI
33 | 
34 | Verificar el resultado del job en la carpeta `/dataset/output.parquet`:
35 | 
36 | ```bash
37 | # Desde la maquina host
38 | $ tree ~/bigdata-workshop-es/dataset/output.parquet/
39 | ```
40 | 
41 | ## Usando Spark-SQL
42 | Usando SparkSQL para acceder a los datos en Parquet y hacer analysis interactiva.
43 | 
44 | ```bash
45 | docker exec -it master bash
46 | spark-shell
47 | ```
48 | 
49 | ```scala
50 | // reduce log noise
51 | sc.setLogLevel("ERROR")
52 | 
53 | import spark.implicits._
54 | val df = spark.read.parquet("/dataset/output.parquet")
55 | df.show
56 | df.printSchema
57 | 
58 | df.createOrReplaceTempView("stocks")
59 | 
60 | // No usando particiones
61 | val badHighestClosingPrice = spark.sql("SELECT symbol, MAX(close) AS price FROM stocks WHERE full_date >= '2017-09-01' AND full_date < '2017-10-01' GROUP BY symbol")
62 | badHighestClosingPrice.explain
63 | badHighestClosingPrice.show
64 | 
65 | // Optimizando con particiones
66 | val highestClosingPrice = spark.sql("SELECT symbol, MAX(close) AS price FROM stocks WHERE year=2017 AND month=9 GROUP BY symbol")
67 | highestClosingPrice.explain
68 | highestClosingPrice.show
69 | ```
70 | 
71 | ## Ver los datos en Postgres
72 | El batch job también escribe una tabla `stocks` en Postgres que se puede acceder:
73 | 
74 | ```
75 | # abrir otra consola
76 | 
77 | docker exec -it postgres bash
78 | 
79 | psql -U workshop workshop
80 | workshop=# \d
81 | ...
82 | ...
83 | 
84 | workshop=# SELECT * FROM stocks LIMIT 10;
85 | ```
86 | 
87 | ## Creando un Dashboard con Superset
88 | 
89 | * [Como configurar Superset](./README-superset.md)
90 | * [Sitio Oficial Superset](https://superset.apache.org/)
91 | 
92 | 
93 | ## Siga leyendo
94 | * [Structured Streaming Processing](README-streaming.md)
95 | 
96 | 
97 | ____
98 | Gustavo Arjones &copy; 2017-2020
99 | 


--------------------------------------------------------------------------------
/README-ml.md:
--------------------------------------------------------------------------------
 1 | # Workshop de Big Data con Apache Spark [🇪🇸]
 2 | Material del Workshop de Big Data
 3 | 
 4 | ## Machine Learning Lib
 5 | Usando un dataset de [Credito Alemán](https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)) se entrenará un algoritmo de [Clasificación Random Forest](https://spark.apache.org/docs/2.4.4/ml-classification-regression.html#random-forest-classifier) y se buscará predecir el valor `Creditable` que significa **brindar credito**.
 6 | 
 7 | ## Codigo
 8 | * [Analisis de risco de credito](code/credit-risk-analysis) (credit-risk-analysis)
 9 | 
10 | ## Realizar el entrenamiento
11 | La clase [CreditRiskTrain.scala](code/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskTrain.scala) hace las transformaciones de los datos de entrada para generar el modelo de Random Forest. También intentamos mejorar el modelo utilizando [CrossValidator](https://spark.apache.org/docs/2.4.4/ml-tuning.html#cross-validation)
12 | 
13 | ```bash
14 | # Compilar el proyecto
15 | cd code/credit-risk-analysis
16 | sbt clean assembly
17 | 
18 | # Conectarse al SparkMaster y hacer submit del proyecto de Entrenamiento
19 | docker exec -it master bash
20 | cd /app/credit-risk-analysis
21 | spark-submit \
22 |   --class es.arjon.CreditRiskTrain \
23 |   --master 'spark://master:7077' \
24 |   target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \
25 |   /dataset/credit-risk/germancredit.csv \
26 |   /dataset/credit-risk.model
27 | 
28 | # va tomar 4+ minutos para concluir el entrenamiento
29 | ```
30 | 
31 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI
32 | 
33 | # Chequeá el modelo entrenado
34 | ```bash
35 | ls -la /dataset/credit-risk.model
36 | ```
37 | 
38 | ## Realizando predicciones
39 | El archivo `/dataset/credit-risk/germancredit-user-input.csv` simula entrada de usuarios con sus respectivas que son enviadas al modelo para prediccion.
40 | 
41 | ```bash
42 | spark-submit \
43 |   --class es.arjon.CreditRiskAnalysis \
44 |   --master 'spark://master:7077' \
45 |   target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \
46 |   /dataset/credit-risk/germancredit-user-input.csv \
47 |   /dataset/credit-risk.model
48 | ```
49 | 
50 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI
51 | 
52 | ### Desafío 🤓
53 | Modificar el codigo para tomar la entrada de **Kafka** y escribir en **Postgres**
54 | 
55 | 
56 | ## Más información
57 | * [Predicting Loan Credit Risk using Apache Spark Machine Learning Random Forests](https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/)
58 | * [Original: Analysis of German Credit Data](https://onlinecourses.science.psu.edu/stat857/node/215)
59 | 
60 | ____
61 | Gustavo Arjones &copy; 2017-2020
62 | 


--------------------------------------------------------------------------------
/README-pyspark.md:
--------------------------------------------------------------------------------
 1 | # Usando `pySpark`:
 2 | 
 3 | ## Consola
 4 | 
 5 | ```bash
 6 | docker exec -it master bash
 7 | root@588acf96a879:/app# pyspark
 8 | ```
 9 | ```python
10 | file = spark.read.text("/dataset/yahoo-symbols-201709.csv")
11 | file.count()
12 | for line in file.take(10):
13 |   print(line)
14 | ```
15 | 
16 | ## Usando Jupyter Notebook
17 | Acceda al [Jupyter Notebook aqui](http://localhost:8888/), los notebook disponibles en ese workshop [están en Github](https://github.com/arjones/bigdata-workshop-es/tree/master/jupyter/notebook)
18 | 
19 | ## Material de lectura:
20 | 
21 | * [Apache Spark in Python: Beginner's Guide](https://www.datacamp.com/community/tutorials/apache-spark-python)
22 | * [Introduction to PySpark](https://www.datacamp.com/courses/introduction-to-pyspark)
23 | * [pySpark: Evaluating the machine learning model](https://www.datacamp.com/community/tutorials/apache-spark-tutorial-machine-learning)
24 | 
25 | 
26 | ## Visualización de Datos
27 | 
28 | * [Python Data Visualization with Matplotlib](https://stackabuse.com/python-data-visualization-with-matplotlib/)
29 | * [Top 50 matplotlib Visualizations](https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/)
30 | * [Seaborn Library for Data Visualization in Python: Part 1](https://stackabuse.com/seaborn-library-for-data-visualization-in-python-part-1/)
31 | 
32 | 
33 | ____
34 | Gustavo Arjones &copy; 2017-2020
35 | 


--------------------------------------------------------------------------------
/README-streaming.md:
--------------------------------------------------------------------------------
  1 | # Workshop de Big Data con Apache Spark [🇪🇸]
  2 | Material del Workshop de Big Data
  3 | 
  4 | ## Structured Streaming Processing
  5 | El simulador publica información sobre acciones y sus precios en una cola Kafka que es consumida por Spark.
  6 | 
  7 | ## Codigo
  8 | * [Analisis de acciones de EEUU](code/us-stock-analysis) (US Stocks)
  9 | 
 10 | ## Iniciar el simulador de acciones
 11 | Dentro del mismo package tenemos la clase del simulador [FakeStockPriceGenerator](./code/us-stock-analysis/src/main/scala/es/arjonFakeStockPriceGenerator.scala)
 12 | 
 13 | ```bash
 14 | # Compilar el similador
 15 | cd code/us-stock-analysis
 16 | sbt clean assembly
 17 | 
 18 | # Ejecutarlos dentro de un Worker
 19 | docker exec -it worker1 bash
 20 | cd /app/us-stock-analysis
 21 | java -cp target/scala-2.11/us-stock-analysis-assembly-0.1.jar \
 22 |   "es.arjon.FakeStockPriceGenerator" kafka:9092 stocks
 23 | ```
 24 | 
 25 | ## Chequear el contenido de Kafka
 26 | 
 27 | ```bash
 28 | docker exec -it kafka bash
 29 | 
 30 | /opt/kafka_2.11-0.10.1.0/bin/kafka-console-consumer.sh \
 31 |   --bootstrap-server kafka:9092 --topic stocks --from-beginning
 32 | 
 33 | # apretar CTRL+C para salir
 34 | ```
 35 | 
 36 | ## Submit de un job
 37 | Conectarse al Spark-Master y hacer submit del programa
 38 | 
 39 | **NOTA:** Utilizar `--total-executor-cores` con la mitad de cores de tu computadora, ej: si tiene 4 cores, utilizar `2`.
 40 | 
 41 | ```bash
 42 | docker exec -it master bash
 43 | 
 44 | cd /app/us-stock-analysis
 45 | spark-submit --master 'spark://master:7077' \
 46 |   --class "es.arjon.StreamingETL" \
 47 |   --total-executor-cores 1 \
 48 |   target/scala-2.11/us-stock-analysis-assembly-0.1.jar \
 49 |   kafka:9092 stocks
 50 | ```
 51 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI
 52 | 
 53 | ## En otra consola, acceder al dataset de Streaming
 54 | ```bash
 55 | docker exec -it master bash
 56 | spark-shell --total-executor-cores 1
 57 | ```
 58 | 
 59 | ```scala
 60 | import spark.implicits._
 61 | val df = spark.read.parquet("/dataset/streaming.parquet")
 62 | df.show
 63 | ```
 64 | 
 65 | ## Utilizar Spark SQL y el Sink in Memory
 66 | 
 67 | En el archivo `StreamingETL.scala` comentar las líneas 71 a la 85 para evitar que se escriba en el archivo de output Parquet y descomentar las líneas de código de 90 al 103.
 68 | 
 69 | Compilar la aplicación de nuevo con: 
 70 | 
 71 | ```bash
 72 | sbt assembly
 73 | ```
 74 | 
 75 | Probar y observar el output por consola.
 76 | 
 77 | Luego comentar las líneas 98 a 103 y descomentar 106 a 121, compilar y ejecutar probar. Qué diferencia observa?
 78 | 
 79 | 
 80 | ## Streaming Spark SQL + Insert a Postgres
 81 | 
 82 | Comentar las líneas 106 a 121 y descomentar la línea
 83 | 
 84 | ```scala
 85 | AverageStocksToPostgres.process(spark, stocks)
 86 | ```
 87 | 
 88 | En otra tab ingresar al container de Postgres y luego al utilitario de línea de comando `psql`
 89 | 
 90 | ```bash
 91 | docker exec -it postgres bash
 92 | psql --host localhost --d workshop --username workshop
 93 | ```
 94 | 
 95 | Crear la tabla para recibir los inserts
 96 | 
 97 | ```sql
 98 | 
 99 | CREATE TABLE test_streaming_inserts_avg_price (
100 |     "window" varchar(128),
101 |     symbol varchar(10),
102 |     avg_price real
103 | );
104 | ```
105 | 
106 | 
107 | ## Más información
108 | * [Structured Streaming in PySpark](https://hackersandslackers.com/structured-streaming-in-pyspark/)
109 | * [Real-time Streaming ETL with Structured Streaming in Apache Spark 2.1](https://databricks.com/blog/2017/01/19/real-time-streaming-etl-structured-streaming-apache-spark-2-1.html)
110 | * [Processing Data in Apache Kafka with Structured Streaming in Apache Spark 2.2](https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html)
111 | * [Real-Time End-to-End Integration with Apache Kafka in Apache Spark’s Structured Streaming](https://databricks.com/blog/2017/04/04/real-time-end-to-end-integration-with-apache-kafka-in-apache-sparks-structured-streaming.html)
112 | 
113 | # Siga leyendo
114 | * [MLlib](README-ml.md)
115 | 
116 | ____
117 | Gustavo Arjones &copy; 2017-2020
118 | 


--------------------------------------------------------------------------------
/README-superset.md:
--------------------------------------------------------------------------------
 1 | # Workshop de Big Data con Apache Spark [🇪🇸]
 2 | Material del Workshop de Big Data
 3 | 
 4 | ## Creando un Dashboard con Superset
 5 | 
 6 | ![Superset Dashboard Example](images/superset.png)
 7 | 
 8 | * Antes de acceder por primera vez a Superset inicializar la base de datos y crear las credenciales del usuario admin corriendo el siguiente comando: 
 9 | `./control-env.sh superset-init`
10 | * Acceder a http://localhost:8088/ (utilizar las credenciales creadas en el primer paso).
11 | * Agregar el database (Sources > Databases):
12 |   - Database: `Workshop`
13 |   - SQLAlchemy URI: `postgresql://workshop:w0rkzh0p@postgres/workshop`
14 |   - OK
15 | * Agregar tabla (Sources > Tables) :
16 |   - Database: `workshop`
17 |   - Table Name: `stocks`
18 | * Create Slices & Dashboard [official docs](https://superset.incubator.apache.org/tutorial.html#creating-a-slice-and-dashboard)
19 | 
20 | ![](images/superset-01.png)
21 | 
22 | ![](images/superset-02.png)
23 | 
24 | ![](images/superset-03.png)
25 | 
26 | ![](images/superset-04.png)
27 | 
28 | ![](images/superset-05.png)
29 | 
30 | ![](images/superset-06.png)
31 | 
32 | ![](images/superset-07.png)
33 | 
34 | ![](images/superset-08.png)
35 | 
36 | ![](images/superset-09.png)
37 | 
38 | ![](images/superset-10.png)
39 | 
40 | ![](images/superset-11.png)
41 | 
42 | 
43 | ## Sobre
44 | Gustavo Arjones &copy; 2017-2020  
45 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Workshop de Big Data con Apache Spark [🇪🇸]
 2 | Material del Workshopde Big Data
 3 | 
 4 | ## Contenidos
 5 | * [Levantar el ambiente](#levantar-ambiente)
 6 | * [Introducción a Scala](scala/README.md)
 7 | * [Batch Processing (Scala)](README-batch.md)
 8 | * [Structured Streaming Processing (Scala)](README-streaming.md)
 9 | * [Machine Learning (Scala)](README-ml.md)
10 | * [Jupyter Notebook (Python / pySpark)](README-pyspark.md)
11 | * [Lista de Jupyter Notebook](jupyter/notebook/README.md)
12 | 
13 | ## Infrastructura
14 | 
15 | El workshop simula una instalación de producción utilizando container de Docker.
16 | [docker-compose.yml](docker-compose.yml) contiene las definiciones y configuraciones para esos servicios y sus respectivas UIs:
17 | 
18 | * Apache Spark: [Spark Master UI](http://localhost:8080) | [Job Progress](http://localhost:4040)
19 | * Apache Kafka:
20 | * Postgres:
21 | * [Superset](http://superset.incubator.apache.org): [Nuestro Dashboard](http://localhost:8088/)
22 | 
23 | Los puertos de acceso a cada servicio quedaron los defaults. Ej: **spark master:7077**, **postgres: 5432**
24 | 
25 | ## Levantar ambiente
26 | 
27 | Instalar el ambiente [siguiendo las instrucciones acá](INSTALL.md).
28 | 
29 | Correr el script que levanta el ambiente `Usage: control-env.sh (start|stop|cleanup)`:
30 | 
31 | ```bash
32 | ./control-env.sh start
33 | 
34 | **IMPORTANTE** el script `control-env.sh cleanup` borra cualquier dado que haya sido procesado anteriormente.
35 | 
36 | 
37 | # Access Spark-Master and run spark-shell
38 | docker exec -it master bash
39 | root@588acf96a879:/app# spark-shell
40 | ```
41 | Probar:
42 | 
43 | ```scala
44 | val file = sc.textFile("/dataset/yahoo-symbols-201709.csv")
45 | file.count
46 | file.take(10).foreach(println)
47 | ```
48 | 
49 | Acceder al [Spark Master: http://localhost:8080](http://localhost:8080) y [SPARK-UI: http://localhost:4040](http://localhost:4040).
50 | 
51 | ### Troubleshooting
52 | 
53 | Si los jobs mueren (`KILLED`) y no se completan puede ser debido a la memória disponible para Docker, **aumente la memoria > 8Gb** al proceso de Docker:
54 | 
55 | ![](./images/docker-advanced-config.jpg)
56 | 
57 | # Siga leyendo
58 | * [Introducción a Scala](scala/README.md)
59 | * [Jupyter Notebook (Python / pySpark)](README-pyspark.md)
60 | 
61 | ## Agradecimientos
62 | * Juan Pampliega ([MuttData](https://www.muttdata.ai/)): expandir y actualizar el ejemplo de [Spark Streaming](README-streaming.md)
63 | * Pedro Ferrari ([MuttData](https://www.muttdata.ai/)): crear el notebook de [pySpark con Machine Learning](./jupyter/notebook/titanic/)
64 | 
65 | ## Sobre
66 | Gustavo Arjones &copy; 2017-2020  
67 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
68 | 


--------------------------------------------------------------------------------
/airflow/.gitignore:
--------------------------------------------------------------------------------
1 | # Tex
2 | */_minted*
3 | *.log
4 | 


--------------------------------------------------------------------------------
/airflow/dags/random_number_dag.py:
--------------------------------------------------------------------------------
 1 | """Random number dag."""
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | 
 5 | from airflow.models import DAG
 6 | from airflow.operators.bash_operator import BashOperator
 7 | from airflow.operators.dummy_operator import DummyOperator
 8 | from airflow.operators.python_operator import PythonOperator
 9 | 
10 | STORE_DIR = Path(__file__).resolve().parent / 'tmp-files' / 'random-num'
11 | Path.mkdir(STORE_DIR, exist_ok=True, parents=True)
12 | bash_cmd = f"echo $(( ( RANDOM % 10 )  + 1 )) > {str(STORE_DIR / 'random_number.txt')}"
13 | 
14 | 
15 | def _read_number_and_square(store_dir):
16 |     fn = str(store_dir / 'random_number.txt')
17 |     with open(fn, 'r') as f:
18 |         n = f.readline()
19 |     return int(n) ** 2
20 | 
21 | 
22 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)}
23 | with DAG(
24 |     'random_number', default_args=default_args, schedule_interval='0 4 * * *'
25 | ) as dag:
26 |     dummy_start_task = DummyOperator(task_id=f'dummy_start')
27 |     generate_random_number = BashOperator(
28 |         task_id='generate_random_number', bash_command=bash_cmd
29 |     )
30 |     read_num_and_square = PythonOperator(
31 |         task_id='read_number_and_square_it',
32 |         python_callable=_read_number_and_square,
33 |         op_args=[STORE_DIR],
34 |     )
35 |     dummy_start_task >> generate_random_number >> read_num_and_square
36 | 


--------------------------------------------------------------------------------
/airflow/dags/requests_example.py:
--------------------------------------------------------------------------------
 1 | """Get data from API."""
 2 | import json
 3 | from datetime import datetime
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import requests
 8 | 
 9 | BASE_URL = 'https://www.alphavantage.co/query'
10 | API_KEY = 'TFHNYCWBD71JBSON'
11 | STOCK_FN = 'TIME_SERIES_DAILY'
12 | 
13 | 
14 | def _get_stock_data(stock_symbol, date):
15 |     date = f"{date:%Y-%m-%d}"  # read execution date from context
16 |     end_point = (
17 |         f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}"
18 |         f"&apikey={API_KEY}&datatype=json"
19 |     )
20 |     print(f"Getting data from {end_point}...")
21 |     r = requests.get(end_point)
22 |     data = json.loads(r.content)
23 |     df = (
24 |         pd.DataFrame(data['Time Series (Daily)'])
25 |         .T.reset_index()
26 |         .rename(columns={'index': 'date'})
27 |     )
28 |     df = df[df['date'] == date]
29 |     if not df.empty:
30 |         for c in df.columns:
31 |             if c != 'date':
32 |                 df[c] = df[c].astype(float)
33 |         df['avg_price'] = (df['2. high'] + df['3. low']) / 2
34 |         df['avg_num_trades'] = df['5. volume'] / 1440
35 |     else:
36 |         df = pd.DataFrame(
37 |             [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price'],
38 |         )
39 |     df['symbol'] = stock_symbol
40 |     df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']]
41 |     return df
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     yesterday = datetime(2020, 4, 13)
46 |     df1 = _get_stock_data('aapl', yesterday)
47 | 


--------------------------------------------------------------------------------
/airflow/dags/solution_random_number_dag.py:
--------------------------------------------------------------------------------
 1 | """Random number dag extended."""
 2 | import logging
 3 | from datetime import datetime
 4 | from pathlib import Path
 5 | 
 6 | from airflow.models import DAG
 7 | from airflow.operators.bash_operator import BashOperator
 8 | from airflow.operators.dummy_operator import DummyOperator
 9 | from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
10 | 
11 | STORE_DIR = Path(__file__).resolve().parent / 'tmp-files' / 'random-num'
12 | Path.mkdir(STORE_DIR, exist_ok=True, parents=True)
13 | # Add execution date to filename that stores random number
14 | bash_cmd = (
15 |     f'echo $(( ( RANDOM % 10 )  + 1 )) > {str(STORE_DIR)}/{{{{ ds_nodash }}}}.txt'
16 | )
17 | 
18 | 
19 | def _read_number_and_square(store_dir, **context):
20 |     date = context['execution_date']  # read execution date from context
21 |     fn = str(store_dir / f'{date:%Y%m%d}.txt')
22 |     print(f"Reading {fn}...")  # add logging with print
23 |     with open(fn, 'r') as f:
24 |         n = f.readline()
25 |     logging.info(f"Number read from file is: {n}")  # also adds logging
26 |     n_sqr = int(n) ** 2
27 |     return 'print_high' if n_sqr > 30 else 'print_low'  # return next task instance
28 | 
29 | 
30 | def _print_high():
31 |     return 'HIGH'
32 | 
33 | 
34 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)}
35 | with DAG(
36 |     'random_number_extended', default_args=default_args, schedule_interval='0 4 * * *'
37 | ) as dag:
38 |     dummy_start_task = DummyOperator(task_id=f'dummy_start')
39 |     generate_random_number = BashOperator(
40 |         task_id='generate_random_number', bash_command=bash_cmd
41 |     )
42 |     # New branch operator
43 |     read_num_and_square = BranchPythonOperator(
44 |         task_id='read_number_and_square_it',
45 |         python_callable=_read_number_and_square,
46 |         op_args=[STORE_DIR],
47 |         provide_context=True,  # pass task instance params to python callable
48 |     )
49 |     print_high = PythonOperator(task_id='print_high', python_callable=_print_high)
50 |     print_low = BashOperator(task_id='print_low', bash_command='echo LOW')
51 |     # Define tasks (normal path and then each branch)
52 |     dummy_start_task >> generate_random_number >> read_num_and_square >> print_high
53 |     read_num_and_square.set_downstream(print_low)
54 | 


--------------------------------------------------------------------------------
/airflow/dags/solution_stocks_dag.py:
--------------------------------------------------------------------------------
  1 | """Stocks dag extended."""
  2 | import json
  3 | from datetime import datetime
  4 | from time import sleep
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import requests
  9 | import sqlalchemy.exc
 10 | from airflow.models import DAG
 11 | from airflow.operators.email_operator import EmailOperator
 12 | from airflow.operators.python_operator import PythonOperator
 13 | from airflow.operators.sqlite_operator import SqliteOperator
 14 | from sqlite_cli import SqLiteClient
 15 | 
 16 | BASE_URL = 'https://www.alphavantage.co/query'
 17 | API_KEY = 'TFHNYCWBD71JBSON'
 18 | STOCK_FN = 'TIME_SERIES_DAILY'
 19 | 
 20 | SQL_DB = '/tmp/sqlite_default.db'  # This is defined in Admin/Connections
 21 | SQL_TABLE = 'stocks_daily_extended'
 22 | SQL_CREATE = f"""
 23 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} (
 24 | date TEXT,
 25 | symbol TEXT,
 26 | avg_num_trades REAL,
 27 | avg_price REAL,
 28 | UNIQUE(date,symbol)
 29 | )
 30 | """
 31 | SQL_REPORT = f"""
 32 | SELECT symbol, avg_num_trades
 33 | FROM {SQL_TABLE}
 34 | WHERE date = '{{date}}'
 35 | ORDER BY avg_num_trades DESC
 36 | LIMIT 1
 37 | """
 38 | 
 39 | STOCKS = {'apple': 'aapl', 'tesla': 'tsla', 'facebook': 'fb'}
 40 | 
 41 | 
 42 | def _get_stock_data(stock_symbol, **context):
 43 |     date = f"{context['execution_date']:%Y-%m-%d}"  # read execution date from context
 44 |     end_point = (
 45 |         f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}"
 46 |         f"&apikey={API_KEY}&datatype=json"
 47 |     )
 48 |     print(f"Getting data from {end_point}...")
 49 |     r = requests.get(end_point)
 50 |     sleep(15)  # To avoid api limits
 51 |     data = json.loads(r.content)
 52 |     df = (
 53 |         pd.DataFrame(data['Time Series (Daily)'])
 54 |         .T.reset_index()
 55 |         .rename(columns={'index': 'date'})
 56 |     )
 57 |     df = df[df['date'] == date]
 58 |     if not df.empty:
 59 |         for c in df.columns:
 60 |             if c != 'date':
 61 |                 df[c] = df[c].astype(float)
 62 |         df['avg_price'] = (df['2. high'] + df['3. low']) / 2
 63 |         df['avg_num_trades'] = df['5. volume'] / 1440
 64 |     else:
 65 |         df = pd.DataFrame(
 66 |             [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price']
 67 |         )
 68 |     df['symbol'] = stock_symbol
 69 |     df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']]
 70 |     return df
 71 | 
 72 | 
 73 | def _insert_daily_data(**context):
 74 |     task_instance = context['ti']
 75 |     # Get xcom for each upstream task
 76 |     dfs = []
 77 |     for company in STOCKS:
 78 |         dfs.append(task_instance.xcom_pull(task_ids=f'get_daily_data_{company}'))
 79 |     df = pd.concat(dfs, axis=0)
 80 |     sql_cli = SqLiteClient(SQL_DB)
 81 |     try:
 82 |         sql_cli.insert_from_frame(df, SQL_TABLE)
 83 |         print(f"Inserted {len(df)} records")
 84 |     except sqlalchemy.exc.IntegrityError:
 85 |         # You can avoid doing this by setting a trigger rule in the reports operator
 86 |         print("Data already exists! Nothing to do...")
 87 |     return
 88 | 
 89 | 
 90 | def _perform_daily_report(**context):
 91 |     date = f"{context['execution_date']:%Y-%m-%d}"
 92 |     sql_cli = SqLiteClient(SQL_DB)
 93 |     sql = SQL_REPORT.format(date=date)
 94 |     df = sql_cli.to_frame(sql).squeeze()
 95 |     msg = (
 96 |         f"Most traded action in {date} was {df['symbol']} with "
 97 |         f"an avg of {df['avg_num_trades']} trades per minute."
 98 |     )
 99 |     return msg
100 | 
101 | 
102 | default_args = {
103 |     'owner': 'pedro',
104 |     'retries': 0,
105 |     'start_date': datetime(2020, 12, 10),
106 |     'email_on_failure': True,
107 |     'email_on_retry': False,
108 |     'email': ['pedro@muttdata.ai'],
109 | }
110 | with DAG(
111 |     'stocks_extended', default_args=default_args, schedule_interval='0 4 * * *'
112 | ) as dag:
113 | 
114 |     create_table_if_not_exists = SqliteOperator(
115 |         task_id='create_table_if_not_exists',
116 |         sql=SQL_CREATE,
117 |         sqlite_conn_id='sqlite_default',
118 |     )
119 | 
120 |     # Create several task in loop
121 |     get_data_task = {}
122 |     for company, symbol in STOCKS.items():
123 |         get_data_task[company] = PythonOperator(
124 |             task_id=f'get_daily_data_{company}',
125 |             python_callable=_get_stock_data,
126 |             op_args=[symbol],
127 |             provide_context=True,
128 |         )
129 | 
130 |     insert_daily_data = PythonOperator(
131 |         task_id='insert_daily_data',
132 |         python_callable=_insert_daily_data,
133 |         provide_context=True,
134 |     )
135 | 
136 |     do_daily_report = PythonOperator(
137 |         task_id='do_most_traded_report',
138 |         python_callable=_perform_daily_report,
139 |         provide_context=True,
140 |     )
141 | 
142 |     send_report_email = EmailOperator(
143 |         task_id='send_report_email',
144 |         to='pedro@muttdata.ai',
145 |         subject='Airflow Stocks Report {{ ds }}',
146 |         html_content="{{ ti.xcom_pull(task_ids='do_most_traded_report') }}",
147 |     )
148 | 
149 |     for company in STOCKS:
150 |         upstream_task = create_table_if_not_exists
151 |         task = get_data_task[company]
152 |         upstream_task.set_downstream(task)
153 |         task.set_downstream(insert_daily_data)
154 |     insert_daily_data.set_downstream(do_daily_report)
155 |     do_daily_report.set_downstream(send_report_email)
156 | 


--------------------------------------------------------------------------------
/airflow/dags/spark_job.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyspark import SparkConf, SparkContext
 4 | 
 5 | BLACK_LIST = ["the", "be", "to", "of", "=", "=="]
 6 | 
 7 | 
 8 | sc = SparkContext("local", "PySpark Word Count Exmaple")
 9 | input_file = os.environ.get('INPUT_FILE', f"{os.environ['SPARK_HOME']}/README.md")
10 | output_file = os.environ.get('OUTPUT_FILE', '/spark-job/output.csv')
11 | log_file = f"{input_file}"  # Should be some file on your system
12 | 
13 | words = sc.textFile(log_file).flatMap(lambda line: line.split(" "))
14 | word_counts = (
15 |     words.filter(lambda word: word != '' and len(word) > 1 and word not in BLACK_LIST)
16 |     .map(lambda word: (word, 1))
17 |     .reduceByKey(lambda a, b: a + b)
18 |     .max(lambda x: x[1])
19 | )
20 | 
21 | with open(output_file, 'w') as output:
22 |     output.write(f"{word_counts[0]},{word_counts[1]}")
23 | 


--------------------------------------------------------------------------------
/airflow/dags/sqlite_cli.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sqlalchemy import create_engine
 3 | 
 4 | 
 5 | class SqLiteClient:
 6 |     def __init__(self, db):
 7 |         self.dialect = 'sqlite'
 8 |         self.db = db
 9 |         self._engine = None
10 | 
11 |     def _get_engine(self):
12 |         db_uri = f'{self.dialect}:///{self.db}'
13 |         if not self._engine:
14 |             self._engine = create_engine(db_uri)
15 |         return self._engine
16 | 
17 |     def _connect(self):
18 |         return self._get_engine().connect()
19 | 
20 |     @staticmethod
21 |     def _cursor_columns(cursor):
22 |         if hasattr(cursor, 'keys'):
23 |             return cursor.keys()
24 |         else:
25 |             return [c[0] for c in cursor.description]
26 | 
27 |     def execute(self, sql, connection=None):
28 |         if connection is None:
29 |             connection = self._connect()
30 |         return connection.execute(sql)
31 | 
32 |     def insert_from_frame(self, df, table, if_exists='append', index=False, **kwargs):
33 |         connection = self._connect()
34 |         with connection:
35 |             df.to_sql(table, connection, if_exists=if_exists, index=index, **kwargs)
36 | 
37 |     def to_frame(self, *args, **kwargs):
38 |         cursor = self.execute(*args, **kwargs)
39 |         if not cursor:
40 |             return
41 |         data = cursor.fetchall()
42 |         if data:
43 |             df = pd.DataFrame(data, columns=self._cursor_columns(cursor))
44 |         else:
45 |             df = pd.DataFrame()
46 |         return df
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     db = '/tmp/sqlite_default.db'
51 |     sqlite_cli = SqLiteClient(db)
52 |     print(sqlite_cli.to_frame('SELECT * FROM stocks_daily'))
53 | 


--------------------------------------------------------------------------------
/airflow/dags/stocks_dag.py:
--------------------------------------------------------------------------------
 1 | """Stocks dag."""
 2 | import json
 3 | from datetime import datetime
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import requests
 8 | from airflow.models import DAG
 9 | from airflow.operators.python_operator import PythonOperator
10 | from airflow.operators.sqlite_operator import SqliteOperator
11 | from sqlite_cli import SqLiteClient
12 | 
13 | BASE_URL = 'https://www.alphavantage.co/query'
14 | API_KEY = 'TFHNYCWBD71JBSON'
15 | STOCK_FN = 'TIME_SERIES_DAILY'
16 | 
17 | SQL_DB = '/tmp/sqlite_default.db'  # This is defined in Admin/Connections
18 | SQL_TABLE = 'stocks_daily'
19 | SQL_CREATE = f"""
20 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} (
21 | date TEXT,
22 | symbol TEXT,
23 | avg_num_trades REAL,
24 | avg_price REAL,
25 | UNIQUE(date,symbol)
26 | )
27 | """
28 | 
29 | 
30 | def _get_stock_data(stock_symbol, **context):
31 |     date = f"{context['execution_date']:%Y-%m-%d}"  # read execution date from context
32 |     end_point = (
33 |         f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}"
34 |         f"&apikey={API_KEY}&datatype=json"
35 |     )
36 |     print(f"Getting data from {end_point}...")
37 |     r = requests.get(end_point)
38 |     data = json.loads(r.content)
39 |     df = (
40 |         pd.DataFrame(data['Time Series (Daily)'])
41 |         .T.reset_index()
42 |         .rename(columns={'index': 'date'})
43 |     )
44 |     df = df[df['date'] == date]
45 |     if not df.empty:
46 |         for c in df.columns:
47 |             if c != 'date':
48 |                 df[c] = df[c].astype(float)
49 |         df['avg_price'] = (df['2. high'] + df['3. low']) / 2
50 |         df['avg_num_trades'] = df['5. volume'] / 1440
51 |     else:
52 |         df = pd.DataFrame(
53 |             [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price'],
54 |         )
55 |     df['symbol'] = stock_symbol
56 |     df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']]
57 |     return df
58 | 
59 | 
60 | def _insert_daily_data(**context):
61 |     task_instance = context['ti']
62 |     df = task_instance.xcom_pull(task_ids='get_daily_data')
63 |     sql_cli = SqLiteClient(SQL_DB)
64 |     sql_cli.insert_from_frame(df, SQL_TABLE)
65 |     return
66 | 
67 | 
68 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 10)}
69 | with DAG('stocks', default_args=default_args, schedule_interval='0 4 * * *') as dag:
70 |     create_table_if_not_exists = SqliteOperator(
71 |         task_id='create_table_if_not_exists',
72 |         sql=SQL_CREATE,
73 |         sqlite_conn_id='sqlite_default',
74 |     )
75 |     get_daily_data = PythonOperator(
76 |         task_id='get_daily_data',
77 |         python_callable=_get_stock_data,
78 |         op_args=['aapl'],
79 |         provide_context=True,
80 |     )
81 |     # Add insert stock data
82 |     insert_daily_data = PythonOperator(
83 |         task_id='insert_daily_data',
84 |         python_callable=_insert_daily_data,
85 |         provide_context=True,
86 |     )
87 |     create_table_if_not_exists >> get_daily_data >> insert_daily_data
88 | 


--------------------------------------------------------------------------------
/airflow/dags/word_count_dag.py:
--------------------------------------------------------------------------------
 1 | """Word count dag."""
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | 
 5 | import pandas as pd
 6 | from airflow.hooks.postgres_hook import PostgresHook
 7 | from airflow.models import DAG
 8 | 
 9 | # from airflow.operators.docker_operator import DockerOperator
10 | from airflow.operators.postgres_operator import PostgresOperator
11 | from airflow.operators.python_operator import PythonOperator
12 | 
13 | STORE_DIR = Path(__file__).resolve().parent
14 | 
15 | CONNECTION_ID = 'postgres_local'
16 | SQL_DB = "word_count"
17 | SQL_TABLE = 'word_count'
18 | SQL_CREATE = f"""
19 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} (
20 | date TEXT,
21 | word TEXT,
22 | count REAL,
23 | UNIQUE(date,word)
24 | )
25 | """
26 | 
27 | 
28 | def _insert_file_to_sql(**context):
29 |     df_result = pd.read_csv(f"{STORE_DIR}/output.csv", names=["word", "count"])
30 |     df_result["date"] = context["ds"]
31 |     if not df_result.empty:
32 |         for c in df_result.columns:
33 |             if c == 'count':
34 |                 df_result[c] = df_result[c].astype(float)
35 |     df_result = df_result.squeeze()  # squeezing single row dataframe
36 | 
37 |     df_tuple = [(df_result["date"], df_result["word"], df_result["count"])]
38 | 
39 |     hook = PostgresHook(postgres_conn_id=CONNECTION_ID)
40 |     hook.insert_rows(SQL_TABLE, df_tuple)
41 | 
42 | 
43 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)}
44 | with DAG('word_count', default_args=default_args, schedule_interval='0 0 * * *') as dag:
45 |     create_table_if_not_exists = PostgresOperator(
46 |         task_id='create_table_if_not_exists',
47 |         sql=SQL_CREATE,
48 |         postgres_conn_id=CONNECTION_ID,
49 |     )
50 |     # spark_job = DockerOperator(
51 |     # task_id='spark_job',
52 |     # image='bde2020/spark-master:latest',
53 |     # api_version='auto',
54 |     # auto_remove=True,
55 |     # environment={'PYSPARK_PYTHON': "python3", 'SPARK_HOME': "/spark"},
56 |     # volumes=[f'{STORE_DIR}:/spark-job'],
57 |     # command='/spark/bin/spark-submit --master local[*] /spark-job/spark_job.py',
58 |     # docker_url='unix://var/run/docker.sock',
59 |     # network_mode='bridge',
60 |     # )
61 |     insert_file_to_sql = PythonOperator(
62 |         task_id='file_to_sql',
63 |         python_callable=_insert_file_to_sql,
64 |         provide_context=True,
65 |     )
66 | 


--------------------------------------------------------------------------------
/airflow/docs/clase_airflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/clase_airflow.pdf


--------------------------------------------------------------------------------
/airflow/docs/figures/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow.png


--------------------------------------------------------------------------------
/airflow/docs/figures/airflow_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow_architecture.png


--------------------------------------------------------------------------------
/airflow/docs/figures/airflow_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow_ui.png


--------------------------------------------------------------------------------
/airflow/docs/figures/dag_graph_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/dag_graph_view.png


--------------------------------------------------------------------------------
/airflow/docs/figures/example_bash_operator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/example_bash_operator.png


--------------------------------------------------------------------------------
/airflow/docs/figures/exercise_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_1.png


--------------------------------------------------------------------------------
/airflow/docs/figures/exercise_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_2.png


--------------------------------------------------------------------------------
/airflow/docs/figures/exercise_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_3.png


--------------------------------------------------------------------------------
/airflow/docs/figures/exercise_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_4.png


--------------------------------------------------------------------------------
/airflow/docs/figures/logo_mutt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/logo_mutt.png


--------------------------------------------------------------------------------
/code/postgresql-42.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/code/postgresql-42.1.4.jar


--------------------------------------------------------------------------------
/code/python/introduction/ejercicios/list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | # Basic list exercises
 4 | # Fill in the code for the functions below. main() is already set up
 5 | # to call the functions with a few different inputs,
 6 | # printing 'OK' when each function is correct.
 7 | # The starter code for each function includes a 'return'
 8 | # which is just a placeholder for your code.
 9 | # It's ok if you do not complete all the functions, and there
10 | # are some additional functions to try in list2.py.
11 | 
12 | # A. match_ends
13 | # Given a list of strings, return the count of the number of
14 | # strings where the string length is 2 or more and the first
15 | # and last chars of the string are the same.
16 | # Note: python does not have a ++ operator, but += works.
17 | def match_ends(words):
18 |   # +++your code here+++
19 |   return
20 | 
21 | 
22 | # B. front_x
23 | # Given a list of strings, return a list with the strings
24 | # in sorted order, except group all the strings that begin with 'x' first.
25 | # e.g. ['mix', 'xyz', 'apple', 'xanadu', 'aardvark'] yields
26 | # ['xanadu', 'xyz', 'aardvark', 'apple', 'mix']
27 | # Hint: this can be done by making 2 lists and sorting each of them
28 | # before combining them.
29 | def front_x(words):
30 |   # +++your code here+++
31 |   return
32 | 
33 | 
34 | 
35 | # C. sort_last
36 | # Given a list of non-empty tuples, return a list sorted in increasing
37 | # order by the last element in each tuple.
38 | # e.g. [(1, 7), (1, 3), (3, 4, 5), (2, 2)] yields
39 | # [(2, 2), (1, 3), (3, 4, 5), (1, 7)]
40 | # Hint: use a custom key= function to extract the last element form each tuple.
41 | def sort_last(tuples):
42 |   # +++your code here+++
43 |   return
44 | 
45 | 
46 | # D. Given a list of numbers, return a list where
47 | # all adjacent == elements have been reduced to a single element,
48 | # so [1, 2, 2, 3] returns [1, 2, 3]. You may create a new list or
49 | # modify the passed in list.
50 | def remove_adjacent(nums):
51 |   # +++your code here+++
52 |   return
53 | 
54 | 
55 | # Simple provided test() function used in main() to print
56 | # what each function returns vs. what it's supposed to return.
57 | def test(got, expected):
58 |   if got == expected:
59 |     prefix = ' OK '
60 |   else:
61 |     prefix = '  X '
62 |   print(f'{prefix} got: {got} expected: {expected}')
63 | 
64 | 
65 | # Calls the above functions with interesting inputs.
66 | def main():
67 |   print('match_ends')
68 |   test(match_ends(['aba', 'xyz', 'aa', 'x', 'bbb']), 3)
69 |   test(match_ends(['', 'x', 'xy', 'xyx', 'xx']), 2)
70 |   test(match_ends(['aaa', 'be', 'abc', 'hello']), 1)
71 | 
72 |   print('\n')
73 |   print('front_x')
74 |   test(front_x(['bbb', 'ccc', 'axx', 'xzz', 'xaa']),
75 |        ['xaa', 'xzz', 'axx', 'bbb', 'ccc'])
76 |   test(front_x(['ccc', 'bbb', 'aaa', 'xcc', 'xaa']),
77 |        ['xaa', 'xcc', 'aaa', 'bbb', 'ccc'])
78 |   test(front_x(['mix', 'xyz', 'apple', 'xanadu', 'aardvark']),
79 |        ['xanadu', 'xyz', 'aardvark', 'apple', 'mix'])
80 | 
81 |        
82 |   print('\n')
83 |   print('sort_last')
84 |   test(sort_last([(1, 3), (3, 2), (2, 1)]),
85 |        [(2, 1), (3, 2), (1, 3)])
86 |   test(sort_last([(2, 3), (1, 2), (3, 1)]),
87 |        [(3, 1), (1, 2), (2, 3)])
88 |   test(sort_last([(1, 7), (1, 3), (3, 4, 5), (2, 2)]),
89 |        [(2, 2), (1, 3), (3, 4, 5), (1, 7)])
90 | 
91 |   print('\n')
92 |   print('remove_adjacent')
93 |   test(remove_adjacent([1, 2, 2, 3]), [1, 2, 3])
94 |   test(remove_adjacent([2, 2, 3, 3, 3]), [2, 3])
95 |   test(remove_adjacent([]), [])
96 | 
97 | if __name__ == '__main__':
98 |   main()
99 | 


--------------------------------------------------------------------------------
/code/python/introduction/ejercicios/small.txt:
--------------------------------------------------------------------------------
1 | We are not what we should be
2 | We are not what we need to be
3 | But at least we are not what we used to be
4 |   -- Football Coach
5 | 
6 | 


--------------------------------------------------------------------------------
/code/python/introduction/ejercicios/string.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # Copyright 2010 Google Inc.
  3 | # Licensed under the Apache License, Version 2.0
  4 | # http://www.apache.org/licenses/LICENSE-2.0
  5 | 
  6 | # Google's Python Class
  7 | # http://code.google.com/edu/languages/google-python-class/
  8 | 
  9 | # Basic string exercises
 10 | # Fill in the code for the functions below. main() is already set up
 11 | # to call the functions with a few different inputs,
 12 | # printing 'OK' when each function is correct.
 13 | # The starter code for each function includes a 'return'
 14 | # which is just a placeholder for your code.
 15 | # It's ok if you do not complete all the functions, and there
 16 | # are some additional functions to try in string2.py.
 17 | 
 18 | 
 19 | # A. donuts
 20 | # Given an int count of a number of donuts, return a string
 21 | # of the form 'Number of donuts: <count>', where <count> is the number
 22 | # passed in. However, if the count is 10 or more, then use the word 'many'
 23 | # instead of the actual count.
 24 | # So donuts(5) returns 'Number of donuts: 5'
 25 | # and donuts(23) returns 'Number of donuts: many'
 26 | def donuts(count):
 27 |   # +++your code here+++
 28 |   return
 29 | 
 30 | 
 31 | # B. both_ends
 32 | # Given a string s, return a string made of the first 2
 33 | # and the last 2 chars of the original string,
 34 | # so 'spring' yields 'spng'. However, if the string length
 35 | # is less than 2, return instead the empty string.
 36 | def both_ends(s):
 37 |   # +++your code here+++
 38 |   return
 39 | 
 40 | 
 41 | # C. fix_start
 42 | # Given a string s, return a string
 43 | # where all occurences of its first char have
 44 | # been changed to '*', except do not change
 45 | # the first char itself.
 46 | # e.g. 'babble' yields 'ba**le'
 47 | # Assume that the string is length 1 or more.
 48 | # Hint: s.replace(stra, strb) returns a version of string s
 49 | # where all instances of stra have been replaced by strb.
 50 | def fix_start(s):
 51 |   # +++your code here+++
 52 |   return
 53 | 
 54 | 
 55 | # D. MixUp
 56 | # Given strings a and b, return a single string with a and b separated
 57 | # by a space '<a> <b>', except swap the first 2 chars of each string.
 58 | # e.g.
 59 | #   'mix', pod' -> 'pox mid'
 60 | #   'dog', 'dinner' -> 'dig donner'
 61 | # Assume a and b are length 2 or more.
 62 | def mix_up(a, b):
 63 |   # +++your code here+++
 64 |   return
 65 | 
 66 | 
 67 | # Provided simple test() function used in main() to print
 68 | # what each function returns vs. what it's supposed to return.
 69 | def test(got, expected):
 70 |   if got == expected:
 71 |     prefix = ' OK '
 72 |   else:
 73 |     prefix = '  X '
 74 |   print(f'{prefix} got: {got} expected: {expected}')
 75 | 
 76 | 
 77 | # Provided main() calls the above functions with interesting inputs,
 78 | # using test() to check if each result is correct or not.
 79 | def main():
 80 |   print('donuts')
 81 |   # Each line calls donuts, compares its result to the expected for that call.
 82 |   test(donuts(4), 'Number of donuts: 4')
 83 |   test(donuts(9), 'Number of donuts: 9')
 84 |   test(donuts(10), 'Number of donuts: many')
 85 |   test(donuts(99), 'Number of donuts: many')
 86 | 
 87 |   print('\n')
 88 |   print('both_ends')
 89 |   test(both_ends('spring'), 'spng')
 90 |   test(both_ends('Hello'), 'Helo')
 91 |   test(both_ends('a'), '')
 92 |   test(both_ends('xyz'), 'xyyz')
 93 | 
 94 |   
 95 |   print('\n')
 96 |   print('fix_start')
 97 |   test(fix_start('babble'), 'ba**le')
 98 |   test(fix_start('aardvark'), 'a*rdv*rk')
 99 |   test(fix_start('google'), 'goo*le')
100 |   test(fix_start('donut'), 'donut')
101 | 
102 |   print('\n')
103 |   print('mix_up')
104 |   test(mix_up('mix', 'pod'), 'pox mid')
105 |   test(mix_up('dog', 'dinner'), 'dig donner')
106 |   test(mix_up('gnash', 'sport'), 'spash gnort')
107 |   test(mix_up('pezzy', 'firm'), 'fizzy perm')
108 | 
109 | 
110 | # Standard boilerplate to call the main() function.
111 | if __name__ == '__main__':
112 |   main()
113 | 


--------------------------------------------------------------------------------
/code/python/introduction/ejercicios/wordcount.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # Copyright 2010 Google Inc.
 3 | # Licensed under the Apache License, Version 2.0
 4 | # http://www.apache.org/licenses/LICENSE-2.0
 5 | 
 6 | # Google's Python Class
 7 | # http://code.google.com/edu/languages/google-python-class/
 8 | 
 9 | """Wordcount exercise
10 | Google's Python class
11 | 
12 | The main() below is already defined and complete. It calls print_words()
13 | and print_top() functions which you write.
14 | 
15 | 1. For the --count flag, implement a print_words(filename) function that counts
16 | how often each word appears in the text and prints:
17 | word1 count1
18 | word2 count2
19 | ...
20 | 
21 | Print the above list in order sorted by word (python will sort punctuation to
22 | come before letters -- that's fine). Store all the words as lowercase,
23 | so 'The' and 'the' count as the same word.
24 | 
25 | 2. For the --topcount flag, implement a print_top(filename) which is similar
26 | to print_words() but which prints just the top 20 most common words sorted
27 | so the most common word is first, then the next most common, and so on.
28 | 
29 | Use str.split() (no arguments) to split on all whitespace.
30 | 
31 | Workflow: don't build the whole program at once. Get it to an intermediate
32 | milestone and print your data structure and sys.exit(0).
33 | When that's working, try for the next milestone.
34 | 
35 | Optional: define a helper function to avoid code duplication inside
36 | print_words() and print_top().
37 | 
38 | """
39 | 
40 | import sys
41 | 
42 | # +++your code here+++
43 | # Define print_words(filename) and print_top(filename) functions.
44 | # You could write a helper utility function that reads a file
45 | # and builds and returns a word/count dict for it.
46 | # Then print_words() and print_top() can just call the utility function.
47 | 
48 | ###
49 | 
50 | # This basic command line argument parsing code is provided and
51 | # calls the print_words() and print_top() functions which you must define.
52 | def main():
53 |   if len(sys.argv) != 3:
54 |     print('usage: ./wordcount.py {--count | --topcount} file')
55 |     sys.exit(1)
56 | 
57 |   option = sys.argv[1]
58 |   filename = sys.argv[2]
59 |   if option == '--count':
60 |     print_words(filename)
61 |   elif option == '--topcount':
62 |     print_top(filename)
63 |   else:
64 |     print(f'unknown option: {option}')
65 |     sys.exit(1)
66 | 
67 | if __name__ == '__main__':
68 |   main()
69 | 


--------------------------------------------------------------------------------
/code/python/introduction/hello.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | """A tiny Python program to check that Python is working.
 4 | Try running this program from the command line like this:
 5 |   python hello.py
 6 |   python hello.py Alice
 7 | That should print:
 8 |   Hello World -or- Hello Alice
 9 | Try changing the 'Hello' to 'Howdy' and run again.
10 | Once you have that working, you're ready for class -- you can edit
11 | and run Python code; now you just need to learn Python!
12 | """
13 | 
14 | import sys
15 | 
16 | # Define a main() function that prints a little greeting.
17 | def main():
18 |   # Get the name from the command line, using 'World' as a fallback.
19 |   if len(sys.argv) >= 2:
20 |     name = sys.argv[1]
21 |   else:
22 |     name = 'World'
23 |   print(f'Hello {name}')
24 | 
25 | # This is the standard boilerplate that calls the main() function.
26 | if __name__ == '__main__':
27 |   main()
28 | 


--------------------------------------------------------------------------------
/code/python/introduction/introduccion_a_python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/code/python/introduction/introduccion_a_python.pdf


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/.gitignore:
--------------------------------------------------------------------------------
1 | .cache
2 | __pycache__
3 | build/
4 | dist/
5 | *.egg-info/
6 | .idea/
7 | .vscode/
8 | venv
9 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/README.md:
--------------------------------------------------------------------------------
  1 | # ETL: US stocks analysis (BATCH)
  2 | 
  3 | ## How to run our app
  4 | 
  5 | ```bash
  6 | # Levantando docker en modo iterativo, conectando al master
  7 | docker exec -it master bash
  8 | 
  9 | # voy a la carpeta base de mi codigo
 10 | cd /app/python/us-stock-analysis
 11 | 
 12 | # Envía el job para ser ejecutado
 13 | spark-submit \
 14 |   --master 'spark://master:7077' \
 15 |   --jars /app/postgresql-42.1.4.jar \
 16 |   src/batch/etl_steps.py \
 17 |   /dataset/stocks-small \
 18 |   /dataset/yahoo-symbols-201709.csv \
 19 |   /dataset/output.parquet
 20 | 
 21 | # Console
 22 | pyspark \
 23 |   --master 'spark://master:7077' \
 24 |   --jars /app/postgresql-42.1.4.jar
 25 | ```
 26 | 
 27 | ## More examples
 28 | 
 29 | ```bash
 30 | spark-submit \
 31 |   --master 'spark://master:7077' \
 32 |   src/examples/first_example.py
 33 | 
 34 | spark-submit \
 35 |   --master 'spark://master:7077' \
 36 |   --jars /app/postgresql-42.1.4.jar \
 37 |   src/examples/postgres_example.py
 38 | ```
 39 | # Create a Project using `venv`
 40 | 
 41 | ```bash
 42 | mkdir project1
 43 | cd project1
 44 | 
 45 | # Create virtualenv
 46 | python3 -m venv venv
 47 | source venv/bin/activate
 48 | 
 49 | # Upgrade pip & Install deps
 50 | pip install --upgrade pip
 51 | pip install -r requirements.txt
 52 | 
 53 | charm .
 54 | ```
 55 | 
 56 | # ETL: US stocks analysis (STREAMING)
 57 | 
 58 | ### Comenzar fake generator
 59 | ```bash
 60 | docker exec -it worker1 bash
 61 | 
 62 | cd /app/python/us-stock-analysis/
 63 | 
 64 | # generate stream data
 65 | python src/stream/fake_stock_price_generator.py kafka:9092 stocks 2017-11-11T10:00:00Z
 66 | ```
 67 | 
 68 | ### Process using Spark Structured Stream API
 69 | [Structured Streaming + Kafka Integration Guide](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#deploying)
 70 | 
 71 | Abrir otra tab y volver a ingresar al servidor donde se encuentran corriendo los contenedores.
 72 | Luego, para correr la aplicación de spark conectarse a un worker, ir al directorio con el código y correr `spark-submit` de la siguiente manera:
 73 | 
 74 | ```bash
 75 | docker exec -it worker1 bash
 76 | 
 77 | cd /app/python/us-stock-analysis/
 78 | 
 79 | spark-submit \
 80 |   --master 'spark://master:7077' \
 81 |   --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 \
 82 |   --jars /app/postgresql-42.1.4.jar \
 83 |   src/stream/etl_stream.py \
 84 |   kafka:9092 stocks
 85 | ```
 86 | 
 87 | (Para stopear el comando presiones `Ctrl + c` )
 88 | 
 89 | ### Escribiendo a Postgres
 90 | 
 91 | En una nueva tab de la terminal ingresar a la línea de comando de Postgres con:
 92 | 
 93 | ```bash
 94 | ./control-env.sh psql
 95 | ```
 96 | 
 97 | Crear las tablas que vamos a utilizar para el ejercicio con los siguientes comandos (copiar el comando entero, pegar y presionar enter por cada uno)
 98 | 
 99 | ```sql
100 | CREATE TABLE streaming_inserts (
101 |     "timestamp" timestamptz NOT NULL,
102 |     symbol varchar(10),
103 |     price real
104 | );
105 | ```
106 | 
107 | ```sql
108 | CREATE TABLE streaming_inserts_avg_price (
109 |     "window" varchar(128),
110 |     symbol varchar(10),
111 |     avg_price real
112 | );
113 | ```
114 | 
115 | ```sql
116 | CREATE TABLE streaming_inserts_avg_price_final (
117 |     window_start timestamp,
118 |     window_end timestamp,
119 |     symbol varchar(10),
120 |     avg_price real
121 | );
122 | ```
123 | 
124 | Asegurarse que todas las líneas de la 59 a la 114 del archivo `etl_stream.py` se encuentran comentadas.
125 | 
126 | Descomentar el primer job de inserción a Postgres en las siguientes líneas
127 | ```python
128 |   # Simple insert
129 |   query = stream_to_postgres(stocks)
130 |   query.awaitTermination()
131 | ```
132 | 
133 | Asegurarse que el generador de datos está corriendo y ejecutar el job de streaming con el mismo comando que con anterioridad:
134 | 
135 | Comente las líneas del primer job (`stream_to_postgres`) y descomente las del job `stream_aggregation_to_postgres`.
136 | Revise el código de la nueva función y observe las diferencias con el anterior. Qué diferencias observa?
137 | Luego de correrlo revise los datos insertados en la tabla con `psql`. Qué ve de particular en la fecha de comienzo?
138 | 
139 | Finalmente comente el job `stream_aggregation_to_postgres` y descomente `stream_aggregation_to_postgres_final`.
140 | Agregue una visualización en Superset para poder visualizar las filas insertándose en esta nueva tabla.
141 | 
142 | Una vez completados los pasos anteriores pruebe algunos de las siguientes modificaciones:
143 | 
144 | 1. Agregue al job final lógica para que además de calcular el avg_price calcule el max de cada ventana.
145 | 2. Agregue nuevas visualizaciones al dashboard de Superset y haga que se refresque cada 10 segundos.
146 | 3. Agregue al ETL batch el código necesario para que también guarde la información del volumen de cada acción.
147 | 4. Agregue al `fake_stock_price_generator.py` lógica para generar un volumen para cada acción de manera artificial además del precio. Modifique los jobs de streaming para procesar este dato. 
148 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark==2.4.5
2 | 
3 | # Deps for fake_stock_price_gen
4 | kafka-python==2.0.1
5 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/src/batch/etl_steps.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from pyspark.sql import SparkSession
  4 | 
  5 | # UDF
  6 | from pyspark.sql.types import StringType
  7 | 
  8 | # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions
  9 | from pyspark.sql import functions as F
 10 | from pyspark.sql.window import Window
 11 | 
 12 | # Initialization
 13 | args = sys.argv
 14 | 
 15 | if len(args) != 4:
 16 |     print(f"""
 17 |    |Usage: {args[0]} <dataset folder> <lookup file> <output folder>
 18 |    |  <dataset folder> folder where stocks data are located
 19 |    |  <lookup file> file containing lookup information
 20 |    |  <output folder> folder to write parquet files
 21 |    |
 22 |    |  {args[0]} /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet
 23 |     """)
 24 |     sys.exit(1)
 25 | 
 26 | _, stocks_dir, lookup_file, output_dir = args
 27 | 
 28 | spark = SparkSession \
 29 |     .builder \
 30 |     .appName("Stocks:ETL") \
 31 |     .getOrCreate()
 32 | 
 33 | 
 34 | #
 35 | def csv_stocks_df(stocks_folder):
 36 |     # Create a function and define it as a UDF
 37 |     # UDF
 38 |     def extract_symbol_from(filename):
 39 |         return filename.split('/')[-1].split('.')[0].upper()
 40 | 
 41 |     extract_symbol = F.udf(lambda filename: extract_symbol_from(filename), StringType())
 42 | 
 43 |     df = spark.read \
 44 |         .option("header", True) \
 45 |         .option("inferSchema", True) \
 46 |         .csv(stocks_folder) \
 47 |         .withColumn("name", extract_symbol(F.input_file_name())) \
 48 |         .withColumnRenamed("Date", "dateTime") \
 49 |         .withColumnRenamed("Open", "open") \
 50 |         .withColumnRenamed("High", "high") \
 51 |         .withColumnRenamed("Low", "low") \
 52 |         .withColumnRenamed("Close", "close") \
 53 |         .drop("Volume", "OpenInt")
 54 | 
 55 |     return df
 56 | 
 57 | 
 58 | # Load lookup CSV and convert into DataFrame
 59 | def load_lookup_data(filename):
 60 |     # df.filter("Country = \"USA\""). \
 61 |     # df.filter("Country" === "USA").
 62 |     df = spark.read. \
 63 |         option("header", True). \
 64 |         option("inferSchema", True). \
 65 |         csv(filename). \
 66 |         select("Ticker", "Category Name"). \
 67 |         withColumnRenamed("Ticker", "symbol"). \
 68 |         withColumnRenamed("Category Name", "category")
 69 | 
 70 |     return df
 71 | 
 72 | 
 73 | df_stocks = csv_stocks_df(stocks_dir)
 74 | print("Sample of df_stocks data:")
 75 | df_stocks.show(3)
 76 | 
 77 | symbols_lookup = load_lookup_data(lookup_file)
 78 | print("Sample of symbols_lookup data:")
 79 | symbols_lookup.show(3)
 80 | 
 81 | joined_df = df_stocks \
 82 |     .withColumnRenamed('dateTime', "full_date") \
 83 |     .filter("full_date >= \"2017-09-01\"") \
 84 |     .withColumn("year", F.year("full_date")) \
 85 |     .withColumn("month", F.month("full_date")) \
 86 |     .withColumn("day", F.dayofmonth("full_date")) \
 87 |     .withColumnRenamed("name", "symbol") \
 88 |     .join(symbols_lookup, ["symbol"])
 89 | 
 90 | print("Sample of joined_df data:")
 91 | joined_df.show()
 92 | 
 93 | # Calculate Moving Average
 94 | # https://stackoverflow.com/questions/45806194/pyspark-rolling-average-using-timeseries-data
 95 | 
 96 | window20 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-20, 0))
 97 | window50 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-50, 0))
 98 | window100 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-100, 0))
 99 | 
100 | # // Calculate the moving average
101 | stocks_moving_avg_df = joined_df \
102 |     .withColumn("ma20", F.avg("close").over(window20)) \
103 |     .withColumn("ma50", F.avg("close").over(window50)) \
104 |     .withColumn("ma100", F.avg("close").over(window100))
105 | 
106 | print("Sample of stocks_moving_avg_df data:")
107 | stocks_moving_avg_df.show()
108 | 
109 | # Write to Parquet
110 | stocks_moving_avg_df \
111 |     .write \
112 |     .mode('overwrite') \
113 |     .partitionBy("year", "month", "day") \
114 |     .parquet(output_dir)
115 | 
116 | # Write to Postgres
117 | stocks_moving_avg_df \
118 |     .drop("year", "month", "day") \
119 |     .write \
120 |     .format("jdbc") \
121 |     .option("url", "jdbc:postgresql://postgres/workshop") \
122 |     .option("dbtable", "workshop.stocks") \
123 |     .option("user", "workshop") \
124 |     .option("password", "w0rkzh0p") \
125 |     .option("driver", "org.postgresql.Driver") \
126 |     .mode('append') \
127 |     .save()
128 | 
129 | print("All done")
130 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/src/examples/first_example.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | spark = SparkSession \
 4 |     .builder \
 5 |     .appName("first_example") \
 6 |     .getOrCreate()
 7 | 
 8 | df = spark.read.csv("/dataset/yahoo-symbols-201709.csv")
 9 | 
10 | df.show()
11 | 
12 | spark.stop
13 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/src/examples/postgres_example.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | spark = SparkSession \
 4 |     .builder \
 5 |     .appName("first_example") \
 6 |     .getOrCreate()
 7 | 
 8 | df = spark.read \
 9 |     .format("jdbc") \
10 |     .option("url", "jdbc:postgresql://postgres/workshop") \
11 |     .option("dbtable", "workshop.stocks") \
12 |     .option("user", "workshop") \
13 |     .option("password", "w0rkzh0p") \
14 |     .option("driver", "org.postgresql.Driver") \
15 |     .load()
16 | 
17 | df.printSchema()
18 | 
19 | elems_count = df.count()
20 | 
21 | print(f'Count: {elems_count}\n\n')
22 | 
23 | df.show()
24 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/src/stream/etl_stream.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from time import sleep
  4 | 
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.functions import from_json, year, month, dayofmonth, hour, minute
  7 | from pyspark.sql import functions as F  # col doesn't import correctly
  8 | from pyspark.sql.types import TimestampType, StringType, StructType, StructField, DoubleType
  9 | 
 10 | 
 11 | def validate_params(args):
 12 |     if len(args) != 3:
 13 |         print(f"""
 14 |          |Usage: {args[0]} <brokers> <topics>
 15 |          |  <brokers> is a list of one or more Kafka brokers
 16 |          |  <topic> is a a kafka topic to consume from
 17 |          |
 18 |          |  {args[0]} kafka:9092 stocks
 19 |         """)
 20 |         sys.exit(1)
 21 |     pass
 22 | 
 23 | 
 24 | def create_spark_session():
 25 |     return SparkSession \
 26 |         .builder \
 27 |         .appName("Stocks:Stream:ETL") \
 28 |         .getOrCreate()
 29 | 
 30 | 
 31 | def start_stream(args):
 32 |     validate_params(args)
 33 |     _, brokers, topic = args
 34 | 
 35 |     spark = create_spark_session()
 36 | 
 37 |     json = spark \
 38 |         .readStream \
 39 |         .format("kafka") \
 40 |         .option("kafka.bootstrap.servers", brokers) \
 41 |         .option("subscribe", topic) \
 42 |         .load()
 43 | 
 44 |     json.printSchema()
 45 | 
 46 |     # Explicitly set schema
 47 |     schema = StructType([StructField("symbol", StringType(), False),
 48 |                          StructField("timestamp", TimestampType(), False),
 49 |                          StructField("price", DoubleType(), False)])
 50 | 
 51 |     json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"}
 52 |     stocks_json = json \
 53 |         .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content"))
 54 | 
 55 |     stocks_json.printSchema
 56 | 
 57 |     stocks = stocks_json.select("content.*")
 58 | 
 59 |     ####################################
 60 |     # Stream to Parquet
 61 |     ####################################
 62 |     query = stocks \
 63 |         .withColumn('year', year(F.col('timestamp'))) \
 64 |         .withColumn('month', month(F.col('timestamp'))) \
 65 |         .withColumn('day', dayofmonth(F.col('timestamp'))) \
 66 |         .withColumn('hour', hour(F.col('timestamp'))) \
 67 |         .withColumn('minute', minute(F.col('timestamp'))) \
 68 |         .writeStream \
 69 |         .format('parquet') \
 70 |         .partitionBy('year', 'month', 'day', 'hour', 'minute') \
 71 |         .option('startingOffsets', 'earliest') \
 72 |         .option('checkpointLocation', '/dataset/checkpoint') \
 73 |         .option('path', '/dataset/streaming.parquet') \
 74 |         .trigger(processingTime='30 seconds') \
 75 |         .start()
 76 | 
 77 |     query.awaitTermination()
 78 | 
 79 | 
 80 |     # avg_pricing = stocks \
 81 |     #     .groupBy(F.col("symbol")) \
 82 |     #     .agg(F.avg(F.col("price")).alias("avg_price"))
 83 | 
 84 |     ####################################
 85 |     # Console Output
 86 |     ####################################
 87 |     # query2 = avg_pricing.writeStream \
 88 |     #     .outputMode('complete') \
 89 |     #     .format("console") \
 90 |     #     .trigger(processingTime="10 seconds") \
 91 |     #     .start()
 92 | 
 93 |     # query2.awaitTermination()
 94 | 
 95 |     ####################################
 96 |     # Table in Memory
 97 |     ####################################
 98 |     # query3 = avg_pricing \
 99 |     #     .writeStream \
100 |     #     .queryName("avgPricing") \
101 |     #     .outputMode("complete") \
102 |     #     .format("memory") \
103 |     #     .trigger(processingTime="10 seconds") \
104 |     #     .start()
105 |     #
106 |     # while True:
107 |     #     print('\n' + '_' * 30)
108 |     #     # interactively query in-memory table
109 |     #     spark.sql('SELECT * FROM avgPricing').show()
110 |     #     print(query3.lastProgress)
111 |     #     sleep(10)
112 | 
113 |     # query3.awaitTermination()
114 | 
115 |     ####################################
116 |     # Writing to Postgres
117 |     ####################################
118 | 
119 |     # Simple insert
120 |     # query = stream_to_postgres(stocks)
121 |     # query.awaitTermination()
122 | 
123 |     # Average Price Aggregation
124 |     # query = stream_aggregation_to_postgres(stocks)
125 |     # query.awaitTermination()
126 | 
127 |     # Final Average Price Aggregation with Timestamp columns
128 |     # query = stream_aggregation_to_postgres_final(stocks)
129 |     # query.awaitTermination()
130 | 
131 |     pass
132 | 
133 | 
134 | def define_write_to_postgres(table_name):
135 |   
136 |     def write_to_postgres(df, epochId):
137 |         return (
138 |             df.write
139 |                 .format("jdbc")
140 |                 .option("url", "jdbc:postgresql://postgres/workshop")
141 |                 .option("dbtable", f"workshop.{table_name}")
142 |                 .option("user", "workshop")
143 |                 .option("password", "w0rkzh0p")
144 |                 .option("driver", "org.postgresql.Driver")
145 |                 .mode('append')
146 |                 .save()
147 |         )
148 |     return write_to_postgres
149 | 
150 |     
151 | def stream_to_postgres(stocks, output_table="streaming_inserts"):
152 |     wstocks =  (
153 |         stocks
154 |             .withWatermark("timestamp", "60 seconds")
155 |             .select("timestamp", "symbol", "price")
156 |     )
157 | 
158 |     write_to_postgres_fn = define_write_to_postgres("streaming_inserts")
159 |     
160 |     query = (
161 |         wstocks.writeStream
162 |         .foreachBatch(write_to_postgres_fn)
163 |         .outputMode("append")
164 |         .trigger(processingTime="10 seconds")
165 |         .start()
166 |     )
167 | 
168 |     return query
169 | 
170 | 
171 | def summarize_stocks(stocks):
172 |     avg_pricing = (
173 |         stocks
174 |         .withWatermark("timestamp", "60 seconds")
175 |         .groupBy(
176 |             F.window("timestamp", "30 seconds"),
177 |             stocks.symbol)
178 |         .agg(F.avg("price").alias('avg_price'))
179 |     )
180 |     avg_pricing.printSchema()
181 |     return avg_pricing
182 | 
183 | 
184 | def stream_aggregation_to_postgres(stocks, output_table="streaming_inserts_avg_price"):
185 | 
186 |     avg_pricing = summarize_stocks(stocks)
187 | 
188 |     window_to_string = F.udf(lambda w: str(w.start) + ' - ' + str(w.end), StringType())
189 |     
190 |     write_to_postgres_fn = define_write_to_postgres(output_table)
191 | 
192 |     query = (
193 |         avg_pricing\
194 |         .withColumn("window", window_to_string("window"))
195 |         .writeStream
196 |         .foreachBatch(write_to_postgres_fn)
197 |         .outputMode("append")
198 |         .trigger(processingTime="10 seconds")
199 |         .start()
200 |     )
201 | 
202 |     return query
203 | 
204 | 
205 | def stream_aggregation_to_postgres_final(stocks, output_table="streaming_inserts_avg_price_final"):
206 | 
207 |     avg_pricing = summarize_stocks(stocks)
208 | 
209 |     window_start_ts_fn = F.udf(lambda w: w.start, TimestampType())
210 | 
211 |     window_end_ts_fn = F.udf(lambda w: w.end, TimestampType())
212 |     
213 |     write_to_postgres_fn = define_write_to_postgres(output_table)
214 | 
215 |     query = (
216 |         avg_pricing\
217 |         .withColumn("window_start", window_start_ts_fn("window"))
218 |         .withColumn("window_end", window_end_ts_fn("window"))
219 |         .drop("window")
220 |         .writeStream
221 |         .foreachBatch(write_to_postgres_fn)
222 |         .outputMode("append")
223 |         .trigger(processingTime="10 seconds")
224 |         .start()
225 |     )
226 | 
227 |     return query
228 | 
229 | 
230 | if __name__ == '__main__':
231 |     start_stream(sys.argv)
232 | 


--------------------------------------------------------------------------------
/code/python/us-stock-analysis/src/stream/fake_stock_price_generator.py:
--------------------------------------------------------------------------------
  1 | from random import randrange, random
  2 | from datetime import datetime, timedelta
  3 | from time import sleep
  4 | 
  5 | from kafka import KafkaProducer
  6 | import json
  7 | import sys
  8 | 
  9 | 
 10 | class QuoteGenerator:
 11 |     # Using as SEED for the generator last 90 days of stocks
 12 |     # price: Max Closing Price
 13 |     # volatility: StdDev of Closing Pricing
 14 |     #    df.groupBy($"symbol")
 15 |     #      .agg(stddev_pop($"close").as("volatility"), max($"close").as("price"))
 16 |     #      .orderBy($"symbol")
 17 |     quotes_list = [("AAPL", 175.61, 6.739169981533334),
 18 |                    ("BABA", 188.51, 5.637335242825282),
 19 |                    ("CSCO", 34.62, 0.9673997717593282),
 20 |                    ("DHR", 93.24, 2.949284608917899),
 21 |                    ("EBAY", 38.99, 0.8110024414266584),
 22 |                    ("FB", 182.66, 4.14292553638126),
 23 |                    ("GOOG", 1039.85, 37.960859608812854),
 24 |                    ("GOOGL", 1058.29, 39.11749241707603),
 25 |                    ("IBM", 160.47, 4.8367462989079755),
 26 |                    ("INTC", 46.826, 3.678237311321825),
 27 |                    ("JNJ", 143.62, 4.336597380435497),
 28 |                    ("MELI", 292.05, 19.703519789367583),
 29 |                    ("MSFT", 84.56, 3.7745700470384693),
 30 |                    ("ORCL", 52.593, 1.4026418724678085),
 31 |                    ("QCOM", 65.49, 3.962328548164577),
 32 |                    ("TSLA", 385.0, 21.667055079857995),
 33 |                    ("TXN", 98.54, 5.545761038090265),
 34 |                    ("WDC", 89.9, 1.7196676293981952),
 35 |                    ("XRX", 33.86, 1.4466726098188216)]
 36 | 
 37 |     def __init__(self, trading_start_at):
 38 |         self.trading_start_datetime = trading_start_at
 39 | 
 40 |     # a very naive impl of marketing hours
 41 |     # not consider weekends nor holidays
 42 |     def __nextMarketTime(self):
 43 |         # Sometimes it substracts 1 and generates late arriving tickers
 44 |         tick = randrange(5) - 1
 45 |         next_time = self.trading_start_datetime + timedelta(minutes=tick)
 46 |         # Market should be closed, bump to next day
 47 |         if next_time.hour > 15:
 48 |             next_time = (next_time + timedelta(days=1)).replace(hour=10, minute=0)
 49 | 
 50 |         self.trading_start_datetime = next_time
 51 |         return next_time
 52 | 
 53 |     def __signal(self):
 54 |         if randrange(2) == 0:
 55 |             return 1
 56 |         else:
 57 |             return -1
 58 | 
 59 |     def next_symbol(self):
 60 |         quote_idx = randrange(len(self.quotes_list) - 1)
 61 |         quote = self.quotes_list[quote_idx]
 62 | 
 63 |         # price = quote.price + (signal * rnd.nextDouble * quote.volatility * 3)
 64 |         price = quote[1] + (self.__signal() * random() * quote[2] * 3)
 65 | 
 66 |         return {
 67 |             'symbol': quote[0],
 68 |             'timestamp': self.__nextMarketTime().isoformat(),
 69 |             'price': float(f'{price:2.3f}')
 70 |         }
 71 | 
 72 | 
 73 | if __name__ == '__main__':
 74 |     # Initialization
 75 |     args = sys.argv
 76 | 
 77 |     if len(args) != 4:
 78 |         print(f"""
 79 |         |Usage: {args[0]} <brokers> <topics> <start_date>
 80 |         |  <brokers> is a list of one or more Kafka brokers
 81 |         |  <topic> one kafka topic to produce to
 82 |         |  <start_date> [OPTIONAL] iso timestamp from when to start producing data
 83 |         |
 84 |         |  {args[0]} kafka:9092 stocks 2017-11-11T10:00:00Z
 85 |         """)
 86 |         sys.exit(1)
 87 | 
 88 |     _, brokers, topic, start_date = args
 89 |     trading_start_datetime = datetime.strptime(start_date, '%Y-%m-%dT%H:%M:%S%z')
 90 | 
 91 |     quote_gen = QuoteGenerator(trading_start_datetime)
 92 | 
 93 |     producer = KafkaProducer(
 94 |         bootstrap_servers=brokers,
 95 |         value_serializer=lambda v: json.dumps(v).encode('utf-8'))
 96 | 
 97 |     while True:
 98 |         stock_data = quote_gen.next_symbol()
 99 |         producer.send(topic, stock_data)
100 |         print(stock_data)
101 |         sleep(.5)
102 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .DS_Store
3 | derby.log
4 | metastore_db
5 | spark-warehouse
6 | data/credit.model
7 | .idea


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Credit Risk Analysis
 2 | ## Spark Machine Learning (Random Forest) 
 3 | 
 4 | 
 5 | 
 6 | ```bash
 7 | sbt clean assembly
 8 | 
 9 | spark-submit \
10 |   --class es.arjon.CreditRiskTrain \
11 |   --master 'spark://master:7077' \
12 |   target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \
13 |   /dataset/credit-risk/germancredit.csv \
14 |   /dataset/credit-risk.model
15 |     
16 | 
17 | 
18 | spark-submit \
19 |   --class es.arjon.CreditRiskAnalysis \
20 |   --master 'spark://master:7077' \
21 |   target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \
22 |   /dataset/credit-risk/germancredit-user-input.csv \
23 |   /dataset/credit-risk.model
24 | ```
25 | 
26 | # Acknowledge
27 | The original author of this tutorial is **Carol McDonald <caroljmcdonald@gmail.com>** for the MapR article: [Predicting Loan Credit Risk using Apache Spark Machine Learning Random Forests](https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/), 
28 | I updated the API version (Spark 2.4.4) and made changes on the code to clarify/reduce duplication.  
29 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/build.sbt:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | 
 3 | name := "credit-risk-analysis"
 4 | 
 5 | version := "0.1"
 6 | 
 7 | scalaVersion := "2.11.12"
 8 | 
 9 | scalacOptions += "-target:jvm-1.8"
10 | 
11 | libraryDependencies ++= Seq(
12 |   "org.apache.spark" %% "spark-sql" % "2.4.4" % "provided",
13 |   "org.apache.spark" %% "spark-mllib" % "2.4.4" % "provided",
14 | 
15 |   "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly()
16 | )
17 | 
18 | assemblyMergeStrategy in assembly := {
19 |   case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
20 |   case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
21 |   case "log4j.properties" => MergeStrategy.first
22 |   case "reference.conf" => MergeStrategy.concat
23 |   case _ => MergeStrategy.first
24 | }
25 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.16
2 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskAnalysis.scala:
--------------------------------------------------------------------------------
 1 | package es.arjon
 2 | 
 3 | import org.apache.spark.ml.classification.RandomForestClassificationModel
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object CreditRiskAnalysis extends DatasetUtil {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     if (args.length < 2) {
10 |       System.err.println(
11 |         s"""
12 |            |Usage: CreditRiskAnalysis <datasource> <model>
13 |            |  <datasource> CSV dataset to PREDICT credit
14 |            |  <model> path to the model
15 |            |
16 |            |  CreditRiskAnalysis /dataset/credit-risk/germancredit-user-input.csv /dataset/credit-risk.model
17 |         """.stripMargin)
18 |       System.exit(1)
19 |     }
20 | 
21 |     //    val Array(datasource, modelPath) = Array("/dataset/credit-risk/germancredit-user-input.csv",
22 |     //      "/dataset/credit-risk.model")
23 |     val Array(datasource, modelPath) = args
24 | 
25 |     //    implicit val ss = spark
26 |     implicit val spark = SparkSession.
27 |       builder.
28 |       appName("CreditRisk").
29 |       getOrCreate()
30 | 
31 |     val df = loadUserInputData(datasource)
32 |     val dfVector = vectorizeInput(df)
33 | 
34 |     val model = RandomForestClassificationModel.load(modelPath)
35 |     val predictions = model.transform(dfVector)
36 | 
37 |     import spark.implicits._
38 | 
39 |     println("=" * 30)
40 |     println("Prediction are:")
41 |     predictions.select($"userId", $"amount", $"prediction").show(false)
42 |   }
43 | 
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskTrain.scala:
--------------------------------------------------------------------------------
  1 | package es.arjon
  2 | 
  3 | import es.arjon.CreditRiskAnalysis.vectorizeInput
  4 | import org.apache.spark.ml.classification._
  5 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
  6 | import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
  7 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
  8 | import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}
  9 | import org.apache.spark.mllib.evaluation.RegressionMetrics
 10 | import org.apache.spark.sql._
 11 | import org.apache.spark.sql.functions._
 12 | 
 13 | // Heavily inspired on
 14 | // https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/
 15 | object CreditRiskTrain extends DatasetUtil {
 16 |   def main(args: Array[String]) {
 17 |     if (args.length < 2) {
 18 |       System.err.println(
 19 |         s"""
 20 |            |Usage: CreditRiskTrain <datasource> <model>
 21 |            |  <datasource> CSV dataset to learn from
 22 |            |  <model> path to save model to
 23 |            |
 24 |            |  CreditRiskTrain /dataset/credit-risk/germancredit.csv /dataset/credit-risk.model
 25 |         """.stripMargin)
 26 |       System.exit(1)
 27 |     }
 28 | 
 29 |     val Array(datasource, modelPath) = args
 30 | 
 31 | 
 32 |     // When using Spark-Shell:
 33 |     // implicit val ss = spark
 34 |     implicit val spark = SparkSession.
 35 |       builder.
 36 |       appName("CreditRisk").
 37 |       getOrCreate()
 38 | 
 39 |     import spark.implicits._
 40 | 
 41 |     val creditDF = loadTrainData(datasource)
 42 |     creditDF.printSchema
 43 |     creditDF.show
 44 | 
 45 |     // creditDF.createOrReplaceTempView("credit")
 46 |     // spark.sql("SELECT creditability, avg(balance) as avg_balance, avg(amount) as avg_amount,
 47 |     // avg(duration) as avg_duration  FROM credit GROUP BY creditability").show
 48 | 
 49 |     creditDF.describe("balance").show
 50 |     creditDF.groupBy("creditability").agg(avg('balance), avg('amount), avg('duration)).show
 51 | 
 52 |     val dfVector = vectorizeInput(creditDF)
 53 | 
 54 |     // Convert Strings into Label Identifiers (Double)
 55 |     val labelIndexer = new StringIndexer().setInputCol("creditability").setOutputCol("label")
 56 | 
 57 |     // Add Label Identifiers field to the DF
 58 |     val dfLabeled = labelIndexer.fit(dfVector).transform(dfVector)
 59 | 
 60 |     //    Manually transforming
 61 |     //        def convertCreditability(v: String) = if (v =="YES") 1.0 else 0.0
 62 |     //        val convertCreditabilityUDF = udf(convertCreditability _)
 63 |     //        val dfLabeled = dfVector.withColumn("label2", convertCreditabilityUDF($"creditability"))
 64 | 
 65 |     dfLabeled.select($"features", $"label", $"creditability").show(30, false)
 66 | 
 67 |     // remove unused fields
 68 |     val dfInput = dfLabeled.select($"features", $"label")
 69 | 
 70 | 
 71 |     val splitSeed = 5043
 72 |     val Array(trainingDataUncached, testData) = dfInput.randomSplit(Array(0.7, 0.3), splitSeed)
 73 | 
 74 |     // Try to run with & without cache()
 75 |     // val trainingData = trainingDataUncached.cache()
 76 |     val trainingData = trainingDataUncached
 77 | 
 78 |     val classifier = new RandomForestClassifier().
 79 |       setImpurity("gini").
 80 |       setMaxDepth(3).
 81 |       setNumTrees(20).
 82 |       setFeatureSubsetStrategy("auto").
 83 |       setSeed(5043)
 84 | 
 85 |     val model = classifier.fit(trainingData)
 86 |     println(model.toDebugString)
 87 | 
 88 |     println("=" * 30)
 89 |     println("Before pipeline fitting\n")
 90 |     val predictions = model.transform(testData)
 91 | 
 92 |     val evaluator = new BinaryClassificationEvaluator().setLabelCol("label")
 93 |     val accuracy = evaluator.evaluate(predictions)
 94 |     println(f"Accuracy: $accuracy%2.3f")
 95 |     printPredictionMetrics(predictions)
 96 | 
 97 |     // Save the model to latter use
 98 |     model.write.overwrite().save(modelPath)
 99 | 
100 |     // Let's try to do better
101 |     val paramGrid = new ParamGridBuilder().
102 |       addGrid(classifier.maxBins, Array(20, 40)).
103 |       addGrid(classifier.maxDepth, Array(2, 10)).
104 |       addGrid(classifier.numTrees, Array(10, 60)).
105 |       addGrid(classifier.impurity, Array("entropy", "gini")).
106 |       build()
107 | 
108 |     val steps: Array[PipelineStage] = Array(classifier)
109 |     val pipeline = new Pipeline().setStages(steps)
110 | 
111 |     val cv = new CrossValidator().
112 |       setEstimator(pipeline).
113 |       setEvaluator(evaluator).
114 |       setEstimatorParamMaps(paramGrid).
115 |       setNumFolds(10)
116 | 
117 |     val pipelineFittedModel = cv.fit(trainingData)
118 | 
119 |     val predictions2 = pipelineFittedModel.transform(testData)
120 |     val accuracy2 = evaluator.evaluate(predictions2)
121 |     println("=" * 30)
122 |     println("AFTER pipeline fitting\n")
123 |     println(f"Accuracy: $accuracy2%2.3f")
124 | 
125 |     val bestModel = pipelineFittedModel.bestModel.asInstanceOf[PipelineModel].stages(0)
126 |     val params = bestModel.extractParamMap
127 | 
128 |     println(
129 |       s"""
130 |          |The best model found was:
131 |          |${bestModel}
132 |          |
133 |         |Using params:
134 |          |${params}
135 |          |
136 |       """.stripMargin)
137 | 
138 |     printPredictionMetrics(predictions2)
139 | 
140 |     // Not saving the final model...
141 | //
142 |   }
143 | 
144 |   def printPredictionMetrics(predictions: DataFrame)(implicit spark: SparkSession) {
145 |     // Extract PREDICTED and CORRECT (label) values
146 |     import spark.implicits._
147 |     val predictionAndObservations = predictions.select('prediction, 'label)
148 |     val rdd = predictionAndObservations.rdd.map(r => (r.getDouble(0), r.getDouble(1)))
149 | 
150 |     // Calculate the Quality Metrics
151 |     val rm = new RegressionMetrics(rdd)
152 |     val msg =
153 |       s"""
154 |          |MSE:           ${rm.meanSquaredError}
155 |          |MAE:           ${rm.meanAbsoluteError}
156 |          |RMSE Squared:  ${rm.rootMeanSquaredError}
157 |          |R Squared:     ${rm.r2}
158 |          |Exp. Variance: ${rm.explainedVariance}
159 |          |
160 |       """.stripMargin
161 | 
162 |     println(msg)
163 |   }
164 | }
165 | 
166 | 


--------------------------------------------------------------------------------
/code/scala/credit-risk-analysis/src/main/scala/es/arjon/DatasetUtil.scala:
--------------------------------------------------------------------------------
 1 | package es.arjon
 2 | 
 3 | import org.apache.spark.ml.feature.VectorAssembler
 4 | import org.apache.spark.sql.{DataFrame, SparkSession}
 5 | 
 6 | trait DatasetUtil {
 7 | 
 8 |   // when using console add this
 9 |   // implicit val ss = spark
10 |   def loadTrainData(csv: String)(implicit spark: SparkSession) = {
11 |     import org.apache.spark.sql.types._
12 | 
13 |     val schema = StructType(Seq(
14 |       StructField("creditability", StringType, nullable = false),
15 |       StructField("balance", DoubleType, nullable = false),
16 |       StructField("duration", DoubleType, nullable = false),
17 |       StructField("history", DoubleType, nullable = false),
18 |       StructField("purpose", DoubleType, nullable = false),
19 |       StructField("amount", DoubleType, nullable = false),
20 |       StructField("savings", DoubleType, nullable = false),
21 |       StructField("employment", DoubleType, nullable = false),
22 |       StructField("instPercent", DoubleType, nullable = false),
23 |       StructField("sexMarried", DoubleType, nullable = false),
24 |       StructField("guarantors", DoubleType, nullable = false),
25 |       StructField("residenceDuration", DoubleType, nullable = false),
26 |       StructField("assets", DoubleType, nullable = false),
27 |       StructField("age", DoubleType, nullable = false),
28 |       StructField("concCredit", DoubleType, nullable = false),
29 |       StructField("apartment", DoubleType, nullable = false),
30 |       StructField("credits", DoubleType, nullable = false),
31 |       StructField("occupation", DoubleType, nullable = false),
32 |       StructField("dependents", DoubleType, nullable = false),
33 |       StructField("hasPhone", DoubleType, nullable = false),
34 |       StructField("foreign", DoubleType, nullable = false)
35 |     ))
36 | 
37 |     spark.read.
38 |       option("header", false).
39 |       schema(schema).
40 |       csv(csv)
41 |   }
42 | 
43 |   def loadUserInputData(csv: String)(implicit spark: SparkSession) = {
44 |     import org.apache.spark.sql.types._
45 |     val schema = StructType(Seq(
46 |       StructField("userId", StringType, nullable = false), // USER ID to identify the PREDICTED ANSWER
47 |       StructField("balance", DoubleType, nullable = false),
48 |       StructField("duration", DoubleType, nullable = false),
49 |       StructField("history", DoubleType, nullable = false),
50 |       StructField("purpose", DoubleType, nullable = false),
51 |       StructField("amount", DoubleType, nullable = false),
52 |       StructField("savings", DoubleType, nullable = false),
53 |       StructField("employment", DoubleType, nullable = false),
54 |       StructField("instPercent", DoubleType, nullable = false),
55 |       StructField("sexMarried", DoubleType, nullable = false),
56 |       StructField("guarantors", DoubleType, nullable = false),
57 |       StructField("residenceDuration", DoubleType, nullable = false),
58 |       StructField("assets", DoubleType, nullable = false),
59 |       StructField("age", DoubleType, nullable = false),
60 |       StructField("concCredit", DoubleType, nullable = false),
61 |       StructField("apartment", DoubleType, nullable = false),
62 |       StructField("credits", DoubleType, nullable = false),
63 |       StructField("occupation", DoubleType, nullable = false),
64 |       StructField("dependents", DoubleType, nullable = false),
65 |       StructField("hasPhone", DoubleType, nullable = false),
66 |       StructField("foreign", DoubleType, nullable = false)
67 |     ))
68 | 
69 |     spark.read.
70 |       option("header", false).
71 |       schema(schema).
72 |       csv(csv)
73 |   }
74 | 
75 |   def vectorizeInput(df: DataFrame)(implicit spark: SparkSession): DataFrame = {
76 |     import spark.implicits._
77 | 
78 |     val featureCols = Array("balance", "duration", "history", "purpose", "amount",
79 |       "savings", "employment", "instPercent", "sexMarried", "guarantors",
80 |       "residenceDuration", "assets", "age", "concCredit", "apartment",
81 |       "credits", "occupation", "dependents", "hasPhone", "foreign")
82 | 
83 |     val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")
84 |     val out = assembler.transform(df)
85 |     out.select('features).show(truncate = false)
86 | 
87 |     out
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .DS_Store
3 | spark-warehouse
4 | .idea
5 | dataset/output.parquet/
6 | derby.log
7 | metastore_db
8 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/README.md:
--------------------------------------------------------------------------------
 1 | # ETL: US stocks analysis
 2 | 
 3 | 
 4 | 
 5 | ### Create a jar containing your application and its deps
 6 | ```bash
 7 | $ sbt clean assembly
 8 | ```
 9 | 
10 | ### Use spark-submit to run your application
11 | 
12 | ```bash
13 | $ spark-submit \
14 |   --class "es.arjon.FromCsvToParquet" \
15 |   --master 'local[*]' \
16 |   target/scala-2.11/us-stock-analysis-assembly-0.1.jar
17 | ```
18 | 
19 | ```bash
20 | $ spark-submit \
21 |   --class "es.arjon.RunAll" \
22 |   --master 'spark://master:7077' \
23 |   --driver-class-path /app/postgresql-42.1.4.jar \
24 |   target/scala-2.11/us-stock-analysis-assembly-0.1.jar
25 | ```
26 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/build.sbt:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | 
 3 | name := "us-stock-analysis"
 4 | 
 5 | version := "0.1"
 6 | 
 7 | scalaVersion := "2.11.12"
 8 | 
 9 | scalacOptions += "-target:jvm-1.8"
10 | 
11 | libraryDependencies ++= Seq(
12 |   "org.apache.spark" %% "spark-sql" % "2.4.4" % "provided",
13 |   "org.postgresql" % "postgresql" % "42.1.1",
14 | 
15 |   "org.apache.spark" %% "spark-streaming" % "2.4.4" % "provided",
16 |   "org.apache.spark" %% "spark-streaming-kafka-0-10" % "2.4.4",
17 |   "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.4"
18 | )
19 | 
20 | assemblyMergeStrategy in assembly := {
21 |   case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
22 |   case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
23 |   case "log4j.properties" => MergeStrategy.first
24 |   case "reference.conf" => MergeStrategy.concat
25 |   case _ => MergeStrategy.first
26 | }


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.17
2 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Define the root logger with appender file
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
13 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/src/main/scala/es/arjon/EtlSteps.scala:
--------------------------------------------------------------------------------
  1 | package es.arjon
  2 | 
  3 | import org.apache.spark.sql.expressions.Window
  4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
  5 | 
  6 | case class Stock(name: String,
  7 |                  dateTime: String,
  8 |                  open: Double,
  9 |                  high: Double,
 10 |                  low: Double,
 11 |                  close: Double)
 12 | 
 13 | object Stock {
 14 |   def fromCSV(symbol: String, line: String): Option[Stock] = {
 15 |     val v = line.split(",")
 16 | 
 17 |     try {
 18 |       Some(
 19 |         Stock(
 20 |           symbol,
 21 |           dateTime = v(0),
 22 |           open = v(1).toDouble,
 23 |           high = v(2).toDouble,
 24 |           low = v(3).toDouble,
 25 |           close = v(4).toDouble
 26 |         )
 27 |       )
 28 | 
 29 |     } catch {
 30 |       case ex: Exception => {
 31 |         println(s"Failed to process $symbol, with input $line, with ${ex.toString}")
 32 |         None
 33 |       }
 34 |     }
 35 | 
 36 |   }
 37 | }
 38 | 
 39 | 
 40 | object RunAll {
 41 |   def main(args: Array[String]): Unit = {
 42 |     if (args.length < 3) {
 43 |       System.err.println(
 44 |         s"""
 45 |            |Usage: RunAll <dataset folder> <lookup file> <output folder>
 46 |            |  <dataset folder> folder where stocks data is located
 47 |            |  <lookup file> file containing lookup information
 48 |            |  <output folder> folder to write parquet data
 49 |            |
 50 |            |RunAll /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet
 51 |         """.stripMargin)
 52 |       System.exit(1)
 53 |     }
 54 | 
 55 |     val Array(stocksFolder, lookupSymbol, outputFolder) = args
 56 | 
 57 | 
 58 |     val spark = SparkSession.
 59 |       builder.
 60 |       appName("Stocks:ETL").
 61 |       getOrCreate()
 62 | 
 63 |     val stocksDS = ReadStockCSV.processDS(spark, stocksFolder)
 64 |     val lookup = ReadSymbolLookup.process(spark, lookupSymbol)
 65 | 
 66 |     // For implicit conversions like converting RDDs to DataFrames
 67 |     import org.apache.spark.sql.functions._
 68 |     import spark.implicits._
 69 | 
 70 |     val ds = stocksDS.
 71 |       withColumn("full_date", unix_timestamp($"dateTime", "yyyy-MM-dd").cast("timestamp")).
 72 |       filter("full_date >= \"2017-09-01\"").
 73 |       withColumn("year", year($"full_date")).
 74 |       withColumn("month", month($"full_date")).
 75 |       withColumn("day", dayofmonth($"full_date")).
 76 |       drop($"dateTime").
 77 |       withColumnRenamed("name", "symbol").
 78 |       join(lookup, Seq("symbol"))
 79 | 
 80 |     // https://weishungchung.com/2016/08/21/spark-analyzing-stock-price/
 81 |     val movingAverageWindow20 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-20, 0)
 82 |     val movingAverageWindow50 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-50, 0)
 83 |     val movingAverageWindow100 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-100, 0)
 84 | 
 85 |     // Calculate the moving average
 86 |     val stocksMA = ds.
 87 |       withColumn("ma20", avg($"close").over(movingAverageWindow20)).
 88 |       withColumn("ma50", avg($"close").over(movingAverageWindow50)).
 89 |       withColumn("ma100", avg($"close").over(movingAverageWindow100))
 90 | 
 91 |     stocksMA.show(100)
 92 | 
 93 |     DatasetToParquet.process(spark, stocksMA, outputFolder)
 94 | 
 95 |     DatasetToPostgres.process(spark, stocksMA)
 96 | 
 97 |     spark.stop()
 98 |   }
 99 | }
100 | 
101 | object ReadStockCSV {
102 | 
103 |   def extractSymbolFromFilename(filename: String) = {
104 |     val arr = filename.split("/")
105 |     arr(arr.size - 1).split("\\.")(0).toUpperCase
106 |   }
107 | 
108 |   def processDS(spark: SparkSession, originFolder: String) = {
109 |     import org.apache.spark.sql.functions._
110 |     import spark.implicits._
111 | 
112 |     val symbolFromFilename = udf(extractSymbolFromFilename _)
113 | 
114 |     spark.read.
115 |       option("header", true).
116 |       option("inferSchema", true).
117 |       csv(originFolder).
118 |       withColumn("name", symbolFromFilename(input_file_name())).
119 |       withColumnRenamed("Date", "dateTime").
120 |       withColumnRenamed("Open", "open").
121 |       withColumnRenamed("High", "high").
122 |       withColumnRenamed("Low", "low").
123 |       withColumnRenamed("Close", "close").
124 |       drop("Volume", "OpenInt").
125 |       as[Stock]
126 |   }
127 | 
128 | 
129 |   def processRDD(spark: SparkSession, originFolder: String) = {
130 | 
131 |     // Using SparkContext to use RDD
132 |     val sc = spark.sparkContext
133 |     val files = sc.wholeTextFiles(originFolder, minPartitions = 40)
134 | 
135 |     val stocks = files.map { case (filename, content) =>
136 |       val symbol = extractSymbolFromFilename(filename)
137 | 
138 |       content.split("\n").flatMap { line =>
139 |         Stock.fromCSV(symbol, line)
140 |       }
141 |     }.
142 |       flatMap(e => e).
143 |       cache
144 | 
145 |     import spark.implicits._
146 | 
147 |     stocks.toDS.as[Stock]
148 |   }
149 | }
150 | 
151 | object ReadSymbolLookup {
152 |   def process(spark: SparkSession, file: String) = {
153 |     import spark.implicits._
154 |     spark.read.
155 |       option("header", true).
156 |       option("inferSchema", true).
157 |       csv(file).
158 |       //      filter("Country = \"USA\"").
159 |       //      filter($"Country" === "USA").
160 |       select($"Ticker", $"Category Name").
161 |       withColumnRenamed("Ticker", "symbol").
162 |       withColumnRenamed("Category Name", "category")
163 |   }
164 | }
165 | 
166 | object DatasetToParquet {
167 |   def process(spark: SparkSession, df: DataFrame, destinationFolder: String): Unit = {
168 |     // https://stackoverflow.com/questions/43731679/how-to-save-a-partitioned-parquet-file-in-spark-2-1
169 |     df.
170 |       write.
171 |       mode("overwrite").
172 |       partitionBy("year", "month", "day").
173 |       parquet(destinationFolder)
174 |   }
175 | }
176 | 
177 | object DatasetToPostgres {
178 | 
179 |   def process(spark: SparkSession, df: DataFrame): Unit = {
180 |     // Write to Postgres
181 |     val connectionProperties = new java.util.Properties
182 |     connectionProperties.put("user", "workshop")
183 |     connectionProperties.put("password", "w0rkzh0p")
184 |     val jdbcUrl = s"jdbc:postgresql://postgres:5432/workshop"
185 | 
186 |     df.
187 |       drop("year", "month", "day"). // drop unused columns
188 |       write.
189 |       mode(SaveMode.Append).
190 |       jdbc(jdbcUrl, "stocks", connectionProperties)
191 | 
192 |   }
193 | }
194 | 
195 | // TODO: Read compressed
196 | // option("codec", "org.apache.hadoop.io.compress.GzipCodec").
197 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/src/main/scala/es/arjon/FakeStockPriceGenerator.scala:
--------------------------------------------------------------------------------
  1 | package es.arjon
  2 | 
  3 | import java.time.ZonedDateTime
  4 | import java.util.Properties
  5 | 
  6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
  7 | 
  8 | object FakeStockPriceGenerator extends App {
  9 |   val rnd = new scala.util.Random(42)
 10 | 
 11 |   if (args.length < 2 || args.length > 3) {
 12 |     System.err.println(
 13 |       s"""
 14 |          |Usage: FakeStockPriceGenerator <brokers> <topics> <start_date>
 15 |          |  <brokers> is a list of one or more Kafka brokers
 16 |          |  <topic> one kafka topic to produce to
 17 |          |  <start_date> [OPTIONAL] iso timestamp from when to start producing data
 18 |          |
 19 |          |  FakeStockPriceGenerator kafka:9092 stocks 2017-11-11T10:00:00Z
 20 |         """.stripMargin)
 21 |     System.exit(1)
 22 |   }
 23 | 
 24 |   val brokers = args(0)
 25 |   val topic = args(1)
 26 | 
 27 |   # The default vauel is when the batch sample data ends
 28 |   val tradingStartParam = if (args.length == 3) args(2) else "2017-11-11T10:00:00Z"
 29 | 
 30 |   var tradingBeginOfTime = ZonedDateTime.parse(tradingStartParam)
 31 | 
 32 |   println(
 33 |     s"""
 34 |        |Generating faking stocks prices at $brokers/$topic
 35 |        |Each tick (300ms) represents 3min in clock time
 36 |     """.stripMargin)
 37 | 
 38 |   val props = new Properties()
 39 |   props.put("bootstrap.servers", brokers)
 40 |   props.put("client.id", "FakeStockPriceGenerator")
 41 |   props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
 42 |   props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
 43 | 
 44 |   val producer = new KafkaProducer[String, String](props)
 45 |   var counter = 0
 46 | 
 47 |   while (true) {
 48 |     val stock = nextSymbol()
 49 |     val data = new ProducerRecord[String, String](topic, null, stock)
 50 | 
 51 |     producer.send(data)
 52 |     Thread.sleep(300)
 53 | 
 54 |     counter += 1
 55 |     println(s"# $counter: $stock")
 56 |   }
 57 | 
 58 |   producer.close()
 59 | 
 60 | 
 61 |   def nextSymbol(): String = {
 62 |     # a very naive impl of marketing hours
 63 |     # not consider weekends nor holidays
 64 |     def nextMarketTime = {
 65 |       # val tick = 3
 66 |       # Sometimes it substracts 1 and generates late arriving tickers  
 67 |       val tick = rnd.nextInt(5)-1
 68 |       val proposedNextTime = tradingBeginOfTime.plusMinutes(tick)
 69 |       val nextTime = if (proposedNextTime.getHour > 15)
 70 |         proposedNextTime.plusDays(1).withHour(10).withMinute(0)
 71 |       else
 72 |         proposedNextTime
 73 | 
 74 |       tradingBeginOfTime = nextTime
 75 |       nextTime
 76 |     }
 77 | 
 78 | 
 79 |     case class StockConf(symbol: String, price: Double, volatility: Double)
 80 | 
 81 |     # Using as SEED for the generator last 90 days of stocks
 82 |     # price: Max Closing Price
 83 |     # volatility: StdDev of Closing Pricing
 84 |     #    df.groupBy($"symbol")
 85 |     #      .agg(stddev_pop($"close").as("volatility"), max($"close").as("price"))
 86 |     #      .orderBy($"symbol")
 87 |     //
 88 |     val quotes = List(
 89 |       StockConf("AAPL", 175.61, 6.739169981533334),
 90 |       StockConf("BABA", 188.51, 5.637335242825282),
 91 |       StockConf("CSCO", 34.62, 0.9673997717593282),
 92 |       StockConf("DHR", 93.24, 2.949284608917899),
 93 |       StockConf("EBAY", 38.99, 0.8110024414266584),
 94 |       StockConf("FB", 182.66, 4.14292553638126),
 95 |       StockConf("GOOG", 1039.85, 37.960859608812854),
 96 |       StockConf("GOOGL", 1058.29, 39.11749241707603),
 97 |       StockConf("IBM", 160.47, 4.8367462989079755),
 98 |       StockConf("INTC", 46.826, 3.678237311321825),
 99 |       StockConf("JNJ", 143.62, 4.336597380435497),
100 |       StockConf("MELI", 292.05, 19.703519789367583),
101 |       StockConf("MSFT", 84.56, 3.7745700470384693),
102 |       StockConf("ORCL", 52.593, 1.4026418724678085),
103 |       StockConf("QCOM", 65.49, 3.962328548164577),
104 |       StockConf("TSLA", 385.0, 21.667055079857995),
105 |       StockConf("TXN", 98.54, 5.545761038090265),
106 |       StockConf("WDC", 89.9, 1.7196676293981952),
107 |       StockConf("XRX", 33.86, 1.4466726098188216)
108 |     )
109 | 
110 |     def signal = if (rnd.nextInt(2) == 0) 1 else -1
111 | 
112 |     val quote = quotes(rnd.nextInt(quotes.size))
113 | 
114 |     val price = quote.price + (signal * rnd.nextDouble * quote.volatility * 3)
115 | 
116 |     //
117 |     f"""{"symbol":"${quote.symbol}","timestamp":"${nextMarketTime}","price":$price%2.3f}"""
118 |   }
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/code/scala/us-stock-analysis/src/main/scala/es/arjon/StreamingETL.scala:
--------------------------------------------------------------------------------
  1 | package es.arjon
  2 | 
  3 | import java.util.Properties
  4 | 
  5 | import org.apache.spark.sql.DataFrame
  6 | import org.apache.spark.sql.SparkSession
  7 | import org.apache.spark.sql.functions.udf
  8 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
  9 | import org.apache.spark.sql.types._
 10 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 11 | import org.apache.spark.sql.SaveMode
 12 | 
 13 | 
 14 | 
 15 | object StreamingETL extends App {
 16 |   if (args.length < 2) {
 17 |     System.err.println(
 18 |       s"""
 19 |          |Usage: StreamingETL <brokers> <topics>
 20 |          |  <brokers> is a list of one or more Kafka brokers
 21 |          |  <topics> is a list of one or more kafka topics to consume from
 22 |          |
 23 |          |  StreamingETL kafka:9092 stocks
 24 |         """.stripMargin)
 25 |     System.exit(1)
 26 |   }
 27 | 
 28 |   val Array(brokers, topics) = args
 29 |   val spark = SparkSession.
 30 |     builder.
 31 |     appName("Stocks:StreamingETL").
 32 |     getOrCreate()
 33 | 
 34 |   //  val brokers = "kafka:9092"
 35 |   //  val topics = "stocks"
 36 | 
 37 | 
 38 |   // Create DataSet representing the stream of input lines from kafka
 39 |   //  https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html
 40 |   val jsons = spark.
 41 |     readStream.
 42 |     format("kafka").
 43 |     option("kafka.bootstrap.servers", brokers).
 44 |     option("subscribe", topics).
 45 |     //option("startingOffsets", "earliest").
 46 |     load()
 47 |   
 48 | 
 49 |   jsons.printSchema
 50 | 
 51 |   val schema = StructType(Seq(
 52 |     StructField("symbol", StringType, nullable = false),
 53 |     StructField("timestamp", TimestampType, nullable = false),
 54 |     StructField("price", DoubleType, nullable = false)
 55 |   ))
 56 | 
 57 |   import org.apache.spark.sql.functions._
 58 |   import spark.implicits._
 59 | 
 60 |   val jsonOptions = Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm'Z'")
 61 |   val stocksJson = jsons.
 62 |     select(from_json($"value".cast("string"), schema, jsonOptions).as("content"))
 63 | 
 64 |   stocksJson.printSchema
 65 | 
 66 |   val stocks = stocksJson.select($"content.*")
 67 | 
 68 |   stocks.printSchema
 69 | 
 70 |   // Write to Parquet
 71 |   val query = stocks.
 72 |     withColumn("year", year($"timestamp")).
 73 |     withColumn("month", month($"timestamp")).
 74 |     withColumn("day", dayofmonth($"timestamp")).
 75 |     withColumn("hour", hour($"timestamp")).
 76 |     withColumn("minute", minute($"timestamp")).
 77 |     writeStream.
 78 |     format("parquet").
 79 |     partitionBy("year", "month", "day", "hour", "minute").
 80 |     option("startingOffsets", "earliest").
 81 |     option("checkpointLocation", "/dataset/checkpoint").
 82 |     option("path", "/dataset/streaming.parquet").
 83 |     trigger(Trigger.ProcessingTime("30 seconds")).
 84 |     start()
 85 |   query.awaitTermination()
 86 | 
 87 |   // AverageStocksToPostgres.process(spark, stocks)
 88 | 
 89 |   // Using as an ordinary DF
 90 |   // val avgPricing = stocks.
 91 |   //   groupBy($"symbol").
 92 |   //   agg(avg($"price").as("avg_price"))
 93 | 
 94 | 
 95 |   // avgPricing.printSchema
 96 | 
 97 |   // Start running the query that prints the running results to the console
 98 |   // val query = avgPricing.writeStream.
 99 |   //   outputMode(OutputMode.Complete).
100 |   //   format("console").
101 |   //   trigger(Trigger.ProcessingTime("10 seconds")).
102 |   //   start()
103 |   // query.awaitTermination()
104 | 
105 |   // // Have all the aggregates in an in-memory table
106 |   // val query = avgPricing
107 |   //    .writeStream
108 |   //    .queryName("avgPricing")    // this query name will be the table name
109 |   //    .outputMode("complete")
110 |   //    .format("memory")
111 |   //    .trigger(Trigger.ProcessingTime("10 seconds"))
112 |   //    .start()
113 |   
114 |   // while (true) {
115 |   //   Thread.sleep(10 * 1000)     
116 |   //   // interactively query in-memory table
117 |   //   spark.sql("select * from avgPricing").show()
118 |   //   //println(query.lastProgress)
119 |   // }
120 | 
121 |   // query.awaitTermination()
122 | }
123 | 
124 | 
125 | object AverageStocksToPostgres {
126 | 
127 |   def process(spark: SparkSession, stocks: DataFrame): Unit = {
128 | 
129 |   import org.apache.spark.sql.functions._
130 |   import spark.implicits._ 
131 |   
132 |   val avgPricing = stocks.
133 |     withWatermark("timestamp", "60 seconds").
134 |     groupBy( window($"timestamp", "30 seconds"),
135 |       $"symbol").
136 |     agg(avg($"price").as("avg_price"))
137 | 
138 | 
139 |   avgPricing.printSchema
140 | 
141 |   val connectionProperties = new Properties()
142 |   connectionProperties.put("user", "workshop")
143 |   connectionProperties.put("password", "w0rkzh0p")
144 |   connectionProperties.put("driver", "org.postgresql.Driver")
145 |   
146 | 
147 |   val winToString = udf{(window:GenericRowWithSchema) => window.mkString("-")}
148 | 
149 |   val processAvgTickers = avgPricing.
150 |     withColumn("window", winToString($"window")).
151 |     writeStream.
152 |     foreachBatch { (batchDF: DataFrame, batchId: Long) =>
153 |         batchDF.write.mode(SaveMode.Append).jdbc(s"jdbc:postgresql://postgres:5432/workshop", "workshop.test_streaming_inserts_avg_price", connectionProperties)
154 |     }.
155 |     trigger(Trigger.ProcessingTime("10 seconds")).
156 |     start()
157 | 
158 |   processAvgTickers.awaitTermination()    
159 | 
160 |   }
161 | }


--------------------------------------------------------------------------------
/control-env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | function stop {
 4 |   echo "Stopping and removing containers"
 5 |   docker-compose --project-name wksp down
 6 | }
 7 | 
 8 | function cleanup {
 9 |   echo "Removing volume"
10 |   docker volume rm wksp_postgres-data
11 |   docker volume rm wksp_superset
12 |   docker volume rm wksp_postgres-airflow-data
13 | }
14 | 
15 | function start {
16 |   echo "Starting up"
17 |   docker-compose --project-name wksp up -d
18 | }
19 | 
20 | function update {
21 |   echo "Updating code ..."
22 |   git pull --all
23 | 
24 |   echo "Updating docker images ..."
25 |   docker-compose --project-name wksp pull
26 | 
27 |   echo "You probably should restart"
28 | }
29 | 
30 | function info {
31 |   echo '
32 |   Everything is ready, access your host to learn more (ie: http://localhost/)
33 |   '
34 | }
35 | 
36 | function token {
37 |   echo 'Your TOKEN for Jupyter Notebook is:'
38 |   SERVER=$(docker exec -it jupyter jupyter notebook list)
39 |   echo "${SERVER}" | grep '/notebook' | sed -E 's/^.*=([a-z0-9]+).*$/\1/'
40 | }
41 | 
42 | function superset-init {
43 |   echo 'Initializing Superset database using sqlite'
44 |   docker exec -it superset superset-init
45 | }
46 | 
47 | function psql {
48 |   docker exec -it postgres psql -U workshop workshop
49 | }
50 | 
51 | case $1 in
52 |   start )
53 |   start
54 |   info
55 |     ;;
56 | 
57 |   stop )
58 |   stop
59 |     ;;
60 | 
61 |   cleanup )
62 |   stop
63 |   cleanup
64 |     ;;
65 | 
66 |   update )
67 |   update
68 |     ;;
69 | 
70 |   logs )
71 |   docker-compose --project-name wksp logs -f
72 |     ;;
73 | 
74 |   token )
75 |   token
76 |     ;;
77 | 
78 |   superset-init )
79 |   superset-init
80 |     ;;
81 | 
82 |   psql )
83 |   psql
84 |     ;;
85 | 
86 |   * )
87 |   printf "ERROR: Missing command\n  Usage: `basename $0` (start|stop|cleanup|token|logs|update)\n"
88 |   exit 1
89 |     ;;
90 | esac
91 | 


--------------------------------------------------------------------------------
/dataset/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | credit-risk.model
3 | 


--------------------------------------------------------------------------------
/dataset/credit-risk/germancredit-user-input.csv:
--------------------------------------------------------------------------------
1 | user389076,4,12,2,3,3059,4,4,2,1,1,4,1,61,3,2,1,2,1,1,1
2 | user123456,1,24,2,2,2996,5,3,2,4,1,4,3,20,3,2,1,3,1,1,1
3 | user789012,2,6,2,0,14555,5,1,1,3,1,2,2,23,3,2,1,1,1,2,1
4 | user234567,1,48,2,3,6758,1,3,3,2,1,2,3,31,3,2,1,3,1,2,1
5 | user345678,1,48,2,0,7763,1,5,4,3,1,4,4,42,1,3,1,4,1,1,1
6 | user456789,1,36,4,6,8065,1,3,3,2,1,2,4,25,3,2,2,4,1,2,1
7 | 


--------------------------------------------------------------------------------
/dataset/global-temperature-1880-2016.json:
--------------------------------------------------------------------------------
1 | {"description":{"title":"Global Land and Ocean Temperature Anomalies, January-December","units":"Degrees Celsius","base_period":"1901-2000","missing":-999},"data":{"1880":"-0.12","1881":"-0.08","1882":"-0.10","1883":"-0.18","1884":"-0.27","1885":"-0.25","1886":"-0.24","1887":"-0.29","1888":"-0.13","1889":"-0.09","1890":"-0.35","1891":"-0.25","1892":"-0.30","1893":"-0.33","1894":"-0.31","1895":"-0.24","1896":"-0.09","1897":"-0.10","1898":"-0.27","1899":"-0.15","1900":"-0.07","1901":"-0.15","1902":"-0.25","1903":"-0.37","1904":"-0.45","1905":"-0.28","1906":"-0.21","1907":"-0.38","1908":"-0.43","1909":"-0.44","1910":"-0.40","1911":"-0.44","1912":"-0.34","1913":"-0.32","1914":"-0.14","1915":"-0.09","1916":"-0.32","1917":"-0.40","1918":"-0.31","1919":"-0.25","1920":"-0.23","1921":"-0.16","1922":"-0.24","1923":"-0.25","1924":"-0.24","1925":"-0.18","1926":"-0.07","1927":"-0.17","1928":"-0.18","1929":"-0.33","1930":"-0.11","1931":"-0.06","1932":"-0.13","1933":"-0.26","1934":"-0.11","1935":"-0.16","1936":"-0.12","1937":"-0.01","1938":"-0.02","1939":"0.01","1940":"0.15","1941":"0.27","1942":"0.10","1943":"0.10","1944":"0.27","1945":"0.17","1946":"-0.01","1947":"-0.04","1948":"-0.06","1949":"-0.08","1950":"-0.16","1951":"0.00","1952":"0.04","1953":"0.13","1954":"-0.10","1955":"-0.13","1956":"-0.18","1957":"0.07","1958":"0.12","1959":"0.08","1960":"0.05","1961":"0.09","1962":"0.10","1963":"0.12","1964":"-0.14","1965":"-0.07","1966":"-0.01","1967":"0.00","1968":"-0.03","1969":"0.11","1970":"0.06","1971":"-0.07","1972":"0.04","1973":"0.19","1974":"-0.06","1975":"0.01","1976":"-0.07","1977":"0.21","1978":"0.12","1979":"0.23","1980":"0.28","1981":"0.32","1982":"0.19","1983":"0.36","1984":"0.17","1985":"0.16","1986":"0.24","1987":"0.38","1988":"0.39","1989":"0.30","1990":"0.45","1991":"0.39","1992":"0.24","1993":"0.28","1994":"0.35","1995":"0.47","1996":"0.33","1997":"0.52","1998":"0.65","1999":"0.44","2000":"0.43","2001":"0.57","2002":"0.62","2003":"0.64","2004":"0.59","2005":"0.67","2006":"0.64","2007":"0.62","2008":"0.55","2009":"0.65","2010":"0.73","2011":"0.58","2012":"0.64","2013":"0.68","2014":"0.74","2015":"0.93","2016":"0.99"}}


--------------------------------------------------------------------------------
/dataset/news/huffingtonpost-news.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/news/huffingtonpost-news.json.gz


--------------------------------------------------------------------------------
/dataset/pyspark-df-overview/README.md:
--------------------------------------------------------------------------------
1 | # Adult Census Income Datase
2 | 
3 | https://www.kaggle.com/uciml/adult-census-income/home


--------------------------------------------------------------------------------
/dataset/pyspark-df-overview/census_income.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/pyspark-df-overview/census_income.csv.gz


--------------------------------------------------------------------------------
/dataset/stocks/README.md:
--------------------------------------------------------------------------------
 1 | # Huge Stock Market Dataset
 2 | ## Full Historical Daily Price + Volume Data For All U.S. Stocks & ETFs
 3 | High-quality financial data is expensive to acquire and is therefore rarely shared for free. Here I provide the full historical daily price and volume data for all U.S.-based stocks and ETFs trading on the NYSE, NASDAQ, and AMEX (NYSE MKT). It's one of the best datasets of its kind you can obtain.
 4 | 
 5 | [Download the complete Dataset from kaggle.com](https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs/)
 6 | 
 7 | Acknowledge/Thanks for [Boris Marjanovic](https://www.kaggle.com/borismarjanovic)
 8 | 
 9 | ---
10 | 
11 | ## `stocks-small` folder Inspired on Fortune 500 Tech list
12 | http://fortune.com/2015/06/13/fortune-500-tech/
13 | 


--------------------------------------------------------------------------------
/dataset/yahoo-symbols-201709.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/yahoo-symbols-201709.csv


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3"
  2 | services:
  3 |   docs:
  4 |     container_name: docs
  5 |     image: nginx
  6 |     ports:
  7 |       - "80:80"
  8 |     volumes:
  9 |       - "./nginx/html:/usr/share/nginx/html:ro"
 10 | 
 11 |   master:
 12 |     container_name: master
 13 |     image: arjones/pyspark:2.4.5
 14 |     restart: always
 15 |     command: ["/opt/spark/sbin/start-master.sh"]
 16 |     environment:
 17 |       MASTER: spark://master:7077
 18 |       SPARK_NO_DAEMONIZE: 1
 19 |     ports:
 20 |       - 4040:4040
 21 |       - 6066:6066
 22 |       - 7077:7077
 23 |       - 8080:8080
 24 |     volumes:
 25 |       - ./code:/app
 26 |       - ./dataset:/dataset
 27 | 
 28 |   worker1:
 29 |     container_name: worker1
 30 |     image: arjones/pyspark:2.4.5
 31 |     restart: always
 32 |     command: ["/opt/spark/sbin/start-slave.sh", "spark://master:7077"]
 33 |     environment:
 34 |       MASTER: spark://master:7077
 35 |       SPARK_NO_DAEMONIZE: 1
 36 |     depends_on:
 37 |       - master
 38 |     ports:
 39 |       - 4041:4040
 40 |       - "6066"
 41 |       - "7077"
 42 |       - 8081:8080
 43 |     volumes:
 44 |       - ./code:/app
 45 |       - ./dataset:/dataset
 46 | 
 47 |   worker2:
 48 |     container_name: worker2
 49 |     image: arjones/pyspark:2.4.5
 50 |     restart: always
 51 |     command: ["/opt/spark/sbin/start-slave.sh", "spark://master:7077"]
 52 |     environment:
 53 |       MASTER: spark://master:7077
 54 |       SPARK_NO_DAEMONIZE: 1
 55 |     depends_on:
 56 |       - master
 57 |     ports:
 58 |       - 4042:4040
 59 |       - "6066"
 60 |       - "7077"
 61 |       - 8082:8080
 62 |     volumes:
 63 |       - ./code:/app
 64 |       - ./dataset:/dataset
 65 | 
 66 |   jupyter:
 67 |     container_name: jupyter
 68 |     image: arjones/pyspark:2.4.5
 69 |     restart: always
 70 |     environment:
 71 |       MASTER: spark://master:7077
 72 |     depends_on:
 73 |       - master
 74 |     ports:
 75 |       - "8888:8888"
 76 |     volumes:
 77 |       - ./jupyter/notebook:/notebook
 78 |       - ./dataset:/dataset
 79 |       - ./code:/app
 80 | 
 81 |   kafka:
 82 |     container_name: kafka
 83 |     image: spotify/kafka
 84 |     restart: always
 85 |     ports:
 86 |       - "2181:2181"
 87 |       - "9092:9092"
 88 |     environment:
 89 |       ADVERTISED_HOST: kafka
 90 |       ADVERTISED_PORT: 9092
 91 | 
 92 |   postgres:
 93 |     container_name: postgres
 94 |     image: postgres:11
 95 |     restart: always
 96 |     volumes:
 97 |       - postgres-data:/var/lib/postgresql/data
 98 |       - ./postgres/scripts:/docker-entrypoint-initdb.d
 99 |     environment:
100 |       POSTGRES_DB: workshop
101 |       POSTGRES_USER: workshop
102 |       POSTGRES_PASSWORD: w0rkzh0p
103 |     ports:
104 |       - "5432:5432"
105 | 
106 |   redis:
107 |     container_name: redis
108 |     image: redis
109 |     restart: always
110 | 
111 |   superset:
112 |     container_name: superset
113 |     image: amancevice/superset
114 |     restart: always
115 |     depends_on:
116 |       - redis
117 |       - postgres
118 |     environment:
119 |       MAPBOX_API_KEY: ${MAPBOX_API_KEY}
120 |       SUPERSET_HOME: /etc/superset
121 |     ports:
122 |       - "8088:8088"
123 |     volumes:
124 |       - ./superset/conf/superset_config.py:/etc/superset/superset_config.py
125 |       - superset:/var/lib/superset
126 | 
127 |   postgres-airflow:
128 |     container_name: postgres-airflow
129 |     image: postgres:11
130 |     restart: always
131 |     volumes:
132 |       - postgres-airflow-data:/var/lib/postgresql/data
133 |     environment:
134 |       POSTGRES_DB: airflow
135 |       POSTGRES_USER: airflow
136 |       POSTGRES_PASSWORD: airflow
137 |     ports:
138 |       - "5434:5432"
139 | 
140 |   airflow:
141 |     container_name: airflow
142 |     image: puckel/docker-airflow
143 |     restart: always
144 |     depends_on:
145 |       - postgres-airflow
146 |     environment:
147 |       EXECUTOR: Local
148 |       LOAD_EX: n
149 |       AIRFLOW__WEBSERVER__WEB_SERVER_PORT: 9090
150 |       AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres-airflow:5432/airflow
151 |       AIRFLOW__CORE__FERNET_KEY: "Eff80poJxv6LE4432pDC6OmD6N449KCSuhUAMLXiq4U="
152 |     ports:
153 |       - "9090:9090"
154 |     volumes:
155 |       - ./airflow/dags:/usr/local/airflow/dags
156 | 
157 | volumes:
158 |   postgres-data:
159 |   superset:
160 |   postgres-airflow-data:
161 | 


--------------------------------------------------------------------------------
/images/docker-advanced-config.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/docker-advanced-config.jpg


--------------------------------------------------------------------------------
/images/superset-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-01.png


--------------------------------------------------------------------------------
/images/superset-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-02.png


--------------------------------------------------------------------------------
/images/superset-03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-03.png


--------------------------------------------------------------------------------
/images/superset-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-04.png


--------------------------------------------------------------------------------
/images/superset-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-05.png


--------------------------------------------------------------------------------
/images/superset-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-06.png


--------------------------------------------------------------------------------
/images/superset-07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-07.png


--------------------------------------------------------------------------------
/images/superset-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-08.png


--------------------------------------------------------------------------------
/images/superset-09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-09.png


--------------------------------------------------------------------------------
/images/superset-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-10.png


--------------------------------------------------------------------------------
/images/superset-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-11.png


--------------------------------------------------------------------------------
/images/superset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset.png


--------------------------------------------------------------------------------
/jupyter/notebook/README.md:
--------------------------------------------------------------------------------
 1 | # Notebook
 2 | 
 3 | ## Pandas (without Spark integration)
 4 | 
 5 | * [pandas-json-sample](pandas-json-sample.ipynb)
 6 | 
 7 | ## pySpark
 8 | 
 9 | ### Check Installation 
10 | 
11 | * [pyspark-intro](pyspark-intro.ipynb): basic pySpark operations
12 | * [pyspark-check-install](pyspark-check-install.ipynb): check pySpark installation, this notebook must run without errors.
13 | * [pyspark-apache-arrow](pyspark-apache-arrow.ipynb): Apache Arrow to integrate Pandas/NumPy data to pySpark.
14 | * [pyspark-postgres](pyspark-postgres.ipynb): Reading/Writing data from Postgres
15 | 
16 | ### Basic commands
17 | 
18 | * [pyspark-dataframe-overview](pyspark-dataframe-overview.ipynb): Spark Dataframe operations
19 | 
20 | 
21 | ### Machine Learning
22 | 
23 | * **Titanic** [Exercise](titanic/titanic_spark_exercises.ipynb) | [Solution](titanic/titanic_spark_solutions.ipynb): [Kaggle Competition](https://www.kaggle.com/c/titanic) solved using pySpark
24 | * [pyspark-nlp](pyspark-nlp.ipynb): Multi-Class Text Classification Using PySpark, MLlib & Doc2Vec 
25 | 


--------------------------------------------------------------------------------
/jupyter/notebook/batch_etl_steps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import findspark\n",
 10 |     "\n",
 11 |     "findspark.add_jars('/app/postgresql-42.1.4.jar')\n",
 12 |     "findspark.init()"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "from pyspark.sql import SparkSession\n",
 22 |     "\n",
 23 |     "spark = SparkSession.builder.appName(\"Stocks:ETL\").getOrCreate()"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "'2.4.5'"
 35 |       ]
 36 |      },
 37 |      "execution_count": 3,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "spark.version"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "stocks_dir = '/dataset/stocks-small'"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import sys\n",
 62 |     "\n",
 63 |     "from pyspark.sql import SparkSession\n",
 64 |     "\n",
 65 |     "# UDF\n",
 66 |     "from pyspark.sql.types import StringType\n",
 67 |     "#\n",
 68 |     "from pyspark.sql import functions as F\n",
 69 |     "from pyspark.sql.window import Window"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 6,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "df = spark.read \\\n",
 79 |     "    .option(\"header\", True) \\\n",
 80 |     "    .option(\"inferSchema\", True) \\\n",
 81 |     "    .csv(stocks_dir)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 7,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "root\n",
 94 |       " |-- Date: timestamp (nullable = true)\n",
 95 |       " |-- Open: double (nullable = true)\n",
 96 |       " |-- High: double (nullable = true)\n",
 97 |       " |-- Low: double (nullable = true)\n",
 98 |       " |-- Close: double (nullable = true)\n",
 99 |       " |-- Volume: integer (nullable = true)\n",
100 |       " |-- OpenInt: integer (nullable = true)\n",
101 |       "\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "df.count()\n",
107 |     "df.printSchema()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 8,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "+-------------------+------+------+------+------+------+-------+\n",
120 |       "|               Date|  Open|  High|   Low| Close|Volume|OpenInt|\n",
121 |       "+-------------------+------+------+------+------+------+-------+\n",
122 |       "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378|467056|      0|\n",
123 |       "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294|      0|\n",
124 |       "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365|      0|\n",
125 |       "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112|      0|\n",
126 |       "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087|655676|      0|\n",
127 |       "|1962-01-09 00:00:00|6.1208|6.2376|6.1208|6.1621|592806|      0|\n",
128 |       "|1962-01-10 00:00:00|6.1707|6.2041|6.1707|6.1707|359274|      0|\n",
129 |       "|1962-01-11 00:00:00|6.1875|6.2376|6.1875|6.2376|386220|      0|\n",
130 |       "|1962-01-12 00:00:00|6.2543|6.2962|6.2543|6.2543|529933|      0|\n",
131 |       "|1962-01-15 00:00:00|6.2708|6.2962|6.2708|6.2792|305383|      0|\n",
132 |       "|1962-01-16 00:00:00|6.2708|6.2708|6.2128|6.2128|305383|      0|\n",
133 |       "|1962-01-17 00:00:00|6.1875|6.1875|6.0956|6.1125|502984|      0|\n",
134 |       "|1962-01-18 00:00:00|6.1291|6.1875|6.1291|6.1291|449093|      0|\n",
135 |       "|1962-01-19 00:00:00|6.1291|6.1457|6.0624|6.1374|485021|      0|\n",
136 |       "|1962-01-22 00:00:00|6.1374|6.1958|6.1208|6.1208|332329|      0|\n",
137 |       "|1962-01-23 00:00:00|6.1208|6.1291|6.0538|6.0624|449093|      0|\n",
138 |       "|1962-01-24 00:00:00|6.0624|6.0956|6.0287|6.0956|494001|      0|\n",
139 |       "|1962-01-25 00:00:00|6.0956|6.1457|6.0208|6.0287|386220|      0|\n",
140 |       "|1962-01-26 00:00:00|6.0287|6.0538|5.9951|5.9951|296401|      0|\n",
141 |       "|1962-01-29 00:00:00|5.9951|6.0373|5.8952|5.8952|700585|      0|\n",
142 |       "+-------------------+------+------+------+------+------+-------+\n",
143 |       "only showing top 20 rows\n",
144 |       "\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "df.show()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 9,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "df = df.withColumn('filename', F.input_file_name())"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 10,
164 |    "metadata": {
165 |     "scrolled": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n",
173 |       "|Date               |Open  |High  |Low   |Close |Volume|OpenInt|filename                               |\n",
174 |       "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n",
175 |       "|1962-01-02 00:00:00|6.413 |6.413 |6.3378|6.3378|467056|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
176 |       "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
177 |       "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
178 |       "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
179 |       "|1962-01-08 00:00:00|6.2041|6.2041|6.0373|6.087 |655676|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
180 |       "|1962-01-09 00:00:00|6.1208|6.2376|6.1208|6.1621|592806|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
181 |       "|1962-01-10 00:00:00|6.1707|6.2041|6.1707|6.1707|359274|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
182 |       "|1962-01-11 00:00:00|6.1875|6.2376|6.1875|6.2376|386220|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
183 |       "|1962-01-12 00:00:00|6.2543|6.2962|6.2543|6.2543|529933|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
184 |       "|1962-01-15 00:00:00|6.2708|6.2962|6.2708|6.2792|305383|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
185 |       "|1962-01-16 00:00:00|6.2708|6.2708|6.2128|6.2128|305383|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
186 |       "|1962-01-17 00:00:00|6.1875|6.1875|6.0956|6.1125|502984|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
187 |       "|1962-01-18 00:00:00|6.1291|6.1875|6.1291|6.1291|449093|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
188 |       "|1962-01-19 00:00:00|6.1291|6.1457|6.0624|6.1374|485021|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
189 |       "|1962-01-22 00:00:00|6.1374|6.1958|6.1208|6.1208|332329|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
190 |       "|1962-01-23 00:00:00|6.1208|6.1291|6.0538|6.0624|449093|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
191 |       "|1962-01-24 00:00:00|6.0624|6.0956|6.0287|6.0956|494001|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
192 |       "|1962-01-25 00:00:00|6.0956|6.1457|6.0208|6.0287|386220|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
193 |       "|1962-01-26 00:00:00|6.0287|6.0538|5.9951|5.9951|296401|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
194 |       "|1962-01-29 00:00:00|5.9951|6.0373|5.8952|5.8952|700585|0      |file:///dataset/stocks-small/ibm.us.txt|\n",
195 |       "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n",
196 |       "only showing top 20 rows\n",
197 |       "\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "df.show(truncate=False)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 11,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "df_lookup = spark.read.csv('/dataset/yahoo-symbols-201709.csv')"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 12,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "+------+--------------------+--------+--------------------+-------+\n",
224 |       "|   _c0|                 _c1|     _c2|                 _c3|    _c4|\n",
225 |       "+------+--------------------+--------+--------------------+-------+\n",
226 |       "|Ticker|                Name|Exchange|       Category Name|Country|\n",
227 |       "|  OEDV|Osage Exploration...|     PNK|                null|    USA|\n",
228 |       "|  AAPL|          Apple Inc.|     NMS|Electronic Equipment|    USA|\n",
229 |       "|   BAC|Bank of America C...|     NYQ|  Money Center Banks|    USA|\n",
230 |       "|  AMZN|    Amazon.com, Inc.|     NMS|Catalog & Mail Or...|    USA|\n",
231 |       "|     T|           AT&T Inc.|     NYQ|Telecom Services ...|    USA|\n",
232 |       "|  GOOG|       Alphabet Inc.|     NMS|Internet Informat...|    USA|\n",
233 |       "|    MO|  Altria Group, Inc.|     NYQ|          Cigarettes|    USA|\n",
234 |       "|   DAL|Delta Air Lines, ...|     NYQ|      Major Airlines|    USA|\n",
235 |       "|    AA|   Alcoa Corporation|     NYQ|            Aluminum|    USA|\n",
236 |       "|   AXP|American Express ...|     NYQ|     Credit Services|    USA|\n",
237 |       "|    DD|E. I. du Pont de ...|     NYQ|Agricultural Chem...|    USA|\n",
238 |       "|  BABA|Alibaba Group Hol...|     NYQ|Specialty Retail,...|    USA|\n",
239 |       "|   ABT| Abbott Laboratories|     NYQ|Medical Appliance...|    USA|\n",
240 |       "|    UA|  Under Armour, Inc.|     NYQ|Textile - Apparel...|    USA|\n",
241 |       "|  AMAT|Applied Materials...|     NMS|Semiconductor Equ...|    USA|\n",
242 |       "|  AMGN|          Amgen Inc.|     NMS|       Biotechnology|    USA|\n",
243 |       "|   AAL|American Airlines...|     NMS|      Major Airlines|    USA|\n",
244 |       "|   AIG|American Internat...|     NYQ|Property & Casual...|    USA|\n",
245 |       "|   ALL|The Allstate Corp...|     NYQ|Property & Casual...|    USA|\n",
246 |       "+------+--------------------+--------+--------------------+-------+\n",
247 |       "only showing top 20 rows\n",
248 |       "\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "df_lookup.show()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 13,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "def extract_symbol_from(filename):\n",
263 |     "    return filename.split('/')[-1].split('.')[0].upper()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 14,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "'IBM'"
275 |       ]
276 |      },
277 |      "execution_count": 14,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "# filename = 'file:///dataset/stocks-small/ibm.us.txt' # => IBM\n",
284 |     "extract_symbol_from('file:///dataset/stocks-small/ibm.us.txt')"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 15,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "extract_symbol = F.udf(lambda filename: extract_symbol_from(filename), StringType())"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 16,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "stocks_folder = stocks_dir\n",
303 |     "df = spark.read \\\n",
304 |     "        .option(\"header\", True) \\\n",
305 |     "        .option(\"inferSchema\", True) \\\n",
306 |     "        .csv(stocks_folder) \\\n",
307 |     "        .withColumn(\"name\", extract_symbol(F.input_file_name()))"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 17,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "+-------------------+------+------+------+------+------+-------+----+\n",
320 |       "|               Date|  Open|  High|   Low| Close|Volume|OpenInt|name|\n",
321 |       "+-------------------+------+------+------+------+------+-------+----+\n",
322 |       "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378|467056|      0| IBM|\n",
323 |       "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294|      0| IBM|\n",
324 |       "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365|      0| IBM|\n",
325 |       "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112|      0| IBM|\n",
326 |       "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087|655676|      0| IBM|\n",
327 |       "+-------------------+------+------+------+------+------+-------+----+\n",
328 |       "only showing top 5 rows\n",
329 |       "\n"
330 |      ]
331 |     }
332 |    ],
333 |    "source": [
334 |     "df.show(5)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 18,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "df = spark.read \\\n",
344 |     "        .option(\"header\", True) \\\n",
345 |     "        .option(\"inferSchema\", True) \\\n",
346 |     "        .csv(stocks_folder) \\\n",
347 |     "        .withColumn(\"name\", extract_symbol(F.input_file_name())) \\\n",
348 |     "        .withColumnRenamed(\"Date\", \"dateTime\") \\\n",
349 |     "        .withColumnRenamed(\"Open\", \"open\") \\\n",
350 |     "        .withColumnRenamed(\"High\", \"high\") \\\n",
351 |     "        .withColumnRenamed(\"Low\", \"low\") \\\n",
352 |     "        .withColumnRenamed(\"Close\", \"close\") \\\n",
353 |     "        .drop(\"Volume\", \"OpenInt\")"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 19,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "df_stocks = df"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 20,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "+-------------------+------+------+------+------+----+\n",
375 |       "|           dateTime|  open|  high|   low| close|name|\n",
376 |       "+-------------------+------+------+------+------+----+\n",
377 |       "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378| IBM|\n",
378 |       "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963| IBM|\n",
379 |       "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295| IBM|\n",
380 |       "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041| IBM|\n",
381 |       "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087| IBM|\n",
382 |       "+-------------------+------+------+------+------+----+\n",
383 |       "only showing top 5 rows\n",
384 |       "\n"
385 |      ]
386 |     }
387 |    ],
388 |    "source": [
389 |     "df_stocks.show(5)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 21,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "lookup_file = '/dataset/yahoo-symbols-201709.csv'"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 22,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "symbols_lookup = spark.read. \\\n",
408 |     "        option(\"header\", True). \\\n",
409 |     "        option(\"inferSchema\", True). \\\n",
410 |     "        csv(lookup_file). \\\n",
411 |     "        select(\"Ticker\", \"Category Name\"). \\\n",
412 |     "        withColumnRenamed(\"Ticker\", \"symbol\"). \\\n",
413 |     "        withColumnRenamed(\"Category Name\", \"category\")"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 23,
419 |    "metadata": {},
420 |    "outputs": [
421 |     {
422 |      "name": "stdout",
423 |      "output_type": "stream",
424 |      "text": [
425 |       "+-------------------+------+------+------+------+----+\n",
426 |       "|           dateTime|  open|  high|   low| close|name|\n",
427 |       "+-------------------+------+------+------+------+----+\n",
428 |       "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378| IBM|\n",
429 |       "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963| IBM|\n",
430 |       "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295| IBM|\n",
431 |       "+-------------------+------+------+------+------+----+\n",
432 |       "only showing top 3 rows\n",
433 |       "\n",
434 |       "+------+--------------------+\n",
435 |       "|symbol|            category|\n",
436 |       "+------+--------------------+\n",
437 |       "|  OEDV|                null|\n",
438 |       "|  AAPL|Electronic Equipment|\n",
439 |       "|   BAC|  Money Center Banks|\n",
440 |       "+------+--------------------+\n",
441 |       "only showing top 3 rows\n",
442 |       "\n"
443 |      ]
444 |     }
445 |    ],
446 |    "source": [
447 |     "df_stocks.show(3)\n",
448 |     "symbols_lookup.show(3)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 24,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "joined_df = df_stocks \\\n",
458 |     "    .withColumnRenamed('dateTime', \"full_date\") \\\n",
459 |     "    .filter(\"full_date >= \\\"2017-09-01\\\"\") \\\n",
460 |     "    .withColumn(\"year\", F.year(\"full_date\")) \\\n",
461 |     "    .withColumn(\"month\", F.month(\"full_date\")) \\\n",
462 |     "    .withColumn(\"day\", F.dayofmonth(\"full_date\")) \\\n",
463 |     "    .withColumnRenamed(\"name\", \"symbol\") \\\n",
464 |     "    .join(symbols_lookup, [\"symbol\"])"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": 25,
470 |    "metadata": {},
471 |    "outputs": [
472 |     {
473 |      "name": "stdout",
474 |      "output_type": "stream",
475 |      "text": [
476 |       "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n",
477 |       "|symbol|          full_date|  open|  high|   low| close|year|month|day|            category|\n",
478 |       "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n",
479 |       "|   IBM|2017-01-03 00:00:00|160.76| 161.6|159.81|160.95|2017|    1|  3|Information Techn...|\n",
480 |       "|   IBM|2017-01-04 00:00:00|161.51|163.53|161.11|162.94|2017|    1|  4|Information Techn...|\n",
481 |       "|   IBM|2017-01-05 00:00:00|162.93|163.06|161.01|162.41|2017|    1|  5|Information Techn...|\n",
482 |       "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n",
483 |       "only showing top 3 rows\n",
484 |       "\n"
485 |      ]
486 |     }
487 |    ],
488 |    "source": [
489 |     "joined_df.show(3)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 26,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "window20 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-20, 0))\n",
499 |     "window50 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-50, 0))\n",
500 |     "window100 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-100, 0))"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": 27,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "stocks_moving_avg_df = joined_df \\\n",
510 |     "    .withColumn(\"ma20\", F.avg(\"close\").over(window20)) \\\n",
511 |     "    .withColumn(\"ma50\", F.avg(\"close\").over(window50)) \\\n",
512 |     "    .withColumn(\"ma100\", F.avg(\"close\").over(window100))"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 28,
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "name": "stdout",
522 |      "output_type": "stream",
523 |      "text": [
524 |       "+------+------+------------------+\n",
525 |       "|symbol| close|              ma20|\n",
526 |       "+------+------+------------------+\n",
527 |       "|  AAPL|114.31|            114.31|\n",
528 |       "|  AAPL|114.19|            114.25|\n",
529 |       "|  AAPL|114.77|114.42333333333333|\n",
530 |       "|  AAPL|116.04|          114.8275|\n",
531 |       "|  AAPL|117.11|115.28399999999999|\n",
532 |       "|  AAPL|117.23|115.60833333333333|\n",
533 |       "|  AAPL|117.86|115.92999999999999|\n",
534 |       "|  AAPL|117.37|            116.11|\n",
535 |       "|  AAPL|117.16|116.22666666666666|\n",
536 |       "|  AAPL| 118.1|116.41399999999999|\n",
537 |       "|  AAPL|118.09|116.56636363636362|\n",
538 |       "|  AAPL|117.89|116.67666666666666|\n",
539 |       "|  AAPL| 118.1|116.78615384615384|\n",
540 |       "|  AAPL|118.19|116.88642857142857|\n",
541 |       "|  AAPL|118.07|116.96533333333332|\n",
542 |       "|  AAPL|119.95|117.15187499999999|\n",
543 |       "|  AAPL|120.01|            117.32|\n",
544 |       "|  AAPL|120.02|            117.47|\n",
545 |       "|  AAPL| 119.7|117.58736842105263|\n",
546 |       "|  AAPL|119.43|117.67949999999999|\n",
547 |       "|  AAPL| 126.7| 118.1090476190476|\n",
548 |       "|  AAPL| 126.5|118.68952380952379|\n",
549 |       "|  AAPL|127.03| 119.3009523809524|\n",
550 |       "|  AAPL|128.23|119.94190476190477|\n",
551 |       "|  AAPL|129.44|120.58000000000004|\n",
552 |       "+------+------+------------------+\n",
553 |       "only showing top 25 rows\n",
554 |       "\n"
555 |      ]
556 |     }
557 |    ],
558 |    "source": [
559 |     "# Moving Average\n",
560 |     "stocks_moving_avg_df.select('symbol', 'close', 'ma20').show(25)"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 29,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "output_dir = '/dataset/output.parquet'"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 30,
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "stocks_moving_avg_df \\\n",
579 |     "    .write \\\n",
580 |     "    .mode('overwrite') \\\n",
581 |     "    .partitionBy(\"year\", \"month\", \"day\") \\\n",
582 |     "    .parquet(output_dir)"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": 31,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "df_parquet = spark.read.parquet(output_dir)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 32,
597 |    "metadata": {},
598 |    "outputs": [
599 |     {
600 |      "data": {
601 |       "text/plain": [
602 |        "4142"
603 |       ]
604 |      },
605 |      "execution_count": 32,
606 |      "metadata": {},
607 |      "output_type": "execute_result"
608 |     }
609 |    ],
610 |    "source": [
611 |     "df_parquet.count()"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 33,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "df_parquet.createOrReplaceTempView(\"stocks\")"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 34,
626 |    "metadata": {},
627 |    "outputs": [
628 |     {
629 |      "name": "stdout",
630 |      "output_type": "stream",
631 |      "text": [
632 |       "== Physical Plan ==\n",
633 |       "*(2) HashAggregate(keys=[symbol#559], functions=[max(close#564)])\n",
634 |       "+- Exchange hashpartitioning(symbol#559, 200)\n",
635 |       "   +- *(1) HashAggregate(keys=[symbol#559], functions=[partial_max(close#564)])\n",
636 |       "      +- *(1) Project [symbol#559, close#564]\n",
637 |       "         +- *(1) Filter ((isnotnull(full_date#560) && (cast(full_date#560 as string) >= 2017-09-01)) && (cast(full_date#560 as string) < 2017-10-01))\n",
638 |       "            +- *(1) FileScan parquet [symbol#559,full_date#560,close#564,year#569,month#570,day#571] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/dataset/output.parquet], PartitionCount: 218, PartitionFilters: [], PushedFilters: [IsNotNull(full_date)], ReadSchema: struct<symbol:string,full_date:timestamp,close:double>\n"
639 |      ]
640 |     }
641 |    ],
642 |    "source": [
643 |     "badHighestClosingPrice = spark.sql(\"SELECT symbol, MAX(close) AS price FROM stocks WHERE full_date >= '2017-09-01' AND full_date < '2017-10-01' GROUP BY symbol\")\n",
644 |     "badHighestClosingPrice.explain()"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 35,
650 |    "metadata": {},
651 |    "outputs": [
652 |     {
653 |      "name": "stdout",
654 |      "output_type": "stream",
655 |      "text": [
656 |       "== Physical Plan ==\n",
657 |       "*(2) HashAggregate(keys=[symbol#559], functions=[max(close#564)])\n",
658 |       "+- Exchange hashpartitioning(symbol#559, 200)\n",
659 |       "   +- *(1) HashAggregate(keys=[symbol#559], functions=[partial_max(close#564)])\n",
660 |       "      +- *(1) Project [symbol#559, close#564]\n",
661 |       "         +- *(1) FileScan parquet [symbol#559,close#564,year#569,month#570,day#571] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/dataset/output.parquet], PartitionCount: 20, PartitionFilters: [isnotnull(year#569), isnotnull(month#570), (year#569 = 2017), (month#570 = 9)], PushedFilters: [], ReadSchema: struct<symbol:string,close:double>\n"
662 |      ]
663 |     }
664 |    ],
665 |    "source": [
666 |     "highestClosingPrice = spark.sql(\"SELECT symbol, MAX(close) AS price FROM stocks WHERE year=2017 AND month=9 GROUP BY symbol\")\n",
667 |     "highestClosingPrice.explain()"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 36,
673 |    "metadata": {},
674 |    "outputs": [],
675 |    "source": [
676 |     "# Write to Postgres\n",
677 |     "stocks_moving_avg_df \\\n",
678 |     "    .drop(\"year\", \"month\", \"day\") \\\n",
679 |     "    .write \\\n",
680 |     "    .format(\"jdbc\") \\\n",
681 |     "    .option(\"url\", \"jdbc:postgresql://postgres/workshop\") \\\n",
682 |     "    .option(\"dbtable\", \"workshop.stocks\") \\\n",
683 |     "    .option(\"user\", \"workshop\") \\\n",
684 |     "    .option(\"password\", \"w0rkzh0p\") \\\n",
685 |     "    .option(\"driver\", \"org.postgresql.Driver\") \\\n",
686 |     "    .mode('append') \\\n",
687 |     "    .save()"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": null,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": []
696 |   }
697 |  ],
698 |  "metadata": {
699 |   "kernelspec": {
700 |    "display_name": "Python 3",
701 |    "language": "python",
702 |    "name": "python3"
703 |   },
704 |   "language_info": {
705 |    "codemirror_mode": {
706 |     "name": "ipython",
707 |     "version": 3
708 |    },
709 |    "file_extension": ".py",
710 |    "mimetype": "text/x-python",
711 |    "name": "python",
712 |    "nbconvert_exporter": "python",
713 |    "pygments_lexer": "ipython3",
714 |    "version": "3.7.3"
715 |   }
716 |  },
717 |  "nbformat": 4,
718 |  "nbformat_minor": 4
719 | }
720 | 


--------------------------------------------------------------------------------
/jupyter/notebook/pyspark-apache-arrow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PySpark: Pandas with Apache Arrow\n",
  8 |     "[Reference](https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark)"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import findspark\n",
 18 |     "findspark.init()"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from pyspark.sql import SparkSession\n",
 28 |     "\n",
 29 |     "spark = SparkSession.builder. \\\n",
 30 |     "    appName(\"pyspark-arrow\"). \\\n",
 31 |     "    getOrCreate()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import numpy as np\n",
 41 |     "import pandas as pd\n",
 42 |     "\n",
 43 |     "# Enable Arrow-based columnar data transfers\n",
 44 |     "spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Generate a Pandas DataFrame\n",
 54 |     "pdf = pd.DataFrame(np.random.rand(100, 3))\n",
 55 |     "\n",
 56 |     "# Create a Spark DataFrame from a Pandas DataFrame using Arrow\n",
 57 |     "df = spark.createDataFrame(pdf)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stderr",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "/usr/local/lib/python3.7/dist-packages/pyarrow/__init__.py:157: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n",
 70 |       "  warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n"
 71 |      ]
 72 |     },
 73 |     {
 74 |      "data": {
 75 |       "text/html": [
 76 |        "<div>\n",
 77 |        "<style scoped>\n",
 78 |        "    .dataframe tbody tr th:only-of-type {\n",
 79 |        "        vertical-align: middle;\n",
 80 |        "    }\n",
 81 |        "\n",
 82 |        "    .dataframe tbody tr th {\n",
 83 |        "        vertical-align: top;\n",
 84 |        "    }\n",
 85 |        "\n",
 86 |        "    .dataframe thead th {\n",
 87 |        "        text-align: right;\n",
 88 |        "    }\n",
 89 |        "</style>\n",
 90 |        "<table border=\"1\" class=\"dataframe\">\n",
 91 |        "  <thead>\n",
 92 |        "    <tr style=\"text-align: right;\">\n",
 93 |        "      <th></th>\n",
 94 |        "      <th>0</th>\n",
 95 |        "      <th>1</th>\n",
 96 |        "      <th>2</th>\n",
 97 |        "    </tr>\n",
 98 |        "  </thead>\n",
 99 |        "  <tbody>\n",
100 |        "    <tr>\n",
101 |        "      <th>0</th>\n",
102 |        "      <td>0.581765</td>\n",
103 |        "      <td>0.421754</td>\n",
104 |        "      <td>0.746082</td>\n",
105 |        "    </tr>\n",
106 |        "    <tr>\n",
107 |        "      <th>1</th>\n",
108 |        "      <td>0.450096</td>\n",
109 |        "      <td>0.838185</td>\n",
110 |        "      <td>0.650798</td>\n",
111 |        "    </tr>\n",
112 |        "    <tr>\n",
113 |        "      <th>2</th>\n",
114 |        "      <td>0.906061</td>\n",
115 |        "      <td>0.902553</td>\n",
116 |        "      <td>0.582332</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>3</th>\n",
120 |        "      <td>0.027134</td>\n",
121 |        "      <td>0.367107</td>\n",
122 |        "      <td>0.342978</td>\n",
123 |        "    </tr>\n",
124 |        "    <tr>\n",
125 |        "      <th>4</th>\n",
126 |        "      <td>0.271463</td>\n",
127 |        "      <td>0.658056</td>\n",
128 |        "      <td>0.881614</td>\n",
129 |        "    </tr>\n",
130 |        "    <tr>\n",
131 |        "      <th>...</th>\n",
132 |        "      <td>...</td>\n",
133 |        "      <td>...</td>\n",
134 |        "      <td>...</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>95</th>\n",
138 |        "      <td>0.894291</td>\n",
139 |        "      <td>0.621559</td>\n",
140 |        "      <td>0.434179</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>96</th>\n",
144 |        "      <td>0.336394</td>\n",
145 |        "      <td>0.382479</td>\n",
146 |        "      <td>0.723049</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>97</th>\n",
150 |        "      <td>0.940094</td>\n",
151 |        "      <td>0.693528</td>\n",
152 |        "      <td>0.695185</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>98</th>\n",
156 |        "      <td>0.571244</td>\n",
157 |        "      <td>0.793291</td>\n",
158 |        "      <td>0.476467</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>99</th>\n",
162 |        "      <td>0.547759</td>\n",
163 |        "      <td>0.531697</td>\n",
164 |        "      <td>0.638495</td>\n",
165 |        "    </tr>\n",
166 |        "  </tbody>\n",
167 |        "</table>\n",
168 |        "<p>100 rows × 3 columns</p>\n",
169 |        "</div>"
170 |       ],
171 |       "text/plain": [
172 |        "           0         1         2\n",
173 |        "0   0.581765  0.421754  0.746082\n",
174 |        "1   0.450096  0.838185  0.650798\n",
175 |        "2   0.906061  0.902553  0.582332\n",
176 |        "3   0.027134  0.367107  0.342978\n",
177 |        "4   0.271463  0.658056  0.881614\n",
178 |        "..       ...       ...       ...\n",
179 |        "95  0.894291  0.621559  0.434179\n",
180 |        "96  0.336394  0.382479  0.723049\n",
181 |        "97  0.940094  0.693528  0.695185\n",
182 |        "98  0.571244  0.793291  0.476467\n",
183 |        "99  0.547759  0.531697  0.638495\n",
184 |        "\n",
185 |        "[100 rows x 3 columns]"
186 |       ]
187 |      },
188 |      "execution_count": 5,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow\n",
195 |     "result_pdf = df.select(\"*\").toPandas()\n",
196 |     "\n",
197 |     "result_pdf"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 6,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "spark.stop()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python 3",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.7.3"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 2
238 | }
239 | 


--------------------------------------------------------------------------------
/jupyter/notebook/pyspark-check-install.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Test notebook\n",
 8 |     "It should run without errors when all Worker nodes contains python deps. From [OneHotEncoderEstimator](https://spark.apache.org/docs/latest/ml-features#onehotencoderestimator)"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "metadata": {},
15 |    "outputs": [],
16 |    "source": [
17 |     "import findspark\n",
18 |     "findspark.init()"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "from pyspark.sql import SparkSession\n",
28 |     "spark = SparkSession.builder.appName(\"OneHotEncoderEstimator\").getOrCreate()"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {},
35 |    "outputs": [],
36 |    "source": [
37 |     "from pyspark.ml.feature import OneHotEncoderEstimator\n",
38 |     "\n",
39 |     "df = spark.createDataFrame([\n",
40 |     "    (0.0, 1.0),\n",
41 |     "    (1.0, 0.0),\n",
42 |     "    (2.0, 1.0),\n",
43 |     "    (0.0, 2.0),\n",
44 |     "    (0.0, 1.0),\n",
45 |     "    (2.0, 0.0)\n",
46 |     "], [\"categoryIndex1\", \"categoryIndex2\"])\n",
47 |     "\n",
48 |     "encoder = OneHotEncoderEstimator(inputCols=[\"categoryIndex1\", \"categoryIndex2\"],\n",
49 |     "                                 outputCols=[\"categoryVec1\", \"categoryVec2\"])\n",
50 |     "model = encoder.fit(df)\n",
51 |     "encoded = model.transform(df)\n",
52 |     "encoded.show()"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": [
61 |     "spark.stop()"
62 |    ]
63 |   },
64 |   {
65 |    "cell_type": "code",
66 |    "execution_count": null,
67 |    "metadata": {},
68 |    "outputs": [],
69 |    "source": []
70 |   }
71 |  ],
72 |  "metadata": {
73 |   "kernelspec": {
74 |    "display_name": "Python 3",
75 |    "language": "python",
76 |    "name": "python3"
77 |   },
78 |   "language_info": {
79 |    "codemirror_mode": {
80 |     "name": "ipython",
81 |     "version": 3
82 |    },
83 |    "file_extension": ".py",
84 |    "mimetype": "text/x-python",
85 |    "name": "python",
86 |    "nbconvert_exporter": "python",
87 |    "pygments_lexer": "ipython3",
88 |    "version": "3.7.3"
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 2
93 | }


--------------------------------------------------------------------------------
/jupyter/notebook/pyspark-dataframe-overview.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# pySpark Commands Reference\n",
  8 |     "https://spark.apache.org/docs/2.4.4/api/python/index.html\n",
  9 |     "\n",
 10 |     "\n",
 11 |     "## Connect to Spark Cluster\n",
 12 |     "https://github.com/minrk/findspark"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import findspark\n",
 22 |     "findspark.init()"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "from pyspark.sql import SparkSession\n",
 32 |     "spark = SparkSession.builder.appName(\"pyspark-df-overview\").getOrCreate()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "spark.version"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Create Spark DataFrame\n",
 49 |     "Dataset from: https://www.kaggle.com/uciml/adult-census-income/home"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "df = spark.read.csv(\"/dataset/pyspark-df-overview/census_income.csv.gz\", header=True)\n",
 59 |     "df.printSchema()"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "### Define a schema"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "import pyspark.sql.types as t\n",
 76 |     "\n",
 77 |     "census_schema = t.StructType([\n",
 78 |     "      t.StructField('age', t.IntegerType(), True)\n",
 79 |     "    , t.StructField('workclass', t.StringType(), True)\n",
 80 |     "    , t.StructField('fnlwgt', t.IntegerType(), True)\n",
 81 |     "    , t.StructField('education', t.StringType(), True)\n",
 82 |     "    , t.StructField('education-num', t.IntegerType(), True)\n",
 83 |     "    , t.StructField('marital-status', t.StringType(), True)\n",
 84 |     "    , t.StructField('occupation', t.StringType(), True)\n",
 85 |     "    , t.StructField('relationship', t.StringType(), True)\n",
 86 |     "    , t.StructField('race', t.StringType(), True)\n",
 87 |     "    , t.StructField('sex', t.StringType(), True)\n",
 88 |     "    , t.StructField('capital-gain', t.DoubleType(), True)\n",
 89 |     "    , t.StructField('capital-loss', t.DoubleType(), True)\n",
 90 |     "    , t.StructField('hours-per-week', t.IntegerType(), True)\n",
 91 |     "    , t.StructField('native-country', t.StringType(), True)\n",
 92 |     "    , t.StructField('label', t.StringType(), True)\n",
 93 |     "])"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Support for compressed (gziped) payload\n",
103 |     "df = spark.read.csv(\"/dataset/pyspark-df-overview/census_income.csv.gz\", header=True, schema=census_schema)\n",
104 |     "df.printSchema()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "df.count()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "### Drop unused column"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "df = df.drop('fnlwgt')\n",
130 |     "df.printSchema()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "### Few operations"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "from pyspark.sql.functions import count, avg, desc\n",
147 |     "\n",
148 |     "df.groupBy(['education']). \\\n",
149 |     "agg(\n",
150 |     "    count('*').alias('qty'), \n",
151 |     "    avg('age').alias('avg_age')\n",
152 |     ").orderBy(desc('qty')). \\\n",
153 |     "show()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "### Using SQL\n",
161 |     "Same operation with SQL syntax"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "df.createOrReplaceTempView(\"census\")\n",
171 |     "s = spark.sql(\"\"\"\n",
172 |     "SELECT \n",
173 |     "    education, \n",
174 |     "    COUNT(*) AS qty, \n",
175 |     "    AVG(age) AS avg_age\n",
176 |     "FROM census\n",
177 |     "GROUP BY education\n",
178 |     "\"\"\")\n",
179 |     "s.show()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# a transformation can be exposed as function\n",
189 |     "def my_query(field):\n",
190 |     "    return df.groupBy([field]). \\\n",
191 |     "    agg(\n",
192 |     "        count('*').alias('qty'), \n",
193 |     "        avg('age').alias('avg_age')\n",
194 |     "    ).orderBy(desc('qty'))\n",
195 |     "    \n",
196 |     "\n",
197 |     "    \n",
198 |     "print(my_query('workclass').show())"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "df.select('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week').describe().show()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "df.select('workclass', 'education', 'marital-status').describe().show()"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "df.freqItems(['marital-status']).show(truncate=False)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "df.crosstab('age', 'label').sort(\"age_label\").show()\n"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "df.groupby('native-country').agg({'native-country': 'count'}).sort('count(native-country)').show()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "### Check if there is missing data"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "from pyspark.sql.functions import isnan, when, count, col\n",
260 |     "\n",
261 |     "# All columns\n",
262 |     "# cols = df.columns\n",
263 |     "# Selected columns\n",
264 |     "cols = ['workclass', 'education-num', 'occupation', 'hours-per-week', 'native-country']\n",
265 |     "\n",
266 |     "# https://stackoverflow.com/a/44631639/570393\n",
267 |     "df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in cols]).show()"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "### Remove rows with missing data"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "# Total rows\n",
284 |     "print('total rows: %s' % df.count())\n",
285 |     "\n",
286 |     "# After droping NA records\n",
287 |     "print('only complete rows: %s' % df.dropna().count())"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "### Fill rows that contains missing data"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "def show_df(df, field='occupation'):\n",
304 |     "    df.groupBy(field).count().show()"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "show_df(df)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# Fill with a fixed value\n",
323 |     "new_df = df.fillna({'occupation': 'Other-service'})\n",
324 |     "\n",
325 |     "# Count \n",
326 |     "show_df(new_df)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "### Better way\n",
334 |     "\n",
335 |     "Calc the `mean()` value of a column and use it on missing values.\n",
336 |     "Also use a static string for categorical data "
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "from pyspark.sql.functions import mean\n",
346 |     "df.groupBy().agg(mean('hours-per-week').alias('hours-per-week')).show()"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "from pyspark.sql.functions import mean\n",
356 |     "import pandas as pd\n",
357 |     "\n",
358 |     "data_to_fill = \\\n",
359 |     "    df.groupBy().agg(mean('hours-per-week').alias('hours-per-week')).toPandas().to_dict('records')[0]\n",
360 |     "\n",
361 |     "# Simple Python Dict Update\n",
362 |     "data_to_fill.update({'occupation': 'Other-service'})\n",
363 |     "\n",
364 |     "data_to_fill"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "df.fillna(data_to_fill).select('hours-per-week', 'occupation').show(50)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "### Creating charts with pandas & matplotlib\n",
381 |     "https://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-plotting\n",
382 |     "\n",
383 |     "**Important:** possible only when data become small enough to driver program"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "# This is distributed\n",
393 |     "df_spark = df.groupBy('workclass').agg(count('*').alias('counts')).orderBy('counts')\n",
394 |     "# df_spark.show()\n",
395 |     "\n",
396 |     "# This is running on driver\n",
397 |     "df_wk = df_spark.toPandas()"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "# Check Pandas DF content\n",
407 |     "df_wk"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "import matplotlib.pyplot as plt\n",
417 |     "%matplotlib inline\n",
418 |     "\n",
419 |     "df_wk.plot.bar(x='workclass', y='counts', figsize=(20,6));"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "### Stop Drive Program\n",
427 |     "Release resources from Spark Cluster"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "spark.stop()"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "# Continue Learning\n",
444 |     "\n",
445 |     "* [Kaggle Learn](https://www.kaggle.com/learn/overview)\n",
446 |     "* [PySpark Cookbook](https://www.safaribooksonline.com/library/view/pyspark-cookbook/9781788835367/)\n",
447 |     "\n",
448 |     "## Other references\n",
449 |     "\n",
450 |     "* [PySpark Tutorial for Beginners: Machine Learning Example](https://www.guru99.com/pyspark-tutorial.html)"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": []
459 |   }
460 |  ],
461 |  "metadata": {
462 |   "kernelspec": {
463 |    "display_name": "Python 3",
464 |    "language": "python",
465 |    "name": "python3"
466 |   },
467 |   "language_info": {
468 |    "codemirror_mode": {
469 |     "name": "ipython",
470 |     "version": 3
471 |    },
472 |    "file_extension": ".py",
473 |    "mimetype": "text/x-python",
474 |    "name": "python",
475 |    "nbconvert_exporter": "python",
476 |    "pygments_lexer": "ipython3",
477 |    "version": "3.7.3"
478 |   }
479 |  },
480 |  "nbformat": 4,
481 |  "nbformat_minor": 2
482 | }
483 | 


--------------------------------------------------------------------------------
/jupyter/notebook/pyspark-intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import findspark\n",
 10 |     "findspark.init()"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from pyspark.sql import SparkSession\n",
 20 |     "spark = SparkSession.builder.appName(\"pyspark-intro\").getOrCreate()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "spark.version"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "### Create Spark DataFrame"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "df = spark.read.csv(\"/dataset/yahoo-symbols-201709.csv\", header=True)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df.count()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "df.printSchema()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "df.show()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### DataFrame operations\n",
 80 |     "Show Top 20 categories by quantity of stocks"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "cats = df.groupby(df['Category Name']).count()\n",
 90 |     "cats.orderBy(cats['count'].desc()).show(truncate=False)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Stop Drive Program\n",
 98 |     "Release resources from Spark Cluster"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "spark.stop()"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "kernelspec": {
113 |    "display_name": "Python 3",
114 |    "language": "python",
115 |    "name": "python3"
116 |   },
117 |   "language_info": {
118 |    "codemirror_mode": {
119 |     "name": "ipython",
120 |     "version": 3
121 |    },
122 |    "file_extension": ".py",
123 |    "mimetype": "text/x-python",
124 |    "name": "python",
125 |    "nbconvert_exporter": "python",
126 |    "pygments_lexer": "ipython3",
127 |    "version": "3.5.3"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 2
132 | }
133 | 


--------------------------------------------------------------------------------
/jupyter/notebook/pyspark-postgres.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Connecting to Postgres\n",
  8 |     "This notebook shows how to pass JDBC driver and connect to our Postgres"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import findspark\n",
 18 |     "\n",
 19 |     "findspark.add_jars('/app/postgresql-42.1.4.jar')\n",
 20 |     "findspark.init()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from pyspark.sql import SparkSession\n",
 30 |     "\n",
 31 |     "spark = SparkSession.builder.appName(\"pyspark-postgres\").getOrCreate()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 13,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "df = spark.read \\\n",
 41 |     "    .format(\"jdbc\") \\\n",
 42 |     "    .option(\"url\", \"jdbc:postgresql://postgres/workshop\") \\\n",
 43 |     "    .option(\"dbtable\", \"workshop.stocks\") \\\n",
 44 |     "    .option(\"user\", \"workshop\") \\\n",
 45 |     "    .option(\"password\", \"w0rkzh0p\") \\\n",
 46 |     "    .option(\"driver\", \"org.postgresql.Driver\") \\\n",
 47 |     "    .load()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 15,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "root\n",
 60 |       " |-- full_date: timestamp (nullable = true)\n",
 61 |       " |-- symbol: string (nullable = true)\n",
 62 |       " |-- category: string (nullable = true)\n",
 63 |       " |-- open: double (nullable = true)\n",
 64 |       " |-- high: double (nullable = true)\n",
 65 |       " |-- low: double (nullable = true)\n",
 66 |       " |-- close: double (nullable = true)\n",
 67 |       " |-- ma20: double (nullable = true)\n",
 68 |       " |-- ma50: double (nullable = true)\n",
 69 |       " |-- ma100: double (nullable = true)\n",
 70 |       "\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "df.printSchema()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": []
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.7.3"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 2
107 | }
108 | 


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/clase_ml.bib:
--------------------------------------------------------------------------------
 1 | @Book{mitchell97,
 2 |   author      = {Tom M Mitchell},
 3 |   title       = {Machine Learning},
 4 |   publisher   = {McGraw-Hill},
 5 |   address     = {New York, NY},
 6 |   year        = {1997}
 7 | }
 8 | 
 9 | @Book{james13,
10 |   author      = {Gareth James and Daniela Witten and Trevor Hastie and Robert Tibshirani},
11 |   title       = {Introduction to Statistical Learning : with Applications in R},
12 |   publisher   = {Springer},
13 |   address     = {New York, NY},
14 |   year        = {2013}
15 | }
16 | 
17 | @book{efron16,
18 |  author = {Efron, Bradley and Hastie, Trevor},
19 |  title = {Computer Age Statistical Inference: Algorithms, Evidence, and Data Science},
20 |  publisher = {Cambridge University Press},
21 |  address = {New York, NY, USA},
22 |  year = {2016},
23 | }
24 | 
25 | @Book{tukey77,
26 |   author      = {John W Tukey},
27 |   title       = {Exploratory Data Analysis},
28 |   edition     = {7},
29 |   publisher   = {Addison Wesley},
30 |   address     = {Reading, Massachusetts},
31 |   year        = {1977}
32 | }
33 | 


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/clase_ml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/clase_ml.pdf


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/bias_variance_tradeoff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/bias_variance_tradeoff.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/bvt2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/bvt2.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/complexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/complexity.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/confusion_matrix.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/corr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/corr.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/facet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/facet.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/frontier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/frontier.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/holdout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/holdout.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/kde.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/kde.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/logistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/logistic.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/logo_mutt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/logo_mutt.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/one_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/one_hot.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/overfitting.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/roc.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/run.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/sample_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/sample_size.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/supervised.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/table_variables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/table_variables.pdf


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/table_variables.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{standalone}
 2 | 
 3 | %-----------------------+
 4 | % Clean auxiliary files |
 5 | %-----------------------+
 6 | % arara: clean: {files: [table_variables.aux, table_variables.log, table_variables.synctex.gz]}
 7 | 
 8 | %------------------------------------+
 9 | % Language, hyphenation and encoding |
10 | %------------------------------------+
11 | \usepackage{lmodern}                      % Use Latin Modern fonts
12 | % \renewcommand{\rmdefault}{\sfdefault}   % Use beamer sans-serif font family
13 | \usepackage[T1]{fontenc}        % Better output when a diacritic/accent is used
14 | \usepackage[utf8]{inputenc}               % Allows to input accented characters
15 | 
16 | %----------------+
17 | % Table packages |
18 | %----------------+
19 | \usepackage{array}          % Flexible column formatting
20 | % \usepackage{spreadtab}  % Spreadsheet features
21 | \usepackage{multirow}       % Allows table cells that span more than one row
22 | \usepackage{booktabs}       % Enhance quality of tables
23 | \setlength{\heavyrulewidth}{1pt}
24 | 
25 | \usepackage{siunitx}        % Typeset units correctly and define new column (S)
26 | \sisetup{detect-all,table-auto-round,input-symbols = {()}}
27 | % \robustify{\bfseries}     % Correct alignment of bold numbers in tables
28 | 
29 | % Table colors
30 | \usepackage[table,x11names]{xcolor}
31 | 
32 | \begin{document}
33 | \begin{tabular}{lll}
34 |   \toprule
35 |   Variable & Descripción  & Notas \\
36 |   \midrule
37 |    survived & Condición de superviviencia & 1 = Si, 0 = No\\
38 |    pclass & Tipo de ticket &  1= Alto, 2= Medio, 3 = Bajo  \\
39 |    name & Nombre del pasajero & \\
40 |    sex & Sexo  & \\
41 |    age & Edad en años & Fracción si < 1 y xx.5 si estimada\\
42 |    sibsp & \# hermanos y conyuges a bordo &  \\
43 |    parch & \# padres e hijos a bordo &  Hijos con niñera tiene parch=0\\
44 |    ticket & \# de boleto & \\
45 |    fare & Precio del boleto & \\
46 |    cabin & \# de cabina & \\
47 |    embarked & Puerto de embarque & C=Cherbourg, Q=Queenstown, S=Southhampton\\
48 |    boat & \# bote de rescate & \\ 
49 |    home.dest & Ciudad de origen & \\ 
50 |    body & \# Número de identificación cadáver \\
51 |   \bottomrule
52 | \end{tabular}
53 | \end{document}
54 | 


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/titanic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/titanic.jpg


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tree.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/tree_regions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tree_regions.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/tvt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tvt.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/unbalance_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/unbalance_class.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/underfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/underfitting.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/unsupervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/unsupervised.png


--------------------------------------------------------------------------------
/jupyter/notebook/titanic/docs/figures/whatido.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/whatido.jpg


--------------------------------------------------------------------------------
/nginx/html/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <title>Workshop de Big Data con Apache Spark</title>
 4 | 
 5 | <xmp theme="spacelab" style="display:none;">
 6 | 
 7 | # Material del Workshopde Big Data
 8 | 
 9 | ## Documentaci&#xF3;n
10 | Todo el material del curso est&#xE1; [disponible en Github](https://github.com/arjones/bigdata-workshop-es)
11 | 
12 | ## Puertos y Servicios
13 | El listado abajo contiene los puertos para acceso a las interfaces graficas de los servicios instalados.
14 | 
15 | <table>
16 |   <thead>
17 |     <tr>
18 |       <th>Servicio</th>
19 |       <th>Acceso</th>
20 |       <th>Notas</th>
21 |     </tr>
22 |   </thead>
23 |   <tbody>
24 |     <tr>
25 |       <td>Spark</td>
26 |       <td>
27 |         <a href="/" onclick="javascript:event.target.port=8080" target="_blank">Spark Master</a>
28 |         <br /><br />
29 |         <a href="/" onclick="javascript:event.target.port=4040" target="_blank">Job Progress</a>
30 |       </td>
31 |       <td><br /><br /><b>Job Progress</b> solo funciona cuando el job este activo</td>
32 | 
33 |     </tr>
34 |     <tr>
35 |       <td>pySpark</td>
36 |       <td><a href="/" onclick="javascript:event.target.port=8888" target="_blank">Jupyter Notebook</a></td>
37 |       <td>Token required, correr <code>./control-env.sh token</code></td>
38 |     </tr>
39 |     <tr>
40 |       <td>Superset</td>
41 |       <td><a href="/" onclick="javascript:event.target.port=8088" target="_blank">Dashboard</a></td>
42 |       <td>Username/Password creado durante superset-init </td>
43 |     </tr>
44 |   </tbody>
45 | </table>
46 | 
47 | ---
48 | ## Sobre
49 | Gustavo Arjones &copy; 2017-2020
50 | [arjon.es](https://arjon.es) | [LinkedIn](https://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
51 | </xmp>
52 | 
53 | <script src="http://strapdownjs.com/v/0.2/strapdown.js"></script>
54 | </html>
55 | 


--------------------------------------------------------------------------------
/postgres/scripts/init.sql:
--------------------------------------------------------------------------------
 1 | CREATE SCHEMA workshop;
 2 | 
 3 | DROP TABLE IF EXISTS stocks;
 4 | CREATE TABLE stocks (
 5 |   full_date timestamptz NOT NULL,
 6 |   symbol varchar(10) NOT NULL,
 7 |   category varchar(64) NOT NULL,
 8 |   open double precision	NOT NULL,
 9 |   high double precision	NOT NULL,
10 |   low double precision	NOT NULL,
11 |   close double precision	NOT NULL,
12 |   MA20 double precision	NOT NULL,
13 |   MA50 double precision	NOT NULL,
14 |   MA100 double precision	NOT NULL,
15 |   PRIMARY KEY(full_date, symbol)
16 | );
17 | 


--------------------------------------------------------------------------------
/scala/README.md:
--------------------------------------------------------------------------------
 1 | # Databricks Notebook
 2 | 
 3 | 1. Crear una cuenta en [Databricks | COMMUNITY EDITION](https://databricks.com/try-databricks)
 4 | 
 5 | 2. Importar el Notebook:
 6 | 
 7 | ![](databricks-import-notebook-1.png)
 8 | 
 9 | ![](databricks-import-notebook-2.png)
10 | 
11 | 3. Agregar la URL: `https://raw.githubusercontent.com/arjones/bigdata-workshop-es/master/scala/Day%201%20-%20Scala%20Intro.html`
12 | 
13 | # Siga leyendo
14 | * [Batch Processing](README-batch.md)
15 | 
16 | ## Sobre
17 | Gustavo Arjones &copy; 2017-2020  
18 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
19 | 


--------------------------------------------------------------------------------
/scala/databricks-import-notebook-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/scala/databricks-import-notebook-1.png


--------------------------------------------------------------------------------
/scala/databricks-import-notebook-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/scala/databricks-import-notebook-2.png


--------------------------------------------------------------------------------
/spark/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | ARG SPARK_VERSION=2.4.5
 6 | 
 7 | ENV SPARK_HOME /opt/spark
 8 | 
 9 | RUN apt-get update && \
10 |   apt-get install -y wget ca-certificates procps && \
11 |   wget http://apache.dattatec.com/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz -O - | tar zx -C /opt && \
12 |   ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop2.7 /opt/spark && \
13 |   sed 's/INFO/ERROR/g' /opt/spark/conf/log4j.properties.template > /opt/spark/conf/log4j.properties && \
14 |   echo "\nexport PATH=\${PATH}:/opt/spark/bin" >> /etc/bash.bashrc && \
15 |   echo "\nexport SPARK_NO_DAEMONIZE=1" >> /etc/bash.bashrc && \
16 |   rm -rf /var/lib/apt/lists/*
17 | 
18 | CMD ["/opt/spark/bin/spark-shell"]
19 | 
20 | EXPOSE 8080 8081 4040 4041
21 | 


--------------------------------------------------------------------------------
/spark/Dockerfile.pyspark:
--------------------------------------------------------------------------------
 1 | ARG SPARK_VERSION=2.4.5
 2 | FROM arjones/spark:${SPARK_VERSION}
 3 | 
 4 | WORKDIR /notebook
 5 | 
 6 | COPY requirements.txt /tmp/
 7 | 
 8 | RUN apt-get update && \
 9 |   apt-get --no-install-recommends --no-install-suggests install -y \
10 |   python3 python3-pip python3-setuptools python3-distutils && \
11 |   update-alternatives --install /usr/bin/python python /usr/bin/python3.7 10 && \
12 |   pip3 install --no-cache-dir --default-timeout=120 -r /tmp/requirements.txt && \
13 |   apt-get autoremove -y && \
14 |   rm -rvf /tmp/requirements.txt /var/lib/apt/lists/*
15 | 
16 | CMD [ "/usr/local/bin/jupyter", "notebook", "--allow-root", "--no-browser", "--ip=0.0.0.0"]
17 | 
18 | EXPOSE 8888
19 | 


--------------------------------------------------------------------------------
/spark/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Install notes
 2 | 
 3 | ## Trying new libs
 4 | A quick & dirty way to try new libs in pySpark would be:
 5 | 
 6 | ```
 7 | cd jupyter/notebook
 8 | docker run -it -p8888:8888 -w /notebook -v $PWD:/notebook arjones/pyspark:2.4.4 bash
 9 | 
10 | apt-get update && \
11 |   apt-get --no-install-recommends --no-install-suggests install -y \
12 |   python3-pip && \
13 |   pip3 install gensim
14 | 
15 | ```
16 | 
17 | After detecting all dependencies, you can include it on `Dockerfile` definitions and rebuild the image.


--------------------------------------------------------------------------------
/spark/README.md:
--------------------------------------------------------------------------------
 1 | # Apache Spark Image
 2 | 
 3 | How to build and push the Spark Image:
 4 | 
 5 | ```bash
 6 | export SPARK_VERSION=2.4.5
 7 | 
 8 | docker build \
 9 |   --build-arg SPARK_VERSION=${SPARK_VERSION} \
10 |   -t arjones/spark:${SPARK_VERSION} .
11 | 
12 | docker build \
13 |   -f Dockerfile.pyspark \
14 |   --build-arg SPARK_VERSION=${SPARK_VERSION} \
15 |   -t arjones/pyspark:${SPARK_VERSION} .
16 | 
17 | docker push arjones/spark:${SPARK_VERSION}
18 | docker push arjones/pyspark:${SPARK_VERSION}
19 | ```
20 | 
21 | ## Sobre
22 | Gustavo Arjones &copy; 2017-2020  
23 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
24 | 


--------------------------------------------------------------------------------
/spark/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyter
 2 | findspark
 3 | pandas
 4 | matplotlib
 5 | seaborn
 6 | 
 7 | # gensim install
 8 | wheel
 9 | gensim
10 | 
11 | # https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark
12 | pyarrow==0.14.1
13 | 
14 | # Deps for fake_stock_price_gen
15 | kafka-python==2.0.1


--------------------------------------------------------------------------------
/superset/conf/superset_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | ROW_LIMIT = 5000
 4 | WEBSERVER_THREADS = 8
 5 | SUPERSET_WEBSERVER_PORT = 8088
 6 | SUPERSET_WEBSERVER_TIMEOUT = 60
 7 | SECRET_KEY = '1dBL8QOCromAwD0nEZijWL7vvgGJHm0WPNxpUlJFGQA6fSQaG4dhj5sPvCZ7KX'
 8 | CACHE_CONFIG = {
 9 |     'CACHE_TYPE': 'redis',
10 |     'CACHE_DEFAULT_TIMEOUT': 300,
11 |     'CACHE_KEY_PREFIX': 'superset_',
12 |     'CACHE_REDIS_HOST': 'redis',
13 |     'CACHE_REDIS_PORT': 6379,
14 |     'CACHE_REDIS_DB': 1,
15 |     'CACHE_REDIS_URL': 'redis://redis:6379/1'}
16 | SQLALCHEMY_DATABASE_URI = 'sqlite:////var/lib/superset/superset.db'
17 | SQLALCHEMY_TRACK_MODIFICATIONS = True
18 | WTF_CSRF_ENABLED = True
19 | WTF_CSRF_EXEMPT_LIST = []
20 | MAPBOX_API_KEY = os.getenv('MAPBOX_API_KEY', '')
21 | 


--------------------------------------------------------------------------------
/vm/README.md:
--------------------------------------------------------------------------------
 1 | # Creando VM de cero
 2 | 
 3 | * Download una version de Ubuntu Desktop
 4 | 
 5 | `wget -c http://releases.ubuntu.com/18.04/ubuntu-18.04.3-desktop-amd64.iso`
 6 | 
 7 | ## Configurar VirtualBox
 8 | 
 9 | * Instalar [VirtualBox](https://www.virtualbox.org)
10 | * Configurar disco de **>= 20Gb** y **8Gb RAM**
11 | * Configurar red: Settings > Network > **Port Forwarding**
12 | 
13 | ![virtualbox-port-forwarding](virtualbox-port-forwarding.png)
14 | 
15 | * Instalar la VM
16 | * Abrir la terminal y ejecutar los comandos de [install-script.sh](install-script.sh)
17 | 
18 | ## Acceso por SSH
19 | 
20 | * Despues de habilitar SSHD (corriendo script arriba) se puede acceder a la VM por SSH: `ssh analyst@localhost -p 2222`
21 | 
22 | 
23 | ![](vm-0.png)
24 | 
25 | ![](vm-1.png)
26 | 
27 | ![](vm-2.png)
28 | 
29 | ![](vm-3.png)
30 | 
31 | ![](vm-4.png)
32 | 
33 | ![](vm-5.png)
34 | 
35 | ## Sobre
36 | Gustavo Arjones &copy; 2017-2020  
37 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones)
38 | 


--------------------------------------------------------------------------------
/vm/install-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SSHD
 4 | sudo apt-get update
 5 | 
 6 | # Docker
 7 | sudo apt-get install -y \
 8 |   apt-transport-https \
 9 |   ca-certificates \
10 |   curl \
11 |   software-properties-common
12 | 
13 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
14 | 
15 | sudo apt-key fingerprint 0EBFCD88
16 | 
17 | sudo add-apt-repository \
18 |    "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
19 |    $(lsb_release -cs) \
20 |    stable"
21 | 
22 | sudo apt-get update
23 | sudo apt-get install -y docker-ce
24 | 
25 | sudo groupadd docker
26 | sudo usermod -aG docker "${USER}"
27 | 
28 | sudo systemctl enable docker
29 | 
30 | # Docker Compose
31 | sudo curl -L https://github.com/docker/compose/releases/download/1.24.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
32 | sudo chmod +x /usr/local/bin/docker-compose
33 | 
34 | # Cleanup
35 | sudo apt-get -y autoremove
36 | 
37 | #############################################
38 | #
39 | echo '===================================='
40 | echo
41 | echo 'You need to logout and restart again'
42 | echo
43 | echo '===================================='
44 | 


--------------------------------------------------------------------------------
/vm/install-script.sh:
--------------------------------------------------------------------------------
 1 | # Enables sudo without passwd
 2 | echo "%sudo ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers
 3 | 
 4 | # SSHD
 5 | sudo apt-get update
 6 | sudo apt-get install -y openssh-server
 7 | 
 8 | # Remove/Install JVM
 9 | sudo apt-get -y remove openjdk*
10 | sudo apt-get -y remove --auto-remove openjdk*
11 | sudo apt-get -y purge openjdk*
12 | 
13 | sudo apt-get install -y openjdk-8-jdk
14 | 
15 | # Git
16 | sudo apt-get install -y git \
17 |   maven
18 | 
19 | ## Scala
20 | # IMPORTANT: Make sure scala version is the same as Spark 
21 | # have been compiled to. Run spark-shell
22 | # 
23 | sudo apt-get -y remove --auto-remove scala-library scala
24 | sudo apt-get -y purge scala-library* scala*
25 | 
26 | sudo wget https://downloads.lightbend.com/scala/2.11.12/scala-2.11.12.deb
27 | sudo dpkg -i scala-2.11.12.deb
28 | sudo apt-get update
29 | sudo apt-get -y install scala
30 | 
31 | # SBT
32 | echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
33 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
34 | sudo apt-get update
35 | sudo apt-get install -y sbt
36 | 
37 | # Docker
38 | sudo apt-get install -y \
39 |   apt-transport-https \
40 |   ca-certificates \
41 |   curl \
42 |   software-properties-common
43 | 
44 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
45 | 
46 | sudo apt-key fingerprint 0EBFCD88
47 | 
48 | sudo add-apt-repository \
49 |    "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
50 |    $(lsb_release -cs) \
51 |    stable"
52 | 
53 | sudo apt-get update
54 | sudo apt-get install -y docker-ce
55 | 
56 | sudo groupadd docker
57 | sudo usermod -aG docker "${USER}"
58 | 
59 | # Docker Compose
60 | sudo curl -L https://github.com/docker/compose/releases/download/1.24.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose
61 | sudo chmod +x /usr/local/bin/docker-compose
62 | 
63 | # Cleanup
64 | sudo apt-get -y autoremove
65 | 
66 | 
67 | #############################################
68 | #
69 | # Course Material
70 | cd ~
71 | git clone https://github.com/arjones/bigdata-workshop-es.git
72 | 
73 | cd bigdata-workshop-es
74 | docker-compose pull
75 | 


--------------------------------------------------------------------------------
/vm/virtualbox-port-forwarding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/virtualbox-port-forwarding.png


--------------------------------------------------------------------------------
/vm/vm-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-0.png


--------------------------------------------------------------------------------
/vm/vm-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-1.png


--------------------------------------------------------------------------------
/vm/vm-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-2.png


--------------------------------------------------------------------------------
/vm/vm-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-3.png


--------------------------------------------------------------------------------
/vm/vm-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-4.png


--------------------------------------------------------------------------------
/vm/vm-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-5.png


--------------------------------------------------------------------------------