├── .gitignore ├── INSTALL.md ├── LICENSE ├── README-batch.md ├── README-ml.md ├── README-pyspark.md ├── README-streaming.md ├── README-superset.md ├── README.md ├── airflow ├── .gitignore ├── dags │ ├── random_number_dag.py │ ├── requests_example.py │ ├── solution_random_number_dag.py │ ├── solution_stocks_dag.py │ ├── spark_job.py │ ├── sqlite_cli.py │ ├── stocks_dag.py │ └── word_count_dag.py └── docs │ ├── clase_airflow.pdf │ ├── clase_airflow.tex │ └── figures │ ├── airflow.png │ ├── airflow_architecture.png │ ├── airflow_ui.png │ ├── dag_graph_view.png │ ├── example_bash_operator.png │ ├── exercise_1.png │ ├── exercise_2.png │ ├── exercise_3.png │ ├── exercise_4.png │ └── logo_mutt.png ├── code ├── postgresql-42.1.4.jar ├── python │ ├── introduction │ │ ├── ejercicios │ │ │ ├── alice.txt │ │ │ ├── list.py │ │ │ ├── small.txt │ │ │ ├── string.py │ │ │ └── wordcount.py │ │ ├── hello.py │ │ └── introduccion_a_python.pdf │ └── us-stock-analysis │ │ ├── .gitignore │ │ ├── README.md │ │ ├── requirements.txt │ │ └── src │ │ ├── batch │ │ └── etl_steps.py │ │ ├── examples │ │ ├── first_example.py │ │ └── postgres_example.py │ │ └── stream │ │ ├── etl_stream.py │ │ └── fake_stock_price_generator.py └── scala │ ├── credit-risk-analysis │ ├── .gitignore │ ├── README.md │ ├── build.sbt │ ├── project │ │ ├── assembly.sbt │ │ └── build.properties │ └── src │ │ └── main │ │ └── scala │ │ └── es │ │ └── arjon │ │ ├── CreditRiskAnalysis.scala │ │ ├── CreditRiskTrain.scala │ │ └── DatasetUtil.scala │ └── us-stock-analysis │ ├── .gitignore │ ├── README.md │ ├── build.sbt │ ├── project │ ├── assembly.sbt │ └── build.properties │ └── src │ └── main │ ├── resources │ └── log4j.properties │ └── scala │ └── es │ └── arjon │ ├── EtlSteps.scala │ ├── FakeStockPriceGenerator.scala │ └── StreamingETL.scala ├── control-env.sh ├── dataset ├── .gitignore ├── credit-risk │ ├── germancredit-user-input.csv │ └── germancredit.csv ├── global-temperature-1880-2016.json ├── news │ └── huffingtonpost-news.json.gz ├── pyspark-df-overview │ ├── README.md │ └── census_income.csv.gz ├── stocks-small │ ├── aapl.us.txt │ ├── baba.us.txt │ ├── csco.us.txt │ ├── dhr.us.txt │ ├── ebay.us.txt │ ├── fb.us.txt │ ├── goog.us.txt │ ├── googl.us.txt │ ├── ibm.us.txt │ ├── intc.us.txt │ ├── jnj.us.txt │ ├── meli.us.txt │ ├── msft.us.txt │ ├── orcl.us.txt │ ├── qcom.us.txt │ ├── tsla.us.txt │ ├── txn.us.txt │ ├── wdc.us.txt │ └── xrx.us.txt ├── stocks │ └── README.md ├── titanic.csv └── yahoo-symbols-201709.csv ├── docker-compose.yml ├── images ├── docker-advanced-config.jpg ├── superset-01.png ├── superset-02.png ├── superset-03.png ├── superset-04.png ├── superset-05.png ├── superset-06.png ├── superset-07.png ├── superset-08.png ├── superset-09.png ├── superset-10.png ├── superset-11.png └── superset.png ├── jupyter └── notebook │ ├── README.md │ ├── batch_etl_steps.ipynb │ ├── pandas-json-sample.ipynb │ ├── pyspark-apache-arrow.ipynb │ ├── pyspark-check-install.ipynb │ ├── pyspark-dataframe-overview.ipynb │ ├── pyspark-intro.ipynb │ ├── pyspark-nlp.ipynb │ ├── pyspark-postgres.ipynb │ └── titanic │ ├── docs │ ├── clase_ml.bib │ ├── clase_ml.pdf │ ├── clase_ml.tex │ └── figures │ │ ├── bias_variance_tradeoff.png │ │ ├── bvt2.png │ │ ├── complexity.png │ │ ├── confusion_matrix.png │ │ ├── corr.png │ │ ├── facet.png │ │ ├── frontier.png │ │ ├── holdout.png │ │ ├── kde.png │ │ ├── logistic.png │ │ ├── logo_mutt.png │ │ ├── one_hot.png │ │ ├── overfitting.png │ │ ├── roc.png │ │ ├── run.png │ │ ├── sample_size.png │ │ ├── supervised.png │ │ ├── table_variables.pdf │ │ ├── table_variables.tex │ │ ├── titanic.jpg │ │ ├── tree.png │ │ ├── tree_regions.png │ │ ├── tvt.png │ │ ├── unbalance_class.png │ │ ├── underfitting.png │ │ ├── unsupervised.png │ │ └── whatido.jpg │ ├── titanic_spark_exercises.ipynb │ └── titanic_spark_solutions.ipynb ├── nginx └── html │ └── index.html ├── postgres └── scripts │ └── init.sql ├── scala ├── Day 1 - Scala Intro.html ├── README.md ├── databricks-import-notebook-1.png └── databricks-import-notebook-2.png ├── spark ├── Dockerfile ├── Dockerfile.pyspark ├── INSTALL.md ├── README.md └── requirements.txt ├── superset └── conf │ └── superset_config.py └── vm ├── README.md ├── install-docker.sh ├── install-script.sh ├── virtualbox-port-forwarding.png ├── vm-0.png ├── vm-1.png ├── vm-2.png ├── vm-3.png ├── vm-4.png └── vm-5.png /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | .DS_Store 3 | mnt/* 4 | *.iso 5 | *.ova 6 | 7 | # Spark 8 | checkpoint 9 | derby.log 10 | metastore_db 11 | streaming.parquet 12 | dataset/output.parquet/ 13 | 14 | # Superset Dashboard 15 | superset/conf/.setup-complete 16 | superset/conf/superset.db 17 | 18 | # Python 19 | .ipynb_checkpoints 20 | __pycache__ 21 | dataset/titanic_* 22 | .idea/ 23 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Instrucciones de Instalación 2 | 3 | **IMPORTANTE:** Cualquier opción de instalación requeriere por lo menos 8Gb de RAM para un correcto funcionamiento. 4 | 5 | ## Utilizando Windows 6 | 7 | Si su computadora es Windows debe utilizar la Virtual Machine, algunas computadoras más antiguas no soporta virtualización, por lo cual [VirtualBox](https://www.virtualbox.org/) no funciona. Tampoco funciona adecuadamente **Docker on Windows**. 8 | 9 | Se puede generar una Virtual Machine de cero siguiendo las instrucciones acá: [Virtual Box - INSTALL](./vm). La virtual machine completa pesa 15Gb y 5.2Gb cuando comprimida con `gzip -9`. 10 | 11 | ## Utilizando MacOSX 12 | 13 | Para compilar y correr el codigo adecuadamente en MacOSX es necesario instalar varias dependencias, acá pueden encontrar las instrucciones para instalar todas las dependencias necesarias: [Setting up Macbook Pro for Development](https://arjon.es/2019/setting-up-macbook-pro-for-development/) 14 | 15 | Al finalizar la instalación clonar el repositorio: 16 | 17 | ```shell 18 | git clone https://github.com/arjones/bigdata-workshop-es.git 19 | 20 | cd bigdata-workshop-es 21 | 22 | ./control-env.sh start 23 | ``` 24 | 25 | ## Sobre 26 | Gustavo Arjones © 2017-2020 27 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 28 | -------------------------------------------------------------------------------- /README-batch.md: -------------------------------------------------------------------------------- 1 | # Workshop de Big Data con Apache Spark [🇪🇸] 2 | Material del Workshop de Big Data 3 | 4 | ## Batch Processing 5 | 6 | ## Codigo 7 | * [Analisis de acciones de EEUU](code/scala/us-stock-analysis) (Scala) 8 | * [Analisis de acciones de EEUU](code/python/us-stock-analysis) (Python) 9 | * [us-stock-analysis Jupyter Notebook](jupyter/notebook/batch_etl_steps.ipynb) (Python) 10 | 11 | ## Compilar el codigo 12 | Compilar y empaquetar el codigo para deploy en el cluster 13 | 14 | ```bash 15 | cd code/us-stock-analysis 16 | sbt clean assembly 17 | ``` 18 | 19 | ## Submit de un job 20 | Conectarse al Spark-Master y hacer submit del programa 21 | 22 | ```bash 23 | docker exec -it master bash 24 | 25 | cd /app/us-stock-analysis 26 | spark-submit --master 'spark://master:7077' \ 27 | --class "es.arjon.RunAll" \ 28 | --driver-class-path /app/postgresql-42.1.4.jar \ 29 | target/scala-2.11/us-stock-analysis-assembly-0.1.jar \ 30 | /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet 31 | ``` 32 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI 33 | 34 | Verificar el resultado del job en la carpeta `/dataset/output.parquet`: 35 | 36 | ```bash 37 | # Desde la maquina host 38 | $ tree ~/bigdata-workshop-es/dataset/output.parquet/ 39 | ``` 40 | 41 | ## Usando Spark-SQL 42 | Usando SparkSQL para acceder a los datos en Parquet y hacer analysis interactiva. 43 | 44 | ```bash 45 | docker exec -it master bash 46 | spark-shell 47 | ``` 48 | 49 | ```scala 50 | // reduce log noise 51 | sc.setLogLevel("ERROR") 52 | 53 | import spark.implicits._ 54 | val df = spark.read.parquet("/dataset/output.parquet") 55 | df.show 56 | df.printSchema 57 | 58 | df.createOrReplaceTempView("stocks") 59 | 60 | // No usando particiones 61 | val badHighestClosingPrice = spark.sql("SELECT symbol, MAX(close) AS price FROM stocks WHERE full_date >= '2017-09-01' AND full_date < '2017-10-01' GROUP BY symbol") 62 | badHighestClosingPrice.explain 63 | badHighestClosingPrice.show 64 | 65 | // Optimizando con particiones 66 | val highestClosingPrice = spark.sql("SELECT symbol, MAX(close) AS price FROM stocks WHERE year=2017 AND month=9 GROUP BY symbol") 67 | highestClosingPrice.explain 68 | highestClosingPrice.show 69 | ``` 70 | 71 | ## Ver los datos en Postgres 72 | El batch job también escribe una tabla `stocks` en Postgres que se puede acceder: 73 | 74 | ``` 75 | # abrir otra consola 76 | 77 | docker exec -it postgres bash 78 | 79 | psql -U workshop workshop 80 | workshop=# \d 81 | ... 82 | ... 83 | 84 | workshop=# SELECT * FROM stocks LIMIT 10; 85 | ``` 86 | 87 | ## Creando un Dashboard con Superset 88 | 89 | * [Como configurar Superset](./README-superset.md) 90 | * [Sitio Oficial Superset](https://superset.apache.org/) 91 | 92 | 93 | ## Siga leyendo 94 | * [Structured Streaming Processing](README-streaming.md) 95 | 96 | 97 | ____ 98 | Gustavo Arjones © 2017-2020 99 | -------------------------------------------------------------------------------- /README-ml.md: -------------------------------------------------------------------------------- 1 | # Workshop de Big Data con Apache Spark [🇪🇸] 2 | Material del Workshop de Big Data 3 | 4 | ## Machine Learning Lib 5 | Usando un dataset de [Credito Alemán](https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)) se entrenará un algoritmo de [Clasificación Random Forest](https://spark.apache.org/docs/2.4.4/ml-classification-regression.html#random-forest-classifier) y se buscará predecir el valor `Creditable` que significa **brindar credito**. 6 | 7 | ## Codigo 8 | * [Analisis de risco de credito](code/credit-risk-analysis) (credit-risk-analysis) 9 | 10 | ## Realizar el entrenamiento 11 | La clase [CreditRiskTrain.scala](code/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskTrain.scala) hace las transformaciones de los datos de entrada para generar el modelo de Random Forest. También intentamos mejorar el modelo utilizando [CrossValidator](https://spark.apache.org/docs/2.4.4/ml-tuning.html#cross-validation) 12 | 13 | ```bash 14 | # Compilar el proyecto 15 | cd code/credit-risk-analysis 16 | sbt clean assembly 17 | 18 | # Conectarse al SparkMaster y hacer submit del proyecto de Entrenamiento 19 | docker exec -it master bash 20 | cd /app/credit-risk-analysis 21 | spark-submit \ 22 | --class es.arjon.CreditRiskTrain \ 23 | --master 'spark://master:7077' \ 24 | target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \ 25 | /dataset/credit-risk/germancredit.csv \ 26 | /dataset/credit-risk.model 27 | 28 | # va tomar 4+ minutos para concluir el entrenamiento 29 | ``` 30 | 31 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI 32 | 33 | # Chequeá el modelo entrenado 34 | ```bash 35 | ls -la /dataset/credit-risk.model 36 | ``` 37 | 38 | ## Realizando predicciones 39 | El archivo `/dataset/credit-risk/germancredit-user-input.csv` simula entrada de usuarios con sus respectivas que son enviadas al modelo para prediccion. 40 | 41 | ```bash 42 | spark-submit \ 43 | --class es.arjon.CreditRiskAnalysis \ 44 | --master 'spark://master:7077' \ 45 | target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \ 46 | /dataset/credit-risk/germancredit-user-input.csv \ 47 | /dataset/credit-risk.model 48 | ``` 49 | 50 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI 51 | 52 | ### Desafío 🤓 53 | Modificar el codigo para tomar la entrada de **Kafka** y escribir en **Postgres** 54 | 55 | 56 | ## Más información 57 | * [Predicting Loan Credit Risk using Apache Spark Machine Learning Random Forests](https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/) 58 | * [Original: Analysis of German Credit Data](https://onlinecourses.science.psu.edu/stat857/node/215) 59 | 60 | ____ 61 | Gustavo Arjones © 2017-2020 62 | -------------------------------------------------------------------------------- /README-pyspark.md: -------------------------------------------------------------------------------- 1 | # Usando `pySpark`: 2 | 3 | ## Consola 4 | 5 | ```bash 6 | docker exec -it master bash 7 | root@588acf96a879:/app# pyspark 8 | ``` 9 | ```python 10 | file = spark.read.text("/dataset/yahoo-symbols-201709.csv") 11 | file.count() 12 | for line in file.take(10): 13 | print(line) 14 | ``` 15 | 16 | ## Usando Jupyter Notebook 17 | Acceda al [Jupyter Notebook aqui](http://localhost:8888/), los notebook disponibles en ese workshop [están en Github](https://github.com/arjones/bigdata-workshop-es/tree/master/jupyter/notebook) 18 | 19 | ## Material de lectura: 20 | 21 | * [Apache Spark in Python: Beginner's Guide](https://www.datacamp.com/community/tutorials/apache-spark-python) 22 | * [Introduction to PySpark](https://www.datacamp.com/courses/introduction-to-pyspark) 23 | * [pySpark: Evaluating the machine learning model](https://www.datacamp.com/community/tutorials/apache-spark-tutorial-machine-learning) 24 | 25 | 26 | ## Visualización de Datos 27 | 28 | * [Python Data Visualization with Matplotlib](https://stackabuse.com/python-data-visualization-with-matplotlib/) 29 | * [Top 50 matplotlib Visualizations](https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/) 30 | * [Seaborn Library for Data Visualization in Python: Part 1](https://stackabuse.com/seaborn-library-for-data-visualization-in-python-part-1/) 31 | 32 | 33 | ____ 34 | Gustavo Arjones © 2017-2020 35 | -------------------------------------------------------------------------------- /README-streaming.md: -------------------------------------------------------------------------------- 1 | # Workshop de Big Data con Apache Spark [🇪🇸] 2 | Material del Workshop de Big Data 3 | 4 | ## Structured Streaming Processing 5 | El simulador publica información sobre acciones y sus precios en una cola Kafka que es consumida por Spark. 6 | 7 | ## Codigo 8 | * [Analisis de acciones de EEUU](code/us-stock-analysis) (US Stocks) 9 | 10 | ## Iniciar el simulador de acciones 11 | Dentro del mismo package tenemos la clase del simulador [FakeStockPriceGenerator](./code/us-stock-analysis/src/main/scala/es/arjonFakeStockPriceGenerator.scala) 12 | 13 | ```bash 14 | # Compilar el similador 15 | cd code/us-stock-analysis 16 | sbt clean assembly 17 | 18 | # Ejecutarlos dentro de un Worker 19 | docker exec -it worker1 bash 20 | cd /app/us-stock-analysis 21 | java -cp target/scala-2.11/us-stock-analysis-assembly-0.1.jar \ 22 | "es.arjon.FakeStockPriceGenerator" kafka:9092 stocks 23 | ``` 24 | 25 | ## Chequear el contenido de Kafka 26 | 27 | ```bash 28 | docker exec -it kafka bash 29 | 30 | /opt/kafka_2.11-0.10.1.0/bin/kafka-console-consumer.sh \ 31 | --bootstrap-server kafka:9092 --topic stocks --from-beginning 32 | 33 | # apretar CTRL+C para salir 34 | ``` 35 | 36 | ## Submit de un job 37 | Conectarse al Spark-Master y hacer submit del programa 38 | 39 | **NOTA:** Utilizar `--total-executor-cores` con la mitad de cores de tu computadora, ej: si tiene 4 cores, utilizar `2`. 40 | 41 | ```bash 42 | docker exec -it master bash 43 | 44 | cd /app/us-stock-analysis 45 | spark-submit --master 'spark://master:7077' \ 46 | --class "es.arjon.StreamingETL" \ 47 | --total-executor-cores 1 \ 48 | target/scala-2.11/us-stock-analysis-assembly-0.1.jar \ 49 | kafka:9092 stocks 50 | ``` 51 | Acceder a http://localhost:8080 y http://localhost:4040 para ver la SPARK-UI 52 | 53 | ## En otra consola, acceder al dataset de Streaming 54 | ```bash 55 | docker exec -it master bash 56 | spark-shell --total-executor-cores 1 57 | ``` 58 | 59 | ```scala 60 | import spark.implicits._ 61 | val df = spark.read.parquet("/dataset/streaming.parquet") 62 | df.show 63 | ``` 64 | 65 | ## Utilizar Spark SQL y el Sink in Memory 66 | 67 | En el archivo `StreamingETL.scala` comentar las líneas 71 a la 85 para evitar que se escriba en el archivo de output Parquet y descomentar las líneas de código de 90 al 103. 68 | 69 | Compilar la aplicación de nuevo con: 70 | 71 | ```bash 72 | sbt assembly 73 | ``` 74 | 75 | Probar y observar el output por consola. 76 | 77 | Luego comentar las líneas 98 a 103 y descomentar 106 a 121, compilar y ejecutar probar. Qué diferencia observa? 78 | 79 | 80 | ## Streaming Spark SQL + Insert a Postgres 81 | 82 | Comentar las líneas 106 a 121 y descomentar la línea 83 | 84 | ```scala 85 | AverageStocksToPostgres.process(spark, stocks) 86 | ``` 87 | 88 | En otra tab ingresar al container de Postgres y luego al utilitario de línea de comando `psql` 89 | 90 | ```bash 91 | docker exec -it postgres bash 92 | psql --host localhost --d workshop --username workshop 93 | ``` 94 | 95 | Crear la tabla para recibir los inserts 96 | 97 | ```sql 98 | 99 | CREATE TABLE test_streaming_inserts_avg_price ( 100 | "window" varchar(128), 101 | symbol varchar(10), 102 | avg_price real 103 | ); 104 | ``` 105 | 106 | 107 | ## Más información 108 | * [Structured Streaming in PySpark](https://hackersandslackers.com/structured-streaming-in-pyspark/) 109 | * [Real-time Streaming ETL with Structured Streaming in Apache Spark 2.1](https://databricks.com/blog/2017/01/19/real-time-streaming-etl-structured-streaming-apache-spark-2-1.html) 110 | * [Processing Data in Apache Kafka with Structured Streaming in Apache Spark 2.2](https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html) 111 | * [Real-Time End-to-End Integration with Apache Kafka in Apache Spark’s Structured Streaming](https://databricks.com/blog/2017/04/04/real-time-end-to-end-integration-with-apache-kafka-in-apache-sparks-structured-streaming.html) 112 | 113 | # Siga leyendo 114 | * [MLlib](README-ml.md) 115 | 116 | ____ 117 | Gustavo Arjones © 2017-2020 118 | -------------------------------------------------------------------------------- /README-superset.md: -------------------------------------------------------------------------------- 1 | # Workshop de Big Data con Apache Spark [🇪🇸] 2 | Material del Workshop de Big Data 3 | 4 | ## Creando un Dashboard con Superset 5 | 6 | ![Superset Dashboard Example](images/superset.png) 7 | 8 | * Antes de acceder por primera vez a Superset inicializar la base de datos y crear las credenciales del usuario admin corriendo el siguiente comando: 9 | `./control-env.sh superset-init` 10 | * Acceder a http://localhost:8088/ (utilizar las credenciales creadas en el primer paso). 11 | * Agregar el database (Sources > Databases): 12 | - Database: `Workshop` 13 | - SQLAlchemy URI: `postgresql://workshop:w0rkzh0p@postgres/workshop` 14 | - OK 15 | * Agregar tabla (Sources > Tables) : 16 | - Database: `workshop` 17 | - Table Name: `stocks` 18 | * Create Slices & Dashboard [official docs](https://superset.incubator.apache.org/tutorial.html#creating-a-slice-and-dashboard) 19 | 20 | ![](images/superset-01.png) 21 | 22 | ![](images/superset-02.png) 23 | 24 | ![](images/superset-03.png) 25 | 26 | ![](images/superset-04.png) 27 | 28 | ![](images/superset-05.png) 29 | 30 | ![](images/superset-06.png) 31 | 32 | ![](images/superset-07.png) 33 | 34 | ![](images/superset-08.png) 35 | 36 | ![](images/superset-09.png) 37 | 38 | ![](images/superset-10.png) 39 | 40 | ![](images/superset-11.png) 41 | 42 | 43 | ## Sobre 44 | Gustavo Arjones © 2017-2020 45 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Workshop de Big Data con Apache Spark [🇪🇸] 2 | Material del Workshopde Big Data 3 | 4 | ## Contenidos 5 | * [Levantar el ambiente](#levantar-ambiente) 6 | * [Introducción a Scala](scala/README.md) 7 | * [Batch Processing (Scala)](README-batch.md) 8 | * [Structured Streaming Processing (Scala)](README-streaming.md) 9 | * [Machine Learning (Scala)](README-ml.md) 10 | * [Jupyter Notebook (Python / pySpark)](README-pyspark.md) 11 | * [Lista de Jupyter Notebook](jupyter/notebook/README.md) 12 | 13 | ## Infrastructura 14 | 15 | El workshop simula una instalación de producción utilizando container de Docker. 16 | [docker-compose.yml](docker-compose.yml) contiene las definiciones y configuraciones para esos servicios y sus respectivas UIs: 17 | 18 | * Apache Spark: [Spark Master UI](http://localhost:8080) | [Job Progress](http://localhost:4040) 19 | * Apache Kafka: 20 | * Postgres: 21 | * [Superset](http://superset.incubator.apache.org): [Nuestro Dashboard](http://localhost:8088/) 22 | 23 | Los puertos de acceso a cada servicio quedaron los defaults. Ej: **spark master:7077**, **postgres: 5432** 24 | 25 | ## Levantar ambiente 26 | 27 | Instalar el ambiente [siguiendo las instrucciones acá](INSTALL.md). 28 | 29 | Correr el script que levanta el ambiente `Usage: control-env.sh (start|stop|cleanup)`: 30 | 31 | ```bash 32 | ./control-env.sh start 33 | 34 | **IMPORTANTE** el script `control-env.sh cleanup` borra cualquier dado que haya sido procesado anteriormente. 35 | 36 | 37 | # Access Spark-Master and run spark-shell 38 | docker exec -it master bash 39 | root@588acf96a879:/app# spark-shell 40 | ``` 41 | Probar: 42 | 43 | ```scala 44 | val file = sc.textFile("/dataset/yahoo-symbols-201709.csv") 45 | file.count 46 | file.take(10).foreach(println) 47 | ``` 48 | 49 | Acceder al [Spark Master: http://localhost:8080](http://localhost:8080) y [SPARK-UI: http://localhost:4040](http://localhost:4040). 50 | 51 | ### Troubleshooting 52 | 53 | Si los jobs mueren (`KILLED`) y no se completan puede ser debido a la memória disponible para Docker, **aumente la memoria > 8Gb** al proceso de Docker: 54 | 55 | ![](./images/docker-advanced-config.jpg) 56 | 57 | # Siga leyendo 58 | * [Introducción a Scala](scala/README.md) 59 | * [Jupyter Notebook (Python / pySpark)](README-pyspark.md) 60 | 61 | ## Agradecimientos 62 | * Juan Pampliega ([MuttData](https://www.muttdata.ai/)): expandir y actualizar el ejemplo de [Spark Streaming](README-streaming.md) 63 | * Pedro Ferrari ([MuttData](https://www.muttdata.ai/)): crear el notebook de [pySpark con Machine Learning](./jupyter/notebook/titanic/) 64 | 65 | ## Sobre 66 | Gustavo Arjones © 2017-2020 67 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 68 | -------------------------------------------------------------------------------- /airflow/.gitignore: -------------------------------------------------------------------------------- 1 | # Tex 2 | */_minted* 3 | *.log 4 | -------------------------------------------------------------------------------- /airflow/dags/random_number_dag.py: -------------------------------------------------------------------------------- 1 | """Random number dag.""" 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | from airflow.models import DAG 6 | from airflow.operators.bash_operator import BashOperator 7 | from airflow.operators.dummy_operator import DummyOperator 8 | from airflow.operators.python_operator import PythonOperator 9 | 10 | STORE_DIR = Path(__file__).resolve().parent / 'tmp-files' / 'random-num' 11 | Path.mkdir(STORE_DIR, exist_ok=True, parents=True) 12 | bash_cmd = f"echo $(( ( RANDOM % 10 ) + 1 )) > {str(STORE_DIR / 'random_number.txt')}" 13 | 14 | 15 | def _read_number_and_square(store_dir): 16 | fn = str(store_dir / 'random_number.txt') 17 | with open(fn, 'r') as f: 18 | n = f.readline() 19 | return int(n) ** 2 20 | 21 | 22 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)} 23 | with DAG( 24 | 'random_number', default_args=default_args, schedule_interval='0 4 * * *' 25 | ) as dag: 26 | dummy_start_task = DummyOperator(task_id=f'dummy_start') 27 | generate_random_number = BashOperator( 28 | task_id='generate_random_number', bash_command=bash_cmd 29 | ) 30 | read_num_and_square = PythonOperator( 31 | task_id='read_number_and_square_it', 32 | python_callable=_read_number_and_square, 33 | op_args=[STORE_DIR], 34 | ) 35 | dummy_start_task >> generate_random_number >> read_num_and_square 36 | -------------------------------------------------------------------------------- /airflow/dags/requests_example.py: -------------------------------------------------------------------------------- 1 | """Get data from API.""" 2 | import json 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import requests 8 | 9 | BASE_URL = 'https://www.alphavantage.co/query' 10 | API_KEY = 'TFHNYCWBD71JBSON' 11 | STOCK_FN = 'TIME_SERIES_DAILY' 12 | 13 | 14 | def _get_stock_data(stock_symbol, date): 15 | date = f"{date:%Y-%m-%d}" # read execution date from context 16 | end_point = ( 17 | f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}" 18 | f"&apikey={API_KEY}&datatype=json" 19 | ) 20 | print(f"Getting data from {end_point}...") 21 | r = requests.get(end_point) 22 | data = json.loads(r.content) 23 | df = ( 24 | pd.DataFrame(data['Time Series (Daily)']) 25 | .T.reset_index() 26 | .rename(columns={'index': 'date'}) 27 | ) 28 | df = df[df['date'] == date] 29 | if not df.empty: 30 | for c in df.columns: 31 | if c != 'date': 32 | df[c] = df[c].astype(float) 33 | df['avg_price'] = (df['2. high'] + df['3. low']) / 2 34 | df['avg_num_trades'] = df['5. volume'] / 1440 35 | else: 36 | df = pd.DataFrame( 37 | [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price'], 38 | ) 39 | df['symbol'] = stock_symbol 40 | df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']] 41 | return df 42 | 43 | 44 | if __name__ == '__main__': 45 | yesterday = datetime(2020, 4, 13) 46 | df1 = _get_stock_data('aapl', yesterday) 47 | -------------------------------------------------------------------------------- /airflow/dags/solution_random_number_dag.py: -------------------------------------------------------------------------------- 1 | """Random number dag extended.""" 2 | import logging 3 | from datetime import datetime 4 | from pathlib import Path 5 | 6 | from airflow.models import DAG 7 | from airflow.operators.bash_operator import BashOperator 8 | from airflow.operators.dummy_operator import DummyOperator 9 | from airflow.operators.python_operator import BranchPythonOperator, PythonOperator 10 | 11 | STORE_DIR = Path(__file__).resolve().parent / 'tmp-files' / 'random-num' 12 | Path.mkdir(STORE_DIR, exist_ok=True, parents=True) 13 | # Add execution date to filename that stores random number 14 | bash_cmd = ( 15 | f'echo $(( ( RANDOM % 10 ) + 1 )) > {str(STORE_DIR)}/{{{{ ds_nodash }}}}.txt' 16 | ) 17 | 18 | 19 | def _read_number_and_square(store_dir, **context): 20 | date = context['execution_date'] # read execution date from context 21 | fn = str(store_dir / f'{date:%Y%m%d}.txt') 22 | print(f"Reading {fn}...") # add logging with print 23 | with open(fn, 'r') as f: 24 | n = f.readline() 25 | logging.info(f"Number read from file is: {n}") # also adds logging 26 | n_sqr = int(n) ** 2 27 | return 'print_high' if n_sqr > 30 else 'print_low' # return next task instance 28 | 29 | 30 | def _print_high(): 31 | return 'HIGH' 32 | 33 | 34 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)} 35 | with DAG( 36 | 'random_number_extended', default_args=default_args, schedule_interval='0 4 * * *' 37 | ) as dag: 38 | dummy_start_task = DummyOperator(task_id=f'dummy_start') 39 | generate_random_number = BashOperator( 40 | task_id='generate_random_number', bash_command=bash_cmd 41 | ) 42 | # New branch operator 43 | read_num_and_square = BranchPythonOperator( 44 | task_id='read_number_and_square_it', 45 | python_callable=_read_number_and_square, 46 | op_args=[STORE_DIR], 47 | provide_context=True, # pass task instance params to python callable 48 | ) 49 | print_high = PythonOperator(task_id='print_high', python_callable=_print_high) 50 | print_low = BashOperator(task_id='print_low', bash_command='echo LOW') 51 | # Define tasks (normal path and then each branch) 52 | dummy_start_task >> generate_random_number >> read_num_and_square >> print_high 53 | read_num_and_square.set_downstream(print_low) 54 | -------------------------------------------------------------------------------- /airflow/dags/solution_stocks_dag.py: -------------------------------------------------------------------------------- 1 | """Stocks dag extended.""" 2 | import json 3 | from datetime import datetime 4 | from time import sleep 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import requests 9 | import sqlalchemy.exc 10 | from airflow.models import DAG 11 | from airflow.operators.email_operator import EmailOperator 12 | from airflow.operators.python_operator import PythonOperator 13 | from airflow.operators.sqlite_operator import SqliteOperator 14 | from sqlite_cli import SqLiteClient 15 | 16 | BASE_URL = 'https://www.alphavantage.co/query' 17 | API_KEY = 'TFHNYCWBD71JBSON' 18 | STOCK_FN = 'TIME_SERIES_DAILY' 19 | 20 | SQL_DB = '/tmp/sqlite_default.db' # This is defined in Admin/Connections 21 | SQL_TABLE = 'stocks_daily_extended' 22 | SQL_CREATE = f""" 23 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} ( 24 | date TEXT, 25 | symbol TEXT, 26 | avg_num_trades REAL, 27 | avg_price REAL, 28 | UNIQUE(date,symbol) 29 | ) 30 | """ 31 | SQL_REPORT = f""" 32 | SELECT symbol, avg_num_trades 33 | FROM {SQL_TABLE} 34 | WHERE date = '{{date}}' 35 | ORDER BY avg_num_trades DESC 36 | LIMIT 1 37 | """ 38 | 39 | STOCKS = {'apple': 'aapl', 'tesla': 'tsla', 'facebook': 'fb'} 40 | 41 | 42 | def _get_stock_data(stock_symbol, **context): 43 | date = f"{context['execution_date']:%Y-%m-%d}" # read execution date from context 44 | end_point = ( 45 | f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}" 46 | f"&apikey={API_KEY}&datatype=json" 47 | ) 48 | print(f"Getting data from {end_point}...") 49 | r = requests.get(end_point) 50 | sleep(15) # To avoid api limits 51 | data = json.loads(r.content) 52 | df = ( 53 | pd.DataFrame(data['Time Series (Daily)']) 54 | .T.reset_index() 55 | .rename(columns={'index': 'date'}) 56 | ) 57 | df = df[df['date'] == date] 58 | if not df.empty: 59 | for c in df.columns: 60 | if c != 'date': 61 | df[c] = df[c].astype(float) 62 | df['avg_price'] = (df['2. high'] + df['3. low']) / 2 63 | df['avg_num_trades'] = df['5. volume'] / 1440 64 | else: 65 | df = pd.DataFrame( 66 | [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price'] 67 | ) 68 | df['symbol'] = stock_symbol 69 | df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']] 70 | return df 71 | 72 | 73 | def _insert_daily_data(**context): 74 | task_instance = context['ti'] 75 | # Get xcom for each upstream task 76 | dfs = [] 77 | for company in STOCKS: 78 | dfs.append(task_instance.xcom_pull(task_ids=f'get_daily_data_{company}')) 79 | df = pd.concat(dfs, axis=0) 80 | sql_cli = SqLiteClient(SQL_DB) 81 | try: 82 | sql_cli.insert_from_frame(df, SQL_TABLE) 83 | print(f"Inserted {len(df)} records") 84 | except sqlalchemy.exc.IntegrityError: 85 | # You can avoid doing this by setting a trigger rule in the reports operator 86 | print("Data already exists! Nothing to do...") 87 | return 88 | 89 | 90 | def _perform_daily_report(**context): 91 | date = f"{context['execution_date']:%Y-%m-%d}" 92 | sql_cli = SqLiteClient(SQL_DB) 93 | sql = SQL_REPORT.format(date=date) 94 | df = sql_cli.to_frame(sql).squeeze() 95 | msg = ( 96 | f"Most traded action in {date} was {df['symbol']} with " 97 | f"an avg of {df['avg_num_trades']} trades per minute." 98 | ) 99 | return msg 100 | 101 | 102 | default_args = { 103 | 'owner': 'pedro', 104 | 'retries': 0, 105 | 'start_date': datetime(2020, 12, 10), 106 | 'email_on_failure': True, 107 | 'email_on_retry': False, 108 | 'email': ['pedro@muttdata.ai'], 109 | } 110 | with DAG( 111 | 'stocks_extended', default_args=default_args, schedule_interval='0 4 * * *' 112 | ) as dag: 113 | 114 | create_table_if_not_exists = SqliteOperator( 115 | task_id='create_table_if_not_exists', 116 | sql=SQL_CREATE, 117 | sqlite_conn_id='sqlite_default', 118 | ) 119 | 120 | # Create several task in loop 121 | get_data_task = {} 122 | for company, symbol in STOCKS.items(): 123 | get_data_task[company] = PythonOperator( 124 | task_id=f'get_daily_data_{company}', 125 | python_callable=_get_stock_data, 126 | op_args=[symbol], 127 | provide_context=True, 128 | ) 129 | 130 | insert_daily_data = PythonOperator( 131 | task_id='insert_daily_data', 132 | python_callable=_insert_daily_data, 133 | provide_context=True, 134 | ) 135 | 136 | do_daily_report = PythonOperator( 137 | task_id='do_most_traded_report', 138 | python_callable=_perform_daily_report, 139 | provide_context=True, 140 | ) 141 | 142 | send_report_email = EmailOperator( 143 | task_id='send_report_email', 144 | to='pedro@muttdata.ai', 145 | subject='Airflow Stocks Report {{ ds }}', 146 | html_content="{{ ti.xcom_pull(task_ids='do_most_traded_report') }}", 147 | ) 148 | 149 | for company in STOCKS: 150 | upstream_task = create_table_if_not_exists 151 | task = get_data_task[company] 152 | upstream_task.set_downstream(task) 153 | task.set_downstream(insert_daily_data) 154 | insert_daily_data.set_downstream(do_daily_report) 155 | do_daily_report.set_downstream(send_report_email) 156 | -------------------------------------------------------------------------------- /airflow/dags/spark_job.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark import SparkConf, SparkContext 4 | 5 | BLACK_LIST = ["the", "be", "to", "of", "=", "=="] 6 | 7 | 8 | sc = SparkContext("local", "PySpark Word Count Exmaple") 9 | input_file = os.environ.get('INPUT_FILE', f"{os.environ['SPARK_HOME']}/README.md") 10 | output_file = os.environ.get('OUTPUT_FILE', '/spark-job/output.csv') 11 | log_file = f"{input_file}" # Should be some file on your system 12 | 13 | words = sc.textFile(log_file).flatMap(lambda line: line.split(" ")) 14 | word_counts = ( 15 | words.filter(lambda word: word != '' and len(word) > 1 and word not in BLACK_LIST) 16 | .map(lambda word: (word, 1)) 17 | .reduceByKey(lambda a, b: a + b) 18 | .max(lambda x: x[1]) 19 | ) 20 | 21 | with open(output_file, 'w') as output: 22 | output.write(f"{word_counts[0]},{word_counts[1]}") 23 | -------------------------------------------------------------------------------- /airflow/dags/sqlite_cli.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sqlalchemy import create_engine 3 | 4 | 5 | class SqLiteClient: 6 | def __init__(self, db): 7 | self.dialect = 'sqlite' 8 | self.db = db 9 | self._engine = None 10 | 11 | def _get_engine(self): 12 | db_uri = f'{self.dialect}:///{self.db}' 13 | if not self._engine: 14 | self._engine = create_engine(db_uri) 15 | return self._engine 16 | 17 | def _connect(self): 18 | return self._get_engine().connect() 19 | 20 | @staticmethod 21 | def _cursor_columns(cursor): 22 | if hasattr(cursor, 'keys'): 23 | return cursor.keys() 24 | else: 25 | return [c[0] for c in cursor.description] 26 | 27 | def execute(self, sql, connection=None): 28 | if connection is None: 29 | connection = self._connect() 30 | return connection.execute(sql) 31 | 32 | def insert_from_frame(self, df, table, if_exists='append', index=False, **kwargs): 33 | connection = self._connect() 34 | with connection: 35 | df.to_sql(table, connection, if_exists=if_exists, index=index, **kwargs) 36 | 37 | def to_frame(self, *args, **kwargs): 38 | cursor = self.execute(*args, **kwargs) 39 | if not cursor: 40 | return 41 | data = cursor.fetchall() 42 | if data: 43 | df = pd.DataFrame(data, columns=self._cursor_columns(cursor)) 44 | else: 45 | df = pd.DataFrame() 46 | return df 47 | 48 | 49 | if __name__ == '__main__': 50 | db = '/tmp/sqlite_default.db' 51 | sqlite_cli = SqLiteClient(db) 52 | print(sqlite_cli.to_frame('SELECT * FROM stocks_daily')) 53 | -------------------------------------------------------------------------------- /airflow/dags/stocks_dag.py: -------------------------------------------------------------------------------- 1 | """Stocks dag.""" 2 | import json 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import requests 8 | from airflow.models import DAG 9 | from airflow.operators.python_operator import PythonOperator 10 | from airflow.operators.sqlite_operator import SqliteOperator 11 | from sqlite_cli import SqLiteClient 12 | 13 | BASE_URL = 'https://www.alphavantage.co/query' 14 | API_KEY = 'TFHNYCWBD71JBSON' 15 | STOCK_FN = 'TIME_SERIES_DAILY' 16 | 17 | SQL_DB = '/tmp/sqlite_default.db' # This is defined in Admin/Connections 18 | SQL_TABLE = 'stocks_daily' 19 | SQL_CREATE = f""" 20 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} ( 21 | date TEXT, 22 | symbol TEXT, 23 | avg_num_trades REAL, 24 | avg_price REAL, 25 | UNIQUE(date,symbol) 26 | ) 27 | """ 28 | 29 | 30 | def _get_stock_data(stock_symbol, **context): 31 | date = f"{context['execution_date']:%Y-%m-%d}" # read execution date from context 32 | end_point = ( 33 | f"{BASE_URL}?function={STOCK_FN}&symbol={stock_symbol}" 34 | f"&apikey={API_KEY}&datatype=json" 35 | ) 36 | print(f"Getting data from {end_point}...") 37 | r = requests.get(end_point) 38 | data = json.loads(r.content) 39 | df = ( 40 | pd.DataFrame(data['Time Series (Daily)']) 41 | .T.reset_index() 42 | .rename(columns={'index': 'date'}) 43 | ) 44 | df = df[df['date'] == date] 45 | if not df.empty: 46 | for c in df.columns: 47 | if c != 'date': 48 | df[c] = df[c].astype(float) 49 | df['avg_price'] = (df['2. high'] + df['3. low']) / 2 50 | df['avg_num_trades'] = df['5. volume'] / 1440 51 | else: 52 | df = pd.DataFrame( 53 | [[date, np.nan, np.nan]], columns=['date', 'avg_num_trades', 'avg_price'], 54 | ) 55 | df['symbol'] = stock_symbol 56 | df = df[['date', 'symbol', 'avg_num_trades', 'avg_price']] 57 | return df 58 | 59 | 60 | def _insert_daily_data(**context): 61 | task_instance = context['ti'] 62 | df = task_instance.xcom_pull(task_ids='get_daily_data') 63 | sql_cli = SqLiteClient(SQL_DB) 64 | sql_cli.insert_from_frame(df, SQL_TABLE) 65 | return 66 | 67 | 68 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 10)} 69 | with DAG('stocks', default_args=default_args, schedule_interval='0 4 * * *') as dag: 70 | create_table_if_not_exists = SqliteOperator( 71 | task_id='create_table_if_not_exists', 72 | sql=SQL_CREATE, 73 | sqlite_conn_id='sqlite_default', 74 | ) 75 | get_daily_data = PythonOperator( 76 | task_id='get_daily_data', 77 | python_callable=_get_stock_data, 78 | op_args=['aapl'], 79 | provide_context=True, 80 | ) 81 | # Add insert stock data 82 | insert_daily_data = PythonOperator( 83 | task_id='insert_daily_data', 84 | python_callable=_insert_daily_data, 85 | provide_context=True, 86 | ) 87 | create_table_if_not_exists >> get_daily_data >> insert_daily_data 88 | -------------------------------------------------------------------------------- /airflow/dags/word_count_dag.py: -------------------------------------------------------------------------------- 1 | """Word count dag.""" 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | from airflow.hooks.postgres_hook import PostgresHook 7 | from airflow.models import DAG 8 | 9 | # from airflow.operators.docker_operator import DockerOperator 10 | from airflow.operators.postgres_operator import PostgresOperator 11 | from airflow.operators.python_operator import PythonOperator 12 | 13 | STORE_DIR = Path(__file__).resolve().parent 14 | 15 | CONNECTION_ID = 'postgres_local' 16 | SQL_DB = "word_count" 17 | SQL_TABLE = 'word_count' 18 | SQL_CREATE = f""" 19 | CREATE TABLE IF NOT EXISTS {SQL_TABLE} ( 20 | date TEXT, 21 | word TEXT, 22 | count REAL, 23 | UNIQUE(date,word) 24 | ) 25 | """ 26 | 27 | 28 | def _insert_file_to_sql(**context): 29 | df_result = pd.read_csv(f"{STORE_DIR}/output.csv", names=["word", "count"]) 30 | df_result["date"] = context["ds"] 31 | if not df_result.empty: 32 | for c in df_result.columns: 33 | if c == 'count': 34 | df_result[c] = df_result[c].astype(float) 35 | df_result = df_result.squeeze() # squeezing single row dataframe 36 | 37 | df_tuple = [(df_result["date"], df_result["word"], df_result["count"])] 38 | 39 | hook = PostgresHook(postgres_conn_id=CONNECTION_ID) 40 | hook.insert_rows(SQL_TABLE, df_tuple) 41 | 42 | 43 | default_args = {'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 12, 14)} 44 | with DAG('word_count', default_args=default_args, schedule_interval='0 0 * * *') as dag: 45 | create_table_if_not_exists = PostgresOperator( 46 | task_id='create_table_if_not_exists', 47 | sql=SQL_CREATE, 48 | postgres_conn_id=CONNECTION_ID, 49 | ) 50 | # spark_job = DockerOperator( 51 | # task_id='spark_job', 52 | # image='bde2020/spark-master:latest', 53 | # api_version='auto', 54 | # auto_remove=True, 55 | # environment={'PYSPARK_PYTHON': "python3", 'SPARK_HOME': "/spark"}, 56 | # volumes=[f'{STORE_DIR}:/spark-job'], 57 | # command='/spark/bin/spark-submit --master local[*] /spark-job/spark_job.py', 58 | # docker_url='unix://var/run/docker.sock', 59 | # network_mode='bridge', 60 | # ) 61 | insert_file_to_sql = PythonOperator( 62 | task_id='file_to_sql', 63 | python_callable=_insert_file_to_sql, 64 | provide_context=True, 65 | ) 66 | -------------------------------------------------------------------------------- /airflow/docs/clase_airflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/clase_airflow.pdf -------------------------------------------------------------------------------- /airflow/docs/figures/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow.png -------------------------------------------------------------------------------- /airflow/docs/figures/airflow_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow_architecture.png -------------------------------------------------------------------------------- /airflow/docs/figures/airflow_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/airflow_ui.png -------------------------------------------------------------------------------- /airflow/docs/figures/dag_graph_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/dag_graph_view.png -------------------------------------------------------------------------------- /airflow/docs/figures/example_bash_operator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/example_bash_operator.png -------------------------------------------------------------------------------- /airflow/docs/figures/exercise_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_1.png -------------------------------------------------------------------------------- /airflow/docs/figures/exercise_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_2.png -------------------------------------------------------------------------------- /airflow/docs/figures/exercise_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_3.png -------------------------------------------------------------------------------- /airflow/docs/figures/exercise_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/exercise_4.png -------------------------------------------------------------------------------- /airflow/docs/figures/logo_mutt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/airflow/docs/figures/logo_mutt.png -------------------------------------------------------------------------------- /code/postgresql-42.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/code/postgresql-42.1.4.jar -------------------------------------------------------------------------------- /code/python/introduction/ejercicios/list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # Basic list exercises 4 | # Fill in the code for the functions below. main() is already set up 5 | # to call the functions with a few different inputs, 6 | # printing 'OK' when each function is correct. 7 | # The starter code for each function includes a 'return' 8 | # which is just a placeholder for your code. 9 | # It's ok if you do not complete all the functions, and there 10 | # are some additional functions to try in list2.py. 11 | 12 | # A. match_ends 13 | # Given a list of strings, return the count of the number of 14 | # strings where the string length is 2 or more and the first 15 | # and last chars of the string are the same. 16 | # Note: python does not have a ++ operator, but += works. 17 | def match_ends(words): 18 | # +++your code here+++ 19 | return 20 | 21 | 22 | # B. front_x 23 | # Given a list of strings, return a list with the strings 24 | # in sorted order, except group all the strings that begin with 'x' first. 25 | # e.g. ['mix', 'xyz', 'apple', 'xanadu', 'aardvark'] yields 26 | # ['xanadu', 'xyz', 'aardvark', 'apple', 'mix'] 27 | # Hint: this can be done by making 2 lists and sorting each of them 28 | # before combining them. 29 | def front_x(words): 30 | # +++your code here+++ 31 | return 32 | 33 | 34 | 35 | # C. sort_last 36 | # Given a list of non-empty tuples, return a list sorted in increasing 37 | # order by the last element in each tuple. 38 | # e.g. [(1, 7), (1, 3), (3, 4, 5), (2, 2)] yields 39 | # [(2, 2), (1, 3), (3, 4, 5), (1, 7)] 40 | # Hint: use a custom key= function to extract the last element form each tuple. 41 | def sort_last(tuples): 42 | # +++your code here+++ 43 | return 44 | 45 | 46 | # D. Given a list of numbers, return a list where 47 | # all adjacent == elements have been reduced to a single element, 48 | # so [1, 2, 2, 3] returns [1, 2, 3]. You may create a new list or 49 | # modify the passed in list. 50 | def remove_adjacent(nums): 51 | # +++your code here+++ 52 | return 53 | 54 | 55 | # Simple provided test() function used in main() to print 56 | # what each function returns vs. what it's supposed to return. 57 | def test(got, expected): 58 | if got == expected: 59 | prefix = ' OK ' 60 | else: 61 | prefix = ' X ' 62 | print(f'{prefix} got: {got} expected: {expected}') 63 | 64 | 65 | # Calls the above functions with interesting inputs. 66 | def main(): 67 | print('match_ends') 68 | test(match_ends(['aba', 'xyz', 'aa', 'x', 'bbb']), 3) 69 | test(match_ends(['', 'x', 'xy', 'xyx', 'xx']), 2) 70 | test(match_ends(['aaa', 'be', 'abc', 'hello']), 1) 71 | 72 | print('\n') 73 | print('front_x') 74 | test(front_x(['bbb', 'ccc', 'axx', 'xzz', 'xaa']), 75 | ['xaa', 'xzz', 'axx', 'bbb', 'ccc']) 76 | test(front_x(['ccc', 'bbb', 'aaa', 'xcc', 'xaa']), 77 | ['xaa', 'xcc', 'aaa', 'bbb', 'ccc']) 78 | test(front_x(['mix', 'xyz', 'apple', 'xanadu', 'aardvark']), 79 | ['xanadu', 'xyz', 'aardvark', 'apple', 'mix']) 80 | 81 | 82 | print('\n') 83 | print('sort_last') 84 | test(sort_last([(1, 3), (3, 2), (2, 1)]), 85 | [(2, 1), (3, 2), (1, 3)]) 86 | test(sort_last([(2, 3), (1, 2), (3, 1)]), 87 | [(3, 1), (1, 2), (2, 3)]) 88 | test(sort_last([(1, 7), (1, 3), (3, 4, 5), (2, 2)]), 89 | [(2, 2), (1, 3), (3, 4, 5), (1, 7)]) 90 | 91 | print('\n') 92 | print('remove_adjacent') 93 | test(remove_adjacent([1, 2, 2, 3]), [1, 2, 3]) 94 | test(remove_adjacent([2, 2, 3, 3, 3]), [2, 3]) 95 | test(remove_adjacent([]), []) 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /code/python/introduction/ejercicios/small.txt: -------------------------------------------------------------------------------- 1 | We are not what we should be 2 | We are not what we need to be 3 | But at least we are not what we used to be 4 | -- Football Coach 5 | 6 | -------------------------------------------------------------------------------- /code/python/introduction/ejercicios/string.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright 2010 Google Inc. 3 | # Licensed under the Apache License, Version 2.0 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Google's Python Class 7 | # http://code.google.com/edu/languages/google-python-class/ 8 | 9 | # Basic string exercises 10 | # Fill in the code for the functions below. main() is already set up 11 | # to call the functions with a few different inputs, 12 | # printing 'OK' when each function is correct. 13 | # The starter code for each function includes a 'return' 14 | # which is just a placeholder for your code. 15 | # It's ok if you do not complete all the functions, and there 16 | # are some additional functions to try in string2.py. 17 | 18 | 19 | # A. donuts 20 | # Given an int count of a number of donuts, return a string 21 | # of the form 'Number of donuts: ', where is the number 22 | # passed in. However, if the count is 10 or more, then use the word 'many' 23 | # instead of the actual count. 24 | # So donuts(5) returns 'Number of donuts: 5' 25 | # and donuts(23) returns 'Number of donuts: many' 26 | def donuts(count): 27 | # +++your code here+++ 28 | return 29 | 30 | 31 | # B. both_ends 32 | # Given a string s, return a string made of the first 2 33 | # and the last 2 chars of the original string, 34 | # so 'spring' yields 'spng'. However, if the string length 35 | # is less than 2, return instead the empty string. 36 | def both_ends(s): 37 | # +++your code here+++ 38 | return 39 | 40 | 41 | # C. fix_start 42 | # Given a string s, return a string 43 | # where all occurences of its first char have 44 | # been changed to '*', except do not change 45 | # the first char itself. 46 | # e.g. 'babble' yields 'ba**le' 47 | # Assume that the string is length 1 or more. 48 | # Hint: s.replace(stra, strb) returns a version of string s 49 | # where all instances of stra have been replaced by strb. 50 | def fix_start(s): 51 | # +++your code here+++ 52 | return 53 | 54 | 55 | # D. MixUp 56 | # Given strings a and b, return a single string with a and b separated 57 | # by a space ' ', except swap the first 2 chars of each string. 58 | # e.g. 59 | # 'mix', pod' -> 'pox mid' 60 | # 'dog', 'dinner' -> 'dig donner' 61 | # Assume a and b are length 2 or more. 62 | def mix_up(a, b): 63 | # +++your code here+++ 64 | return 65 | 66 | 67 | # Provided simple test() function used in main() to print 68 | # what each function returns vs. what it's supposed to return. 69 | def test(got, expected): 70 | if got == expected: 71 | prefix = ' OK ' 72 | else: 73 | prefix = ' X ' 74 | print(f'{prefix} got: {got} expected: {expected}') 75 | 76 | 77 | # Provided main() calls the above functions with interesting inputs, 78 | # using test() to check if each result is correct or not. 79 | def main(): 80 | print('donuts') 81 | # Each line calls donuts, compares its result to the expected for that call. 82 | test(donuts(4), 'Number of donuts: 4') 83 | test(donuts(9), 'Number of donuts: 9') 84 | test(donuts(10), 'Number of donuts: many') 85 | test(donuts(99), 'Number of donuts: many') 86 | 87 | print('\n') 88 | print('both_ends') 89 | test(both_ends('spring'), 'spng') 90 | test(both_ends('Hello'), 'Helo') 91 | test(both_ends('a'), '') 92 | test(both_ends('xyz'), 'xyyz') 93 | 94 | 95 | print('\n') 96 | print('fix_start') 97 | test(fix_start('babble'), 'ba**le') 98 | test(fix_start('aardvark'), 'a*rdv*rk') 99 | test(fix_start('google'), 'goo*le') 100 | test(fix_start('donut'), 'donut') 101 | 102 | print('\n') 103 | print('mix_up') 104 | test(mix_up('mix', 'pod'), 'pox mid') 105 | test(mix_up('dog', 'dinner'), 'dig donner') 106 | test(mix_up('gnash', 'sport'), 'spash gnort') 107 | test(mix_up('pezzy', 'firm'), 'fizzy perm') 108 | 109 | 110 | # Standard boilerplate to call the main() function. 111 | if __name__ == '__main__': 112 | main() 113 | -------------------------------------------------------------------------------- /code/python/introduction/ejercicios/wordcount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright 2010 Google Inc. 3 | # Licensed under the Apache License, Version 2.0 4 | # http://www.apache.org/licenses/LICENSE-2.0 5 | 6 | # Google's Python Class 7 | # http://code.google.com/edu/languages/google-python-class/ 8 | 9 | """Wordcount exercise 10 | Google's Python class 11 | 12 | The main() below is already defined and complete. It calls print_words() 13 | and print_top() functions which you write. 14 | 15 | 1. For the --count flag, implement a print_words(filename) function that counts 16 | how often each word appears in the text and prints: 17 | word1 count1 18 | word2 count2 19 | ... 20 | 21 | Print the above list in order sorted by word (python will sort punctuation to 22 | come before letters -- that's fine). Store all the words as lowercase, 23 | so 'The' and 'the' count as the same word. 24 | 25 | 2. For the --topcount flag, implement a print_top(filename) which is similar 26 | to print_words() but which prints just the top 20 most common words sorted 27 | so the most common word is first, then the next most common, and so on. 28 | 29 | Use str.split() (no arguments) to split on all whitespace. 30 | 31 | Workflow: don't build the whole program at once. Get it to an intermediate 32 | milestone and print your data structure and sys.exit(0). 33 | When that's working, try for the next milestone. 34 | 35 | Optional: define a helper function to avoid code duplication inside 36 | print_words() and print_top(). 37 | 38 | """ 39 | 40 | import sys 41 | 42 | # +++your code here+++ 43 | # Define print_words(filename) and print_top(filename) functions. 44 | # You could write a helper utility function that reads a file 45 | # and builds and returns a word/count dict for it. 46 | # Then print_words() and print_top() can just call the utility function. 47 | 48 | ### 49 | 50 | # This basic command line argument parsing code is provided and 51 | # calls the print_words() and print_top() functions which you must define. 52 | def main(): 53 | if len(sys.argv) != 3: 54 | print('usage: ./wordcount.py {--count | --topcount} file') 55 | sys.exit(1) 56 | 57 | option = sys.argv[1] 58 | filename = sys.argv[2] 59 | if option == '--count': 60 | print_words(filename) 61 | elif option == '--topcount': 62 | print_top(filename) 63 | else: 64 | print(f'unknown option: {option}') 65 | sys.exit(1) 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /code/python/introduction/hello.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """A tiny Python program to check that Python is working. 4 | Try running this program from the command line like this: 5 | python hello.py 6 | python hello.py Alice 7 | That should print: 8 | Hello World -or- Hello Alice 9 | Try changing the 'Hello' to 'Howdy' and run again. 10 | Once you have that working, you're ready for class -- you can edit 11 | and run Python code; now you just need to learn Python! 12 | """ 13 | 14 | import sys 15 | 16 | # Define a main() function that prints a little greeting. 17 | def main(): 18 | # Get the name from the command line, using 'World' as a fallback. 19 | if len(sys.argv) >= 2: 20 | name = sys.argv[1] 21 | else: 22 | name = 'World' 23 | print(f'Hello {name}') 24 | 25 | # This is the standard boilerplate that calls the main() function. 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /code/python/introduction/introduccion_a_python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/code/python/introduction/introduccion_a_python.pdf -------------------------------------------------------------------------------- /code/python/us-stock-analysis/.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | __pycache__ 3 | build/ 4 | dist/ 5 | *.egg-info/ 6 | .idea/ 7 | .vscode/ 8 | venv 9 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/README.md: -------------------------------------------------------------------------------- 1 | # ETL: US stocks analysis (BATCH) 2 | 3 | ## How to run our app 4 | 5 | ```bash 6 | # Levantando docker en modo iterativo, conectando al master 7 | docker exec -it master bash 8 | 9 | # voy a la carpeta base de mi codigo 10 | cd /app/python/us-stock-analysis 11 | 12 | # Envía el job para ser ejecutado 13 | spark-submit \ 14 | --master 'spark://master:7077' \ 15 | --jars /app/postgresql-42.1.4.jar \ 16 | src/batch/etl_steps.py \ 17 | /dataset/stocks-small \ 18 | /dataset/yahoo-symbols-201709.csv \ 19 | /dataset/output.parquet 20 | 21 | # Console 22 | pyspark \ 23 | --master 'spark://master:7077' \ 24 | --jars /app/postgresql-42.1.4.jar 25 | ``` 26 | 27 | ## More examples 28 | 29 | ```bash 30 | spark-submit \ 31 | --master 'spark://master:7077' \ 32 | src/examples/first_example.py 33 | 34 | spark-submit \ 35 | --master 'spark://master:7077' \ 36 | --jars /app/postgresql-42.1.4.jar \ 37 | src/examples/postgres_example.py 38 | ``` 39 | # Create a Project using `venv` 40 | 41 | ```bash 42 | mkdir project1 43 | cd project1 44 | 45 | # Create virtualenv 46 | python3 -m venv venv 47 | source venv/bin/activate 48 | 49 | # Upgrade pip & Install deps 50 | pip install --upgrade pip 51 | pip install -r requirements.txt 52 | 53 | charm . 54 | ``` 55 | 56 | # ETL: US stocks analysis (STREAMING) 57 | 58 | ### Comenzar fake generator 59 | ```bash 60 | docker exec -it worker1 bash 61 | 62 | cd /app/python/us-stock-analysis/ 63 | 64 | # generate stream data 65 | python src/stream/fake_stock_price_generator.py kafka:9092 stocks 2017-11-11T10:00:00Z 66 | ``` 67 | 68 | ### Process using Spark Structured Stream API 69 | [Structured Streaming + Kafka Integration Guide](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#deploying) 70 | 71 | Abrir otra tab y volver a ingresar al servidor donde se encuentran corriendo los contenedores. 72 | Luego, para correr la aplicación de spark conectarse a un worker, ir al directorio con el código y correr `spark-submit` de la siguiente manera: 73 | 74 | ```bash 75 | docker exec -it worker1 bash 76 | 77 | cd /app/python/us-stock-analysis/ 78 | 79 | spark-submit \ 80 | --master 'spark://master:7077' \ 81 | --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 \ 82 | --jars /app/postgresql-42.1.4.jar \ 83 | src/stream/etl_stream.py \ 84 | kafka:9092 stocks 85 | ``` 86 | 87 | (Para stopear el comando presiones `Ctrl + c` ) 88 | 89 | ### Escribiendo a Postgres 90 | 91 | En una nueva tab de la terminal ingresar a la línea de comando de Postgres con: 92 | 93 | ```bash 94 | ./control-env.sh psql 95 | ``` 96 | 97 | Crear las tablas que vamos a utilizar para el ejercicio con los siguientes comandos (copiar el comando entero, pegar y presionar enter por cada uno) 98 | 99 | ```sql 100 | CREATE TABLE streaming_inserts ( 101 | "timestamp" timestamptz NOT NULL, 102 | symbol varchar(10), 103 | price real 104 | ); 105 | ``` 106 | 107 | ```sql 108 | CREATE TABLE streaming_inserts_avg_price ( 109 | "window" varchar(128), 110 | symbol varchar(10), 111 | avg_price real 112 | ); 113 | ``` 114 | 115 | ```sql 116 | CREATE TABLE streaming_inserts_avg_price_final ( 117 | window_start timestamp, 118 | window_end timestamp, 119 | symbol varchar(10), 120 | avg_price real 121 | ); 122 | ``` 123 | 124 | Asegurarse que todas las líneas de la 59 a la 114 del archivo `etl_stream.py` se encuentran comentadas. 125 | 126 | Descomentar el primer job de inserción a Postgres en las siguientes líneas 127 | ```python 128 | # Simple insert 129 | query = stream_to_postgres(stocks) 130 | query.awaitTermination() 131 | ``` 132 | 133 | Asegurarse que el generador de datos está corriendo y ejecutar el job de streaming con el mismo comando que con anterioridad: 134 | 135 | Comente las líneas del primer job (`stream_to_postgres`) y descomente las del job `stream_aggregation_to_postgres`. 136 | Revise el código de la nueva función y observe las diferencias con el anterior. Qué diferencias observa? 137 | Luego de correrlo revise los datos insertados en la tabla con `psql`. Qué ve de particular en la fecha de comienzo? 138 | 139 | Finalmente comente el job `stream_aggregation_to_postgres` y descomente `stream_aggregation_to_postgres_final`. 140 | Agregue una visualización en Superset para poder visualizar las filas insertándose en esta nueva tabla. 141 | 142 | Una vez completados los pasos anteriores pruebe algunos de las siguientes modificaciones: 143 | 144 | 1. Agregue al job final lógica para que además de calcular el avg_price calcule el max de cada ventana. 145 | 2. Agregue nuevas visualizaciones al dashboard de Superset y haga que se refresque cada 10 segundos. 146 | 3. Agregue al ETL batch el código necesario para que también guarde la información del volumen de cada acción. 147 | 4. Agregue al `fake_stock_price_generator.py` lógica para generar un volumen para cada acción de manera artificial además del precio. Modifique los jobs de streaming para procesar este dato. 148 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==2.4.5 2 | 3 | # Deps for fake_stock_price_gen 4 | kafka-python==2.0.1 5 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/src/batch/etl_steps.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | # UDF 6 | from pyspark.sql.types import StringType 7 | 8 | # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions 9 | from pyspark.sql import functions as F 10 | from pyspark.sql.window import Window 11 | 12 | # Initialization 13 | args = sys.argv 14 | 15 | if len(args) != 4: 16 | print(f""" 17 | |Usage: {args[0]} 18 | | folder where stocks data are located 19 | | file containing lookup information 20 | | folder to write parquet files 21 | | 22 | | {args[0]} /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet 23 | """) 24 | sys.exit(1) 25 | 26 | _, stocks_dir, lookup_file, output_dir = args 27 | 28 | spark = SparkSession \ 29 | .builder \ 30 | .appName("Stocks:ETL") \ 31 | .getOrCreate() 32 | 33 | 34 | # 35 | def csv_stocks_df(stocks_folder): 36 | # Create a function and define it as a UDF 37 | # UDF 38 | def extract_symbol_from(filename): 39 | return filename.split('/')[-1].split('.')[0].upper() 40 | 41 | extract_symbol = F.udf(lambda filename: extract_symbol_from(filename), StringType()) 42 | 43 | df = spark.read \ 44 | .option("header", True) \ 45 | .option("inferSchema", True) \ 46 | .csv(stocks_folder) \ 47 | .withColumn("name", extract_symbol(F.input_file_name())) \ 48 | .withColumnRenamed("Date", "dateTime") \ 49 | .withColumnRenamed("Open", "open") \ 50 | .withColumnRenamed("High", "high") \ 51 | .withColumnRenamed("Low", "low") \ 52 | .withColumnRenamed("Close", "close") \ 53 | .drop("Volume", "OpenInt") 54 | 55 | return df 56 | 57 | 58 | # Load lookup CSV and convert into DataFrame 59 | def load_lookup_data(filename): 60 | # df.filter("Country = \"USA\""). \ 61 | # df.filter("Country" === "USA"). 62 | df = spark.read. \ 63 | option("header", True). \ 64 | option("inferSchema", True). \ 65 | csv(filename). \ 66 | select("Ticker", "Category Name"). \ 67 | withColumnRenamed("Ticker", "symbol"). \ 68 | withColumnRenamed("Category Name", "category") 69 | 70 | return df 71 | 72 | 73 | df_stocks = csv_stocks_df(stocks_dir) 74 | print("Sample of df_stocks data:") 75 | df_stocks.show(3) 76 | 77 | symbols_lookup = load_lookup_data(lookup_file) 78 | print("Sample of symbols_lookup data:") 79 | symbols_lookup.show(3) 80 | 81 | joined_df = df_stocks \ 82 | .withColumnRenamed('dateTime', "full_date") \ 83 | .filter("full_date >= \"2017-09-01\"") \ 84 | .withColumn("year", F.year("full_date")) \ 85 | .withColumn("month", F.month("full_date")) \ 86 | .withColumn("day", F.dayofmonth("full_date")) \ 87 | .withColumnRenamed("name", "symbol") \ 88 | .join(symbols_lookup, ["symbol"]) 89 | 90 | print("Sample of joined_df data:") 91 | joined_df.show() 92 | 93 | # Calculate Moving Average 94 | # https://stackoverflow.com/questions/45806194/pyspark-rolling-average-using-timeseries-data 95 | 96 | window20 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-20, 0)) 97 | window50 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-50, 0)) 98 | window100 = (Window.partitionBy(F.col('symbol')).orderBy(F.col("full_date")).rowsBetween(-100, 0)) 99 | 100 | # // Calculate the moving average 101 | stocks_moving_avg_df = joined_df \ 102 | .withColumn("ma20", F.avg("close").over(window20)) \ 103 | .withColumn("ma50", F.avg("close").over(window50)) \ 104 | .withColumn("ma100", F.avg("close").over(window100)) 105 | 106 | print("Sample of stocks_moving_avg_df data:") 107 | stocks_moving_avg_df.show() 108 | 109 | # Write to Parquet 110 | stocks_moving_avg_df \ 111 | .write \ 112 | .mode('overwrite') \ 113 | .partitionBy("year", "month", "day") \ 114 | .parquet(output_dir) 115 | 116 | # Write to Postgres 117 | stocks_moving_avg_df \ 118 | .drop("year", "month", "day") \ 119 | .write \ 120 | .format("jdbc") \ 121 | .option("url", "jdbc:postgresql://postgres/workshop") \ 122 | .option("dbtable", "workshop.stocks") \ 123 | .option("user", "workshop") \ 124 | .option("password", "w0rkzh0p") \ 125 | .option("driver", "org.postgresql.Driver") \ 126 | .mode('append') \ 127 | .save() 128 | 129 | print("All done") 130 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/src/examples/first_example.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession \ 4 | .builder \ 5 | .appName("first_example") \ 6 | .getOrCreate() 7 | 8 | df = spark.read.csv("/dataset/yahoo-symbols-201709.csv") 9 | 10 | df.show() 11 | 12 | spark.stop 13 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/src/examples/postgres_example.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession \ 4 | .builder \ 5 | .appName("first_example") \ 6 | .getOrCreate() 7 | 8 | df = spark.read \ 9 | .format("jdbc") \ 10 | .option("url", "jdbc:postgresql://postgres/workshop") \ 11 | .option("dbtable", "workshop.stocks") \ 12 | .option("user", "workshop") \ 13 | .option("password", "w0rkzh0p") \ 14 | .option("driver", "org.postgresql.Driver") \ 15 | .load() 16 | 17 | df.printSchema() 18 | 19 | elems_count = df.count() 20 | 21 | print(f'Count: {elems_count}\n\n') 22 | 23 | df.show() 24 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/src/stream/etl_stream.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from time import sleep 4 | 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.functions import from_json, year, month, dayofmonth, hour, minute 7 | from pyspark.sql import functions as F # col doesn't import correctly 8 | from pyspark.sql.types import TimestampType, StringType, StructType, StructField, DoubleType 9 | 10 | 11 | def validate_params(args): 12 | if len(args) != 3: 13 | print(f""" 14 | |Usage: {args[0]} 15 | | is a list of one or more Kafka brokers 16 | | is a a kafka topic to consume from 17 | | 18 | | {args[0]} kafka:9092 stocks 19 | """) 20 | sys.exit(1) 21 | pass 22 | 23 | 24 | def create_spark_session(): 25 | return SparkSession \ 26 | .builder \ 27 | .appName("Stocks:Stream:ETL") \ 28 | .getOrCreate() 29 | 30 | 31 | def start_stream(args): 32 | validate_params(args) 33 | _, brokers, topic = args 34 | 35 | spark = create_spark_session() 36 | 37 | json = spark \ 38 | .readStream \ 39 | .format("kafka") \ 40 | .option("kafka.bootstrap.servers", brokers) \ 41 | .option("subscribe", topic) \ 42 | .load() 43 | 44 | json.printSchema() 45 | 46 | # Explicitly set schema 47 | schema = StructType([StructField("symbol", StringType(), False), 48 | StructField("timestamp", TimestampType(), False), 49 | StructField("price", DoubleType(), False)]) 50 | 51 | json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"} 52 | stocks_json = json \ 53 | .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content")) 54 | 55 | stocks_json.printSchema 56 | 57 | stocks = stocks_json.select("content.*") 58 | 59 | #################################### 60 | # Stream to Parquet 61 | #################################### 62 | query = stocks \ 63 | .withColumn('year', year(F.col('timestamp'))) \ 64 | .withColumn('month', month(F.col('timestamp'))) \ 65 | .withColumn('day', dayofmonth(F.col('timestamp'))) \ 66 | .withColumn('hour', hour(F.col('timestamp'))) \ 67 | .withColumn('minute', minute(F.col('timestamp'))) \ 68 | .writeStream \ 69 | .format('parquet') \ 70 | .partitionBy('year', 'month', 'day', 'hour', 'minute') \ 71 | .option('startingOffsets', 'earliest') \ 72 | .option('checkpointLocation', '/dataset/checkpoint') \ 73 | .option('path', '/dataset/streaming.parquet') \ 74 | .trigger(processingTime='30 seconds') \ 75 | .start() 76 | 77 | query.awaitTermination() 78 | 79 | 80 | # avg_pricing = stocks \ 81 | # .groupBy(F.col("symbol")) \ 82 | # .agg(F.avg(F.col("price")).alias("avg_price")) 83 | 84 | #################################### 85 | # Console Output 86 | #################################### 87 | # query2 = avg_pricing.writeStream \ 88 | # .outputMode('complete') \ 89 | # .format("console") \ 90 | # .trigger(processingTime="10 seconds") \ 91 | # .start() 92 | 93 | # query2.awaitTermination() 94 | 95 | #################################### 96 | # Table in Memory 97 | #################################### 98 | # query3 = avg_pricing \ 99 | # .writeStream \ 100 | # .queryName("avgPricing") \ 101 | # .outputMode("complete") \ 102 | # .format("memory") \ 103 | # .trigger(processingTime="10 seconds") \ 104 | # .start() 105 | # 106 | # while True: 107 | # print('\n' + '_' * 30) 108 | # # interactively query in-memory table 109 | # spark.sql('SELECT * FROM avgPricing').show() 110 | # print(query3.lastProgress) 111 | # sleep(10) 112 | 113 | # query3.awaitTermination() 114 | 115 | #################################### 116 | # Writing to Postgres 117 | #################################### 118 | 119 | # Simple insert 120 | # query = stream_to_postgres(stocks) 121 | # query.awaitTermination() 122 | 123 | # Average Price Aggregation 124 | # query = stream_aggregation_to_postgres(stocks) 125 | # query.awaitTermination() 126 | 127 | # Final Average Price Aggregation with Timestamp columns 128 | # query = stream_aggregation_to_postgres_final(stocks) 129 | # query.awaitTermination() 130 | 131 | pass 132 | 133 | 134 | def define_write_to_postgres(table_name): 135 | 136 | def write_to_postgres(df, epochId): 137 | return ( 138 | df.write 139 | .format("jdbc") 140 | .option("url", "jdbc:postgresql://postgres/workshop") 141 | .option("dbtable", f"workshop.{table_name}") 142 | .option("user", "workshop") 143 | .option("password", "w0rkzh0p") 144 | .option("driver", "org.postgresql.Driver") 145 | .mode('append') 146 | .save() 147 | ) 148 | return write_to_postgres 149 | 150 | 151 | def stream_to_postgres(stocks, output_table="streaming_inserts"): 152 | wstocks = ( 153 | stocks 154 | .withWatermark("timestamp", "60 seconds") 155 | .select("timestamp", "symbol", "price") 156 | ) 157 | 158 | write_to_postgres_fn = define_write_to_postgres("streaming_inserts") 159 | 160 | query = ( 161 | wstocks.writeStream 162 | .foreachBatch(write_to_postgres_fn) 163 | .outputMode("append") 164 | .trigger(processingTime="10 seconds") 165 | .start() 166 | ) 167 | 168 | return query 169 | 170 | 171 | def summarize_stocks(stocks): 172 | avg_pricing = ( 173 | stocks 174 | .withWatermark("timestamp", "60 seconds") 175 | .groupBy( 176 | F.window("timestamp", "30 seconds"), 177 | stocks.symbol) 178 | .agg(F.avg("price").alias('avg_price')) 179 | ) 180 | avg_pricing.printSchema() 181 | return avg_pricing 182 | 183 | 184 | def stream_aggregation_to_postgres(stocks, output_table="streaming_inserts_avg_price"): 185 | 186 | avg_pricing = summarize_stocks(stocks) 187 | 188 | window_to_string = F.udf(lambda w: str(w.start) + ' - ' + str(w.end), StringType()) 189 | 190 | write_to_postgres_fn = define_write_to_postgres(output_table) 191 | 192 | query = ( 193 | avg_pricing\ 194 | .withColumn("window", window_to_string("window")) 195 | .writeStream 196 | .foreachBatch(write_to_postgres_fn) 197 | .outputMode("append") 198 | .trigger(processingTime="10 seconds") 199 | .start() 200 | ) 201 | 202 | return query 203 | 204 | 205 | def stream_aggregation_to_postgres_final(stocks, output_table="streaming_inserts_avg_price_final"): 206 | 207 | avg_pricing = summarize_stocks(stocks) 208 | 209 | window_start_ts_fn = F.udf(lambda w: w.start, TimestampType()) 210 | 211 | window_end_ts_fn = F.udf(lambda w: w.end, TimestampType()) 212 | 213 | write_to_postgres_fn = define_write_to_postgres(output_table) 214 | 215 | query = ( 216 | avg_pricing\ 217 | .withColumn("window_start", window_start_ts_fn("window")) 218 | .withColumn("window_end", window_end_ts_fn("window")) 219 | .drop("window") 220 | .writeStream 221 | .foreachBatch(write_to_postgres_fn) 222 | .outputMode("append") 223 | .trigger(processingTime="10 seconds") 224 | .start() 225 | ) 226 | 227 | return query 228 | 229 | 230 | if __name__ == '__main__': 231 | start_stream(sys.argv) 232 | -------------------------------------------------------------------------------- /code/python/us-stock-analysis/src/stream/fake_stock_price_generator.py: -------------------------------------------------------------------------------- 1 | from random import randrange, random 2 | from datetime import datetime, timedelta 3 | from time import sleep 4 | 5 | from kafka import KafkaProducer 6 | import json 7 | import sys 8 | 9 | 10 | class QuoteGenerator: 11 | # Using as SEED for the generator last 90 days of stocks 12 | # price: Max Closing Price 13 | # volatility: StdDev of Closing Pricing 14 | # df.groupBy($"symbol") 15 | # .agg(stddev_pop($"close").as("volatility"), max($"close").as("price")) 16 | # .orderBy($"symbol") 17 | quotes_list = [("AAPL", 175.61, 6.739169981533334), 18 | ("BABA", 188.51, 5.637335242825282), 19 | ("CSCO", 34.62, 0.9673997717593282), 20 | ("DHR", 93.24, 2.949284608917899), 21 | ("EBAY", 38.99, 0.8110024414266584), 22 | ("FB", 182.66, 4.14292553638126), 23 | ("GOOG", 1039.85, 37.960859608812854), 24 | ("GOOGL", 1058.29, 39.11749241707603), 25 | ("IBM", 160.47, 4.8367462989079755), 26 | ("INTC", 46.826, 3.678237311321825), 27 | ("JNJ", 143.62, 4.336597380435497), 28 | ("MELI", 292.05, 19.703519789367583), 29 | ("MSFT", 84.56, 3.7745700470384693), 30 | ("ORCL", 52.593, 1.4026418724678085), 31 | ("QCOM", 65.49, 3.962328548164577), 32 | ("TSLA", 385.0, 21.667055079857995), 33 | ("TXN", 98.54, 5.545761038090265), 34 | ("WDC", 89.9, 1.7196676293981952), 35 | ("XRX", 33.86, 1.4466726098188216)] 36 | 37 | def __init__(self, trading_start_at): 38 | self.trading_start_datetime = trading_start_at 39 | 40 | # a very naive impl of marketing hours 41 | # not consider weekends nor holidays 42 | def __nextMarketTime(self): 43 | # Sometimes it substracts 1 and generates late arriving tickers 44 | tick = randrange(5) - 1 45 | next_time = self.trading_start_datetime + timedelta(minutes=tick) 46 | # Market should be closed, bump to next day 47 | if next_time.hour > 15: 48 | next_time = (next_time + timedelta(days=1)).replace(hour=10, minute=0) 49 | 50 | self.trading_start_datetime = next_time 51 | return next_time 52 | 53 | def __signal(self): 54 | if randrange(2) == 0: 55 | return 1 56 | else: 57 | return -1 58 | 59 | def next_symbol(self): 60 | quote_idx = randrange(len(self.quotes_list) - 1) 61 | quote = self.quotes_list[quote_idx] 62 | 63 | # price = quote.price + (signal * rnd.nextDouble * quote.volatility * 3) 64 | price = quote[1] + (self.__signal() * random() * quote[2] * 3) 65 | 66 | return { 67 | 'symbol': quote[0], 68 | 'timestamp': self.__nextMarketTime().isoformat(), 69 | 'price': float(f'{price:2.3f}') 70 | } 71 | 72 | 73 | if __name__ == '__main__': 74 | # Initialization 75 | args = sys.argv 76 | 77 | if len(args) != 4: 78 | print(f""" 79 | |Usage: {args[0]} 80 | | is a list of one or more Kafka brokers 81 | | one kafka topic to produce to 82 | | [OPTIONAL] iso timestamp from when to start producing data 83 | | 84 | | {args[0]} kafka:9092 stocks 2017-11-11T10:00:00Z 85 | """) 86 | sys.exit(1) 87 | 88 | _, brokers, topic, start_date = args 89 | trading_start_datetime = datetime.strptime(start_date, '%Y-%m-%dT%H:%M:%S%z') 90 | 91 | quote_gen = QuoteGenerator(trading_start_datetime) 92 | 93 | producer = KafkaProducer( 94 | bootstrap_servers=brokers, 95 | value_serializer=lambda v: json.dumps(v).encode('utf-8')) 96 | 97 | while True: 98 | stock_data = quote_gen.next_symbol() 99 | producer.send(topic, stock_data) 100 | print(stock_data) 101 | sleep(.5) 102 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .DS_Store 3 | derby.log 4 | metastore_db 5 | spark-warehouse 6 | data/credit.model 7 | .idea -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/README.md: -------------------------------------------------------------------------------- 1 | # Credit Risk Analysis 2 | ## Spark Machine Learning (Random Forest) 3 | 4 | 5 | 6 | ```bash 7 | sbt clean assembly 8 | 9 | spark-submit \ 10 | --class es.arjon.CreditRiskTrain \ 11 | --master 'spark://master:7077' \ 12 | target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \ 13 | /dataset/credit-risk/germancredit.csv \ 14 | /dataset/credit-risk.model 15 | 16 | 17 | 18 | spark-submit \ 19 | --class es.arjon.CreditRiskAnalysis \ 20 | --master 'spark://master:7077' \ 21 | target/scala-2.11/credit-risk-analysis-assembly-0.1.jar \ 22 | /dataset/credit-risk/germancredit-user-input.csv \ 23 | /dataset/credit-risk.model 24 | ``` 25 | 26 | # Acknowledge 27 | The original author of this tutorial is **Carol McDonald ** for the MapR article: [Predicting Loan Credit Risk using Apache Spark Machine Learning Random Forests](https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/), 28 | I updated the API version (Spark 2.4.4) and made changes on the code to clarify/reduce duplication. 29 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/build.sbt: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | 3 | name := "credit-risk-analysis" 4 | 5 | version := "0.1" 6 | 7 | scalaVersion := "2.11.12" 8 | 9 | scalacOptions += "-target:jvm-1.8" 10 | 11 | libraryDependencies ++= Seq( 12 | "org.apache.spark" %% "spark-sql" % "2.4.4" % "provided", 13 | "org.apache.spark" %% "spark-mllib" % "2.4.4" % "provided", 14 | 15 | "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly() 16 | ) 17 | 18 | assemblyMergeStrategy in assembly := { 19 | case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard 20 | case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard 21 | case "log4j.properties" => MergeStrategy.first 22 | case "reference.conf" => MergeStrategy.concat 23 | case _ => MergeStrategy.first 24 | } 25 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.16 2 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskAnalysis.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import org.apache.spark.ml.classification.RandomForestClassificationModel 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object CreditRiskAnalysis extends DatasetUtil { 7 | 8 | def main(args: Array[String]): Unit = { 9 | if (args.length < 2) { 10 | System.err.println( 11 | s""" 12 | |Usage: CreditRiskAnalysis 13 | | CSV dataset to PREDICT credit 14 | | path to the model 15 | | 16 | | CreditRiskAnalysis /dataset/credit-risk/germancredit-user-input.csv /dataset/credit-risk.model 17 | """.stripMargin) 18 | System.exit(1) 19 | } 20 | 21 | // val Array(datasource, modelPath) = Array("/dataset/credit-risk/germancredit-user-input.csv", 22 | // "/dataset/credit-risk.model") 23 | val Array(datasource, modelPath) = args 24 | 25 | // implicit val ss = spark 26 | implicit val spark = SparkSession. 27 | builder. 28 | appName("CreditRisk"). 29 | getOrCreate() 30 | 31 | val df = loadUserInputData(datasource) 32 | val dfVector = vectorizeInput(df) 33 | 34 | val model = RandomForestClassificationModel.load(modelPath) 35 | val predictions = model.transform(dfVector) 36 | 37 | import spark.implicits._ 38 | 39 | println("=" * 30) 40 | println("Prediction are:") 41 | predictions.select($"userId", $"amount", $"prediction").show(false) 42 | } 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/src/main/scala/es/arjon/CreditRiskTrain.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import es.arjon.CreditRiskAnalysis.vectorizeInput 4 | import org.apache.spark.ml.classification._ 5 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 6 | import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} 7 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} 8 | import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} 9 | import org.apache.spark.mllib.evaluation.RegressionMetrics 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.functions._ 12 | 13 | // Heavily inspired on 14 | // https://mapr.com/blog/predicting-loan-credit-risk-using-apache-spark-machine-learning-random-forests/ 15 | object CreditRiskTrain extends DatasetUtil { 16 | def main(args: Array[String]) { 17 | if (args.length < 2) { 18 | System.err.println( 19 | s""" 20 | |Usage: CreditRiskTrain 21 | | CSV dataset to learn from 22 | | path to save model to 23 | | 24 | | CreditRiskTrain /dataset/credit-risk/germancredit.csv /dataset/credit-risk.model 25 | """.stripMargin) 26 | System.exit(1) 27 | } 28 | 29 | val Array(datasource, modelPath) = args 30 | 31 | 32 | // When using Spark-Shell: 33 | // implicit val ss = spark 34 | implicit val spark = SparkSession. 35 | builder. 36 | appName("CreditRisk"). 37 | getOrCreate() 38 | 39 | import spark.implicits._ 40 | 41 | val creditDF = loadTrainData(datasource) 42 | creditDF.printSchema 43 | creditDF.show 44 | 45 | // creditDF.createOrReplaceTempView("credit") 46 | // spark.sql("SELECT creditability, avg(balance) as avg_balance, avg(amount) as avg_amount, 47 | // avg(duration) as avg_duration FROM credit GROUP BY creditability").show 48 | 49 | creditDF.describe("balance").show 50 | creditDF.groupBy("creditability").agg(avg('balance), avg('amount), avg('duration)).show 51 | 52 | val dfVector = vectorizeInput(creditDF) 53 | 54 | // Convert Strings into Label Identifiers (Double) 55 | val labelIndexer = new StringIndexer().setInputCol("creditability").setOutputCol("label") 56 | 57 | // Add Label Identifiers field to the DF 58 | val dfLabeled = labelIndexer.fit(dfVector).transform(dfVector) 59 | 60 | // Manually transforming 61 | // def convertCreditability(v: String) = if (v =="YES") 1.0 else 0.0 62 | // val convertCreditabilityUDF = udf(convertCreditability _) 63 | // val dfLabeled = dfVector.withColumn("label2", convertCreditabilityUDF($"creditability")) 64 | 65 | dfLabeled.select($"features", $"label", $"creditability").show(30, false) 66 | 67 | // remove unused fields 68 | val dfInput = dfLabeled.select($"features", $"label") 69 | 70 | 71 | val splitSeed = 5043 72 | val Array(trainingDataUncached, testData) = dfInput.randomSplit(Array(0.7, 0.3), splitSeed) 73 | 74 | // Try to run with & without cache() 75 | // val trainingData = trainingDataUncached.cache() 76 | val trainingData = trainingDataUncached 77 | 78 | val classifier = new RandomForestClassifier(). 79 | setImpurity("gini"). 80 | setMaxDepth(3). 81 | setNumTrees(20). 82 | setFeatureSubsetStrategy("auto"). 83 | setSeed(5043) 84 | 85 | val model = classifier.fit(trainingData) 86 | println(model.toDebugString) 87 | 88 | println("=" * 30) 89 | println("Before pipeline fitting\n") 90 | val predictions = model.transform(testData) 91 | 92 | val evaluator = new BinaryClassificationEvaluator().setLabelCol("label") 93 | val accuracy = evaluator.evaluate(predictions) 94 | println(f"Accuracy: $accuracy%2.3f") 95 | printPredictionMetrics(predictions) 96 | 97 | // Save the model to latter use 98 | model.write.overwrite().save(modelPath) 99 | 100 | // Let's try to do better 101 | val paramGrid = new ParamGridBuilder(). 102 | addGrid(classifier.maxBins, Array(20, 40)). 103 | addGrid(classifier.maxDepth, Array(2, 10)). 104 | addGrid(classifier.numTrees, Array(10, 60)). 105 | addGrid(classifier.impurity, Array("entropy", "gini")). 106 | build() 107 | 108 | val steps: Array[PipelineStage] = Array(classifier) 109 | val pipeline = new Pipeline().setStages(steps) 110 | 111 | val cv = new CrossValidator(). 112 | setEstimator(pipeline). 113 | setEvaluator(evaluator). 114 | setEstimatorParamMaps(paramGrid). 115 | setNumFolds(10) 116 | 117 | val pipelineFittedModel = cv.fit(trainingData) 118 | 119 | val predictions2 = pipelineFittedModel.transform(testData) 120 | val accuracy2 = evaluator.evaluate(predictions2) 121 | println("=" * 30) 122 | println("AFTER pipeline fitting\n") 123 | println(f"Accuracy: $accuracy2%2.3f") 124 | 125 | val bestModel = pipelineFittedModel.bestModel.asInstanceOf[PipelineModel].stages(0) 126 | val params = bestModel.extractParamMap 127 | 128 | println( 129 | s""" 130 | |The best model found was: 131 | |${bestModel} 132 | | 133 | |Using params: 134 | |${params} 135 | | 136 | """.stripMargin) 137 | 138 | printPredictionMetrics(predictions2) 139 | 140 | // Not saving the final model... 141 | // 142 | } 143 | 144 | def printPredictionMetrics(predictions: DataFrame)(implicit spark: SparkSession) { 145 | // Extract PREDICTED and CORRECT (label) values 146 | import spark.implicits._ 147 | val predictionAndObservations = predictions.select('prediction, 'label) 148 | val rdd = predictionAndObservations.rdd.map(r => (r.getDouble(0), r.getDouble(1))) 149 | 150 | // Calculate the Quality Metrics 151 | val rm = new RegressionMetrics(rdd) 152 | val msg = 153 | s""" 154 | |MSE: ${rm.meanSquaredError} 155 | |MAE: ${rm.meanAbsoluteError} 156 | |RMSE Squared: ${rm.rootMeanSquaredError} 157 | |R Squared: ${rm.r2} 158 | |Exp. Variance: ${rm.explainedVariance} 159 | | 160 | """.stripMargin 161 | 162 | println(msg) 163 | } 164 | } 165 | 166 | -------------------------------------------------------------------------------- /code/scala/credit-risk-analysis/src/main/scala/es/arjon/DatasetUtil.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import org.apache.spark.ml.feature.VectorAssembler 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | trait DatasetUtil { 7 | 8 | // when using console add this 9 | // implicit val ss = spark 10 | def loadTrainData(csv: String)(implicit spark: SparkSession) = { 11 | import org.apache.spark.sql.types._ 12 | 13 | val schema = StructType(Seq( 14 | StructField("creditability", StringType, nullable = false), 15 | StructField("balance", DoubleType, nullable = false), 16 | StructField("duration", DoubleType, nullable = false), 17 | StructField("history", DoubleType, nullable = false), 18 | StructField("purpose", DoubleType, nullable = false), 19 | StructField("amount", DoubleType, nullable = false), 20 | StructField("savings", DoubleType, nullable = false), 21 | StructField("employment", DoubleType, nullable = false), 22 | StructField("instPercent", DoubleType, nullable = false), 23 | StructField("sexMarried", DoubleType, nullable = false), 24 | StructField("guarantors", DoubleType, nullable = false), 25 | StructField("residenceDuration", DoubleType, nullable = false), 26 | StructField("assets", DoubleType, nullable = false), 27 | StructField("age", DoubleType, nullable = false), 28 | StructField("concCredit", DoubleType, nullable = false), 29 | StructField("apartment", DoubleType, nullable = false), 30 | StructField("credits", DoubleType, nullable = false), 31 | StructField("occupation", DoubleType, nullable = false), 32 | StructField("dependents", DoubleType, nullable = false), 33 | StructField("hasPhone", DoubleType, nullable = false), 34 | StructField("foreign", DoubleType, nullable = false) 35 | )) 36 | 37 | spark.read. 38 | option("header", false). 39 | schema(schema). 40 | csv(csv) 41 | } 42 | 43 | def loadUserInputData(csv: String)(implicit spark: SparkSession) = { 44 | import org.apache.spark.sql.types._ 45 | val schema = StructType(Seq( 46 | StructField("userId", StringType, nullable = false), // USER ID to identify the PREDICTED ANSWER 47 | StructField("balance", DoubleType, nullable = false), 48 | StructField("duration", DoubleType, nullable = false), 49 | StructField("history", DoubleType, nullable = false), 50 | StructField("purpose", DoubleType, nullable = false), 51 | StructField("amount", DoubleType, nullable = false), 52 | StructField("savings", DoubleType, nullable = false), 53 | StructField("employment", DoubleType, nullable = false), 54 | StructField("instPercent", DoubleType, nullable = false), 55 | StructField("sexMarried", DoubleType, nullable = false), 56 | StructField("guarantors", DoubleType, nullable = false), 57 | StructField("residenceDuration", DoubleType, nullable = false), 58 | StructField("assets", DoubleType, nullable = false), 59 | StructField("age", DoubleType, nullable = false), 60 | StructField("concCredit", DoubleType, nullable = false), 61 | StructField("apartment", DoubleType, nullable = false), 62 | StructField("credits", DoubleType, nullable = false), 63 | StructField("occupation", DoubleType, nullable = false), 64 | StructField("dependents", DoubleType, nullable = false), 65 | StructField("hasPhone", DoubleType, nullable = false), 66 | StructField("foreign", DoubleType, nullable = false) 67 | )) 68 | 69 | spark.read. 70 | option("header", false). 71 | schema(schema). 72 | csv(csv) 73 | } 74 | 75 | def vectorizeInput(df: DataFrame)(implicit spark: SparkSession): DataFrame = { 76 | import spark.implicits._ 77 | 78 | val featureCols = Array("balance", "duration", "history", "purpose", "amount", 79 | "savings", "employment", "instPercent", "sexMarried", "guarantors", 80 | "residenceDuration", "assets", "age", "concCredit", "apartment", 81 | "credits", "occupation", "dependents", "hasPhone", "foreign") 82 | 83 | val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features") 84 | val out = assembler.transform(df) 85 | out.select('features).show(truncate = false) 86 | 87 | out 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .DS_Store 3 | spark-warehouse 4 | .idea 5 | dataset/output.parquet/ 6 | derby.log 7 | metastore_db 8 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/README.md: -------------------------------------------------------------------------------- 1 | # ETL: US stocks analysis 2 | 3 | 4 | 5 | ### Create a jar containing your application and its deps 6 | ```bash 7 | $ sbt clean assembly 8 | ``` 9 | 10 | ### Use spark-submit to run your application 11 | 12 | ```bash 13 | $ spark-submit \ 14 | --class "es.arjon.FromCsvToParquet" \ 15 | --master 'local[*]' \ 16 | target/scala-2.11/us-stock-analysis-assembly-0.1.jar 17 | ``` 18 | 19 | ```bash 20 | $ spark-submit \ 21 | --class "es.arjon.RunAll" \ 22 | --master 'spark://master:7077' \ 23 | --driver-class-path /app/postgresql-42.1.4.jar \ 24 | target/scala-2.11/us-stock-analysis-assembly-0.1.jar 25 | ``` 26 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/build.sbt: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | 3 | name := "us-stock-analysis" 4 | 5 | version := "0.1" 6 | 7 | scalaVersion := "2.11.12" 8 | 9 | scalacOptions += "-target:jvm-1.8" 10 | 11 | libraryDependencies ++= Seq( 12 | "org.apache.spark" %% "spark-sql" % "2.4.4" % "provided", 13 | "org.postgresql" % "postgresql" % "42.1.1", 14 | 15 | "org.apache.spark" %% "spark-streaming" % "2.4.4" % "provided", 16 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % "2.4.4", 17 | "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.4" 18 | ) 19 | 20 | assemblyMergeStrategy in assembly := { 21 | case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard 22 | case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard 23 | case "log4j.properties" => MergeStrategy.first 24 | case "reference.conf" => MergeStrategy.concat 25 | case _ => MergeStrategy.first 26 | } -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.17 2 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Define the root logger with appender file 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/src/main/scala/es/arjon/EtlSteps.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import org.apache.spark.sql.expressions.Window 4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 5 | 6 | case class Stock(name: String, 7 | dateTime: String, 8 | open: Double, 9 | high: Double, 10 | low: Double, 11 | close: Double) 12 | 13 | object Stock { 14 | def fromCSV(symbol: String, line: String): Option[Stock] = { 15 | val v = line.split(",") 16 | 17 | try { 18 | Some( 19 | Stock( 20 | symbol, 21 | dateTime = v(0), 22 | open = v(1).toDouble, 23 | high = v(2).toDouble, 24 | low = v(3).toDouble, 25 | close = v(4).toDouble 26 | ) 27 | ) 28 | 29 | } catch { 30 | case ex: Exception => { 31 | println(s"Failed to process $symbol, with input $line, with ${ex.toString}") 32 | None 33 | } 34 | } 35 | 36 | } 37 | } 38 | 39 | 40 | object RunAll { 41 | def main(args: Array[String]): Unit = { 42 | if (args.length < 3) { 43 | System.err.println( 44 | s""" 45 | |Usage: RunAll 46 | | folder where stocks data is located 47 | | file containing lookup information 48 | | folder to write parquet data 49 | | 50 | |RunAll /dataset/stocks-small /dataset/yahoo-symbols-201709.csv /dataset/output.parquet 51 | """.stripMargin) 52 | System.exit(1) 53 | } 54 | 55 | val Array(stocksFolder, lookupSymbol, outputFolder) = args 56 | 57 | 58 | val spark = SparkSession. 59 | builder. 60 | appName("Stocks:ETL"). 61 | getOrCreate() 62 | 63 | val stocksDS = ReadStockCSV.processDS(spark, stocksFolder) 64 | val lookup = ReadSymbolLookup.process(spark, lookupSymbol) 65 | 66 | // For implicit conversions like converting RDDs to DataFrames 67 | import org.apache.spark.sql.functions._ 68 | import spark.implicits._ 69 | 70 | val ds = stocksDS. 71 | withColumn("full_date", unix_timestamp($"dateTime", "yyyy-MM-dd").cast("timestamp")). 72 | filter("full_date >= \"2017-09-01\""). 73 | withColumn("year", year($"full_date")). 74 | withColumn("month", month($"full_date")). 75 | withColumn("day", dayofmonth($"full_date")). 76 | drop($"dateTime"). 77 | withColumnRenamed("name", "symbol"). 78 | join(lookup, Seq("symbol")) 79 | 80 | // https://weishungchung.com/2016/08/21/spark-analyzing-stock-price/ 81 | val movingAverageWindow20 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-20, 0) 82 | val movingAverageWindow50 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-50, 0) 83 | val movingAverageWindow100 = Window.partitionBy($"symbol").orderBy("full_date").rowsBetween(-100, 0) 84 | 85 | // Calculate the moving average 86 | val stocksMA = ds. 87 | withColumn("ma20", avg($"close").over(movingAverageWindow20)). 88 | withColumn("ma50", avg($"close").over(movingAverageWindow50)). 89 | withColumn("ma100", avg($"close").over(movingAverageWindow100)) 90 | 91 | stocksMA.show(100) 92 | 93 | DatasetToParquet.process(spark, stocksMA, outputFolder) 94 | 95 | DatasetToPostgres.process(spark, stocksMA) 96 | 97 | spark.stop() 98 | } 99 | } 100 | 101 | object ReadStockCSV { 102 | 103 | def extractSymbolFromFilename(filename: String) = { 104 | val arr = filename.split("/") 105 | arr(arr.size - 1).split("\\.")(0).toUpperCase 106 | } 107 | 108 | def processDS(spark: SparkSession, originFolder: String) = { 109 | import org.apache.spark.sql.functions._ 110 | import spark.implicits._ 111 | 112 | val symbolFromFilename = udf(extractSymbolFromFilename _) 113 | 114 | spark.read. 115 | option("header", true). 116 | option("inferSchema", true). 117 | csv(originFolder). 118 | withColumn("name", symbolFromFilename(input_file_name())). 119 | withColumnRenamed("Date", "dateTime"). 120 | withColumnRenamed("Open", "open"). 121 | withColumnRenamed("High", "high"). 122 | withColumnRenamed("Low", "low"). 123 | withColumnRenamed("Close", "close"). 124 | drop("Volume", "OpenInt"). 125 | as[Stock] 126 | } 127 | 128 | 129 | def processRDD(spark: SparkSession, originFolder: String) = { 130 | 131 | // Using SparkContext to use RDD 132 | val sc = spark.sparkContext 133 | val files = sc.wholeTextFiles(originFolder, minPartitions = 40) 134 | 135 | val stocks = files.map { case (filename, content) => 136 | val symbol = extractSymbolFromFilename(filename) 137 | 138 | content.split("\n").flatMap { line => 139 | Stock.fromCSV(symbol, line) 140 | } 141 | }. 142 | flatMap(e => e). 143 | cache 144 | 145 | import spark.implicits._ 146 | 147 | stocks.toDS.as[Stock] 148 | } 149 | } 150 | 151 | object ReadSymbolLookup { 152 | def process(spark: SparkSession, file: String) = { 153 | import spark.implicits._ 154 | spark.read. 155 | option("header", true). 156 | option("inferSchema", true). 157 | csv(file). 158 | // filter("Country = \"USA\""). 159 | // filter($"Country" === "USA"). 160 | select($"Ticker", $"Category Name"). 161 | withColumnRenamed("Ticker", "symbol"). 162 | withColumnRenamed("Category Name", "category") 163 | } 164 | } 165 | 166 | object DatasetToParquet { 167 | def process(spark: SparkSession, df: DataFrame, destinationFolder: String): Unit = { 168 | // https://stackoverflow.com/questions/43731679/how-to-save-a-partitioned-parquet-file-in-spark-2-1 169 | df. 170 | write. 171 | mode("overwrite"). 172 | partitionBy("year", "month", "day"). 173 | parquet(destinationFolder) 174 | } 175 | } 176 | 177 | object DatasetToPostgres { 178 | 179 | def process(spark: SparkSession, df: DataFrame): Unit = { 180 | // Write to Postgres 181 | val connectionProperties = new java.util.Properties 182 | connectionProperties.put("user", "workshop") 183 | connectionProperties.put("password", "w0rkzh0p") 184 | val jdbcUrl = s"jdbc:postgresql://postgres:5432/workshop" 185 | 186 | df. 187 | drop("year", "month", "day"). // drop unused columns 188 | write. 189 | mode(SaveMode.Append). 190 | jdbc(jdbcUrl, "stocks", connectionProperties) 191 | 192 | } 193 | } 194 | 195 | // TODO: Read compressed 196 | // option("codec", "org.apache.hadoop.io.compress.GzipCodec"). 197 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/src/main/scala/es/arjon/FakeStockPriceGenerator.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import java.time.ZonedDateTime 4 | import java.util.Properties 5 | 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | 8 | object FakeStockPriceGenerator extends App { 9 | val rnd = new scala.util.Random(42) 10 | 11 | if (args.length < 2 || args.length > 3) { 12 | System.err.println( 13 | s""" 14 | |Usage: FakeStockPriceGenerator 15 | | is a list of one or more Kafka brokers 16 | | one kafka topic to produce to 17 | | [OPTIONAL] iso timestamp from when to start producing data 18 | | 19 | | FakeStockPriceGenerator kafka:9092 stocks 2017-11-11T10:00:00Z 20 | """.stripMargin) 21 | System.exit(1) 22 | } 23 | 24 | val brokers = args(0) 25 | val topic = args(1) 26 | 27 | # The default vauel is when the batch sample data ends 28 | val tradingStartParam = if (args.length == 3) args(2) else "2017-11-11T10:00:00Z" 29 | 30 | var tradingBeginOfTime = ZonedDateTime.parse(tradingStartParam) 31 | 32 | println( 33 | s""" 34 | |Generating faking stocks prices at $brokers/$topic 35 | |Each tick (300ms) represents 3min in clock time 36 | """.stripMargin) 37 | 38 | val props = new Properties() 39 | props.put("bootstrap.servers", brokers) 40 | props.put("client.id", "FakeStockPriceGenerator") 41 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 42 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 43 | 44 | val producer = new KafkaProducer[String, String](props) 45 | var counter = 0 46 | 47 | while (true) { 48 | val stock = nextSymbol() 49 | val data = new ProducerRecord[String, String](topic, null, stock) 50 | 51 | producer.send(data) 52 | Thread.sleep(300) 53 | 54 | counter += 1 55 | println(s"# $counter: $stock") 56 | } 57 | 58 | producer.close() 59 | 60 | 61 | def nextSymbol(): String = { 62 | # a very naive impl of marketing hours 63 | # not consider weekends nor holidays 64 | def nextMarketTime = { 65 | # val tick = 3 66 | # Sometimes it substracts 1 and generates late arriving tickers 67 | val tick = rnd.nextInt(5)-1 68 | val proposedNextTime = tradingBeginOfTime.plusMinutes(tick) 69 | val nextTime = if (proposedNextTime.getHour > 15) 70 | proposedNextTime.plusDays(1).withHour(10).withMinute(0) 71 | else 72 | proposedNextTime 73 | 74 | tradingBeginOfTime = nextTime 75 | nextTime 76 | } 77 | 78 | 79 | case class StockConf(symbol: String, price: Double, volatility: Double) 80 | 81 | # Using as SEED for the generator last 90 days of stocks 82 | # price: Max Closing Price 83 | # volatility: StdDev of Closing Pricing 84 | # df.groupBy($"symbol") 85 | # .agg(stddev_pop($"close").as("volatility"), max($"close").as("price")) 86 | # .orderBy($"symbol") 87 | // 88 | val quotes = List( 89 | StockConf("AAPL", 175.61, 6.739169981533334), 90 | StockConf("BABA", 188.51, 5.637335242825282), 91 | StockConf("CSCO", 34.62, 0.9673997717593282), 92 | StockConf("DHR", 93.24, 2.949284608917899), 93 | StockConf("EBAY", 38.99, 0.8110024414266584), 94 | StockConf("FB", 182.66, 4.14292553638126), 95 | StockConf("GOOG", 1039.85, 37.960859608812854), 96 | StockConf("GOOGL", 1058.29, 39.11749241707603), 97 | StockConf("IBM", 160.47, 4.8367462989079755), 98 | StockConf("INTC", 46.826, 3.678237311321825), 99 | StockConf("JNJ", 143.62, 4.336597380435497), 100 | StockConf("MELI", 292.05, 19.703519789367583), 101 | StockConf("MSFT", 84.56, 3.7745700470384693), 102 | StockConf("ORCL", 52.593, 1.4026418724678085), 103 | StockConf("QCOM", 65.49, 3.962328548164577), 104 | StockConf("TSLA", 385.0, 21.667055079857995), 105 | StockConf("TXN", 98.54, 5.545761038090265), 106 | StockConf("WDC", 89.9, 1.7196676293981952), 107 | StockConf("XRX", 33.86, 1.4466726098188216) 108 | ) 109 | 110 | def signal = if (rnd.nextInt(2) == 0) 1 else -1 111 | 112 | val quote = quotes(rnd.nextInt(quotes.size)) 113 | 114 | val price = quote.price + (signal * rnd.nextDouble * quote.volatility * 3) 115 | 116 | // 117 | f"""{"symbol":"${quote.symbol}","timestamp":"${nextMarketTime}","price":$price%2.3f}""" 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /code/scala/us-stock-analysis/src/main/scala/es/arjon/StreamingETL.scala: -------------------------------------------------------------------------------- 1 | package es.arjon 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.spark.sql.DataFrame 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.functions.udf 8 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 9 | import org.apache.spark.sql.types._ 10 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 11 | import org.apache.spark.sql.SaveMode 12 | 13 | 14 | 15 | object StreamingETL extends App { 16 | if (args.length < 2) { 17 | System.err.println( 18 | s""" 19 | |Usage: StreamingETL 20 | | is a list of one or more Kafka brokers 21 | | is a list of one or more kafka topics to consume from 22 | | 23 | | StreamingETL kafka:9092 stocks 24 | """.stripMargin) 25 | System.exit(1) 26 | } 27 | 28 | val Array(brokers, topics) = args 29 | val spark = SparkSession. 30 | builder. 31 | appName("Stocks:StreamingETL"). 32 | getOrCreate() 33 | 34 | // val brokers = "kafka:9092" 35 | // val topics = "stocks" 36 | 37 | 38 | // Create DataSet representing the stream of input lines from kafka 39 | // https://databricks.com/blog/2017/04/26/processing-data-in-apache-kafka-with-structured-streaming-in-apache-spark-2-2.html 40 | val jsons = spark. 41 | readStream. 42 | format("kafka"). 43 | option("kafka.bootstrap.servers", brokers). 44 | option("subscribe", topics). 45 | //option("startingOffsets", "earliest"). 46 | load() 47 | 48 | 49 | jsons.printSchema 50 | 51 | val schema = StructType(Seq( 52 | StructField("symbol", StringType, nullable = false), 53 | StructField("timestamp", TimestampType, nullable = false), 54 | StructField("price", DoubleType, nullable = false) 55 | )) 56 | 57 | import org.apache.spark.sql.functions._ 58 | import spark.implicits._ 59 | 60 | val jsonOptions = Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm'Z'") 61 | val stocksJson = jsons. 62 | select(from_json($"value".cast("string"), schema, jsonOptions).as("content")) 63 | 64 | stocksJson.printSchema 65 | 66 | val stocks = stocksJson.select($"content.*") 67 | 68 | stocks.printSchema 69 | 70 | // Write to Parquet 71 | val query = stocks. 72 | withColumn("year", year($"timestamp")). 73 | withColumn("month", month($"timestamp")). 74 | withColumn("day", dayofmonth($"timestamp")). 75 | withColumn("hour", hour($"timestamp")). 76 | withColumn("minute", minute($"timestamp")). 77 | writeStream. 78 | format("parquet"). 79 | partitionBy("year", "month", "day", "hour", "minute"). 80 | option("startingOffsets", "earliest"). 81 | option("checkpointLocation", "/dataset/checkpoint"). 82 | option("path", "/dataset/streaming.parquet"). 83 | trigger(Trigger.ProcessingTime("30 seconds")). 84 | start() 85 | query.awaitTermination() 86 | 87 | // AverageStocksToPostgres.process(spark, stocks) 88 | 89 | // Using as an ordinary DF 90 | // val avgPricing = stocks. 91 | // groupBy($"symbol"). 92 | // agg(avg($"price").as("avg_price")) 93 | 94 | 95 | // avgPricing.printSchema 96 | 97 | // Start running the query that prints the running results to the console 98 | // val query = avgPricing.writeStream. 99 | // outputMode(OutputMode.Complete). 100 | // format("console"). 101 | // trigger(Trigger.ProcessingTime("10 seconds")). 102 | // start() 103 | // query.awaitTermination() 104 | 105 | // // Have all the aggregates in an in-memory table 106 | // val query = avgPricing 107 | // .writeStream 108 | // .queryName("avgPricing") // this query name will be the table name 109 | // .outputMode("complete") 110 | // .format("memory") 111 | // .trigger(Trigger.ProcessingTime("10 seconds")) 112 | // .start() 113 | 114 | // while (true) { 115 | // Thread.sleep(10 * 1000) 116 | // // interactively query in-memory table 117 | // spark.sql("select * from avgPricing").show() 118 | // //println(query.lastProgress) 119 | // } 120 | 121 | // query.awaitTermination() 122 | } 123 | 124 | 125 | object AverageStocksToPostgres { 126 | 127 | def process(spark: SparkSession, stocks: DataFrame): Unit = { 128 | 129 | import org.apache.spark.sql.functions._ 130 | import spark.implicits._ 131 | 132 | val avgPricing = stocks. 133 | withWatermark("timestamp", "60 seconds"). 134 | groupBy( window($"timestamp", "30 seconds"), 135 | $"symbol"). 136 | agg(avg($"price").as("avg_price")) 137 | 138 | 139 | avgPricing.printSchema 140 | 141 | val connectionProperties = new Properties() 142 | connectionProperties.put("user", "workshop") 143 | connectionProperties.put("password", "w0rkzh0p") 144 | connectionProperties.put("driver", "org.postgresql.Driver") 145 | 146 | 147 | val winToString = udf{(window:GenericRowWithSchema) => window.mkString("-")} 148 | 149 | val processAvgTickers = avgPricing. 150 | withColumn("window", winToString($"window")). 151 | writeStream. 152 | foreachBatch { (batchDF: DataFrame, batchId: Long) => 153 | batchDF.write.mode(SaveMode.Append).jdbc(s"jdbc:postgresql://postgres:5432/workshop", "workshop.test_streaming_inserts_avg_price", connectionProperties) 154 | }. 155 | trigger(Trigger.ProcessingTime("10 seconds")). 156 | start() 157 | 158 | processAvgTickers.awaitTermination() 159 | 160 | } 161 | } -------------------------------------------------------------------------------- /control-env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | function stop { 4 | echo "Stopping and removing containers" 5 | docker-compose --project-name wksp down 6 | } 7 | 8 | function cleanup { 9 | echo "Removing volume" 10 | docker volume rm wksp_postgres-data 11 | docker volume rm wksp_superset 12 | docker volume rm wksp_postgres-airflow-data 13 | } 14 | 15 | function start { 16 | echo "Starting up" 17 | docker-compose --project-name wksp up -d 18 | } 19 | 20 | function update { 21 | echo "Updating code ..." 22 | git pull --all 23 | 24 | echo "Updating docker images ..." 25 | docker-compose --project-name wksp pull 26 | 27 | echo "You probably should restart" 28 | } 29 | 30 | function info { 31 | echo ' 32 | Everything is ready, access your host to learn more (ie: http://localhost/) 33 | ' 34 | } 35 | 36 | function token { 37 | echo 'Your TOKEN for Jupyter Notebook is:' 38 | SERVER=$(docker exec -it jupyter jupyter notebook list) 39 | echo "${SERVER}" | grep '/notebook' | sed -E 's/^.*=([a-z0-9]+).*$/\1/' 40 | } 41 | 42 | function superset-init { 43 | echo 'Initializing Superset database using sqlite' 44 | docker exec -it superset superset-init 45 | } 46 | 47 | function psql { 48 | docker exec -it postgres psql -U workshop workshop 49 | } 50 | 51 | case $1 in 52 | start ) 53 | start 54 | info 55 | ;; 56 | 57 | stop ) 58 | stop 59 | ;; 60 | 61 | cleanup ) 62 | stop 63 | cleanup 64 | ;; 65 | 66 | update ) 67 | update 68 | ;; 69 | 70 | logs ) 71 | docker-compose --project-name wksp logs -f 72 | ;; 73 | 74 | token ) 75 | token 76 | ;; 77 | 78 | superset-init ) 79 | superset-init 80 | ;; 81 | 82 | psql ) 83 | psql 84 | ;; 85 | 86 | * ) 87 | printf "ERROR: Missing command\n Usage: `basename $0` (start|stop|cleanup|token|logs|update)\n" 88 | exit 1 89 | ;; 90 | esac 91 | -------------------------------------------------------------------------------- /dataset/.gitignore: -------------------------------------------------------------------------------- 1 | out 2 | credit-risk.model 3 | -------------------------------------------------------------------------------- /dataset/credit-risk/germancredit-user-input.csv: -------------------------------------------------------------------------------- 1 | user389076,4,12,2,3,3059,4,4,2,1,1,4,1,61,3,2,1,2,1,1,1 2 | user123456,1,24,2,2,2996,5,3,2,4,1,4,3,20,3,2,1,3,1,1,1 3 | user789012,2,6,2,0,14555,5,1,1,3,1,2,2,23,3,2,1,1,1,2,1 4 | user234567,1,48,2,3,6758,1,3,3,2,1,2,3,31,3,2,1,3,1,2,1 5 | user345678,1,48,2,0,7763,1,5,4,3,1,4,4,42,1,3,1,4,1,1,1 6 | user456789,1,36,4,6,8065,1,3,3,2,1,2,4,25,3,2,2,4,1,2,1 7 | -------------------------------------------------------------------------------- /dataset/global-temperature-1880-2016.json: -------------------------------------------------------------------------------- 1 | {"description":{"title":"Global Land and Ocean Temperature Anomalies, January-December","units":"Degrees Celsius","base_period":"1901-2000","missing":-999},"data":{"1880":"-0.12","1881":"-0.08","1882":"-0.10","1883":"-0.18","1884":"-0.27","1885":"-0.25","1886":"-0.24","1887":"-0.29","1888":"-0.13","1889":"-0.09","1890":"-0.35","1891":"-0.25","1892":"-0.30","1893":"-0.33","1894":"-0.31","1895":"-0.24","1896":"-0.09","1897":"-0.10","1898":"-0.27","1899":"-0.15","1900":"-0.07","1901":"-0.15","1902":"-0.25","1903":"-0.37","1904":"-0.45","1905":"-0.28","1906":"-0.21","1907":"-0.38","1908":"-0.43","1909":"-0.44","1910":"-0.40","1911":"-0.44","1912":"-0.34","1913":"-0.32","1914":"-0.14","1915":"-0.09","1916":"-0.32","1917":"-0.40","1918":"-0.31","1919":"-0.25","1920":"-0.23","1921":"-0.16","1922":"-0.24","1923":"-0.25","1924":"-0.24","1925":"-0.18","1926":"-0.07","1927":"-0.17","1928":"-0.18","1929":"-0.33","1930":"-0.11","1931":"-0.06","1932":"-0.13","1933":"-0.26","1934":"-0.11","1935":"-0.16","1936":"-0.12","1937":"-0.01","1938":"-0.02","1939":"0.01","1940":"0.15","1941":"0.27","1942":"0.10","1943":"0.10","1944":"0.27","1945":"0.17","1946":"-0.01","1947":"-0.04","1948":"-0.06","1949":"-0.08","1950":"-0.16","1951":"0.00","1952":"0.04","1953":"0.13","1954":"-0.10","1955":"-0.13","1956":"-0.18","1957":"0.07","1958":"0.12","1959":"0.08","1960":"0.05","1961":"0.09","1962":"0.10","1963":"0.12","1964":"-0.14","1965":"-0.07","1966":"-0.01","1967":"0.00","1968":"-0.03","1969":"0.11","1970":"0.06","1971":"-0.07","1972":"0.04","1973":"0.19","1974":"-0.06","1975":"0.01","1976":"-0.07","1977":"0.21","1978":"0.12","1979":"0.23","1980":"0.28","1981":"0.32","1982":"0.19","1983":"0.36","1984":"0.17","1985":"0.16","1986":"0.24","1987":"0.38","1988":"0.39","1989":"0.30","1990":"0.45","1991":"0.39","1992":"0.24","1993":"0.28","1994":"0.35","1995":"0.47","1996":"0.33","1997":"0.52","1998":"0.65","1999":"0.44","2000":"0.43","2001":"0.57","2002":"0.62","2003":"0.64","2004":"0.59","2005":"0.67","2006":"0.64","2007":"0.62","2008":"0.55","2009":"0.65","2010":"0.73","2011":"0.58","2012":"0.64","2013":"0.68","2014":"0.74","2015":"0.93","2016":"0.99"}} -------------------------------------------------------------------------------- /dataset/news/huffingtonpost-news.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/news/huffingtonpost-news.json.gz -------------------------------------------------------------------------------- /dataset/pyspark-df-overview/README.md: -------------------------------------------------------------------------------- 1 | # Adult Census Income Datase 2 | 3 | https://www.kaggle.com/uciml/adult-census-income/home -------------------------------------------------------------------------------- /dataset/pyspark-df-overview/census_income.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/pyspark-df-overview/census_income.csv.gz -------------------------------------------------------------------------------- /dataset/stocks/README.md: -------------------------------------------------------------------------------- 1 | # Huge Stock Market Dataset 2 | ## Full Historical Daily Price + Volume Data For All U.S. Stocks & ETFs 3 | High-quality financial data is expensive to acquire and is therefore rarely shared for free. Here I provide the full historical daily price and volume data for all U.S.-based stocks and ETFs trading on the NYSE, NASDAQ, and AMEX (NYSE MKT). It's one of the best datasets of its kind you can obtain. 4 | 5 | [Download the complete Dataset from kaggle.com](https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs/) 6 | 7 | Acknowledge/Thanks for [Boris Marjanovic](https://www.kaggle.com/borismarjanovic) 8 | 9 | --- 10 | 11 | ## `stocks-small` folder Inspired on Fortune 500 Tech list 12 | http://fortune.com/2015/06/13/fortune-500-tech/ 13 | -------------------------------------------------------------------------------- /dataset/yahoo-symbols-201709.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/dataset/yahoo-symbols-201709.csv -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | docs: 4 | container_name: docs 5 | image: nginx 6 | ports: 7 | - "80:80" 8 | volumes: 9 | - "./nginx/html:/usr/share/nginx/html:ro" 10 | 11 | master: 12 | container_name: master 13 | image: arjones/pyspark:2.4.5 14 | restart: always 15 | command: ["/opt/spark/sbin/start-master.sh"] 16 | environment: 17 | MASTER: spark://master:7077 18 | SPARK_NO_DAEMONIZE: 1 19 | ports: 20 | - 4040:4040 21 | - 6066:6066 22 | - 7077:7077 23 | - 8080:8080 24 | volumes: 25 | - ./code:/app 26 | - ./dataset:/dataset 27 | 28 | worker1: 29 | container_name: worker1 30 | image: arjones/pyspark:2.4.5 31 | restart: always 32 | command: ["/opt/spark/sbin/start-slave.sh", "spark://master:7077"] 33 | environment: 34 | MASTER: spark://master:7077 35 | SPARK_NO_DAEMONIZE: 1 36 | depends_on: 37 | - master 38 | ports: 39 | - 4041:4040 40 | - "6066" 41 | - "7077" 42 | - 8081:8080 43 | volumes: 44 | - ./code:/app 45 | - ./dataset:/dataset 46 | 47 | worker2: 48 | container_name: worker2 49 | image: arjones/pyspark:2.4.5 50 | restart: always 51 | command: ["/opt/spark/sbin/start-slave.sh", "spark://master:7077"] 52 | environment: 53 | MASTER: spark://master:7077 54 | SPARK_NO_DAEMONIZE: 1 55 | depends_on: 56 | - master 57 | ports: 58 | - 4042:4040 59 | - "6066" 60 | - "7077" 61 | - 8082:8080 62 | volumes: 63 | - ./code:/app 64 | - ./dataset:/dataset 65 | 66 | jupyter: 67 | container_name: jupyter 68 | image: arjones/pyspark:2.4.5 69 | restart: always 70 | environment: 71 | MASTER: spark://master:7077 72 | depends_on: 73 | - master 74 | ports: 75 | - "8888:8888" 76 | volumes: 77 | - ./jupyter/notebook:/notebook 78 | - ./dataset:/dataset 79 | - ./code:/app 80 | 81 | kafka: 82 | container_name: kafka 83 | image: spotify/kafka 84 | restart: always 85 | ports: 86 | - "2181:2181" 87 | - "9092:9092" 88 | environment: 89 | ADVERTISED_HOST: kafka 90 | ADVERTISED_PORT: 9092 91 | 92 | postgres: 93 | container_name: postgres 94 | image: postgres:11 95 | restart: always 96 | volumes: 97 | - postgres-data:/var/lib/postgresql/data 98 | - ./postgres/scripts:/docker-entrypoint-initdb.d 99 | environment: 100 | POSTGRES_DB: workshop 101 | POSTGRES_USER: workshop 102 | POSTGRES_PASSWORD: w0rkzh0p 103 | ports: 104 | - "5432:5432" 105 | 106 | redis: 107 | container_name: redis 108 | image: redis 109 | restart: always 110 | 111 | superset: 112 | container_name: superset 113 | image: amancevice/superset 114 | restart: always 115 | depends_on: 116 | - redis 117 | - postgres 118 | environment: 119 | MAPBOX_API_KEY: ${MAPBOX_API_KEY} 120 | SUPERSET_HOME: /etc/superset 121 | ports: 122 | - "8088:8088" 123 | volumes: 124 | - ./superset/conf/superset_config.py:/etc/superset/superset_config.py 125 | - superset:/var/lib/superset 126 | 127 | postgres-airflow: 128 | container_name: postgres-airflow 129 | image: postgres:11 130 | restart: always 131 | volumes: 132 | - postgres-airflow-data:/var/lib/postgresql/data 133 | environment: 134 | POSTGRES_DB: airflow 135 | POSTGRES_USER: airflow 136 | POSTGRES_PASSWORD: airflow 137 | ports: 138 | - "5434:5432" 139 | 140 | airflow: 141 | container_name: airflow 142 | image: puckel/docker-airflow 143 | restart: always 144 | depends_on: 145 | - postgres-airflow 146 | environment: 147 | EXECUTOR: Local 148 | LOAD_EX: n 149 | AIRFLOW__WEBSERVER__WEB_SERVER_PORT: 9090 150 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres-airflow:5432/airflow 151 | AIRFLOW__CORE__FERNET_KEY: "Eff80poJxv6LE4432pDC6OmD6N449KCSuhUAMLXiq4U=" 152 | ports: 153 | - "9090:9090" 154 | volumes: 155 | - ./airflow/dags:/usr/local/airflow/dags 156 | 157 | volumes: 158 | postgres-data: 159 | superset: 160 | postgres-airflow-data: 161 | -------------------------------------------------------------------------------- /images/docker-advanced-config.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/docker-advanced-config.jpg -------------------------------------------------------------------------------- /images/superset-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-01.png -------------------------------------------------------------------------------- /images/superset-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-02.png -------------------------------------------------------------------------------- /images/superset-03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-03.png -------------------------------------------------------------------------------- /images/superset-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-04.png -------------------------------------------------------------------------------- /images/superset-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-05.png -------------------------------------------------------------------------------- /images/superset-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-06.png -------------------------------------------------------------------------------- /images/superset-07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-07.png -------------------------------------------------------------------------------- /images/superset-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-08.png -------------------------------------------------------------------------------- /images/superset-09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-09.png -------------------------------------------------------------------------------- /images/superset-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-10.png -------------------------------------------------------------------------------- /images/superset-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset-11.png -------------------------------------------------------------------------------- /images/superset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/images/superset.png -------------------------------------------------------------------------------- /jupyter/notebook/README.md: -------------------------------------------------------------------------------- 1 | # Notebook 2 | 3 | ## Pandas (without Spark integration) 4 | 5 | * [pandas-json-sample](pandas-json-sample.ipynb) 6 | 7 | ## pySpark 8 | 9 | ### Check Installation 10 | 11 | * [pyspark-intro](pyspark-intro.ipynb): basic pySpark operations 12 | * [pyspark-check-install](pyspark-check-install.ipynb): check pySpark installation, this notebook must run without errors. 13 | * [pyspark-apache-arrow](pyspark-apache-arrow.ipynb): Apache Arrow to integrate Pandas/NumPy data to pySpark. 14 | * [pyspark-postgres](pyspark-postgres.ipynb): Reading/Writing data from Postgres 15 | 16 | ### Basic commands 17 | 18 | * [pyspark-dataframe-overview](pyspark-dataframe-overview.ipynb): Spark Dataframe operations 19 | 20 | 21 | ### Machine Learning 22 | 23 | * **Titanic** [Exercise](titanic/titanic_spark_exercises.ipynb) | [Solution](titanic/titanic_spark_solutions.ipynb): [Kaggle Competition](https://www.kaggle.com/c/titanic) solved using pySpark 24 | * [pyspark-nlp](pyspark-nlp.ipynb): Multi-Class Text Classification Using PySpark, MLlib & Doc2Vec 25 | -------------------------------------------------------------------------------- /jupyter/notebook/batch_etl_steps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import findspark\n", 10 | "\n", 11 | "findspark.add_jars('/app/postgresql-42.1.4.jar')\n", 12 | "findspark.init()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from pyspark.sql import SparkSession\n", 22 | "\n", 23 | "spark = SparkSession.builder.appName(\"Stocks:ETL\").getOrCreate()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "'2.4.5'" 35 | ] 36 | }, 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "spark.version" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "stocks_dir = '/dataset/stocks-small'" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import sys\n", 62 | "\n", 63 | "from pyspark.sql import SparkSession\n", 64 | "\n", 65 | "# UDF\n", 66 | "from pyspark.sql.types import StringType\n", 67 | "#\n", 68 | "from pyspark.sql import functions as F\n", 69 | "from pyspark.sql.window import Window" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df = spark.read \\\n", 79 | " .option(\"header\", True) \\\n", 80 | " .option(\"inferSchema\", True) \\\n", 81 | " .csv(stocks_dir)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 7, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "root\n", 94 | " |-- Date: timestamp (nullable = true)\n", 95 | " |-- Open: double (nullable = true)\n", 96 | " |-- High: double (nullable = true)\n", 97 | " |-- Low: double (nullable = true)\n", 98 | " |-- Close: double (nullable = true)\n", 99 | " |-- Volume: integer (nullable = true)\n", 100 | " |-- OpenInt: integer (nullable = true)\n", 101 | "\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "df.count()\n", 107 | "df.printSchema()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "+-------------------+------+------+------+------+------+-------+\n", 120 | "| Date| Open| High| Low| Close|Volume|OpenInt|\n", 121 | "+-------------------+------+------+------+------+------+-------+\n", 122 | "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378|467056| 0|\n", 123 | "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294| 0|\n", 124 | "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365| 0|\n", 125 | "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112| 0|\n", 126 | "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087|655676| 0|\n", 127 | "|1962-01-09 00:00:00|6.1208|6.2376|6.1208|6.1621|592806| 0|\n", 128 | "|1962-01-10 00:00:00|6.1707|6.2041|6.1707|6.1707|359274| 0|\n", 129 | "|1962-01-11 00:00:00|6.1875|6.2376|6.1875|6.2376|386220| 0|\n", 130 | "|1962-01-12 00:00:00|6.2543|6.2962|6.2543|6.2543|529933| 0|\n", 131 | "|1962-01-15 00:00:00|6.2708|6.2962|6.2708|6.2792|305383| 0|\n", 132 | "|1962-01-16 00:00:00|6.2708|6.2708|6.2128|6.2128|305383| 0|\n", 133 | "|1962-01-17 00:00:00|6.1875|6.1875|6.0956|6.1125|502984| 0|\n", 134 | "|1962-01-18 00:00:00|6.1291|6.1875|6.1291|6.1291|449093| 0|\n", 135 | "|1962-01-19 00:00:00|6.1291|6.1457|6.0624|6.1374|485021| 0|\n", 136 | "|1962-01-22 00:00:00|6.1374|6.1958|6.1208|6.1208|332329| 0|\n", 137 | "|1962-01-23 00:00:00|6.1208|6.1291|6.0538|6.0624|449093| 0|\n", 138 | "|1962-01-24 00:00:00|6.0624|6.0956|6.0287|6.0956|494001| 0|\n", 139 | "|1962-01-25 00:00:00|6.0956|6.1457|6.0208|6.0287|386220| 0|\n", 140 | "|1962-01-26 00:00:00|6.0287|6.0538|5.9951|5.9951|296401| 0|\n", 141 | "|1962-01-29 00:00:00|5.9951|6.0373|5.8952|5.8952|700585| 0|\n", 142 | "+-------------------+------+------+------+------+------+-------+\n", 143 | "only showing top 20 rows\n", 144 | "\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "df.show()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "df = df.withColumn('filename', F.input_file_name())" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": { 165 | "scrolled": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n", 173 | "|Date |Open |High |Low |Close |Volume|OpenInt|filename |\n", 174 | "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n", 175 | "|1962-01-02 00:00:00|6.413 |6.413 |6.3378|6.3378|467056|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 176 | "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 177 | "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 178 | "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 179 | "|1962-01-08 00:00:00|6.2041|6.2041|6.0373|6.087 |655676|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 180 | "|1962-01-09 00:00:00|6.1208|6.2376|6.1208|6.1621|592806|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 181 | "|1962-01-10 00:00:00|6.1707|6.2041|6.1707|6.1707|359274|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 182 | "|1962-01-11 00:00:00|6.1875|6.2376|6.1875|6.2376|386220|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 183 | "|1962-01-12 00:00:00|6.2543|6.2962|6.2543|6.2543|529933|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 184 | "|1962-01-15 00:00:00|6.2708|6.2962|6.2708|6.2792|305383|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 185 | "|1962-01-16 00:00:00|6.2708|6.2708|6.2128|6.2128|305383|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 186 | "|1962-01-17 00:00:00|6.1875|6.1875|6.0956|6.1125|502984|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 187 | "|1962-01-18 00:00:00|6.1291|6.1875|6.1291|6.1291|449093|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 188 | "|1962-01-19 00:00:00|6.1291|6.1457|6.0624|6.1374|485021|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 189 | "|1962-01-22 00:00:00|6.1374|6.1958|6.1208|6.1208|332329|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 190 | "|1962-01-23 00:00:00|6.1208|6.1291|6.0538|6.0624|449093|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 191 | "|1962-01-24 00:00:00|6.0624|6.0956|6.0287|6.0956|494001|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 192 | "|1962-01-25 00:00:00|6.0956|6.1457|6.0208|6.0287|386220|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 193 | "|1962-01-26 00:00:00|6.0287|6.0538|5.9951|5.9951|296401|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 194 | "|1962-01-29 00:00:00|5.9951|6.0373|5.8952|5.8952|700585|0 |file:///dataset/stocks-small/ibm.us.txt|\n", 195 | "+-------------------+------+------+------+------+------+-------+---------------------------------------+\n", 196 | "only showing top 20 rows\n", 197 | "\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "df.show(truncate=False)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 11, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "df_lookup = spark.read.csv('/dataset/yahoo-symbols-201709.csv')" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 12, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "+------+--------------------+--------+--------------------+-------+\n", 224 | "| _c0| _c1| _c2| _c3| _c4|\n", 225 | "+------+--------------------+--------+--------------------+-------+\n", 226 | "|Ticker| Name|Exchange| Category Name|Country|\n", 227 | "| OEDV|Osage Exploration...| PNK| null| USA|\n", 228 | "| AAPL| Apple Inc.| NMS|Electronic Equipment| USA|\n", 229 | "| BAC|Bank of America C...| NYQ| Money Center Banks| USA|\n", 230 | "| AMZN| Amazon.com, Inc.| NMS|Catalog & Mail Or...| USA|\n", 231 | "| T| AT&T Inc.| NYQ|Telecom Services ...| USA|\n", 232 | "| GOOG| Alphabet Inc.| NMS|Internet Informat...| USA|\n", 233 | "| MO| Altria Group, Inc.| NYQ| Cigarettes| USA|\n", 234 | "| DAL|Delta Air Lines, ...| NYQ| Major Airlines| USA|\n", 235 | "| AA| Alcoa Corporation| NYQ| Aluminum| USA|\n", 236 | "| AXP|American Express ...| NYQ| Credit Services| USA|\n", 237 | "| DD|E. I. du Pont de ...| NYQ|Agricultural Chem...| USA|\n", 238 | "| BABA|Alibaba Group Hol...| NYQ|Specialty Retail,...| USA|\n", 239 | "| ABT| Abbott Laboratories| NYQ|Medical Appliance...| USA|\n", 240 | "| UA| Under Armour, Inc.| NYQ|Textile - Apparel...| USA|\n", 241 | "| AMAT|Applied Materials...| NMS|Semiconductor Equ...| USA|\n", 242 | "| AMGN| Amgen Inc.| NMS| Biotechnology| USA|\n", 243 | "| AAL|American Airlines...| NMS| Major Airlines| USA|\n", 244 | "| AIG|American Internat...| NYQ|Property & Casual...| USA|\n", 245 | "| ALL|The Allstate Corp...| NYQ|Property & Casual...| USA|\n", 246 | "+------+--------------------+--------+--------------------+-------+\n", 247 | "only showing top 20 rows\n", 248 | "\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "df_lookup.show()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 13, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "def extract_symbol_from(filename):\n", 263 | " return filename.split('/')[-1].split('.')[0].upper()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 14, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "'IBM'" 275 | ] 276 | }, 277 | "execution_count": 14, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "# filename = 'file:///dataset/stocks-small/ibm.us.txt' # => IBM\n", 284 | "extract_symbol_from('file:///dataset/stocks-small/ibm.us.txt')" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 15, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "extract_symbol = F.udf(lambda filename: extract_symbol_from(filename), StringType())" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 16, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "stocks_folder = stocks_dir\n", 303 | "df = spark.read \\\n", 304 | " .option(\"header\", True) \\\n", 305 | " .option(\"inferSchema\", True) \\\n", 306 | " .csv(stocks_folder) \\\n", 307 | " .withColumn(\"name\", extract_symbol(F.input_file_name()))" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 17, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "+-------------------+------+------+------+------+------+-------+----+\n", 320 | "| Date| Open| High| Low| Close|Volume|OpenInt|name|\n", 321 | "+-------------------+------+------+------+------+------+-------+----+\n", 322 | "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378|467056| 0| IBM|\n", 323 | "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963|350294| 0| IBM|\n", 324 | "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295|314365| 0| IBM|\n", 325 | "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041|440112| 0| IBM|\n", 326 | "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087|655676| 0| IBM|\n", 327 | "+-------------------+------+------+------+------+------+-------+----+\n", 328 | "only showing top 5 rows\n", 329 | "\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "df.show(5)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 18, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "df = spark.read \\\n", 344 | " .option(\"header\", True) \\\n", 345 | " .option(\"inferSchema\", True) \\\n", 346 | " .csv(stocks_folder) \\\n", 347 | " .withColumn(\"name\", extract_symbol(F.input_file_name())) \\\n", 348 | " .withColumnRenamed(\"Date\", \"dateTime\") \\\n", 349 | " .withColumnRenamed(\"Open\", \"open\") \\\n", 350 | " .withColumnRenamed(\"High\", \"high\") \\\n", 351 | " .withColumnRenamed(\"Low\", \"low\") \\\n", 352 | " .withColumnRenamed(\"Close\", \"close\") \\\n", 353 | " .drop(\"Volume\", \"OpenInt\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 19, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "df_stocks = df" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 20, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "+-------------------+------+------+------+------+----+\n", 375 | "| dateTime| open| high| low| close|name|\n", 376 | "+-------------------+------+------+------+------+----+\n", 377 | "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378| IBM|\n", 378 | "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963| IBM|\n", 379 | "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295| IBM|\n", 380 | "|1962-01-05 00:00:00|6.3211|6.3211|6.1958|6.2041| IBM|\n", 381 | "|1962-01-08 00:00:00|6.2041|6.2041|6.0373| 6.087| IBM|\n", 382 | "+-------------------+------+------+------+------+----+\n", 383 | "only showing top 5 rows\n", 384 | "\n" 385 | ] 386 | } 387 | ], 388 | "source": [ 389 | "df_stocks.show(5)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 21, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "lookup_file = '/dataset/yahoo-symbols-201709.csv'" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 22, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "symbols_lookup = spark.read. \\\n", 408 | " option(\"header\", True). \\\n", 409 | " option(\"inferSchema\", True). \\\n", 410 | " csv(lookup_file). \\\n", 411 | " select(\"Ticker\", \"Category Name\"). \\\n", 412 | " withColumnRenamed(\"Ticker\", \"symbol\"). \\\n", 413 | " withColumnRenamed(\"Category Name\", \"category\")" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 23, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "name": "stdout", 423 | "output_type": "stream", 424 | "text": [ 425 | "+-------------------+------+------+------+------+----+\n", 426 | "| dateTime| open| high| low| close|name|\n", 427 | "+-------------------+------+------+------+------+----+\n", 428 | "|1962-01-02 00:00:00| 6.413| 6.413|6.3378|6.3378| IBM|\n", 429 | "|1962-01-03 00:00:00|6.3378|6.3963|6.3378|6.3963| IBM|\n", 430 | "|1962-01-04 00:00:00|6.3963|6.3963|6.3295|6.3295| IBM|\n", 431 | "+-------------------+------+------+------+------+----+\n", 432 | "only showing top 3 rows\n", 433 | "\n", 434 | "+------+--------------------+\n", 435 | "|symbol| category|\n", 436 | "+------+--------------------+\n", 437 | "| OEDV| null|\n", 438 | "| AAPL|Electronic Equipment|\n", 439 | "| BAC| Money Center Banks|\n", 440 | "+------+--------------------+\n", 441 | "only showing top 3 rows\n", 442 | "\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "df_stocks.show(3)\n", 448 | "symbols_lookup.show(3)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 24, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "joined_df = df_stocks \\\n", 458 | " .withColumnRenamed('dateTime', \"full_date\") \\\n", 459 | " .filter(\"full_date >= \\\"2017-09-01\\\"\") \\\n", 460 | " .withColumn(\"year\", F.year(\"full_date\")) \\\n", 461 | " .withColumn(\"month\", F.month(\"full_date\")) \\\n", 462 | " .withColumn(\"day\", F.dayofmonth(\"full_date\")) \\\n", 463 | " .withColumnRenamed(\"name\", \"symbol\") \\\n", 464 | " .join(symbols_lookup, [\"symbol\"])" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 25, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n", 477 | "|symbol| full_date| open| high| low| close|year|month|day| category|\n", 478 | "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n", 479 | "| IBM|2017-01-03 00:00:00|160.76| 161.6|159.81|160.95|2017| 1| 3|Information Techn...|\n", 480 | "| IBM|2017-01-04 00:00:00|161.51|163.53|161.11|162.94|2017| 1| 4|Information Techn...|\n", 481 | "| IBM|2017-01-05 00:00:00|162.93|163.06|161.01|162.41|2017| 1| 5|Information Techn...|\n", 482 | "+------+-------------------+------+------+------+------+----+-----+---+--------------------+\n", 483 | "only showing top 3 rows\n", 484 | "\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "joined_df.show(3)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 26, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "window20 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-20, 0))\n", 499 | "window50 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-50, 0))\n", 500 | "window100 = (Window.partitionBy(F.col('symbol')).orderBy(F.col(\"full_date\")).rowsBetween(-100, 0))" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 27, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "stocks_moving_avg_df = joined_df \\\n", 510 | " .withColumn(\"ma20\", F.avg(\"close\").over(window20)) \\\n", 511 | " .withColumn(\"ma50\", F.avg(\"close\").over(window50)) \\\n", 512 | " .withColumn(\"ma100\", F.avg(\"close\").over(window100))" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 28, 518 | "metadata": {}, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "+------+------+------------------+\n", 525 | "|symbol| close| ma20|\n", 526 | "+------+------+------------------+\n", 527 | "| AAPL|114.31| 114.31|\n", 528 | "| AAPL|114.19| 114.25|\n", 529 | "| AAPL|114.77|114.42333333333333|\n", 530 | "| AAPL|116.04| 114.8275|\n", 531 | "| AAPL|117.11|115.28399999999999|\n", 532 | "| AAPL|117.23|115.60833333333333|\n", 533 | "| AAPL|117.86|115.92999999999999|\n", 534 | "| AAPL|117.37| 116.11|\n", 535 | "| AAPL|117.16|116.22666666666666|\n", 536 | "| AAPL| 118.1|116.41399999999999|\n", 537 | "| AAPL|118.09|116.56636363636362|\n", 538 | "| AAPL|117.89|116.67666666666666|\n", 539 | "| AAPL| 118.1|116.78615384615384|\n", 540 | "| AAPL|118.19|116.88642857142857|\n", 541 | "| AAPL|118.07|116.96533333333332|\n", 542 | "| AAPL|119.95|117.15187499999999|\n", 543 | "| AAPL|120.01| 117.32|\n", 544 | "| AAPL|120.02| 117.47|\n", 545 | "| AAPL| 119.7|117.58736842105263|\n", 546 | "| AAPL|119.43|117.67949999999999|\n", 547 | "| AAPL| 126.7| 118.1090476190476|\n", 548 | "| AAPL| 126.5|118.68952380952379|\n", 549 | "| AAPL|127.03| 119.3009523809524|\n", 550 | "| AAPL|128.23|119.94190476190477|\n", 551 | "| AAPL|129.44|120.58000000000004|\n", 552 | "+------+------+------------------+\n", 553 | "only showing top 25 rows\n", 554 | "\n" 555 | ] 556 | } 557 | ], 558 | "source": [ 559 | "# Moving Average\n", 560 | "stocks_moving_avg_df.select('symbol', 'close', 'ma20').show(25)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 29, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "output_dir = '/dataset/output.parquet'" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 30, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "stocks_moving_avg_df \\\n", 579 | " .write \\\n", 580 | " .mode('overwrite') \\\n", 581 | " .partitionBy(\"year\", \"month\", \"day\") \\\n", 582 | " .parquet(output_dir)" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 31, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "df_parquet = spark.read.parquet(output_dir)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 32, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "4142" 603 | ] 604 | }, 605 | "execution_count": 32, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "df_parquet.count()" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 33, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "df_parquet.createOrReplaceTempView(\"stocks\")" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 34, 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "name": "stdout", 630 | "output_type": "stream", 631 | "text": [ 632 | "== Physical Plan ==\n", 633 | "*(2) HashAggregate(keys=[symbol#559], functions=[max(close#564)])\n", 634 | "+- Exchange hashpartitioning(symbol#559, 200)\n", 635 | " +- *(1) HashAggregate(keys=[symbol#559], functions=[partial_max(close#564)])\n", 636 | " +- *(1) Project [symbol#559, close#564]\n", 637 | " +- *(1) Filter ((isnotnull(full_date#560) && (cast(full_date#560 as string) >= 2017-09-01)) && (cast(full_date#560 as string) < 2017-10-01))\n", 638 | " +- *(1) FileScan parquet [symbol#559,full_date#560,close#564,year#569,month#570,day#571] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/dataset/output.parquet], PartitionCount: 218, PartitionFilters: [], PushedFilters: [IsNotNull(full_date)], ReadSchema: struct\n" 639 | ] 640 | } 641 | ], 642 | "source": [ 643 | "badHighestClosingPrice = spark.sql(\"SELECT symbol, MAX(close) AS price FROM stocks WHERE full_date >= '2017-09-01' AND full_date < '2017-10-01' GROUP BY symbol\")\n", 644 | "badHighestClosingPrice.explain()" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 35, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "name": "stdout", 654 | "output_type": "stream", 655 | "text": [ 656 | "== Physical Plan ==\n", 657 | "*(2) HashAggregate(keys=[symbol#559], functions=[max(close#564)])\n", 658 | "+- Exchange hashpartitioning(symbol#559, 200)\n", 659 | " +- *(1) HashAggregate(keys=[symbol#559], functions=[partial_max(close#564)])\n", 660 | " +- *(1) Project [symbol#559, close#564]\n", 661 | " +- *(1) FileScan parquet [symbol#559,close#564,year#569,month#570,day#571] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/dataset/output.parquet], PartitionCount: 20, PartitionFilters: [isnotnull(year#569), isnotnull(month#570), (year#569 = 2017), (month#570 = 9)], PushedFilters: [], ReadSchema: struct\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "highestClosingPrice = spark.sql(\"SELECT symbol, MAX(close) AS price FROM stocks WHERE year=2017 AND month=9 GROUP BY symbol\")\n", 667 | "highestClosingPrice.explain()" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 36, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "# Write to Postgres\n", 677 | "stocks_moving_avg_df \\\n", 678 | " .drop(\"year\", \"month\", \"day\") \\\n", 679 | " .write \\\n", 680 | " .format(\"jdbc\") \\\n", 681 | " .option(\"url\", \"jdbc:postgresql://postgres/workshop\") \\\n", 682 | " .option(\"dbtable\", \"workshop.stocks\") \\\n", 683 | " .option(\"user\", \"workshop\") \\\n", 684 | " .option(\"password\", \"w0rkzh0p\") \\\n", 685 | " .option(\"driver\", \"org.postgresql.Driver\") \\\n", 686 | " .mode('append') \\\n", 687 | " .save()" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [] 696 | } 697 | ], 698 | "metadata": { 699 | "kernelspec": { 700 | "display_name": "Python 3", 701 | "language": "python", 702 | "name": "python3" 703 | }, 704 | "language_info": { 705 | "codemirror_mode": { 706 | "name": "ipython", 707 | "version": 3 708 | }, 709 | "file_extension": ".py", 710 | "mimetype": "text/x-python", 711 | "name": "python", 712 | "nbconvert_exporter": "python", 713 | "pygments_lexer": "ipython3", 714 | "version": "3.7.3" 715 | } 716 | }, 717 | "nbformat": 4, 718 | "nbformat_minor": 4 719 | } 720 | -------------------------------------------------------------------------------- /jupyter/notebook/pyspark-apache-arrow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PySpark: Pandas with Apache Arrow\n", 8 | "[Reference](https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import findspark\n", 18 | "findspark.init()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from pyspark.sql import SparkSession\n", 28 | "\n", 29 | "spark = SparkSession.builder. \\\n", 30 | " appName(\"pyspark-arrow\"). \\\n", 31 | " getOrCreate()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import numpy as np\n", 41 | "import pandas as pd\n", 42 | "\n", 43 | "# Enable Arrow-based columnar data transfers\n", 44 | "spark.conf.set(\"spark.sql.execution.arrow.enabled\", \"true\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Generate a Pandas DataFrame\n", 54 | "pdf = pd.DataFrame(np.random.rand(100, 3))\n", 55 | "\n", 56 | "# Create a Spark DataFrame from a Pandas DataFrame using Arrow\n", 57 | "df = spark.createDataFrame(pdf)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stderr", 67 | "output_type": "stream", 68 | "text": [ 69 | "/usr/local/lib/python3.7/dist-packages/pyarrow/__init__.py:157: UserWarning: pyarrow.open_stream is deprecated, please use pyarrow.ipc.open_stream\n", 70 | " warnings.warn(\"pyarrow.open_stream is deprecated, please use \"\n" 71 | ] 72 | }, 73 | { 74 | "data": { 75 | "text/html": [ 76 | "
\n", 77 | "\n", 90 | "\n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
012
00.5817650.4217540.746082
10.4500960.8381850.650798
20.9060610.9025530.582332
30.0271340.3671070.342978
40.2714630.6580560.881614
............
950.8942910.6215590.434179
960.3363940.3824790.723049
970.9400940.6935280.695185
980.5712440.7932910.476467
990.5477590.5316970.638495
\n", 168 | "

100 rows × 3 columns

\n", 169 | "
" 170 | ], 171 | "text/plain": [ 172 | " 0 1 2\n", 173 | "0 0.581765 0.421754 0.746082\n", 174 | "1 0.450096 0.838185 0.650798\n", 175 | "2 0.906061 0.902553 0.582332\n", 176 | "3 0.027134 0.367107 0.342978\n", 177 | "4 0.271463 0.658056 0.881614\n", 178 | ".. ... ... ...\n", 179 | "95 0.894291 0.621559 0.434179\n", 180 | "96 0.336394 0.382479 0.723049\n", 181 | "97 0.940094 0.693528 0.695185\n", 182 | "98 0.571244 0.793291 0.476467\n", 183 | "99 0.547759 0.531697 0.638495\n", 184 | "\n", 185 | "[100 rows x 3 columns]" 186 | ] 187 | }, 188 | "execution_count": 5, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow\n", 195 | "result_pdf = df.select(\"*\").toPandas()\n", 196 | "\n", 197 | "result_pdf" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 6, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "spark.stop()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.7.3" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /jupyter/notebook/pyspark-check-install.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Test notebook\n", 8 | "It should run without errors when all Worker nodes contains python deps. From [OneHotEncoderEstimator](https://spark.apache.org/docs/latest/ml-features#onehotencoderestimator)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import findspark\n", 18 | "findspark.init()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from pyspark.sql import SparkSession\n", 28 | "spark = SparkSession.builder.appName(\"OneHotEncoderEstimator\").getOrCreate()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from pyspark.ml.feature import OneHotEncoderEstimator\n", 38 | "\n", 39 | "df = spark.createDataFrame([\n", 40 | " (0.0, 1.0),\n", 41 | " (1.0, 0.0),\n", 42 | " (2.0, 1.0),\n", 43 | " (0.0, 2.0),\n", 44 | " (0.0, 1.0),\n", 45 | " (2.0, 0.0)\n", 46 | "], [\"categoryIndex1\", \"categoryIndex2\"])\n", 47 | "\n", 48 | "encoder = OneHotEncoderEstimator(inputCols=[\"categoryIndex1\", \"categoryIndex2\"],\n", 49 | " outputCols=[\"categoryVec1\", \"categoryVec2\"])\n", 50 | "model = encoder.fit(df)\n", 51 | "encoded = model.transform(df)\n", 52 | "encoded.show()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "spark.stop()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.7.3" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 2 93 | } -------------------------------------------------------------------------------- /jupyter/notebook/pyspark-dataframe-overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pySpark Commands Reference\n", 8 | "https://spark.apache.org/docs/2.4.4/api/python/index.html\n", 9 | "\n", 10 | "\n", 11 | "## Connect to Spark Cluster\n", 12 | "https://github.com/minrk/findspark" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import findspark\n", 22 | "findspark.init()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "from pyspark.sql import SparkSession\n", 32 | "spark = SparkSession.builder.appName(\"pyspark-df-overview\").getOrCreate()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "spark.version" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Create Spark DataFrame\n", 49 | "Dataset from: https://www.kaggle.com/uciml/adult-census-income/home" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "df = spark.read.csv(\"/dataset/pyspark-df-overview/census_income.csv.gz\", header=True)\n", 59 | "df.printSchema()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### Define a schema" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "import pyspark.sql.types as t\n", 76 | "\n", 77 | "census_schema = t.StructType([\n", 78 | " t.StructField('age', t.IntegerType(), True)\n", 79 | " , t.StructField('workclass', t.StringType(), True)\n", 80 | " , t.StructField('fnlwgt', t.IntegerType(), True)\n", 81 | " , t.StructField('education', t.StringType(), True)\n", 82 | " , t.StructField('education-num', t.IntegerType(), True)\n", 83 | " , t.StructField('marital-status', t.StringType(), True)\n", 84 | " , t.StructField('occupation', t.StringType(), True)\n", 85 | " , t.StructField('relationship', t.StringType(), True)\n", 86 | " , t.StructField('race', t.StringType(), True)\n", 87 | " , t.StructField('sex', t.StringType(), True)\n", 88 | " , t.StructField('capital-gain', t.DoubleType(), True)\n", 89 | " , t.StructField('capital-loss', t.DoubleType(), True)\n", 90 | " , t.StructField('hours-per-week', t.IntegerType(), True)\n", 91 | " , t.StructField('native-country', t.StringType(), True)\n", 92 | " , t.StructField('label', t.StringType(), True)\n", 93 | "])" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Support for compressed (gziped) payload\n", 103 | "df = spark.read.csv(\"/dataset/pyspark-df-overview/census_income.csv.gz\", header=True, schema=census_schema)\n", 104 | "df.printSchema()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "df.count()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### Drop unused column" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "df = df.drop('fnlwgt')\n", 130 | "df.printSchema()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "### Few operations" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "from pyspark.sql.functions import count, avg, desc\n", 147 | "\n", 148 | "df.groupBy(['education']). \\\n", 149 | "agg(\n", 150 | " count('*').alias('qty'), \n", 151 | " avg('age').alias('avg_age')\n", 152 | ").orderBy(desc('qty')). \\\n", 153 | "show()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Using SQL\n", 161 | "Same operation with SQL syntax" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df.createOrReplaceTempView(\"census\")\n", 171 | "s = spark.sql(\"\"\"\n", 172 | "SELECT \n", 173 | " education, \n", 174 | " COUNT(*) AS qty, \n", 175 | " AVG(age) AS avg_age\n", 176 | "FROM census\n", 177 | "GROUP BY education\n", 178 | "\"\"\")\n", 179 | "s.show()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# a transformation can be exposed as function\n", 189 | "def my_query(field):\n", 190 | " return df.groupBy([field]). \\\n", 191 | " agg(\n", 192 | " count('*').alias('qty'), \n", 193 | " avg('age').alias('avg_age')\n", 194 | " ).orderBy(desc('qty'))\n", 195 | " \n", 196 | "\n", 197 | " \n", 198 | "print(my_query('workclass').show())" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "df.select('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week').describe().show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df.select('workclass', 'education', 'marital-status').describe().show()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "df.freqItems(['marital-status']).show(truncate=False)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "df.crosstab('age', 'label').sort(\"age_label\").show()\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df.groupby('native-country').agg({'native-country': 'count'}).sort('count(native-country)').show()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### Check if there is missing data" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "from pyspark.sql.functions import isnan, when, count, col\n", 260 | "\n", 261 | "# All columns\n", 262 | "# cols = df.columns\n", 263 | "# Selected columns\n", 264 | "cols = ['workclass', 'education-num', 'occupation', 'hours-per-week', 'native-country']\n", 265 | "\n", 266 | "# https://stackoverflow.com/a/44631639/570393\n", 267 | "df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in cols]).show()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "### Remove rows with missing data" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# Total rows\n", 284 | "print('total rows: %s' % df.count())\n", 285 | "\n", 286 | "# After droping NA records\n", 287 | "print('only complete rows: %s' % df.dropna().count())" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### Fill rows that contains missing data" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "def show_df(df, field='occupation'):\n", 304 | " df.groupBy(field).count().show()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "show_df(df)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# Fill with a fixed value\n", 323 | "new_df = df.fillna({'occupation': 'Other-service'})\n", 324 | "\n", 325 | "# Count \n", 326 | "show_df(new_df)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### Better way\n", 334 | "\n", 335 | "Calc the `mean()` value of a column and use it on missing values.\n", 336 | "Also use a static string for categorical data " 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "from pyspark.sql.functions import mean\n", 346 | "df.groupBy().agg(mean('hours-per-week').alias('hours-per-week')).show()" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "from pyspark.sql.functions import mean\n", 356 | "import pandas as pd\n", 357 | "\n", 358 | "data_to_fill = \\\n", 359 | " df.groupBy().agg(mean('hours-per-week').alias('hours-per-week')).toPandas().to_dict('records')[0]\n", 360 | "\n", 361 | "# Simple Python Dict Update\n", 362 | "data_to_fill.update({'occupation': 'Other-service'})\n", 363 | "\n", 364 | "data_to_fill" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "df.fillna(data_to_fill).select('hours-per-week', 'occupation').show(50)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "### Creating charts with pandas & matplotlib\n", 381 | "https://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-plotting\n", 382 | "\n", 383 | "**Important:** possible only when data become small enough to driver program" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "# This is distributed\n", 393 | "df_spark = df.groupBy('workclass').agg(count('*').alias('counts')).orderBy('counts')\n", 394 | "# df_spark.show()\n", 395 | "\n", 396 | "# This is running on driver\n", 397 | "df_wk = df_spark.toPandas()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "# Check Pandas DF content\n", 407 | "df_wk" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "import matplotlib.pyplot as plt\n", 417 | "%matplotlib inline\n", 418 | "\n", 419 | "df_wk.plot.bar(x='workclass', y='counts', figsize=(20,6));" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "### Stop Drive Program\n", 427 | "Release resources from Spark Cluster" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "spark.stop()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "# Continue Learning\n", 444 | "\n", 445 | "* [Kaggle Learn](https://www.kaggle.com/learn/overview)\n", 446 | "* [PySpark Cookbook](https://www.safaribooksonline.com/library/view/pyspark-cookbook/9781788835367/)\n", 447 | "\n", 448 | "## Other references\n", 449 | "\n", 450 | "* [PySpark Tutorial for Beginners: Machine Learning Example](https://www.guru99.com/pyspark-tutorial.html)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.7.3" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 2 482 | } 483 | -------------------------------------------------------------------------------- /jupyter/notebook/pyspark-intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import findspark\n", 10 | "findspark.init()" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from pyspark.sql import SparkSession\n", 20 | "spark = SparkSession.builder.appName(\"pyspark-intro\").getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "spark.version" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### Create Spark DataFrame" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "df = spark.read.csv(\"/dataset/yahoo-symbols-201709.csv\", header=True)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df.count()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df.printSchema()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "df.show()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### DataFrame operations\n", 80 | "Show Top 20 categories by quantity of stocks" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "cats = df.groupby(df['Category Name']).count()\n", 90 | "cats.orderBy(cats['count'].desc()).show(truncate=False)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Stop Drive Program\n", 98 | "Release resources from Spark Cluster" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "spark.stop()" 108 | ] 109 | } 110 | ], 111 | "metadata": { 112 | "kernelspec": { 113 | "display_name": "Python 3", 114 | "language": "python", 115 | "name": "python3" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.5.3" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 2 132 | } 133 | -------------------------------------------------------------------------------- /jupyter/notebook/pyspark-postgres.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Connecting to Postgres\n", 8 | "This notebook shows how to pass JDBC driver and connect to our Postgres" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import findspark\n", 18 | "\n", 19 | "findspark.add_jars('/app/postgresql-42.1.4.jar')\n", 20 | "findspark.init()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from pyspark.sql import SparkSession\n", 30 | "\n", 31 | "spark = SparkSession.builder.appName(\"pyspark-postgres\").getOrCreate()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 13, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "df = spark.read \\\n", 41 | " .format(\"jdbc\") \\\n", 42 | " .option(\"url\", \"jdbc:postgresql://postgres/workshop\") \\\n", 43 | " .option(\"dbtable\", \"workshop.stocks\") \\\n", 44 | " .option(\"user\", \"workshop\") \\\n", 45 | " .option(\"password\", \"w0rkzh0p\") \\\n", 46 | " .option(\"driver\", \"org.postgresql.Driver\") \\\n", 47 | " .load()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 15, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "root\n", 60 | " |-- full_date: timestamp (nullable = true)\n", 61 | " |-- symbol: string (nullable = true)\n", 62 | " |-- category: string (nullable = true)\n", 63 | " |-- open: double (nullable = true)\n", 64 | " |-- high: double (nullable = true)\n", 65 | " |-- low: double (nullable = true)\n", 66 | " |-- close: double (nullable = true)\n", 67 | " |-- ma20: double (nullable = true)\n", 68 | " |-- ma50: double (nullable = true)\n", 69 | " |-- ma100: double (nullable = true)\n", 70 | "\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "df.printSchema()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.7.3" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 2 107 | } 108 | -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/clase_ml.bib: -------------------------------------------------------------------------------- 1 | @Book{mitchell97, 2 | author = {Tom M Mitchell}, 3 | title = {Machine Learning}, 4 | publisher = {McGraw-Hill}, 5 | address = {New York, NY}, 6 | year = {1997} 7 | } 8 | 9 | @Book{james13, 10 | author = {Gareth James and Daniela Witten and Trevor Hastie and Robert Tibshirani}, 11 | title = {Introduction to Statistical Learning : with Applications in R}, 12 | publisher = {Springer}, 13 | address = {New York, NY}, 14 | year = {2013} 15 | } 16 | 17 | @book{efron16, 18 | author = {Efron, Bradley and Hastie, Trevor}, 19 | title = {Computer Age Statistical Inference: Algorithms, Evidence, and Data Science}, 20 | publisher = {Cambridge University Press}, 21 | address = {New York, NY, USA}, 22 | year = {2016}, 23 | } 24 | 25 | @Book{tukey77, 26 | author = {John W Tukey}, 27 | title = {Exploratory Data Analysis}, 28 | edition = {7}, 29 | publisher = {Addison Wesley}, 30 | address = {Reading, Massachusetts}, 31 | year = {1977} 32 | } 33 | -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/clase_ml.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/clase_ml.pdf -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/bias_variance_tradeoff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/bias_variance_tradeoff.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/bvt2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/bvt2.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/complexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/complexity.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/confusion_matrix.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/corr.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/facet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/facet.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/frontier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/frontier.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/holdout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/holdout.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/kde.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/kde.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/logistic.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/logo_mutt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/logo_mutt.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/one_hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/one_hot.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/overfitting.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/roc.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/run.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/sample_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/sample_size.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/supervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/supervised.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/table_variables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/table_variables.pdf -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/table_variables.tex: -------------------------------------------------------------------------------- 1 | \documentclass{standalone} 2 | 3 | %-----------------------+ 4 | % Clean auxiliary files | 5 | %-----------------------+ 6 | % arara: clean: {files: [table_variables.aux, table_variables.log, table_variables.synctex.gz]} 7 | 8 | %------------------------------------+ 9 | % Language, hyphenation and encoding | 10 | %------------------------------------+ 11 | \usepackage{lmodern} % Use Latin Modern fonts 12 | % \renewcommand{\rmdefault}{\sfdefault} % Use beamer sans-serif font family 13 | \usepackage[T1]{fontenc} % Better output when a diacritic/accent is used 14 | \usepackage[utf8]{inputenc} % Allows to input accented characters 15 | 16 | %----------------+ 17 | % Table packages | 18 | %----------------+ 19 | \usepackage{array} % Flexible column formatting 20 | % \usepackage{spreadtab} % Spreadsheet features 21 | \usepackage{multirow} % Allows table cells that span more than one row 22 | \usepackage{booktabs} % Enhance quality of tables 23 | \setlength{\heavyrulewidth}{1pt} 24 | 25 | \usepackage{siunitx} % Typeset units correctly and define new column (S) 26 | \sisetup{detect-all,table-auto-round,input-symbols = {()}} 27 | % \robustify{\bfseries} % Correct alignment of bold numbers in tables 28 | 29 | % Table colors 30 | \usepackage[table,x11names]{xcolor} 31 | 32 | \begin{document} 33 | \begin{tabular}{lll} 34 | \toprule 35 | Variable & Descripción & Notas \\ 36 | \midrule 37 | survived & Condición de superviviencia & 1 = Si, 0 = No\\ 38 | pclass & Tipo de ticket & 1= Alto, 2= Medio, 3 = Bajo \\ 39 | name & Nombre del pasajero & \\ 40 | sex & Sexo & \\ 41 | age & Edad en años & Fracción si < 1 y xx.5 si estimada\\ 42 | sibsp & \# hermanos y conyuges a bordo & \\ 43 | parch & \# padres e hijos a bordo & Hijos con niñera tiene parch=0\\ 44 | ticket & \# de boleto & \\ 45 | fare & Precio del boleto & \\ 46 | cabin & \# de cabina & \\ 47 | embarked & Puerto de embarque & C=Cherbourg, Q=Queenstown, S=Southhampton\\ 48 | boat & \# bote de rescate & \\ 49 | home.dest & Ciudad de origen & \\ 50 | body & \# Número de identificación cadáver \\ 51 | \bottomrule 52 | \end{tabular} 53 | \end{document} 54 | -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/titanic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/titanic.jpg -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tree.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/tree_regions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tree_regions.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/tvt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/tvt.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/unbalance_class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/unbalance_class.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/underfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/underfitting.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/unsupervised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/unsupervised.png -------------------------------------------------------------------------------- /jupyter/notebook/titanic/docs/figures/whatido.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/jupyter/notebook/titanic/docs/figures/whatido.jpg -------------------------------------------------------------------------------- /nginx/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Workshop de Big Data con Apache Spark 4 | 5 | 6 | 7 | # Material del Workshopde Big Data 8 | 9 | ## Documentaci&#xF3;n 10 | Todo el material del curso est&#xE1; [disponible en Github](https://github.com/arjones/bigdata-workshop-es) 11 | 12 | ## Puertos y Servicios 13 | El listado abajo contiene los puertos para acceso a las interfaces graficas de los servicios instalados. 14 | 15 | <table> 16 | <thead> 17 | <tr> 18 | <th>Servicio</th> 19 | <th>Acceso</th> 20 | <th>Notas</th> 21 | </tr> 22 | </thead> 23 | <tbody> 24 | <tr> 25 | <td>Spark</td> 26 | <td> 27 | <a href="/" onclick="javascript:event.target.port=8080" target="_blank">Spark Master</a> 28 | <br /><br /> 29 | <a href="/" onclick="javascript:event.target.port=4040" target="_blank">Job Progress</a> 30 | </td> 31 | <td><br /><br /><b>Job Progress</b> solo funciona cuando el job este activo</td> 32 | 33 | </tr> 34 | <tr> 35 | <td>pySpark</td> 36 | <td><a href="/" onclick="javascript:event.target.port=8888" target="_blank">Jupyter Notebook</a></td> 37 | <td>Token required, correr <code>./control-env.sh token</code></td> 38 | </tr> 39 | <tr> 40 | <td>Superset</td> 41 | <td><a href="/" onclick="javascript:event.target.port=8088" target="_blank">Dashboard</a></td> 42 | <td>Username/Password creado durante superset-init </td> 43 | </tr> 44 | </tbody> 45 | </table> 46 | 47 | --- 48 | ## Sobre 49 | Gustavo Arjones &copy; 2017-2020 50 | [arjon.es](https://arjon.es) | [LinkedIn](https://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /postgres/scripts/init.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA workshop; 2 | 3 | DROP TABLE IF EXISTS stocks; 4 | CREATE TABLE stocks ( 5 | full_date timestamptz NOT NULL, 6 | symbol varchar(10) NOT NULL, 7 | category varchar(64) NOT NULL, 8 | open double precision NOT NULL, 9 | high double precision NOT NULL, 10 | low double precision NOT NULL, 11 | close double precision NOT NULL, 12 | MA20 double precision NOT NULL, 13 | MA50 double precision NOT NULL, 14 | MA100 double precision NOT NULL, 15 | PRIMARY KEY(full_date, symbol) 16 | ); 17 | -------------------------------------------------------------------------------- /scala/README.md: -------------------------------------------------------------------------------- 1 | # Databricks Notebook 2 | 3 | 1. Crear una cuenta en [Databricks | COMMUNITY EDITION](https://databricks.com/try-databricks) 4 | 5 | 2. Importar el Notebook: 6 | 7 | ![](databricks-import-notebook-1.png) 8 | 9 | ![](databricks-import-notebook-2.png) 10 | 11 | 3. Agregar la URL: `https://raw.githubusercontent.com/arjones/bigdata-workshop-es/master/scala/Day%201%20-%20Scala%20Intro.html` 12 | 13 | # Siga leyendo 14 | * [Batch Processing](README-batch.md) 15 | 16 | ## Sobre 17 | Gustavo Arjones © 2017-2020 18 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 19 | -------------------------------------------------------------------------------- /scala/databricks-import-notebook-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/scala/databricks-import-notebook-1.png -------------------------------------------------------------------------------- /scala/databricks-import-notebook-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/scala/databricks-import-notebook-2.png -------------------------------------------------------------------------------- /spark/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre-slim 2 | 3 | WORKDIR /app 4 | 5 | ARG SPARK_VERSION=2.4.5 6 | 7 | ENV SPARK_HOME /opt/spark 8 | 9 | RUN apt-get update && \ 10 | apt-get install -y wget ca-certificates procps && \ 11 | wget http://apache.dattatec.com/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop2.7.tgz -O - | tar zx -C /opt && \ 12 | ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop2.7 /opt/spark && \ 13 | sed 's/INFO/ERROR/g' /opt/spark/conf/log4j.properties.template > /opt/spark/conf/log4j.properties && \ 14 | echo "\nexport PATH=\${PATH}:/opt/spark/bin" >> /etc/bash.bashrc && \ 15 | echo "\nexport SPARK_NO_DAEMONIZE=1" >> /etc/bash.bashrc && \ 16 | rm -rf /var/lib/apt/lists/* 17 | 18 | CMD ["/opt/spark/bin/spark-shell"] 19 | 20 | EXPOSE 8080 8081 4040 4041 21 | -------------------------------------------------------------------------------- /spark/Dockerfile.pyspark: -------------------------------------------------------------------------------- 1 | ARG SPARK_VERSION=2.4.5 2 | FROM arjones/spark:${SPARK_VERSION} 3 | 4 | WORKDIR /notebook 5 | 6 | COPY requirements.txt /tmp/ 7 | 8 | RUN apt-get update && \ 9 | apt-get --no-install-recommends --no-install-suggests install -y \ 10 | python3 python3-pip python3-setuptools python3-distutils && \ 11 | update-alternatives --install /usr/bin/python python /usr/bin/python3.7 10 && \ 12 | pip3 install --no-cache-dir --default-timeout=120 -r /tmp/requirements.txt && \ 13 | apt-get autoremove -y && \ 14 | rm -rvf /tmp/requirements.txt /var/lib/apt/lists/* 15 | 16 | CMD [ "/usr/local/bin/jupyter", "notebook", "--allow-root", "--no-browser", "--ip=0.0.0.0"] 17 | 18 | EXPOSE 8888 19 | -------------------------------------------------------------------------------- /spark/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Install notes 2 | 3 | ## Trying new libs 4 | A quick & dirty way to try new libs in pySpark would be: 5 | 6 | ``` 7 | cd jupyter/notebook 8 | docker run -it -p8888:8888 -w /notebook -v $PWD:/notebook arjones/pyspark:2.4.4 bash 9 | 10 | apt-get update && \ 11 | apt-get --no-install-recommends --no-install-suggests install -y \ 12 | python3-pip && \ 13 | pip3 install gensim 14 | 15 | ``` 16 | 17 | After detecting all dependencies, you can include it on `Dockerfile` definitions and rebuild the image. -------------------------------------------------------------------------------- /spark/README.md: -------------------------------------------------------------------------------- 1 | # Apache Spark Image 2 | 3 | How to build and push the Spark Image: 4 | 5 | ```bash 6 | export SPARK_VERSION=2.4.5 7 | 8 | docker build \ 9 | --build-arg SPARK_VERSION=${SPARK_VERSION} \ 10 | -t arjones/spark:${SPARK_VERSION} . 11 | 12 | docker build \ 13 | -f Dockerfile.pyspark \ 14 | --build-arg SPARK_VERSION=${SPARK_VERSION} \ 15 | -t arjones/pyspark:${SPARK_VERSION} . 16 | 17 | docker push arjones/spark:${SPARK_VERSION} 18 | docker push arjones/pyspark:${SPARK_VERSION} 19 | ``` 20 | 21 | ## Sobre 22 | Gustavo Arjones © 2017-2020 23 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 24 | -------------------------------------------------------------------------------- /spark/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | findspark 3 | pandas 4 | matplotlib 5 | seaborn 6 | 7 | # gensim install 8 | wheel 9 | gensim 10 | 11 | # https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark 12 | pyarrow==0.14.1 13 | 14 | # Deps for fake_stock_price_gen 15 | kafka-python==2.0.1 -------------------------------------------------------------------------------- /superset/conf/superset_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ROW_LIMIT = 5000 4 | WEBSERVER_THREADS = 8 5 | SUPERSET_WEBSERVER_PORT = 8088 6 | SUPERSET_WEBSERVER_TIMEOUT = 60 7 | SECRET_KEY = '1dBL8QOCromAwD0nEZijWL7vvgGJHm0WPNxpUlJFGQA6fSQaG4dhj5sPvCZ7KX' 8 | CACHE_CONFIG = { 9 | 'CACHE_TYPE': 'redis', 10 | 'CACHE_DEFAULT_TIMEOUT': 300, 11 | 'CACHE_KEY_PREFIX': 'superset_', 12 | 'CACHE_REDIS_HOST': 'redis', 13 | 'CACHE_REDIS_PORT': 6379, 14 | 'CACHE_REDIS_DB': 1, 15 | 'CACHE_REDIS_URL': 'redis://redis:6379/1'} 16 | SQLALCHEMY_DATABASE_URI = 'sqlite:////var/lib/superset/superset.db' 17 | SQLALCHEMY_TRACK_MODIFICATIONS = True 18 | WTF_CSRF_ENABLED = True 19 | WTF_CSRF_EXEMPT_LIST = [] 20 | MAPBOX_API_KEY = os.getenv('MAPBOX_API_KEY', '') 21 | -------------------------------------------------------------------------------- /vm/README.md: -------------------------------------------------------------------------------- 1 | # Creando VM de cero 2 | 3 | * Download una version de Ubuntu Desktop 4 | 5 | `wget -c http://releases.ubuntu.com/18.04/ubuntu-18.04.3-desktop-amd64.iso` 6 | 7 | ## Configurar VirtualBox 8 | 9 | * Instalar [VirtualBox](https://www.virtualbox.org) 10 | * Configurar disco de **>= 20Gb** y **8Gb RAM** 11 | * Configurar red: Settings > Network > **Port Forwarding** 12 | 13 | ![virtualbox-port-forwarding](virtualbox-port-forwarding.png) 14 | 15 | * Instalar la VM 16 | * Abrir la terminal y ejecutar los comandos de [install-script.sh](install-script.sh) 17 | 18 | ## Acceso por SSH 19 | 20 | * Despues de habilitar SSHD (corriendo script arriba) se puede acceder a la VM por SSH: `ssh analyst@localhost -p 2222` 21 | 22 | 23 | ![](vm-0.png) 24 | 25 | ![](vm-1.png) 26 | 27 | ![](vm-2.png) 28 | 29 | ![](vm-3.png) 30 | 31 | ![](vm-4.png) 32 | 33 | ![](vm-5.png) 34 | 35 | ## Sobre 36 | Gustavo Arjones © 2017-2020 37 | [arjon.es](https://arjon.es) | [LinkedIn](http://linkedin.com/in/arjones/) | [Twitter](https://twitter.com/arjones) 38 | -------------------------------------------------------------------------------- /vm/install-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SSHD 4 | sudo apt-get update 5 | 6 | # Docker 7 | sudo apt-get install -y \ 8 | apt-transport-https \ 9 | ca-certificates \ 10 | curl \ 11 | software-properties-common 12 | 13 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 14 | 15 | sudo apt-key fingerprint 0EBFCD88 16 | 17 | sudo add-apt-repository \ 18 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 19 | $(lsb_release -cs) \ 20 | stable" 21 | 22 | sudo apt-get update 23 | sudo apt-get install -y docker-ce 24 | 25 | sudo groupadd docker 26 | sudo usermod -aG docker "${USER}" 27 | 28 | sudo systemctl enable docker 29 | 30 | # Docker Compose 31 | sudo curl -L https://github.com/docker/compose/releases/download/1.24.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose 32 | sudo chmod +x /usr/local/bin/docker-compose 33 | 34 | # Cleanup 35 | sudo apt-get -y autoremove 36 | 37 | ############################################# 38 | # 39 | echo '====================================' 40 | echo 41 | echo 'You need to logout and restart again' 42 | echo 43 | echo '====================================' 44 | -------------------------------------------------------------------------------- /vm/install-script.sh: -------------------------------------------------------------------------------- 1 | # Enables sudo without passwd 2 | echo "%sudo ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers 3 | 4 | # SSHD 5 | sudo apt-get update 6 | sudo apt-get install -y openssh-server 7 | 8 | # Remove/Install JVM 9 | sudo apt-get -y remove openjdk* 10 | sudo apt-get -y remove --auto-remove openjdk* 11 | sudo apt-get -y purge openjdk* 12 | 13 | sudo apt-get install -y openjdk-8-jdk 14 | 15 | # Git 16 | sudo apt-get install -y git \ 17 | maven 18 | 19 | ## Scala 20 | # IMPORTANT: Make sure scala version is the same as Spark 21 | # have been compiled to. Run spark-shell 22 | # 23 | sudo apt-get -y remove --auto-remove scala-library scala 24 | sudo apt-get -y purge scala-library* scala* 25 | 26 | sudo wget https://downloads.lightbend.com/scala/2.11.12/scala-2.11.12.deb 27 | sudo dpkg -i scala-2.11.12.deb 28 | sudo apt-get update 29 | sudo apt-get -y install scala 30 | 31 | # SBT 32 | echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list 33 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 34 | sudo apt-get update 35 | sudo apt-get install -y sbt 36 | 37 | # Docker 38 | sudo apt-get install -y \ 39 | apt-transport-https \ 40 | ca-certificates \ 41 | curl \ 42 | software-properties-common 43 | 44 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 45 | 46 | sudo apt-key fingerprint 0EBFCD88 47 | 48 | sudo add-apt-repository \ 49 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 50 | $(lsb_release -cs) \ 51 | stable" 52 | 53 | sudo apt-get update 54 | sudo apt-get install -y docker-ce 55 | 56 | sudo groupadd docker 57 | sudo usermod -aG docker "${USER}" 58 | 59 | # Docker Compose 60 | sudo curl -L https://github.com/docker/compose/releases/download/1.24.1/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose 61 | sudo chmod +x /usr/local/bin/docker-compose 62 | 63 | # Cleanup 64 | sudo apt-get -y autoremove 65 | 66 | 67 | ############################################# 68 | # 69 | # Course Material 70 | cd ~ 71 | git clone https://github.com/arjones/bigdata-workshop-es.git 72 | 73 | cd bigdata-workshop-es 74 | docker-compose pull 75 | -------------------------------------------------------------------------------- /vm/virtualbox-port-forwarding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/virtualbox-port-forwarding.png -------------------------------------------------------------------------------- /vm/vm-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-0.png -------------------------------------------------------------------------------- /vm/vm-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-1.png -------------------------------------------------------------------------------- /vm/vm-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-2.png -------------------------------------------------------------------------------- /vm/vm-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-3.png -------------------------------------------------------------------------------- /vm/vm-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-4.png -------------------------------------------------------------------------------- /vm/vm-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arjones/bigdata-workshop-es/8e61043079a11c21340a8c3625f6aeeb1e5e0b8c/vm/vm-5.png --------------------------------------------------------------------------------