├── .envrc ├── .gitignore ├── .mailmap ├── .python-version ├── LICENSE ├── README.md ├── Vagrantfile ├── airflow ├── .env ├── .gitignore ├── Makefile ├── dags │ ├── gasolina_naive.py │ ├── gasolina_s3.py │ ├── hello_dags.py │ ├── hello_python_operator.py │ ├── hello_simple.py │ ├── s3_bucket_operations.py │ ├── s3_file_sensor.py │ ├── spark_ondemand.py │ └── spark_simple.py ├── docker-compose.minio.yaml └── minio_connection.json ├── assignments ├── .dockerignore ├── .gitignore ├── Dockerfile ├── Makefile ├── README.md ├── __init__.py ├── conftest.py ├── contenedores.py ├── data │ └── containers.csv ├── helpers.py ├── pytest.ini ├── requirements.txt ├── test.sh ├── test_ejercicio_0.py ├── test_ejercicio_1.py ├── test_ejercicio_2.py ├── test_ejercicio_3.py ├── test_ejercicio_4.py ├── test_ejercicio_5.py ├── test_ejercicio_6.py └── test_ejercicio_7.py ├── beam ├── .gitignore ├── basic.py ├── beam ├── compras.py ├── compras_ptransform.py ├── compras_ptransform_condensed.py └── compras_totales_por_pais.py ├── data ├── .gitignore ├── Gemfile ├── Gemfile.lock ├── compras_tiny.csv ├── containers_tiny.csv ├── containers_tiny.parquet │ ├── .part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc │ ├── _SUCCESS │ ├── _common_metadata │ ├── _metadata │ └── part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet ├── country_codes.csv ├── exchange_rates_usd.json ├── iso-container-codes.csv ├── iso-container-codes.json ├── iso-container-groups.csv ├── nasdaq.csv ├── nasdaq.json ├── pelicula_ids.csv ├── pelicula_usuarios.csv ├── peliculas.csv ├── poors_man_routes.sh ├── random_data.rb ├── ratings.csv └── ship_routes.csv ├── infra ├── beam │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ └── beam.md ├── dataproc.md ├── docker │ ├── .envrc │ ├── Dockerfile │ ├── Makefile │ ├── README.md │ ├── docker-compose.yml │ └── docker.md ├── kubernetes │ ├── README.md │ ├── kubernetes.md │ ├── master-controller.yaml │ ├── master-service.yaml │ ├── namespace.yaml │ └── slave-controller.yaml ├── minio │ ├── config │ │ ├── config.json │ │ ├── config.json.old │ │ └── share │ │ │ ├── downloads.json │ │ │ └── uploads.json │ ├── data │ ├── docker-compose.yml │ ├── mc │ └── mirror.sh ├── pyspark-jupyter │ ├── Dockerfile │ ├── Makefile │ └── README.md ├── single-node.md └── vagrant.md ├── local_setup.sh ├── playbook.yml └── spark ├── .gitignore ├── _template_rdd ├── _template_sql ├── compras_con_mas_de_un_descuento.py ├── compras_conversion_a_dolares.py ├── compras_importe_total_agrupado_por_tx_id.py ├── compras_sql.py ├── compras_top_ten_countries.py ├── container.py ├── container_caching.py ├── container_convertir_a_parquet.py ├── container_databricks_csv.py ├── container_partition.py ├── container_rdd_to_dataset.py ├── data ├── enable_history.properties ├── friends.py ├── graphframes.sh ├── hello1.py ├── hello2.py ├── helpers.py ├── hft.py ├── live.ipynb ├── live.py ├── live ├── Makefile ├── gcloud ├── gsutil ├── live.sh ├── live_jupyter.sh └── live_template.html ├── peliculas_0_ml.py ├── peliculas_1_mllib.py ├── peliculas_calculo_de_medias_por_key.py ├── reload.sh ├── run_all.sh ├── ship_routes.py ├── spark └── stock_server.rb /.envrc: -------------------------------------------------------------------------------- 1 | layout pyenv 3.9.2 2 | use java adopt@1.11.0-11 3 | export SPARK_HOME="$(pwd)/.spark" 4 | PATH_add "$SPARK_HOME/bin" 5 | export PYSPARK_PYTHON=python3 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .DS_Store 3 | *.aux 4 | *.log 5 | *.pyc 6 | alt/ 7 | checkpoint 8 | metastore_db/ 9 | .spark*/ 10 | .vagrant/ 11 | out/ 12 | __pycache__/ 13 | spark-warehouse/ 14 | .direnv/ 15 | .ipynb_checkpoints/ 16 | 17 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | Luis Belloch 2 | 3 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.2 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Advanced Data Processing course materials. 2 | Copyright (C) 2016, Luis Belloch 3 | 4 | This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. http://creativecommons.org/licenses/by-nc-sa/4.0/ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Processing with Spark 2 | 3 | Materials for the Advanced Data Processing course of the [Big Data Analytics](http://bigdata.inf.upv.es) Master at the Universitat Politècnica de València. 4 | 5 | This course gives a 30 hours overview of many concepts, techniques and tools in data processing using Spark, including some key concepts from Apache Beam. We assume you're familiar with Python, but all the exercises can be easily followed in Java and Scala. We've included a Vagrant definition and docker images for both [Spark](infra/docker/docker.md) and [Beam](infra/beam/beam.md). 6 | 7 | If you find a bug or you want to contribute some comments, please [fill an issue in this repository](https://github.com/luisbelloch/data_processing_course/issues/new) or simply [write us](mailto:bigdata@luisbelloch.es). You're free to reuse course materials, please follow details in the [license section](#license). 8 | 9 | ## Structure 10 | 11 | ### Part A - Spark 12 | 13 | 1. Brief intro to functional programming 14 | 2. Spark basics 15 | 3. PySpark: transformations, actions and basic IO 16 | 4. Spark SQL 17 | 5. MLib 18 | 6. Graphs 19 | - GraphX (Scala) 20 | - GraphFrames (Python) 21 | 7. Spark cluster deployment 22 | - [Single node](infra/single-node.md) 23 | - [Vagrant box playground](infra/vagrant.md) 24 | - Clustering 25 | - [Docker](infra/docker/docker.md) 26 | - [Kubernetes](infra/kubernetes/kubernetes.md) 27 | - [Cloud Dataproc](infra/dataproc.md) - [Start Tutorial](https://ssh.cloud.google.com/cloudshell/open?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git&page=editor&cloudshell_tutorial=infra/dataproc.md) (in Spanish) 28 | 8. Apache Beam 29 | - [Rationale](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf) 30 | - [Docker container using Python SDK](infra/beam/beam.md) 31 | - Slides (coming soon) 32 | 9. Minio 33 | 10. Apache Airflow: coordinating jobs 34 | - Basic setup 35 | - DAGs 36 | - Cloud Composer 37 | 38 | ### Part B - Architecture Workshop 39 | 40 | Team work using [Aronson's puzzle](https://en.wikipedia.org/wiki/Jigsaw_(teaching_technique)). We present a set of real case studies to solve and teams have to design and develop them using any technology available in the market today. 41 | 42 | In the first phase, the teams will split with the goal of becoming experts into a particular area and dig into the proposed tools and framework specifics. In the second phase, they'll return to their peers to design a system that covers use case requirement. There's a 15 minute presentation per team to share the results. 43 | 44 | ## Lecture Notes 45 | 46 | To be added soon, stay tuned! 47 | 48 | ## Source Samples 49 | 50 | - Functional programming (coming soon) 51 | - Why you don't need big data tools 52 | - [poors_man_routes.sh](data/poors_man_routes.sh) - bash superpowers 53 | - Basic data processing using PySpark 54 | - [compras_con_mas_de_un_descuento.py](spark/compras_con_mas_de_un_descuento.py) 55 | - [compras_importe_total_agrupado_por_tx_id.py](spark/compras_importe_total_agrupado_por_tx_id.py) 56 | - [compras_conversion_a_dolares.py](spark/compras_conversion_a_dolares.py) 57 | - [compras_top_ten_countries.py](spark/compras_top_ten_countries.py) 58 | - [helpers.py](spark/helpers.py) - basic parse functions to get started quickly 59 | - Spark SQL 60 | - [compras_sql.py](spark/compras_sql.py) 61 | - [container.py](spark/container.py) 62 | - [container_convertir_a_parquet.py](spark/container_convertir_a_parquet.py) 63 | - [container_rdd_to_dataset.py](spark/container_rdd_to_dataset.py) 64 | - [container_databricks_csv.py](spark/container_databricks_csv.py) 65 | - [container_caching.py](spark/container_caching.py) 66 | - [container_partition.py](spark/container_partition.py) 67 | - Spark Streaming 68 | - [hft.py](spark/stock_server.py) and [stock_server.rb](spark/stock_server.rb) 69 | - MLib 70 | - [peliculas_0_ml.py](spark/peliculas_0_ml.py) - ALS intro 71 | - [peliculas_1_ml.py](spark/peliculas_1_ml.py) - Predictions 72 | - GraphFrames 73 | - [friends.py](spark/friends.py) - Classic graph sample 74 | - [ship_routes.py](spark/ship_routes.py) - Shortest paths for ship routes 75 | - Apache Beam 76 | - [basic.py](beam/basic.py) 77 | - [compras.py](beam/compras.py) 78 | - [compras_ptransform.py](beam/compras_ptransform.py) 79 | - [compras_ptransform_condensed.py](beam/compras_ptransform_condensed.py) 80 | - [compras_totales_por_pais.py](beam/compras_totales_por_pais.py) 81 | - Apache Airflow 82 | - [Standalone Docker Image](https://hub.docker.com/r/luisbelloch/airflow) 83 | - Tutorial for Composer in Cloud Shell [[English]() / [Spanish]()] 84 | - [hello_dags.py](airflow/dags/hello_dags.py) 85 | - [hello_python_operator.py](airflow/dags/hello_python_operator.py) 86 | - [hello_simple.py](airflow/dags/hello_simple.py) 87 | - [spark_ondemand.py](airflow/dags/spark_ondemand.py) 88 | - [spark_simple.py](airflow/dags/spark_simple.py) 89 | - Deployment 90 | - [Single Node](infra/single-node.md) 91 | - [Vagrant](infra/vagrant.md) 92 | - [Ansible](playbook.yml) 93 | - [Spark on Docker](infra/docker/docker.md) 94 | - [Beam on Docker](infra/beam/beam.md) 95 | - [Spark on Kubernetes](infra/kubernetes/kubernetes.md) 96 | - [Spark on Google Cloud Dataproc](infra/dataproc.md) 97 | - Tutorial for Dataproc in Cloud Shell English / [Spanish](https://ssh.cloud.google.com/cloudshell/open?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git&page=editor&cloudshell_tutorial=infra/dataproc.md)] 98 | - [PySpark Jupyter Notebook](infra/pyspark-jupyter/README.md) 99 | 100 | ## Assignments 101 | 102 | Final course assignments can be found in [this document](assignments/README.md). They are in Spanish, they will be translated to English at some point. 103 | 104 | I'm not publishing the solutions to avoid remaking the exercises every year. There's a test suite using [py.test](http://pytest.org) to help you validate the results. If you're really interested on them, please write me to [bigdata@luisbelloch.es](mailto:bigdata@luisbelloch.es). 105 | 106 | ## Evaluation Criteria 107 | 108 | > Self-sufficiency is the state of not requiring any aid, support, or interaction, for survival; it is therefore a type of personal or collective autonomy - [Wikipedia](https://en.wikipedia.org/wiki/Self-sufficiency). 109 | 110 | We follow a self-sufficiency principles for students to drive course goals. At the end of the course, students should have enough knowledge and tools to develop small data processing solutions their own. 111 | 112 | 1. Student understands the underlying concepts behind Spark, and is able to write data processing scripts using PySpark, Spark SQL and MLib. 113 | 2. Student is capable of identify common data processing libraries and frameworks and their applications. 114 | 3. Student is capable to work in a team designing a system to cover a simple data processing scenario, understanding the basic implications of the choices they made on systems, languages, libraries and platforms. 115 | 116 | ## Readings and links 117 | 118 | We recommend the following papers to expand knowledge on Spark and other data processing techniques: 119 | 120 | - [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf) 121 | - [Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](http://people.csail.mit.edu/matei/papers/2012/hotcloud_spark_streaming.pdf) 122 | - [Spark SQL: Relational Data Processing in Spark](http://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf) 123 | - [MLlib: Machine Learning in Apache Spark](http://www.jmlr.org/papers/volume17/15-237/15-237.pdf) 124 | - [GraphX: Unifying Data-Parallel and Graph-Parallel Analytics](https://amplab.cs.berkeley.edu/wp-content/uploads/2014/02/graphx.pdf) 125 | - [Tachyon: Memory Throughput I/O for Cluster Computing Frameworks](http://people.eecs.berkeley.edu/~haoyuan/papers/2013_ladis_tachyon.pdf) 126 | - [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf) 127 | - [Streaming 101: The world beyond batch](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) - [Part two](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102) 128 | - [Apache Flink™: Stream and Batch Processing in a Single Engine](https://www.user.tu-berlin.de/asteriosk/assets/publications/flink-deb.pdf) 129 | - [MillWheel: Fault-Tolerant Stream Processing at Internet Scale](http://research.google.com/pubs/pub41378.html) 130 | - [Pig Latin: A Not-So-Foreign Language for Data Processing](http://infolab.stanford.edu/~olston/publications/sigmod08.pdf) 131 | - [Interpreting the Data: Parallel Analysis with Sawzall](http://research.google.com/archive/sawzall.html) 132 | - [Photon: Fault-tolerant and Scalable Joining of Continuous Data Streams](http://research.google.com/pubs/pub41318.html) 133 | - [Above the Clouds: A Berkeley View of Cloud Computing](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.pdf) 134 | - [Cloud Programming Simplified: A Berkeley View on Serverless Computing](https://arxiv.org/abs/1902.03383) (particularly item 8.2 on MapReduce also applies to Spark) 135 | 136 | ## Roadmap 137 | 138 | Some ideas we might add in forthcoming course editions: 139 | 140 | - Code samples in python notebooks 141 | - ~~Apache Flink and Apache Beam~~ (2017) 142 | - Add Tachyon content and exercises 143 | - Add Kafka source to the streaming sample 144 | - ~~Introduce samples with Minio / InfiniSpan~~ (2018) 145 | - ~~Improve deployment scenarios and tools: Mesos, Chef, etc.~~ (2017) 146 | - Monitoring using Prometheus and Grafana, provide ready-to-use docker containers 147 | - Profiling of Spark applications (Scala only) 148 | - Translate all content to English and Spanish 149 | - ~~Cloud Dataproc~~ (2019) 150 | - ~~Apache Airflow~~ (2019) 151 | - Tensorflow training and model execution at scale 152 | 153 | ## License 154 | 155 | Advanced Data Processing course materials. 156 | Copyright (C) 2016, Luis Belloch 157 | 158 | Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 159 | 160 | ### Recommended citation 161 | 162 | > Luis Belloch, course materials for Advanced Data Processing, Spring 2016. Master on Big Data Analytics (http://bigdata.inf.upv.es), Universitat Politècnica de València. Downloaded on [DD Month YYYY]. 163 | 164 | 165 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | Vagrant.configure("2") do |config| 5 | config.vm.box = "debian/buster64" 6 | config.vbguest.auto_update = false 7 | # config.vm.network "forwarded_port", guest: 8080, host: 8080 8 | # config.vm.network "forwarded_port", guest: 8081, host: 8081 9 | # config.vm.network "forwarded_port", guest: 8082, host: 8082 10 | 11 | config.vm.provision "shell" do |s| 12 | s.inline = "apt-get update && apt-get install -y python" 13 | end 14 | 15 | config.vm.provision "ansible_local" do |ansible| 16 | ansible.verbose = "v" 17 | ansible.playbook = "playbook.yml" 18 | ansible.compatibility_mode = "2.0" 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /airflow/.env: -------------------------------------------------------------------------------- 1 | AIRFLOW_UID=50000 2 | -------------------------------------------------------------------------------- /airflow/.gitignore: -------------------------------------------------------------------------------- 1 | docker-compose.yaml 2 | logs/ 3 | plugins/ 4 | .minio/ 5 | airflow 6 | -------------------------------------------------------------------------------- /airflow/Makefile: -------------------------------------------------------------------------------- 1 | AIRFLOW_VERSION:=2.4.3 2 | 3 | .PHONY: all 4 | all: clean docker-compose.yaml airflow init up 5 | 6 | docker-compose.yaml: 7 | curl -LfO 'https://airflow.apache.org/docs/apache-airflow/${AIRFLOW_VERSION}/docker-compose.yaml' 8 | 9 | airflow: 10 | curl -Lf 'https://airflow.apache.org/docs/apache-airflow/${AIRFLOW_VERSION}/airflow.sh' > airflow 11 | chmod +x airflow 12 | 13 | .PHONY: init 14 | init: 15 | docker-compose up airflow-init 16 | 17 | .PHONY: up 18 | up: 19 | docker-compose up 20 | 21 | .PHONY: down 22 | down: 23 | docker-compose down --remove-orphans 24 | 25 | .PHONY: minio 26 | minio: 27 | docker-compose -f docker-compose.yaml -f docker-compose.minio.yaml up minio 28 | 29 | .PHONY: minio_connection 30 | minio_connection: airflow 31 | ./airflow connections import minio_connection.json 32 | 33 | .PHONY: clean 34 | clean: 35 | -docker-compose down --volumes --remove-orphans 36 | -rm -rf logs/ plugins/ .minio/ docker-compose.yaml airflow dags/__pycache__ 37 | -------------------------------------------------------------------------------- /airflow/dags/gasolina_naive.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | 4 | from airflow import AirflowException 5 | from airflow.decorators import dag, task 6 | 7 | import requests 8 | 9 | codigo_postal = "50197" 10 | endpoint = "https://sedeaplicaciones.minetur.gob.es/ServiciosRESTCarburantes/PreciosCarburantes/EstacionesTerrestres/" 11 | 12 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['upv']) 13 | def extraer_precio_gasolina_naive(): 14 | 15 | @task 16 | def recogida(): 17 | print("Recogiendo datos...") 18 | response = requests.get(endpoint) 19 | if response.status_code != 200: 20 | AirflowException(f"Fallo de conexión {response.status_code}") 21 | 22 | datos = response.json() 23 | return datos['ListaEESSPrecio'] 24 | 25 | @task 26 | def filtrado(datos, codigo_postal): 27 | return list(filter(lambda x: x['C.P.'] == codigo_postal, datos)) 28 | 29 | @task 30 | def almacenamiento(datos): 31 | print("Almacenando datos...") 32 | print(json.dumps(datos, indent=2)) 33 | 34 | todos_los_datos = recogida() 35 | datos_del_codigo_postal_x = filtrado(todos_los_datos, codigo_postal) 36 | almacenamiento(datos_del_codigo_postal_x) 37 | 38 | dag_gasolina = extraer_precio_gasolina_naive() 39 | 40 | -------------------------------------------------------------------------------- /airflow/dags/gasolina_s3.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from datetime import datetime 4 | 5 | from airflow import AirflowException 6 | from airflow.decorators import dag, task 7 | from airflow.operators.bash_operator import BashOperator 8 | from airflow.contrib.sensors.file_sensor import FileSensor 9 | 10 | import boto3 11 | import botocore.client 12 | import requests 13 | 14 | codigo_postal = "50197" 15 | bucket_name = "gasolina" 16 | endpoint = "https://sedeaplicaciones.minetur.gob.es/ServiciosRESTCarburantes/PreciosCarburantes/EstacionesTerrestres/" 17 | 18 | def s3_resource(): 19 | return boto3.resource('s3', 20 | endpoint_url='http://minio:9000', 21 | aws_access_key_id='bigdataupv', 22 | aws_secret_access_key='bigdataupv', 23 | config=botocore.client.Config(signature_version='s3v4'), region_name='us-east-1') 24 | 25 | def read_json_from_s3(key): 26 | obj = s3_resource().Object(bucket_name, key) 27 | return json.loads(obj.get()['Body'].read().decode('utf-8')) 28 | 29 | def save_to_s3(key, data): 30 | obj = s3_resource().Object(bucket_name, key) 31 | obj.put(Body=data) 32 | 33 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['upv']) 34 | def extraer_precio_gasolina_s3(): 35 | 36 | @task 37 | def recogida_s3(): 38 | print("Recogiendo datos...") 39 | response = requests.get(endpoint) 40 | if response.status_code != 200: 41 | AirflowException(f"Fallo de conexión {response.status_code}") 42 | 43 | filename = f'recogida-{datetime.now().strftime("%Y%m%d%H%M%S")}.json' 44 | save_to_s3(filename, response.text) 45 | return { "recogida": filename } 46 | 47 | @task 48 | def filtrado_s3(contexto, codigo_postal): 49 | print("Filtrando datos...") 50 | 51 | datos = read_json_from_s3(contexto['recogida']) 52 | filtrados = list(filter(lambda x: x['C.P.'] == codigo_postal, datos['ListaEESSPrecio'])) 53 | 54 | filename = f'filtrado-{datetime.now().strftime("%Y%m%d%H%M%S")}.json' 55 | save_to_s3(filename, json.dumps(filtrados)) 56 | 57 | return { **contexto, "filtrado": filename } 58 | 59 | @task 60 | def almacenamiento_s3(contexto): 61 | print("Almacenando datos... Nothing to do!") 62 | return 42 63 | 64 | todos_los_datos = recogida_s3() 65 | datos_del_codigo_postal_x = filtrado_s3(todos_los_datos, codigo_postal) 66 | almacenamiento_s3(datos_del_codigo_postal_x) 67 | 68 | dag_gasolina = extraer_precio_gasolina_s3() 69 | 70 | # Additionally, use Amazon operator, particularly S3KeySensor 71 | # https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/operators/s3.html 72 | # https://github.com/apache/airflow/tree/main/airflow/providers/amazon/aws/example_dags 73 | -------------------------------------------------------------------------------- /airflow/dags/hello_dags.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | from datetime import timedelta 3 | 4 | import airflow 5 | from airflow.models import DAG 6 | from airflow.operators.bash_operator import BashOperator 7 | from airflow.operators.dummy_operator import DummyOperator 8 | from airflow.operators.python_operator import PythonOperator 9 | 10 | dag = DAG('hello_dags', schedule_interval=None, start_date=airflow.utils.dates.days_ago(2), tags=['upv']) 11 | 12 | def print_hello(): 13 | return 'Hello world!' 14 | 15 | inicio = BashOperator(task_id='inicio', bash_command="echo inicio!", dag=dag) 16 | paso1 = BashOperator(task_id='paso1', bash_command="echo paso 1", dag=dag) 17 | paso2 = PythonOperator(task_id='paso2', python_callable=print_hello, dag=dag) 18 | paso3 = DummyOperator(task_id='paso3', dag=dag) 19 | ultima_tarea = DummyOperator(task_id='ultima_tarea', dag=dag) 20 | 21 | inicio >> [paso1, paso3] 22 | paso1 >> paso2 >> ultima_tarea 23 | paso3 >> ultima_tarea 24 | 25 | if __name__ == "__main__": 26 | dag.cli() 27 | -------------------------------------------------------------------------------- /airflow/dags/hello_python_operator.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | def print_hello(): 8 | return 'Hello world!' 9 | 10 | dag = DAG( 11 | 'hello_python_operator', 12 | description='Simple tutorial DAG', 13 | schedule_interval='20 * * * *', 14 | start_date=datetime(2017, 3, 20), 15 | tags=['upv'], 16 | catchup=False) 17 | 18 | dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) 19 | 20 | hello_operator = PythonOperator(task_id='hello_from_python', python_callable=print_hello, dag=dag) 21 | 22 | dummy_operator >> hello_operator 23 | -------------------------------------------------------------------------------- /airflow/dags/hello_simple.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | 6 | default_args = {"start_date": datetime(2019, 2, 5)} 7 | dag = DAG('hello', default_args=default_args, schedule_interval=None, tags=['upv'],) 8 | 9 | dummy_operator = DummyOperator(task_id='dummy_task', dag=dag) 10 | hello_operator = DummyOperator(task_id='hello_task', dag=dag) 11 | 12 | dummy_operator >> hello_operator 13 | -------------------------------------------------------------------------------- /airflow/dags/s3_bucket_operations.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from airflow.decorators import task 5 | from airflow.models.dag import DAG 6 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook 7 | from airflow.providers.amazon.aws.operators.s3 import S3CreateBucketOperator, S3DeleteBucketOperator 8 | 9 | # By default, it will use 'aws_default' connection. You can create it here by running `make minio_credentials` 10 | # If you want to change it, use a variable and pass it as `aws_conn_id` to all AWS operators. 11 | AWS_CONN_ID = 'aws_default' 12 | 13 | BUCKET_NAME = os.environ.get('BUCKET_NAME', 'patatas') 14 | 15 | @task(task_id="s3_bucket_dag_add_keys_to_bucket") 16 | def upload_keys(): 17 | s3_hook = S3Hook() 18 | for i in range(0, 3): 19 | s3_hook.load_string(string_data="input", key=f"path/data{i}", bucket_name=BUCKET_NAME) 20 | 21 | with DAG( 22 | dag_id='s3_bucket_operations', 23 | schedule_interval=None, 24 | start_date=datetime(2021, 1, 1), 25 | catchup=False, 26 | default_args={"bucket_name": BUCKET_NAME}, 27 | max_active_runs=1, 28 | tags=['upv'], 29 | ) as dag: 30 | 31 | create_bucket = S3CreateBucketOperator(task_id='s3_bucket_dag_create', region_name='us-east-1') 32 | add_keys_to_bucket = upload_keys() 33 | delete_bucket = S3DeleteBucketOperator(task_id='s3_bucket_dag_delete', force_delete=True) 34 | create_bucket >> add_keys_to_bucket >> delete_bucket 35 | -------------------------------------------------------------------------------- /airflow/dags/s3_file_sensor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from airflow.decorators import task 5 | from airflow.models.dag import DAG 6 | from airflow.models.variable import Variable 7 | from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor 8 | 9 | BUCKET_NAME = os.environ.get('BUCKET_NAME', 'patatas') 10 | 11 | @task(task_id="do_something") 12 | def do_something(): 13 | print("Something!") 14 | 15 | with DAG( 16 | dag_id='s3_file_sensor', 17 | schedule_interval=None, 18 | start_date=datetime(2021, 1, 1), 19 | catchup=False, 20 | default_args={"bucket_name": BUCKET_NAME}, 21 | max_active_runs=1, 22 | tags=['upv'], 23 | ) as dag: 24 | 25 | op = S3KeySensor(task_id="s3_key_sensor", bucket_key="s3://gasolina/some_file.json", bucket_name=None, dag=dag) 26 | end_task = do_something() 27 | op >> end_task 28 | -------------------------------------------------------------------------------- /airflow/dags/spark_ondemand.py: -------------------------------------------------------------------------------- 1 | # ./airflow variables set gcp_project bigdataupv2022 2 | # ./airflow variables set gcp_region europe-west1 3 | # ./airflow variables set gcp_zone europe-west1-b 4 | # ./airflow variables set gcp_bucket bigdataupv_data 5 | 6 | import datetime 7 | import os 8 | 9 | from airflow import models 10 | from airflow.contrib.operators import dataproc_operator 11 | from airflow.utils import trigger_rule 12 | 13 | yesterday = datetime.datetime.combine( 14 | datetime.datetime.today() - datetime.timedelta(1), 15 | datetime.datetime.min.time()) 16 | 17 | default_dag_args = { 18 | 'start_date': yesterday, 19 | 'email_on_failure': False, 20 | 'email_on_retry': False, 21 | 'retries': 1, 22 | 'retry_delay': datetime.timedelta(minutes=5), 23 | 'project_id': models.Variable.get('gcp_project') 24 | } 25 | 26 | with models.DAG( 27 | 'spark_ondemand', 28 | schedule_interval=datetime.timedelta(days=1), 29 | default_args=default_dag_args) as dag: 30 | 31 | create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( 32 | task_id='create_dataproc_cluster', 33 | cluster_name='spark-cluster-{{ ds_nodash }}', 34 | num_workers=2, 35 | zone=models.Variable.get('gcp_zone'), 36 | region=models.Variable.get('gcp_region'), 37 | master_machine_type='n1-standard-1', 38 | worker_machine_type='n1-standard-1') 39 | 40 | run_dataproc_pyspark = dataproc_operator.DataProcPySparkOperator( 41 | task_id='run_spark', 42 | cluster_name='spark-cluster-{{ ds_nodash }}', 43 | region=models.Variable.get('gcp_region'), 44 | main='gs://bigdataupv_code/compras_top_ten_countries.py', 45 | files=['gs://bigdataupv_code/helpers.py']) 46 | 47 | delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( 48 | task_id='delete_dataproc_cluster', 49 | cluster_name='spark-cluster-{{ ds_nodash }}', 50 | trigger_rule=trigger_rule.TriggerRule.ALL_DONE) 51 | 52 | create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster 53 | 54 | -------------------------------------------------------------------------------- /airflow/dags/spark_simple.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | from airflow import models 5 | from airflow.contrib.operators import dataproc_operator 6 | from airflow.utils import trigger_rule 7 | 8 | output_file = os.path.join( 9 | models.Variable.get('gcs_bucket'), 'dataproc_simple', 10 | datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep 11 | 12 | yesterday = datetime.datetime.combine( 13 | datetime.datetime.today() - datetime.timedelta(1), 14 | datetime.datetime.min.time()) 15 | 16 | args = { 17 | 'start_date': yesterday, 18 | 'email_on_failure': False, 19 | 'email_on_retry': False, 20 | 'retries': 1, 21 | 'retry_delay': datetime.timedelta(minutes=5), 22 | 'project_id': models.Variable.get('gcp_project') 23 | } 24 | 25 | with models.DAG('spark_simple', schedule_interval=datetime.timedelta(days=1), default_args=args) as dag: 26 | run_step = dataproc_operator.DataProcPySparkOperator( 27 | task_id='run_spark', 28 | cluster_name='cluster-9c11', 29 | region=models.Variable.get('gcp_region'), 30 | main='gs://bigdataupv_code/compras_top_ten_countries.py', 31 | files=['gs://bigdataupv_code/helpers.py']) 32 | 33 | -------------------------------------------------------------------------------- /airflow/docker-compose.minio.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3' 3 | services: 4 | minio: 5 | image: minio/minio 6 | environment: 7 | MINIO_ROOT_USER: bigdataupv 8 | MINIO_ROOT_PASSWORD: bigdataupv 9 | MINIO_REGION_NAME: us-east-1 10 | ports: 11 | - '9000:9000' 12 | - '9001:9001' 13 | init: true 14 | entrypoint: sh 15 | command: -c 'mkdir -p /data/gasolina && minio server /data --console-address ":9001"' 16 | volumes: 17 | - .minio:/data 18 | -------------------------------------------------------------------------------- /airflow/minio_connection.json: -------------------------------------------------------------------------------- 1 | { 2 | "aws_default": { 3 | "conn_type": "s3", 4 | "description": "", 5 | "host": "", 6 | "login": "bigdataupv", 7 | "password": "bigdataupv", 8 | "schema": "", 9 | "port": null, 10 | "extra": "{\"host\": \"http://minio:9000\"}" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /assignments/.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .cache/ 3 | .direnv/ 4 | .pytest_cache/ 5 | .venv/ 6 | resultados/ 7 | resultados/ 8 | soluciones/ 9 | spark-warehouse/ 10 | venv/ 11 | *.pyc 12 | -------------------------------------------------------------------------------- /assignments/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .cache/ 3 | .direnv/ 4 | .pytest_cache/ 5 | .venv/ 6 | resultados/ 7 | resultados/ 8 | soluciones/ 9 | spark-warehouse/ 10 | venv/ 11 | *.pyc 12 | -------------------------------------------------------------------------------- /assignments/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM luisbelloch/spark 2 | LABEL maintainer="Luis Belloch " 3 | 4 | WORKDIR /opt/tests/ 5 | 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | RUN apt-get update && \ 8 | apt-get upgrade -y python3 && \ 9 | apt-get install -y --no-install-recommends python3-venv python3-pip && \ 10 | rm -rf /var/lib/apt/lists/* 11 | 12 | COPY requirements.txt . 13 | 14 | RUN pip3 install wheel 15 | RUN pip3 install -r requirements.txt 16 | 17 | -------------------------------------------------------------------------------- /assignments/Makefile: -------------------------------------------------------------------------------- 1 | IMAGE_NAME:=luisbelloch/spark-assignments 2 | 3 | .PHONY: all build tag push list test 4 | 5 | test: 6 | ./test.sh 7 | 8 | all: build tag 9 | 10 | build: 11 | docker build -t $(IMAGE_NAME) . 12 | 13 | tag: 14 | docker tag $(IMAGE_NAME) $(IMAGE_NAME):2.4.5 15 | docker tag $(IMAGE_NAME) $(IMAGE_NAME):2020.1 16 | 17 | push: 18 | docker push $(IMAGE_NAME):2.4.5 19 | docker push $(IMAGE_NAME):2020.1 20 | docker push $(IMAGE_NAME) 21 | 22 | list: 23 | docker images $(IMAGE_NAME) 24 | 25 | -------------------------------------------------------------------------------- /assignments/README.md: -------------------------------------------------------------------------------- 1 | # Prácticas SPARK 2 | 3 | Las prácticas consisten en desarrollar una serie de ejercicios de procesado de datos con `PySpark`. 4 | 5 | Para completar las prácticas debe completarse un archivo llamado `contenedores.py` con los ejercicios abajo descritos. No es necesaria explicación alguna, únicamente se pide que el código esté limpio, bien estructurado y ejecute correctamente. 6 | 7 | Los archivos de datos vienen incluidos en este repositorio en la carpeta `data`. Entre los archivos de la práctica se ha incluido [un contenedor de Docker](https://hub.docker.com/r/luisbelloch/spark-assignments/) con todo lo necesario instalado. También se ha incluido una [batería de pruebas](pruebas) para que puedas comprobar los resultados antes de entregar la práctica. 8 | 9 | Cada ejercicio produce un resultado distinto. Los resultados deben guardarse en una carpeta denominada `resultados`, teniendo un único archivo por ejercicio con la nomenclatura `resultado_1`, `resultado_2` etc. La función `path_resultados` devuelve la ruta completa que puedes usar para guardar los datos procesados en cada ejercicio. En la mayoría de los casos debes devolver un DataFrame: 10 | 11 | ``` 12 | def ejercicio_3(sc, path_resultados): 13 | df = sq.sql(...) 14 | # ... otras operaciones 15 | # ... save(path_resultados(3)) 16 | return df 17 | ``` 18 | 19 | Los ejercicios se realizarán sobre un fichero en formato CSV que contiene una lista de barcos, identificados por la columna `ship_imo`. A su vez, cada barco tiene una lista de contenedores identificados por la columna `container_id`. 20 | 21 | Para el procesado del archivo puedes utilizar cualquier función disponible en el API de Python de Spark 2.2.1 22 | 23 | ## Plazo de entrega 24 | 25 | Los ejercicios hay que enviarlos antes del 1 de febrero. 26 | 27 | ## Criterios de evaluación 28 | 29 | 1. El alumno entiende y es capaz de ejecutar programas en PySpark, haciendo uso de el core de Spark 2.2 y Spark SQL. 30 | 2. El archivo `contenedores.py` producido por el alumno se puede ejecutar con `spark-submit` y, opcionalmente, con `pytest`. 31 | 3. El código está estructurado correctamente, es legible y tiene una intencionalidad clara. 32 | 33 | ## Ejercicios 34 | 35 | **Ejercicio 0**. Ejecutar el archivo `contenedores.py` y comprobar que se crea un archivo dentro de la carpeta `resultados` con números del 0 al 9. 36 | 37 | ``` 38 | $ spark-submit contenedores.py 39 | $ cat resultados/resultado_0 40 | 0,1,2,3,4,5,6,7,8,9 41 | ``` 42 | 43 | **Ejercicio 1**. Leer el archivo `data/containers.csv` y contar el número de líneas. 44 | 45 | **Ejercicio 2**. Leer el archivo `data/containers.csv` y filtrar aquellos contenedores cuyo `ship_imo` es `DEJ1128330` y el grupo del contenedor es `22P1`. Guardar los resultados en un archivo de texto en `resultados/resutado_2`. 46 | 47 | **Ejercicio 3**. Leer el archivo `data/containers.csv` y convertir a formato Parquet. Recuerda que puedes hacer uso de la funcion `parse_container` en `helpers.py` tal y como vimos en clase. Guarda los resultados en `resultados/resultado_3`. 48 | 49 | **Ejercicio 4**. Lee el archivo de Parquet guardado en el ejercicio 3 y filtra los barcos que tienen al menos un contenedor donde la columna `customs_ok` es igual a `false`. Extrae una lista con los identificadores de barco, `ship_imo`, sin duplicados y ordenados alfabéticamente, en formato `json`. 50 | 51 | **Ejercicio 5**. Crea una UDF para validar el [código de identificación](https://en.wikipedia.org/wiki/ISO_6346) del contenedor `container_id`. Para simplificar la validación, daremos como válidos aquellos códigos compuestos de 3 letras para el propietario, 1 letra para la categoría, 6 números y 1 dígito de control. Devuelve un `DataFrame` con los campos: `ship_imo`, `container_id`, `propietario`, `categoria`, `numero_serie` y `digito_control`. 52 | 53 | **Ejercicio 6**. Extrae una lista con peso total de cada barco, `net_weight`, sumando cada contenedor y agrupado por los campos `ship_imo` y `container_group`. Devuelve un DataFrame con la siguiente estructura: `ship_imo`, `ship_name`, `container`, `total_net_weight`. 54 | 55 | **Ejercicio 7**. Guarda los resultados del ejercicio anterior en formato Parquet. 56 | 57 | **Ejercicio 8**. ¿En qué casos crees que es más eficiente utilizar formatos como Parquet? ¿Existe alguna desventaja frente a formatos de texto como CSV? 58 | 59 | **Ejercicio 9**. ¿Es posible procesar XML mediante Spark? ¿Existe alguna restricción por la cual no sea eficiente procesar un único archivo en multiples nodos? ¿Se te ocurre alguna posible solución para _trocear_ archivos suficientemente grandes? ¿Existe la misma problemática con otros formatos de texto como JSON? 60 | 61 | **Ejercicio 10**. Spark SQL tiene [una función](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.avg) denominada `avg` que se utiliza para calcular el promedio de un conjunto de valores ¿Por qué los autores han creado esta función en lugar de usar el API estándar de Python o Scala? 62 | 63 | ## Pruebas 64 | 65 | Existe una batería de pruebas para comprobar los resultados de cada ejercicio, desarrollada sobre [pytest](http://pytest.org). Las pruebas no son exhaustivas y únicamente están orientadas a verificar los resultados de cada ejercicio. No es necesario que las pruebas pasen para entregar la práctica, aunque se valorará de forma positiva. Se deja como ejercicio optativo adaptar o ampliar la batería de pruebas. 66 | 67 | ### Ejecución de pruebas en Docker 68 | 69 | De forma alternativa, hemos incluido una imágen de Docker con todas las dependencias necesarias. El directorio actual se montará como volumen dentro del contenedor, concretamente en `/opt/tests/assigments`. 70 | 71 | ``` 72 | $ ./test.sh 73 | ``` 74 | 75 | También es posible lanzar `bash` o `pyspark` para hacer comprobaciones manualmente: 76 | 77 | ``` 78 | $ docker run -v $(pwd):/opt/tests/assigments -ti luisbelloch/spark-assignments /bin/bash 79 | ``` 80 | 81 | Para simplificar el uso, hemos incluido un script llamado `spark` dentro de [la carpeta de ejemplos](../spark) que vimos en clase. El script es capaz de ejecutar cualquier script contenido dentro de esa carpeta, teniendo acceso también a los archivos de datos en `data`: 82 | 83 | ``` 84 | $ cd data_processing_course/spark 85 | $ ./spark compras_conversion_a_dolares.py 86 | ``` 87 | 88 | ### Ejecución local de pruebas 89 | 90 | Teniendo Spark instalado mediante `local_setup.sh`, puedes instalar `pytest` en local mediante `venv`: 91 | 92 | ``` 93 | $ python3 -m venv .venv 94 | $ source .venv/bin/activate 95 | $ pip install -r requirements.txt 96 | $ export SPARK_HOME=$(pwd)/../.spark 97 | ``` 98 | 99 | Y a partir de aquí puede ejecutarse la suite de pruebas: 100 | 101 | ``` 102 | $ pytest -v 103 | ``` 104 | 105 | Para ejecutar un único test añade el nombre al final, lo único que hay que tener en cuenta es que algunos ejercicios dependen de los datos de los anteriores: 106 | 107 | ``` 108 | $ pytest -v test_ejercicio_2.py 109 | ``` 110 | 111 | Happy hacking! 112 | -------------------------------------------------------------------------------- /assignments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/assignments/__init__.py -------------------------------------------------------------------------------- /assignments/conftest.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pytest 4 | import shutil 5 | import sys 6 | 7 | from glob import glob 8 | from .helpers import definir_path_resultados, comprobar_resultados_en_hdfs 9 | 10 | spark_home = os.environ.get('SPARK_HOME', None) 11 | if not spark_home: 12 | raise ValueError("Unable to find Spark, make sure SPARK_HOME environment variable is set") 13 | 14 | if not os.path.exists(spark_home): 15 | raise ValueError("Cannot find path set in SPARK_HOME: " + spark_home) 16 | 17 | spark_python = os.path.join(spark_home, 'python') 18 | py4j = glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0] 19 | sys.path[:0] = [spark_python, py4j] 20 | 21 | from pyspark.context import SparkContext 22 | 23 | @pytest.fixture(scope='session') 24 | def spark_context(request): 25 | sc = SparkContext('local', 'tests_practicas_spark') 26 | request.addfinalizer(lambda: sc.stop()) 27 | logger = logging.getLogger('py4j') 28 | logger.setLevel(logging.WARN) 29 | return sc 30 | 31 | @pytest.fixture(scope='session') 32 | def path_resultados(request): 33 | return definir_path_resultados('./resultados') 34 | 35 | @pytest.fixture(scope='session') 36 | def resultados_ejercicio_3(spark_context, path_resultados): 37 | from contenedores import ejercicio_3 38 | return ejercicio_3(spark_context, path_resultados) 39 | 40 | @pytest.fixture(scope='session') 41 | def comprobar_hdfs(path_resultados): 42 | def check(ejercicio_n): 43 | path = path_resultados(ejercicio_n) 44 | return comprobar_resultados_en_hdfs(path) 45 | return check 46 | 47 | @pytest.fixture(scope='session') 48 | def tiene_columnas(): 49 | def check(df, expected): 50 | assert df is not None, 'El DataFrame no existe ¿Olvidaste un "return df" al final del ejercicio?' 51 | assert sorted(expected) == sorted([column.lower() for column in df.columns]) 52 | return check 53 | 54 | -------------------------------------------------------------------------------- /assignments/contenedores.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import SQLContext, Row 3 | 4 | from .helpers import * 5 | 6 | path_containers = 'data/containers.csv' 7 | 8 | def ejercicio_0(sc, path_resultados): 9 | lineas = sc.parallelize(range(10)).collect() 10 | with open(path_resultados(0), 'w') as f: 11 | f.write("{}\n".format(",".join([str(s) for s in lineas]))) 12 | return lineas 13 | 14 | # Ejercicio 1. Leer el archivo data/containers.csv y contar el número de líneas. 15 | def ejercicio_1(sc, path_resultados): 16 | # COMPLETAR CÓDIGO AQUÍ 17 | # Devolver número de líneas 18 | return 0 19 | 20 | # Ejercicio 2. Leer el archivo data/containers.csv y filtrar aquellos 21 | # contenedores cuyo ship_imo es DEJ1128330 y el grupo del contenedor es 22P1. 22 | # Guardar los resultados en un archivo de texto en resultados/resutado_2. 23 | def ejercicio_2(sc, path_resultados): 24 | # COMPLETAR CÓDIGO AQUÍ 25 | # Guardar en resultados/resultado_2. La función path_resultados devuelve 26 | # la ruta donde se van a guardar los resultados, para que los tests puedan 27 | # ejecutar de forma correcta. Por ejemplo, path_resultados(2) devuelve la 28 | # ruta para el ejercicio 2, path_resultados(3) para el 3, etc. 29 | # Devolver rdd contenedores filtrados: 30 | # return rdd.collect() 31 | pass 32 | 33 | # Ejercicio 3. Leer el archivo data/containers.csv y convertir a formato 34 | # Parquet. Recuerda que puedes hacer uso de la funcion parse_container en 35 | # helpers.py tal y como vimos en clase. Guarda los resultados en 36 | # resultados/resultado_3. 37 | def ejercicio_3(sc, path_resultados): 38 | # COMPLETAR CÓDIGO AQUÍ 39 | # Guardar resultados y devolver DataFrame (return df) 40 | pass 41 | 42 | # Ejercicio 4. Lee el archivo de Parquet guardado en el ejercicio 3 y filtra 43 | # los barcos que tienen al menos un contenedor donde la columna customs_ok es 44 | # igual a false. Extrae un fichero de texto una lista con los identificadores 45 | # de barco, ship_imo, sin duplicados y ordenados alfabéticamente. 46 | def ejercicio_4(sc, path_resultados): 47 | # COMPLETAR CÓDIGO AQUÍ 48 | # Guardar resultados y devolver DataFrame (return df) 49 | pass 50 | 51 | # Ejercicio 5. Crea una UDF para validar el código de identificación del 52 | # contenedor container_id. Para simplificar la validación, daremos como 53 | # válidos aquellos códigos compuestos de 3 letras para el propietario, 1 54 | # letra para la categoría, 6 números y 1 dígito de control. Devuelve un 55 | # DataFrame con los campos: ship_imo, container_id, propietario, categoria, 56 | # numero_serie y digito_control. 57 | def ejercicio_5(sc, path_resultados): 58 | # COMPLETAR CÓDIGO AQUÍ 59 | # Guardar resultados y devolver DataFrame (return df) 60 | pass 61 | 62 | # Ejercicio 6. Extrae una lista con peso total de cada barco, `net_weight`, 63 | # sumando cada contenedor y agrupado por los campos `ship_imo` y `container_group`. 64 | # Devuelve un DataFrame con la siguiente estructura: 65 | # `ship_imo`, `ship_name`, `container_group`, `total_net_weight`. 66 | def ejercicio_6(sc, path_resultados): 67 | # COMPLETAR CÓDIGO AQUÍ 68 | # Guardar resultados y devolver DataFrame (return df) 69 | pass 70 | 71 | # Ejercicio 7. Guarda los resultados del ejercicio anterior en formato Parquet. 72 | def ejercicio_7(sc, path_resultados): 73 | # COMPLETAR CÓDIGO AQUÍ 74 | # Guardar resultados y devolver DataFrame (return df) 75 | pass 76 | 77 | def main(): 78 | sc = SparkContext('local', 'practicas_spark') 79 | pr = definir_path_resultados('./resultados') 80 | ejercicio_0(sc, pr) 81 | ejercicio_1(sc, pr) 82 | ejercicio_2(sc, pr) 83 | ejercicio_3(sc, pr) 84 | ejercicio_4(sc, pr) 85 | ejercicio_5(sc, pr) 86 | ejercicio_6(sc, pr) 87 | ejercicio_7(sc, pr) 88 | 89 | if __name__ == '__main__': 90 | main() 91 | 92 | -------------------------------------------------------------------------------- /assignments/helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | 5 | from collections import namedtuple 6 | from functools import partial 7 | from glob import glob 8 | 9 | item_fields = ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned'] 10 | Item = namedtuple('Item', item_fields) 11 | 12 | def parse_item(raw_string): 13 | f = raw_string.split('|') 14 | f += [None] * (len(item_fields) - len(f)) 15 | return Item(*f) 16 | 17 | # Uso básico de namedtuples: 18 | # item = parse_item(['one', 'two']) 19 | # new_item = item._replace(tx_id=1, buyer=5) 20 | 21 | # API http://fixer.io/ 22 | def get_usd_exchange_rates(): 23 | with open('./data/exchange_rates_usd.csv') as f: 24 | data = json.load(f) 25 | return data['rates'] 26 | 27 | container_fields = ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok'] 28 | Container = namedtuple('Container', container_fields) 29 | 30 | def parse_container(raw_string): 31 | f = raw_string.split(';') 32 | f += [None] * (len(container_fields) - len(f)) 33 | return Container(*f) 34 | 35 | stock_fields = ['simbolo', 'numero', 'precio_compra', 'ultimo_precio', 'returns'] 36 | Stock = namedtuple('Stock', stock_fields) 37 | def parse_stock(raw_string): 38 | f = raw_string.split(',') 39 | return Stock(simbolo=f[0], numero=None, precio_compra=None, ultimo_precio=float(f[1]), returns=0.0) 40 | 41 | def setup_checkpoint(streamingContext): 42 | checkpoint = './checkpoint' 43 | if (os.path.exists(checkpoint)): 44 | shutil.rmtree(checkpoint) 45 | os.mkdir(checkpoint) 46 | streamingContext.checkpoint(checkpoint) 47 | 48 | def definir_path_resultados(path): 49 | if os.path.exists(path): 50 | shutil.rmtree(path) 51 | os.makedirs(path) 52 | return partial(path_resultados_fn, path) 53 | 54 | def path_resultados_fn(basePath, testId, extra = None): 55 | if not extra: 56 | return os.path.join(basePath, 'resultado_' + str(testId)) 57 | return os.path.join(basePath, 'resultado_' + str(testId), extra) 58 | 59 | def comprobar_resultados_en_hdfs(path): 60 | if not os.path.exists(path): 61 | return 'No existe el directorio "{}", asegurate de guardar los datos al finalizar el ejercicio'.format(path) 62 | if not os.path.exists(os.path.join(path, '_SUCCESS')): 63 | return 'El trabajo no terminó correctamente' 64 | parts = glob(os.path.join(path, 'part*')) 65 | at_least_one = any(map(lambda p: os.stat(p).st_size > 0, parts)) 66 | if not parts or not at_least_one: 67 | return 'El trabajo terminó correctamente, pero no existen datos en la carpeta "{}"'.format(path) 68 | return True 69 | 70 | -------------------------------------------------------------------------------- /assignments/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = .git data __pycache__ _build tmp* venv 3 | usefixtures = spark_context path_resultados 4 | -------------------------------------------------------------------------------- /assignments/requirements.txt: -------------------------------------------------------------------------------- 1 | attrs==17.4.0 2 | pluggy==0.6.0 3 | py==1.5.2 4 | pytest==3.4.0 5 | six==1.11.0 6 | -------------------------------------------------------------------------------- /assignments/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | readonly WORKDIR=/opt/tests/assignments 3 | docker run -v $(pwd):${WORKDIR} -w ${WORKDIR} -ti luisbelloch/spark-assignments pytest -v 4 | 5 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_0.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from .contenedores import * 4 | 5 | def test_ejercicio_0_crea_secuencia_de_10_elementos(spark_context, path_resultados): 6 | resultado = ejercicio_0(spark_context, path_resultados) 7 | esperado = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 8 | assert resultado == esperado 9 | 10 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_1.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from .contenedores import * 4 | 5 | def test_ejercicio_1_cuenta_correctamente_el_numero_de_lineas(spark_context, path_resultados): 6 | resultado = ejercicio_1(spark_context, path_resultados) 7 | assert 614 == resultado 8 | 9 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | 6 | @pytest.fixture(scope="session") 7 | def resultados(spark_context, path_resultados): 8 | return ejercicio_2(spark_context, path_resultados) 9 | 10 | def test_ejercicio_2_solo_quedan_dos_contenedores_despues_de_filtrar(resultados): 11 | assert 2 == len(resultados) 12 | 13 | def test_ejercicio_2_comprobar_que_las_matriculas_son_las_correctas(resultados): 14 | assert all([e[0] == 'DEJ1128330' for e in resultados]) 15 | assert 'GYFD1228113' in [e[4] for e in resultados] 16 | assert 'MBPF1909627' in [e[4] for e in resultados] 17 | 18 | def test_ejercicio_2_resultados_guardados(resultados, comprobar_hdfs): 19 | assert comprobar_hdfs(2) == True 20 | 21 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | from pyspark.sql import SQLContext 6 | 7 | def test_ejercicio_3_data_frame_tiene_613_filas(resultados_ejercicio_3): 8 | assert 613 == resultados_ejercicio_3.rdd.count() 9 | 10 | def test_ejercicio_3_data_frame_tiene_al_menos_una_fila_correcta(resultados_ejercicio_3): 11 | df = resultados_ejercicio_3 12 | assert 1 == df.filter(df.ship_imo == "JMP1637582").filter(df.container_id == "XPOG1294738").rdd.count() 13 | 14 | def test_ejercicio_3_resultados_guardados(resultados_ejercicio_3, comprobar_hdfs): 15 | assert comprobar_hdfs(3) == True 16 | 17 | def test_ejercicio_3_estructura_dataframe_correcta(resultados_ejercicio_3, tiene_columnas): 18 | tiene_columnas(resultados_ejercicio_3, ['contact', 'container_group', 'container_id', 'container_type', 'country', 'customs_ok', 'declared', 'departure', 'gross_weight', 'net_weight', 'owner', 'ship_imo', 'ship_name']) 19 | 20 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_4.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | from pyspark.sql import SQLContext 6 | 7 | @pytest.fixture(scope="session") 8 | def resultados(spark_context, path_resultados): 9 | return ejercicio_4(spark_context, path_resultados) 10 | 11 | def test_ejercicio_4_puede_filtrar_la_lista_de_contenedores(resultados): 12 | assert [row.ship_imo for row in resultados.rdd.collect()] == [ 13 | u'AEY1108363', 14 | u'AMC1861710', 15 | u'DEJ1128330', 16 | u'FUS1202266', 17 | u'GEU1548633', 18 | u'GLV1922612', 19 | u'GYR1192020', 20 | u'IWE1254579', 21 | u'JCI1797526', 22 | u'JET1053895', 23 | u'JMP1637582', 24 | u'KSP1096387', 25 | u'MBV1836745', 26 | u'NCZ1777367', 27 | u'NLH1771681', 28 | u'POG1615575', 29 | u'RYP1117603', 30 | u'SQH1155999', 31 | u'TCU1641123', 32 | u'YZX1455509'] 33 | 34 | def test_ejercicio_4_resultados_guardados(resultados, comprobar_hdfs): 35 | assert comprobar_hdfs(4) == True 36 | 37 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | 6 | @pytest.fixture(scope="session") 7 | def resultados(spark_context, path_resultados): 8 | return ejercicio_5(spark_context, path_resultados) 9 | 10 | def test_ejercicio_5_existen_605_contenedores(resultados): 11 | assert 605 == resultados.rdd.count() 12 | 13 | def test_ejercicio_5_al_menos_uno_de_los_contenedores_validos_existe_en_la_lista(resultados): 14 | assert any([e["propietario"] == "UFC" and e["numero_serie"] == "118653" for e in resultados.rdd.collect()]) 15 | 16 | def test_ejercicio_5_todos_los_contendores_invalidos_estan_excluidos(resultados): 17 | existentes = resultados.select(resultados["container_id"]).rdd.collect() 18 | excluidos = [u'GJFL14A2798', u'CTVU1506A832', u'IJWDR1216916', u'OKANR1240284', u'JMYG190Z978', u'DUKF166276', u''] 19 | assert all([(e not in excluidos) for e in existentes]) 20 | 21 | def test_ejercicio_5_resultados_guardados(resultados, comprobar_hdfs): 22 | assert comprobar_hdfs(5) == True 23 | 24 | def test_ejercicio_5_estructura_dataframe_correcta(resultados, tiene_columnas): 25 | tiene_columnas(resultados, ['categoria', 'container_id', 'digito_control', 'numero_serie', 'propietario', 'ship_imo']) 26 | 27 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_6.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | 6 | @pytest.fixture(scope="session") 7 | def resultados(spark_context, path_resultados): 8 | return ejercicio_6(spark_context, path_resultados) 9 | 10 | def test_ejercicio_6_existen_261_contenedores_agrupados(resultados): 11 | assert 261 == resultados.rdd.count() 12 | 13 | def test_ejercicio_6_al_menos_uno_de_los_contenedores_validos_existe_en_la_lista(resultados): 14 | esperados = [109383187.34, 14038620.92, 213307524.22, 26936712.06, 29567214.06, 36127305.83, 38100695.63, 57417325.75, 60934192.91, 723432237.28] 15 | assert sorted(esperados) == sorted([r["total_net_weight"] for r in resultados.rdd.collect() if r["ship_imo"] == u'GLV1922612'], key=float) 16 | 17 | def test_ejercicio_6_estructura_dataframe_correcta(resultados, tiene_columnas): 18 | tiene_columnas(resultados, ['container_group', 'ship_imo', 'ship_name', 'total_net_weight']) 19 | 20 | -------------------------------------------------------------------------------- /assignments/test_ejercicio_7.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from .contenedores import * 5 | 6 | @pytest.fixture(scope="session") 7 | def resultados(spark_context, path_resultados): 8 | return ejercicio_7(spark_context, path_resultados) 9 | 10 | def test_ejercicio_7_resultados_guardados(resultados, comprobar_hdfs): 11 | assert comprobar_hdfs(7) == True 12 | 13 | def test_ejercicio_7_estructura_dataframe_correcta(resultados, tiene_columnas): 14 | tiene_columnas(resultados, ['container_group', 'ship_imo', 'ship_name', 'total_net_weight']) 15 | 16 | -------------------------------------------------------------------------------- /beam/.gitignore: -------------------------------------------------------------------------------- 1 | out 2 | -------------------------------------------------------------------------------- /beam/basic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | 5 | import argparse 6 | import logging 7 | 8 | import apache_beam as beam 9 | 10 | from apache_beam.options.pipeline_options import PipelineOptions 11 | from apache_beam.options.pipeline_options import SetupOptions 12 | 13 | def run(argv=None): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--input', dest='input') 16 | parser.add_argument('--output', dest='output') 17 | 18 | known_args, pipeline_args = parser.parse_known_args(argv) 19 | pipeline_args.extend(['--runner=DirectRunner']) 20 | pipeline_options = PipelineOptions(pipeline_args) 21 | pipeline_options.view_as(SetupOptions).save_main_session = True 22 | p = beam.Pipeline(options=pipeline_options) 23 | 24 | print("Input:", known_args.input) 25 | print("Output:", known_args.output) 26 | 27 | if __name__ == '__main__': 28 | logging.getLogger().setLevel(logging.DEBUG) 29 | run() 30 | 31 | -------------------------------------------------------------------------------- /beam/beam: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | if [[ $# -lt 1 ]]; then 5 | >&2 echo "USAGE: ./beam [SCRIPT_NAME]" 6 | >&2 echo "Sample: ./beam basic.py" 7 | exit 1 8 | fi 9 | 10 | readonly script=/opt/beam/$1 11 | readonly local_data=`cd "../data"; pwd` 12 | docker run --rm -v "${PWD}":/opt/beam -v "${local_data}":/data -ti luisbelloch/beam python ${script} ${@:2} 13 | 14 | -------------------------------------------------------------------------------- /beam/compras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | 4 | import datetime 5 | import logging 6 | 7 | import apache_beam as beam 8 | 9 | logging.getLogger().setLevel(logging.INFO) 10 | 11 | def dump(line): 12 | logging.info(line) 13 | return line 14 | 15 | def isoDate(date): 16 | return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") 17 | 18 | p1 = beam.Pipeline() 19 | lines_collection = (p1 20 | | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv") 21 | | 'Split' >> beam.Map(lambda l: l.split("|")) 22 | | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id') 23 | | 'DosCampos' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]) }) 24 | | 'DebugPrint' >> beam.Map(lambda x: dump(x))) 25 | 26 | p1.run().wait_until_finish() 27 | 28 | -------------------------------------------------------------------------------- /beam/compras_ptransform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | 4 | import datetime 5 | import logging 6 | 7 | import apache_beam as beam 8 | 9 | # ./beam compras_ptransform.py 10 | # head ../data/compras_tiny.json/compras_tiny.json-00000-of-00018 11 | # {'tx_id': u'RHMLNJB157', 'tx_time': datetime.datetime(2010, 2, 3, 4, 12, 3)} 12 | # {'tx_id': u'VFJDQNX118', 'tx_time': datetime.datetime(2010, 10, 24, 3, 1, 9)} 13 | # {'tx_id': u'MYOIBZV163', 'tx_time': datetime.datetime(2010, 7, 26, 5, 23, 35)} 14 | 15 | logging.getLogger().setLevel(logging.INFO) 16 | 17 | def dump(line): 18 | logging.info(line) 19 | return line 20 | 21 | def isoDate(date): 22 | return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") 23 | 24 | class ParseCompras(beam.PTransform): 25 | def init(self): 26 | super(ParseCompras, self).__init__() 27 | 28 | def expand(self, pcol): 29 | return (pcol 30 | | 'SplitFields' >> beam.Map(lambda l: l.split("|")) 31 | | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id') 32 | | 'DosCampos' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]) })) 33 | 34 | p1 = beam.Pipeline() 35 | lines_collection = (p1 36 | | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv") 37 | | ParseCompras() 38 | # | 'DebugPrint' >> beam.Map(lambda x: dump(x)) 39 | | 'Write' >> beam.io.WriteToText('/data/compras_tiny.json/compras_tiny.json')) 40 | 41 | p1.run().wait_until_finish() 42 | 43 | -------------------------------------------------------------------------------- /beam/compras_ptransform_condensed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | 4 | import datetime 5 | import json 6 | import logging 7 | 8 | import apache_beam as beam 9 | 10 | # ./beam compras_ptransform_condensed.py 11 | # head ../data/compras_tiny.json/compras_tiny.json-00000-of-00018 12 | # {'tx_id': u'RHMLNJB157', 'tx_time': datetime.datetime(2010, 2, 3, 4, 12, 3)} 13 | # {'tx_id': u'VFJDQNX118', 'tx_time': datetime.datetime(2010, 10, 24, 3, 1, 9)} 14 | # {'tx_id': u'MYOIBZV163', 'tx_time': datetime.datetime(2010, 7, 26, 5, 23, 35)} 15 | 16 | logging.getLogger().setLevel(logging.INFO) 17 | 18 | class DateTimeEncoder(json.JSONEncoder): 19 | def default(self, target): 20 | if isinstance(target, datetime.datetime): 21 | return target.isoformat() 22 | return json.JSONEncoder.default(self, target) 23 | 24 | class JsonCoder(object): 25 | def encode(self, x): 26 | return json.dumps(x, cls=DateTimeEncoder) 27 | 28 | def decode(self, x): 29 | return json.loads(x) 30 | 31 | def dump(line): 32 | logging.info(line) 33 | return line 34 | 35 | def isoDate(date): 36 | return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") 37 | 38 | @beam.ptransform_fn 39 | def ParseCompras(pcol): 40 | return (pcol 41 | | 'SplitFields' >> beam.Map(lambda l: l.split("|")) 42 | | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id') 43 | | 'Struct' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]), "amount": float(f[9]) })) 44 | 45 | p1 = beam.Pipeline() 46 | lines_collection = (p1 47 | | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv") 48 | | ParseCompras() 49 | # | 'DebugPrint' >> beam.Map(lambda x: dump(x)) 50 | | 'Write' >> beam.io.WriteToText('/data/compras_tiny.json/compras_tiny.json', coder=JsonCoder())) 51 | 52 | p1.run().wait_until_finish() 53 | 54 | -------------------------------------------------------------------------------- /beam/compras_totales_por_pais.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | 4 | import datetime 5 | import logging 6 | 7 | import apache_beam as beam 8 | 9 | logging.getLogger().setLevel(logging.INFO) 10 | 11 | def dump(line): 12 | logging.info(line) 13 | return line 14 | 15 | def isoDate(date): 16 | return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ") 17 | 18 | @beam.ptransform_fn 19 | def ParseCompras(pcol): 20 | return (pcol 21 | | 'SplitFields' >> beam.Map(lambda l: l.split("|")) 22 | | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id') 23 | | 'Struct' >> beam.Map(lambda f: (f[3], float(f[9])))) 24 | 25 | p = beam.Pipeline() 26 | compras = (p 27 | | beam.io.ReadFromText("/data/compras_tiny.csv") 28 | | ParseCompras()) 29 | 30 | totales = (compras | beam.CombinePerKey(sum)) 31 | cuentas = (compras | beam.combiners.Count.PerKey()) 32 | 33 | ({ "total": totales, "cuenta": cuentas} 34 | | 'Join' >> beam.CoGroupByKey() 35 | | 'Flatten' >> beam.Map(lambda e: "%s|%f|%d" % (e[0], e[1]["total"][0], e[1]["cuenta"][0])) 36 | | 'Dump' >> beam.Map(lambda x: dump(x)) 37 | | 'Write' >> beam.io.WriteToText('/data/compras_totales_por_pais')) 38 | 39 | p.run().wait_until_finish() 40 | 41 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | containers_partitioned 2 | peliculas0_trained_model 3 | compras_tiny.json 4 | compras_totales_por_pais-00000-of-00001 5 | 6 | -------------------------------------------------------------------------------- /data/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | gem "faker" 3 | -------------------------------------------------------------------------------- /data/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | concurrent-ruby (1.0.5) 5 | faker (1.8.7) 6 | i18n (>= 0.7) 7 | i18n (0.9.3) 8 | concurrent-ruby (~> 1.0) 9 | 10 | PLATFORMS 11 | ruby 12 | 13 | DEPENDENCIES 14 | faker 15 | 16 | BUNDLED WITH 17 | 1.16.1 18 | -------------------------------------------------------------------------------- /data/containers_tiny.parquet/.part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/.part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc -------------------------------------------------------------------------------- /data/containers_tiny.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/containers_tiny.parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_common_metadata -------------------------------------------------------------------------------- /data/containers_tiny.parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_metadata -------------------------------------------------------------------------------- /data/containers_tiny.parquet/part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet -------------------------------------------------------------------------------- /data/country_codes.csv: -------------------------------------------------------------------------------- 1 | Afghanistan,AF 2 | Åland Islands,AX 3 | Albania,AL 4 | Algeria,DZ 5 | American Samoa,AS 6 | Andorra,AD 7 | Angola,AO 8 | Anguilla,AI 9 | Antarctica,AQ 10 | Antigua and Barbuda,AG 11 | Argentina,AR 12 | Armenia,AM 13 | Aruba,AW 14 | Australia,AU 15 | Austria,AT 16 | Azerbaijan,AZ 17 | Bahamas,BS 18 | Bahrain,BH 19 | Bangladesh,BD 20 | Barbados,BB 21 | Belarus,BY 22 | Belgium,BE 23 | Belize,BZ 24 | Benin,BJ 25 | Bermuda,BM 26 | Bhutan,BT 27 | Bolivia,BO 28 | Bonaire,BQ 29 | Bosnia and Herzegovina,BA 30 | Botswana,BW 31 | Bouvet Island,BV 32 | Brazil,BR 33 | British Indian Ocean Territory,IO 34 | Brunei Darussalam,BN 35 | Bulgaria,BG 36 | Burkina Faso,BF 37 | Burundi,BI 38 | Cambodia,KH 39 | Cameroon,CM 40 | Canada,CA 41 | Cape Verde,CV 42 | Cayman Islands,KY 43 | Central African Republic,CF 44 | Chad,TD 45 | Chile,CL 46 | China,CN 47 | Christmas Island,CX 48 | Cocos (Keeling) Islands,CC 49 | Colombia,CO 50 | Comoros,KM 51 | Congo,CG 52 | Congo,CD 53 | Cook Islands,CK 54 | Costa Rica,CR 55 | Côte d'Ivoire,CI 56 | Croatia,HR 57 | Cuba,CU 58 | Curaçao,CW 59 | Cyprus,CY 60 | Czech Republic,CZ 61 | Denmark,DK 62 | Djibouti,DJ 63 | Dominica,DM 64 | Dominican Republic,DO 65 | Ecuador,EC 66 | Egypt,EG 67 | El Salvador,SV 68 | Equatorial Guinea,GQ 69 | Eritrea,ER 70 | Estonia,EE 71 | Ethiopia,ET 72 | Falkland Islands (Malvinas),FK 73 | Faroe Islands,FO 74 | Fiji,FJ 75 | Finland,FI 76 | France,FR 77 | French Guiana,GF 78 | French Polynesia,PF 79 | French Southern Territories,TF 80 | Gabon,GA 81 | Gambia,GM 82 | Georgia,GE 83 | Germany,DE 84 | Ghana,GH 85 | Gibraltar,GI 86 | Greece,GR 87 | Greenland,GL 88 | Grenada,GD 89 | Guadeloupe,GP 90 | Guam,GU 91 | Guatemala,GT 92 | Guernsey,GG 93 | Guinea,GN 94 | Guinea-Bissau,GW 95 | Guyana,GY 96 | Haiti,HT 97 | Heard Island and McDonald Islands,HM 98 | Holy See (Vatican City State),VA 99 | Honduras,HN 100 | Hong Kong,HK 101 | Hungary,HU 102 | Iceland,IS 103 | India,IN 104 | Indonesia,ID 105 | Iran,IR 106 | Iraq,IQ 107 | Ireland,IE 108 | Isle of Man,IM 109 | Israel,IL 110 | Italy,IT 111 | Jamaica,JM 112 | Japan,JP 113 | Jersey,JE 114 | Jordan,JO 115 | Kazakhstan,KZ 116 | Kenya,KE 117 | Kiribati,KI 118 | Korea,KP 119 | Korea,KR 120 | Kuwait,KW 121 | Kyrgyzstan,KG 122 | Lao People's Democratic Republic,LA 123 | Latvia,LV 124 | Lebanon,LB 125 | Lesotho,LS 126 | Liberia,LR 127 | Libya,LY 128 | Liechtenstein,LI 129 | Lithuania,LT 130 | Luxembourg,LU 131 | Macao,MO 132 | Macedonia,MK 133 | Madagascar,MG 134 | Malawi,MW 135 | Malaysia,MY 136 | Maldives,MV 137 | Mali,ML 138 | Malta,MT 139 | Marshall Islands,MH 140 | Martinique,MQ 141 | Mauritania,MR 142 | Mauritius,MU 143 | Mayotte,YT 144 | Mexico,MX 145 | Micronesia,FM 146 | Moldova,MD 147 | Monaco,MC 148 | Mongolia,MN 149 | Montenegro,ME 150 | Montserrat,MS 151 | Morocco,MA 152 | Mozambique,MZ 153 | Myanmar,MM 154 | Namibia,NA 155 | Nauru,NR 156 | Nepal,NP 157 | Netherlands,NL 158 | New Caledonia,NC 159 | New Zealand,NZ 160 | Nicaragua,NI 161 | Niger,NE 162 | Nigeria,NG 163 | Niue,NU 164 | Norfolk Island,NF 165 | Northern Mariana Islands,MP 166 | Norway,NO 167 | Oman,OM 168 | Pakistan,PK 169 | Palau,PW 170 | Palestine,PS 171 | Panama,PA 172 | Papua New Guinea,PG 173 | Paraguay,PY 174 | Peru,PE 175 | Philippines,PH 176 | Pitcairn,PN 177 | Poland,PL 178 | Portugal,PT 179 | Puerto Rico,PR 180 | Qatar,QA 181 | Réunion,RE 182 | Romania,RO 183 | Russian Federation,RU 184 | Rwanda,RW 185 | Saint Barthélemy,BL 186 | Saint Helena,SH 187 | Saint Kitts and Nevis,KN 188 | Saint Lucia,LC 189 | Saint Martin (French part),MF 190 | Saint Pierre and Miquelon,PM 191 | Saint Vincent and the Grenadines,VC 192 | Samoa,WS 193 | San Marino,SM 194 | Sao Tome and Principe,ST 195 | Saudi Arabia,SA 196 | Senegal,SN 197 | Serbia,RS 198 | Seychelles,SC 199 | Sierra Leone,SL 200 | Singapore,SG 201 | Sint Maarten (Dutch part),SX 202 | Slovakia,SK 203 | Slovenia,SI 204 | Solomon Islands,SB 205 | Somalia,SO 206 | South Africa,ZA 207 | South Georgia and the South Sandwich Islands,GS 208 | South Sudan,SS 209 | Spain,ES 210 | Sri Lanka,LK 211 | Sudan,SD 212 | Suriname,SR 213 | Svalbard and Jan Mayen,SJ 214 | Swaziland,SZ 215 | Sweden,SE 216 | Switzerland,CH 217 | Syrian Arab Republic,SY 218 | Taiwan,TW 219 | Tajikistan,TJ 220 | Tanzania,TZ 221 | Thailand,TH 222 | Timor-Leste,TL 223 | Togo,TG 224 | Tokelau,TK 225 | Tonga,TO 226 | Trinidad and Tobago,TT 227 | Tunisia,TN 228 | Turkey,TR 229 | Turkmenistan,TM 230 | Turks and Caicos Islands,TC 231 | Tuvalu,TV 232 | Uganda,UG 233 | Ukraine,UA 234 | United Arab Emirates,AE 235 | United Kingdom,GB 236 | United States,US 237 | United States Minor Outlying Islands,UM 238 | Uruguay,UY 239 | Uzbekistan,UZ 240 | Vanuatu,VU 241 | Venezuela,VE 242 | Viet Nam,VN 243 | Virgin Islands,VG 244 | Wallis and Futuna,WF 245 | Western Sahara,EH 246 | Yemen,YE 247 | Zambia,ZM 248 | Zimbabwe,ZW 249 | -------------------------------------------------------------------------------- /data/exchange_rates_usd.json: -------------------------------------------------------------------------------- 1 | {"base":"USD","date":"2016-02-05","rates":{"AUD":1.3911,"BGN":1.7459,"BRL":3.8791,"CAD":1.3751,"CHF":0.99098,"CNY":6.5724,"CZK":24.136,"DKK":6.6621,"GBP":0.68715,"HKD":7.7871,"HRK":6.8327,"HUF":276.69,"IDR":13549.0,"ILS":3.8756,"INR":67.654,"JPY":116.68,"KRW":1190.8,"MXN":18.21,"MYR":4.1535,"NOK":8.54,"NZD":1.4868,"PHP":47.555,"PLN":3.9398,"RON":4.0196,"RUB":76.776,"SEK":8.4204,"SGD":1.3958,"THB":35.51,"TRY":2.9011,"ZAR":15.853,"EUR":0.8927}} -------------------------------------------------------------------------------- /data/iso-container-codes.csv: -------------------------------------------------------------------------------- 1 | code,description,length,height,group 2 | 22B0,Bulk,20,8.5,22B0 3 | 22B1,Dry Bulk,20,8.5,22B0 4 | 22B3,Dry Bulk,20,8.5,22B0 5 | 22B4,Dry Bulk,20,8.5,22B0 6 | 22B5,Dry Bulk,20,8.5,22B0 7 | 22B6,Dry Bulk,20,8.5,22B0 8 | 22BK,Dry Bulk,20,8.5,22B0 9 | 2080,Dry Bulk,20,8,22B0 10 | 20B0,Dry Bulk,20,8,22B0 11 | 20B1,Dry Bulk,20,8,22B0 12 | 20B3,Dry Bulk,20,8,22B0 13 | 20B4,Dry Bulk,20,8,22B0 14 | 20B5,Dry Bulk,20,8,22B0 15 | 20B6,Dry Bulk,20,8,22B0 16 | 20BK,Dry Bulk,20,8,22B0 17 | 20BU,Dry Bulk,20,8,22B0 18 | 2280,Dry Bulk,20,8.5,22B0 19 | 2281,Dry Bulk,20,8.5,22B0 20 | 2299,Air/Surface,20,8.5,22B0 21 | 22BU,Dry Bulk,20,8.5,22B0 22 | 22G0,Standard Dry,20,8.5,22G0 23 | 22G1,Standard Dry,20,8.5,22G0 24 | 22G2,Standard Dry,20,8.5,22G0 25 | 22G3,Standard Dry,20,8.5,22G0 26 | 22V3,Standard Dry,20,8.5,22G0 27 | 2300,Standard Dry,20,8.5,22G0 28 | 2301,Standard Dry,20,8.5,22G0 29 | 2302,Standard Dry,20,8.5,22G0 30 | 2303,Standard Dry,20,8.5,22G0 31 | 2304,Standard Dry,20,8.5,22G0 32 | 2410,HIGH CUBE,20,9.5,22G0 33 | 24G0,Standard Dry,20,9,22G0 34 | 24G1,Standard Dry,20,9,22G0 35 | 24G2,Standard Dry,20,9,22G0 36 | 24G3,Standard Dry,20,9,22G0 37 | 24GP,Standard Dry,20,9,22G0 38 | 2500,Standard Dry,20,8.5,22G0 39 | 25G0,Standard Dry High Cube,20,9,22G0 40 | 2600,Standard Dry,20,4.25,22G0 41 | 26G0,Standard Dry,20,9.5,22G0 42 | 26G1,Standard Dry,20,9.5,22G0 43 | 26G2,Standard Dry,20,9.5,22G0 44 | 26G3,Standard Dry,20,9.5,22G0 45 | 26GP,Standard Dry,20,9.5,22G0 46 | 28G0,Standard Dry,20,4.25,22G0 47 | 28GP,Standard Dry,20,4.25,22G0 48 | 28U1,BIN HALF HEIGHT (OPEN TOP),20,4.25,22G0 49 | 28U2,"OPENING(S) AT ONE OR BOTH ENDS, PLUS REMV TOP MEMB",20,8.5,22G0 50 | 28UT,"OPENING(S) AT ONE OR BOTH ENDS, PLUS REMV TOP MEMB",20,8.5,22G0 51 | 2994,Air/Surface,20,4,22G0 52 | 2999,SLIDER CHASSIS,20,0,22G0 53 | 3000,Standard Dry,30,8,22G0 54 | 30G0,DRY CARGO/GENERAL PURPOSE,30,8,22G0 55 | 3200,Standard Dry,30,8.5,22G0 56 | 32G0,DRY CARGO/GENERAL PURPOSE,30,8.5,22G0 57 | 3399,TRIAXLE CHASSIS,23,0,22G0 58 | 7999,SLIDER CHASSIS,20,0,22G0 59 | B2G1,PASSIVE VENTS AT UPPER PART OF CARGO SPACE,24,8.5,22G0 60 | 1000,Standard Dry,10,8,22G0 61 | 10G0,DRY CARGO/GENERAL PURPOSE,10,8,22G0 62 | 1200,Standard Dry,10,8.5,22G0 63 | 12G0,DRY CARGO/GENERAL PURPOSE,10,8.5,22G0 64 | 2000,Standard Dry,20,8,22G0 65 | 2001,Standard Dry,20,8,22G0 66 | 2002,Standard Dry,20,8,22G0 67 | 2003,Standard Dry,20,8,22G0 68 | 2004,Standard Dry,20,8,22G0 69 | 2025,Livestock Carrier,20,8,22G0 70 | 20G0,Standard Dry,20,8,22G0 71 | 20G1,Standard Dry,20,8,22G0 72 | 20G2,Standard Dry,20,8,22G0 73 | 20G3,Standard Dry,20,8,22G0 74 | 20GP,Standard Dry,20,8,22G0 75 | 2101,Standard Dry,20,8,22G0 76 | 2102,Standard Dry,20,8,22G0 77 | 2103,Standard Dry,20,8,22G0 78 | 2104,Standard Dry,20,8,22G0 79 | 2125,Livestock Carrier,20,8,22G0 80 | 2200,Standard Dry,20,8.5,22G0 81 | 2201,Standard Dry,20,8.5,22G0 82 | 2202,Standard Dry,20,8.5,22G0 83 | 2204,Standard Dry,20,8.5,22G0 84 | 2205,Standard Dry,20,8.5,22G0 85 | 2210,Standard Dry,20,8.5,22G0 86 | 2213,Standard Dry,20,8.5,22G0 87 | 2225,Livestock Carrier,20,8.5,22G0 88 | 22GP,Standard Dry,20,8.5,22G0 89 | 2212,General Purpose (Hanging Garments),20,8.5,22G0 90 | 25GP,High Cube,20,9.6,22G0 91 | 22H0,Insulated (Conair),20,8.5,22H0 92 | 22H1,Thermal Refrigerated/Heated,20,8.5,22H0 93 | 22H2,Thermal Refrigerated/Heated,20,8.5,22H0 94 | 22H5,Thermal Insulated,20,8.5,22H0 95 | 22H6,Thermal Insulated,20,8.5,22H0 96 | 22HI,Thermal Refrigerated/Heated,20,8.5,22H0 97 | 24H5,Thermal Insulated,20,9,22H0 98 | 24H6,Thermal Insulated,20,9,22H0 99 | 2020,Thermal Insulated,20,8,22H0 100 | 20H0,Thermal Refrigerated/Heated,20,8,22H0 101 | 20H1,Thermal Refrigerated/Heated,20,8,22H0 102 | 20H2,Thermal Refrigerated/Heated,20,8,22H0 103 | 20H5,Thermal Insulated,20,8,22H0 104 | 20H6,Thermal Insulated,20,8,22H0 105 | 20HI,Thermal Refrigerated/Heated,20,8,22H0 106 | 20HR,Thermal Refrigerated/Heated,20,8,22H0 107 | 2220,Thermal Insulated,20,8.5,22H0 108 | 2224,Insulated,20,8.5,22H0 109 | 22HR,Thermal Refrigerated/Heated,20,8.5,22H0 110 | 22P1,Flat Rack,20,8.5,22P1 111 | 22P2,Platform,20,8.5,22P1 112 | 22P4,Platform,20,8.5,22P1 113 | 22P5,Platform,20,8.5,22P1 114 | 22P7,PLATFORM FIXED,20,8.5,22P1 115 | 22P8,Platform,20,8.5,22P1 116 | 22P9,Platform,20,8.5,22P1 117 | 22PL,Platform,20,8.5,22P1 118 | 22PS,Platform,20,8.5,22P1 119 | 2361,Platform,20,8.5,22P1 120 | 2362,Platform,20,8.5,22P1 121 | 2363,Platform,20,8.5,22P1 122 | 2364,Platform,20,8.5,22P1 123 | 2365,Platform,20,8.5,22P1 124 | 2366,Platform,20,8.5,22P1 125 | 2367,Platform,20,8.5,22P1 126 | 2651,Open Top,20,4.25,22P1 127 | 2661,Platform,20,4.25,22P1 128 | 2761,Platform,20,4.25,22P1 129 | 2063,Flat,20,8,22P1 130 | 2066,Platform,20,8,22P1 131 | 2067,Platform,20,8,22P1 132 | 20P2,Platform,20,8,22P1 133 | 20P4,Platform,20,8,22P1 134 | 20P5,Platform,20,8,22P1 135 | 20PC,Platform,20,8,22P1 136 | 20PF,Platform,20,8,22P1 137 | 20PL,Platform,20,8,22P1 138 | 20PS,Platform,20,8,22P1 139 | 2160,Flat,20,8,22P1 140 | 2161,Platform,20,8,22P1 141 | 2162,Platform,20,8,22P1 142 | 2163,Platform,20,8,22P1 143 | 2164,Platform,20,8,22P1 144 | 2165,Platform,20,8,22P1 145 | 2166,Platform,20,8,22P1 146 | 2167,Platform,20,8,22P1 147 | 2260,Flat,20,8.5,22P1 148 | 2261,Flat,20,8.5,22P1 149 | 2262,Platform,20,8.5,22P1 150 | 2265,Platform,20,8.5,22P1 151 | 2266,Platform,20,8.5,22P1 152 | 2267,Platform,20,8.5,22P1 153 | 22PF,Platform,20,8.5,22P1 154 | 22PC,Platform,20,8.5,22P1 155 | 8888,Uncontainerised,0,0,8888 156 | 22P3,Collapsible Flat Rack,20,8.5,22P3 157 | 20P3,Platform,20,8,22P3 158 | 2263,Flat,20,8.5,22P3 159 | 2264,Platform,20,8.5,22P3 160 | 22R1,Reefer,20,8.5,22R1 161 | 22R2,Thermal Refrigerated/Heated,20,8.5,22R1 162 | 22R3,Thermal Refrigerated/Heated,20,8.5,22R1 163 | 22R9,Thermal Refrigerated/Heated,20,8.5,22R1 164 | 22RC,Thermal Refrigerated/Heated,20,8.5,22R1 165 | 22RE,Thermal Refrigerated,20,8.5,22R1 166 | 22RS,Thermal Refrigerated/Heated,20,8.5,22R1 167 | 2330,Thermal Refrigerated,20,8.5,22R1 168 | 2331,Thermal Refrigerated,20,8.5,22R1 169 | 2332,Thermal Refrigerated/Heated,20,8.5,22R1 170 | 2432,Thermal Refrigerated/Heated,20,9,22R1 171 | 24H0,Thermal Refrigerated/Heated,20,9,22R1 172 | 24H1,Thermal Refrigerated/Heated,20,9,22R1 173 | 24H2,Thermal Refrigerated/Heated,20,9,22R1 174 | 24HI,Thermal Refrigerated/Heated,20,9,22R1 175 | 24HR,Thermal Refrigerated/Heated,20,9,22R1 176 | 24R0,Thermal Refrigerated/Heated,20,9,22R1 177 | 24R1,Thermal Refrigerated/Heated,20,9,22R1 178 | 24R2,Thermal Refrigerated/Heated,20,9,22R1 179 | 24R3,Thermal Refrigerated/Heated,20,9,22R1 180 | 24RE,Thermal Refrigerated,20,9,22R1 181 | 24RS,Thermal Refrigerated/Heated,20,9,22R1 182 | 24RT,Thermal Refrigerated/Heated,20,9,22R1 183 | 2030,Thermal Refrigerated,20,8,22R1 184 | 2031,Thermal Refrigerated,20,8,22R1 185 | 2032,Thermal Refrigerated/Heated,20,8,22R1 186 | 2040,Thermal Refrigerated,20,8,22R1 187 | 2041,Thermal Refrigerated,20,8,22R1 188 | 2042,Thermal Refrigerated,20,8,22R1 189 | 2043,Thermal Refrigerated,20,8,22R1 190 | 20R0,Thermal Refrigerated,20,8,22R1 191 | 20R1,Thermal Refrigerated/Heated,20,8,22R1 192 | 20R2,Thermal Refrigerated/Heated,20,8,22R1 193 | 20R3,Thermal Refrigerated/Heated,20,8,22R1 194 | 20RE,Thermal Refrigerated,20,8,22R1 195 | 20RS,Thermal Refrigerated/Heated,20,8,22R1 196 | 20RT,Thermal Refrigerated/Heated,20,8,22R1 197 | 2130,Thermal Refrigerated,20,8,22R1 198 | 2131,Thermal Refrigerated,20,8,22R1 199 | 2132,Thermal Refrigerated,20,8,22R1 200 | 2230,Thermal Refrigerated,20,8.5,22R1 201 | 2231,Thermal Refrigerated,20,8.5,22R1 202 | 2232,Thermal Refrigerated/Heated,20,8.5,22R1 203 | 2240,Thermal Refrigerated,20,8.5,22R1 204 | 2242,Thermal Refrigerated,20,8.5,22R1 205 | 22R0,Thermal Refrigerated/Heated,20,8.5,22R1 206 | 22RT,Thermal Refrigerated/Heated,20,8.5,22R1 207 | 2234,"Thermal containers, Heated",20,8.5,22R1 208 | 22T0,Tank,20,8.5,22T0 209 | 22T1,Tank,20,8.5,22T0 210 | 22T2,Tank,20,8.5,22T0 211 | 22T3,Tank,20,8.5,22T0 212 | 22T4,Tank,20,8.5,22T0 213 | 22T5,Tank,20,8.5,22T0 214 | 22T6,Tank,20,8.5,22T0 215 | 22T7,Tank,20,8.5,22T0 216 | 22T8,Tank,20,8.5,22T0 217 | 22T9,Tank,20,8.5,22T0 218 | 22TD,Tank,20,8.5,22T0 219 | 22TG,Tank,20,8.5,22T0 220 | 2670,Tank,20,4.25,22T0 221 | 2671,Tank,20,4.25,22T0 222 | 2870,HALF HEIGHT THERMAL TANK,20,0,22T0 223 | 2070,Tank,20,8,22T0 224 | 2071,Tank,20,8,22T0 225 | 2072,Tank,20,8,22T0 226 | 2073,Tank,20,8,22T0 227 | 2074,Tank,20,8,22T0 228 | 2075,Tank,20,8,22T0 229 | 2076,Tank,20,8,22T0 230 | 2077,Tank,20,8,22T0 231 | 2078,Tank,20,8,22T0 232 | 2079,Tank,20,8,22T0 233 | 20T0,Tank,20,8,22T0 234 | 20T1,Tank,20,8,22T0 235 | 20T2,Tank,20,8,22T0 236 | 20T3,Tank,20,8,22T0 237 | 20T4,Tank,20,8,22T0 238 | 20T5,Tank,20,8,22T0 239 | 20T6,Tank,20,8,22T0 240 | 20T7,Tank,20,8,22T0 241 | 20T8,Tank,20,8,22T0 242 | 20T9,Tank,20,8,22T0 243 | 20TD,Tank,20,8,22T0 244 | 20TG,Tank,20,8,22T0 245 | 20TN,Tank,20,8,22T0 246 | 2270,Tank,20,8.5,22T0 247 | 2271,Tank,20,8.5,22T0 248 | 2272,Tank,20,8.5,22T0 249 | 2273,Tank,20,8.5,22T0 250 | 2274,Tank,20,8.5,22T0 251 | 2275,Tank,20,8.5,22T0 252 | 2276,Tank,20,8.5,22T0 253 | 2277,Tank,20,8.5,22T0 254 | 2278,Tank,20,8.5,22T0 255 | 2279,Tank,20,8.5,22T0 256 | 22TN,Tank,20,8.5,22T0 257 | 22U1,Open Top,20,8.5,22U1 258 | 22U2,Open Top,20,8.5,22U1 259 | 22U3,Open Top,20,8.5,22U1 260 | 22U4,Open Top,20,8.5,22U1 261 | 22U5,Open Top,20,8.5,22U1 262 | 22U6,Standard Dry,20,8.5,22U1 263 | 2650,Open Top,0,4.25,22U1 264 | 2750,Open Top,20,4.25,22U1 265 | 2770,Tank,20,4.25,22U1 266 | 2771,Tank,20,4.25,22U1 267 | 2851,HALF OPEN TOP,20,0,22U1 268 | 2050,Open Top,20,8,22U1 269 | 2051,Open Top,20,8,22U1 270 | 2052,Open Top,20,8,22U1 271 | 2053,Open Top,20,8,22U1 272 | 20U0,Open Top,20,8,22U1 273 | 20U1,Open Top,20,8,22U1 274 | 20U2,Open Top,20,8,22U1 275 | 20U3,Open Top,20,8,22U1 276 | 20U4,Open Top,20,8,22U1 277 | 20U5,Open Top,20,8,22U1 278 | 20UT,Open Top,20,8,22U1 279 | 2150,Open Top,20,8,22U1 280 | 2203,Standard Dry,20,8.5,22U1 281 | 2250,Open Top,20,8.5,22U1 282 | 2251,Open Top,20,8.5,22U1 283 | 2252,Open Top,20,8.5,22U1 284 | 2253,Open Top,20,8.5,22U1 285 | 2259,Open Top,20,8.5,22U1 286 | 22U0,Open Top,20,8.5,22U1 287 | 22UT,Open Top,20,8.5,22U1 288 | 22UP,Hard Top,20,8.5,22UP 289 | 22V0,Closed Vented,20,8.5,22VH 290 | 22V2,Closed Vented,20,8.5,22VH 291 | 22V4,Closed Vented,20,8.5,22VH 292 | 22VH,Ventilated,20,8.5,22VH 293 | 28VH,Vented,20,4.75,22VH 294 | 28VO,Vented,20,4.75,22VH 295 | 2010,Closed Vented,20,8,22VH 296 | 2011,Closed Vented,20,8,22VH 297 | 2013,Closed Ventilated,20,8,22VH 298 | 2015,Closed Ventilated,20,8,22VH 299 | 2017,Closed Ventilated,20,8,22VH 300 | 20V0,Closed Vented,20,8,22VH 301 | 20V2,Closed Vented,20,8,22VH 302 | 20V4,Closed Vented,20,8,22VH 303 | 20VH,Closed Vented,20,8,22VH 304 | 2113,Closed Ventilated,20,8,22VH 305 | 2117,Closed Ventilated,20,8,22VH 306 | 2211,Closed Vented,20,8.5,22VH 307 | 2215,Closed Ventilated,20,8.5,22VH 308 | 2216,Closed Ventilated,20,8.5,22VH 309 | 2217,Closed Ventilated,20,8.5,22VH 310 | 29P0,Platform,20,1,29P0 311 | 29P1,PLATFORM (CONTAINER),20,4,29P0 312 | 2060,Platform,20,8,29P0 313 | 2061,Platform,20,8,29P0 314 | 2062,Platform,20,8,29P0 315 | 2064,Platform,20,8,29P0 316 | 2065,Platform,20,8,29P0 317 | 20P0,Platform,20,8,29P0 318 | 20P1,Platform,20,8,29P0 319 | 22P0,Platform,20,8.5,29P0 320 | 2760,Platform,20,4.25,29P0 321 | 2960,Platform,20,4,29P0 322 | 2969,Platform,20,4,29P0 323 | 29PL,PLATFORM (CONTAINER),20,1,29P0 324 | 42G0,Standard Dry,40,8.5,42G0 325 | 42G1,Standard Dry,40,8.5,42G0 326 | 42G2,Standard Dry,40,8.5,42G0 327 | 42G3,Standard Dry,40,8.5,42G0 328 | 4300,Standard Dry,40,8.5,42G0 329 | 4301,Standard Dry,40,8.5,42G0 330 | 4302,Standard Dry,40,8.5,42G0 331 | 4303,Standard Dry,40,8.5,42G0 332 | 4304,Standard Dry,40,8.5,42G0 333 | 4305,Standard Dry,40,8.5,42G0 334 | 4310,Standard Dry,40,8.5,42G0 335 | 4311,Closed Vented,40,8.5,42G0 336 | 4312,General Purpose (Hanging Garments),40,8.5,42G0 337 | 4313,VENTILATED,40,0,42G0 338 | 4315,Closed Ventilated,40,8.5,42G0 339 | 4325,Livestock Carrier,40,8.5,42G0 340 | 4326,Automobile Carrier,40,8.5,42G0 341 | 4380,Dry Bulk,40,8.5,42G0 342 | 44G0,Standard Dry,40,9,42G0 343 | 44G1,Standard Dry,40,9,42G0 344 | 44G2,Standard Dry,40,9,42G0 345 | 44G3,Standard Dry,40,9,42G0 346 | 44GP,Standard Dry,40,9,42G0 347 | 4595,Air/Surface,40,8.5,42G0 348 | 4599,Air/Surface,40,9,42G0 349 | 4651,HALF HIGH,40,0,42G0 350 | 4699,Air/Surface,40,4.25,42G0 351 | 4886,Dry Bulk,40,4.25,42G0 352 | 48UI,HALF HEIGHT (OPEN TOP),40,4.25,42G0 353 | 4994,Air/Surface,40,8.5,42G0 354 | 4999,GOOSENECK CHASSIS,40,0,42G0 355 | 4CG0,OPENING(S) AT ONE OR BOTH ENDS,40,8.5,42G0 356 | 4CGP,OPENING(S) AT ONE OR BOTH ENDS,40,8.5,42G0 357 | 8500,Standard Dry,35,8.5,42G0 358 | 8599,Air/Surface,35,8.5,42G0 359 | 9995,Air/Surface,40,4,42G0 360 | 9998,Air/Surface,40,4,42G0 361 | 9999,Air/Surface,40,4,42G0 362 | M2G0,OPENING(S) AT ONE END OR BOTH ENDS,48,8.5,42G0 363 | P2G0,OPENING(S) AT ONE END OR BOTH ENDS,53,8.5,42G0 364 | 4000,Standard Dry,40,8,42G0 365 | 4001,Standard Dry,40,8,42G0 366 | 4002,Standard Dry,40,8,42G0 367 | 4003,Standard Dry,40,8,42G0 368 | 4004,Standard Dry,40,8,42G0 369 | 4020,Thermal Insulated,40,8,42G0 370 | 4025,Livestock Carrier,40,8,42G0 371 | 4026,Automobile Carrier,40,8,42G0 372 | 4080,Dry Bulk,40,8,42G0 373 | 4096,Air/Surface,40,8,42G0 374 | 40B0,Dry Bulk,40,8,42G0 375 | 40B1,Dry Bulk,40,8,42G0 376 | 40B3,Dry Bulk,40,8,42G0 377 | 40B4,Dry Bulk,40,8,42G0 378 | 40B5,Dry Bulk,40,8,42G0 379 | 40B6,Dry Bulk,40,8,42G0 380 | 40BK,Dry Bulk,40,8,42G0 381 | 40BU,Dry Bulk,40,8,42G0 382 | 40G0,Standard Dry,40,8,42G0 383 | 40G1,Standard Dry,40,8,42G0 384 | 40G2,Standard Dry,40,8,42G0 385 | 40G3,Standard Dry,40,8,42G0 386 | 40GP,Standard Dry,40,8,42G0 387 | 4101,Standard Dry,40,8,42G0 388 | 4102,Standard Dry,40,8,42G0 389 | 4103,Standard Dry,40,8,42G0 390 | 4104,Standard Dry,40,8,42G0 391 | 4126,Automobile Carrier,40,8,42G0 392 | 4200,Standard Dry,40,8.5,42G0 393 | 4201,Standard Dry,40,8.5,42G0 394 | 4202,Standard Dry,40,8.5,42G0 395 | 4203,Standard Dry,40,8.5,42G0 396 | 4204,Standard Dry,40,8.5,42G0 397 | 4225,Livestock Carrier,40,8.5,42G0 398 | 4226,Automobile Carrier,40,8.5,42G0 399 | 4280,Dry Bulk,40,8.5,42G0 400 | 42B0,Dry Bulk,40,8.5,42G0 401 | 42B1,Dry Bulk,40,8.5,42G0 402 | 42B3,Dry Bulk,40,8.5,42G0 403 | 42B4,Dry Bulk,40,8.5,42G0 404 | 42B5,Dry Bulk,40,8.5,42G0 405 | 42B6,Dry Bulk,40,8.5,42G0 406 | 42BK,Dry Bulk,40,8.5,42G0 407 | 42BU,Dry Bulk,40,8.5,42G0 408 | 42GP,Standard Dry,40,8.5,42G0 409 | 42G4,General Purose (Hanging Garments),40,8.5,42G0 410 | 4CG1,PASSIVE VENTS AT UPPER PART OF CARGO SPACE,40,8.5,42G0 411 | 42H5,Thermal Insulated,40,8.5,42H0 412 | 42H6,Thermal Insulated,40,8.5,42H0 413 | 44H5,Thermal Insulated,40,9,42H0 414 | 44H6,Thermal Insulated,40,9,42H0 415 | 45H5,Thermal Insulated,45,9.5,42H0 416 | 45H6,Thermal Insulated,45,9.5,42H0 417 | L2H5,Thermal Insulated,45,8.5,42H0 418 | L2H6,Thermal Insulated,45,8.5,42H0 419 | L5H5,Thermal Insulated,45,9.5,42H0 420 | L5H6,Thermal Insulated,45,9.5,42H0 421 | 42H0,Insulated (Conair),40,8.5,42H0 422 | 42HI,Thermal Refrigerated/Heated,40,8.5,42H0 423 | 42P1,Flat Rack,40,8.5,42P1 424 | 42P2,Platform,40,8.5,42P1 425 | 42P4,Platform,40,8.5,42P1 426 | 42P5,Platform,40,8.5,42P1 427 | 42P8,Platform,40,8.5,42P1 428 | 42P9,Platform,40,8.5,42P1 429 | 42PL,Platform,40,8.5,42P1 430 | 42PS,Platform,40,8.5,42P1 431 | 4361,Flat,40,8.5,42P1 432 | 4362,Platform,40,8.5,42P1 433 | 4364,Platform,40,8.5,42P1 434 | 4365,Platform,40,8.5,42P1 435 | 4366,Platform,40,8.5,42P1 436 | 4367,Platform,40,8.5,42P1 437 | 4560,Platform,40,8.5,42P1 438 | 4561,Platform,40,8.5,42P1 439 | 4661,Platform,40,4.25,42P1 440 | 4761,Platform,40,4.25,42P1 441 | 48P1,Platform,40,4.25,42P1 442 | 48P5,Platform,40,4.25,42P1 443 | 48PC,Platform,40,4.25,42P1 444 | 48PF,Platform,40,4.25,42P1 445 | 48PL,Platform,40,4.25,42P1 446 | 4061,Platform,40,8,42P1 447 | 4062,Platform,40,8,42P1 448 | 4063,Platform,40,8,42P1 449 | 4064,Platform,40,8,42P1 450 | 4065,Platform,40,8,42P1 451 | 4066,Platform,40,8,42P1 452 | 4067,Platform,40,8,42P1 453 | 40P1,Platform,40,8,42P1 454 | 40P2,Platform,40,8,42P1 455 | 40P4,Platform,40,8,42P1 456 | 40P5,Platform,40,8,42P1 457 | 40PC,Platform,40,8,42P1 458 | 40PF,Platform,40,8,42P1 459 | 40PL,Platform,40,8,42P1 460 | 40PS,Platform,40,8,42P1 461 | 4161,Platform,40,8,42P1 462 | 4162,Platform,40,8,42P1 463 | 4163,Platform,40,8,42P1 464 | 4164,Platform,40,8,42P1 465 | 4165,Platform,40,8,42P1 466 | 4166,Platform,40,8,42P1 467 | 4167,Platform,40,8,42P1 468 | 4261,Platform,40,8.5,42P1 469 | 4262,Platform,40,8.5,42P1 470 | 4263,Flat,40,8.5,42P1 471 | 4264,Platform,40,8.5,42P1 472 | 4265,Platform,40,8.5,42P1 473 | 4266,Platform,40,8.5,42P1 474 | 4267,Platform,40,8.5,42P1 475 | 42PC,Platform,40,8.5,42P1 476 | 42PF,Platform,40,8.5,42P1 477 | 42P3,Collapsible Flat Rack,40,8.5,42P3 478 | 4363,Flat,40,8.5,42P3 479 | 48P3,Platform,40,4.25,42P3 480 | 40P3,Platform,40,8,42P3 481 | 42R1,Reefer,40,8.5,42R1 482 | 42R2,Thermal Refrigerated/Heated,40,8.5,42R1 483 | 42R3,Thermal Refrigerated/Heated,40,8.5,42R1 484 | 42R9,Thermal Refrigerated/Heated,40,8.5,42R1 485 | 42RC,Thermal Refrigerated/Heated,40,8.5,42R1 486 | 42RE,Thermal Refrigerated,40,8.5,42R1 487 | 42RS,Thermal Refrigerated/Heated,40,8.5,42R1 488 | 4320,Thermal Insulated,40,8.5,42R1 489 | 4330,Thermal Refrigerated,40,8.5,42R1 490 | 4332,Thermal Refrigerated/Heated,40,8.5,42R1 491 | 4333,Thermal Refrigerated/Heated,40,8.5,42R1 492 | 4340,Thermal Refrigerated,40,8.5,42R1 493 | 44H0,Thermal Refrigerated/Heated,40,9,42R1 494 | 44H1,Thermal Refrigerated/Heated,40,9,42R1 495 | 44H2,Thermal Refrigerated/Heated,40,9,42R1 496 | 44HI,Thermal Refrigerated/Heated,40,9,42R1 497 | 44HR,Thermal Refrigerated/Heated,40,9,42R1 498 | 44R0,Thermal Refrigerated/Heated,40,9,42R1 499 | 44R1,Thermal Refrigerated/Heated,40,9,42R1 500 | 44R2,Thermal Refrigerated/Heated,40,9,42R1 501 | 44R3,Thermal Refrigerated/Heated,40,9,42R1 502 | 44RE,Thermal Refrigerated,40,9,42R1 503 | 44RS,Thermal Refrigerated/Heated,40,9,42R1 504 | 44RT,Thermal Refrigerated/Heated,40,9,42R1 505 | 8520,Thermal Insulated,35,8.5,42R1 506 | 8532,Thermal Refrigerated/Heated,35,8.5,42R1 507 | 4030,Thermal Refrigerated,40,8,42R1 508 | 4031,Thermal Refrigerated,40,8,42R1 509 | 4032,Thermal Refrigerated/Heated,40,8,42R1 510 | 4040,Thermal Refrigerated,40,8,42R1 511 | 40H0,Thermal Refrigerated/Heated,40,8,42R1 512 | 40H1,Thermal Refrigerated/Heated,40,8,42R1 513 | 40H2,Thermal Refrigerated/Heated,40,8,42R1 514 | 40H5,Thermal Insulated,40,8,42R1 515 | 40H6,Thermal Insulated,40,8,42R1 516 | 40HI,Thermal Refrigerated/Heated,40,8,42R1 517 | 40HR,Thermal Refrigerated/Heated,40,8,42R1 518 | 40R0,Thermal Refrigerated/Heated,40,8,42R1 519 | 40R1,Thermal Refrigerated/Heated,40,8,42R1 520 | 40R2,Thermal Refrigerated/Heated,40,8,42R1 521 | 40R3,Thermal Refrigerated/Heated,40,8,42R1 522 | 40RE,Thermal Refrigerated,40,8,42R1 523 | 40RS,Thermal Refrigerated/Heated,40,8,42R1 524 | 40RT,Thermal Refrigerated/Heated,40,8,42R1 525 | 4130,Thermal Refrigerated,40,8,42R1 526 | 4131,Thermal Refrigerated,40,8,42R1 527 | 4132,Thermal Refrigerated/Heated,40,8,42R1 528 | 4224,Insulated,40,8.5,42R1 529 | 4230,Thermal Refrigerated,40,8.5,42R1 530 | 4231,Thermal Refrigerated,40,8.5,42R1 531 | 4232,Thermal Refrigerated/Heated,40,8.5,42R1 532 | 4240,Thermal Refrigerated,40,8.5,42R1 533 | 4243,Thermal Refrigerated,40,8.5,42R1 534 | 42H1,Thermal Refrigerated/Heated,40,8.5,42R1 535 | 42H2,Thermal Refrigerated/Heated,40,8.5,42R1 536 | 42HR,Thermal Refrigerated/Heated,40,8.5,42R1 537 | 42R0,Thermal Refrigerated/Heated,40,8.5,42R1 538 | 4432,Thermal Refrigerated/Heated,40,9,42R1 539 | 42RT,Thermal Refrigerated/Heated,40,8.5,42R1 540 | 42T0,Tank,40,8.5,42T0 541 | 42T1,Tank,40,8.5,42T0 542 | 42T2,Tank,40,8.5,42T0 543 | 42T3,Tank,40,8.5,42T0 544 | 42T4,Tank,40,8.5,42T0 545 | 42T5,Tank,40,8.5,42T0 546 | 42T6,Tank,40,8.5,42T0 547 | 42T7,Tank,40,8.5,42T0 548 | 42T8,Tank,40,8.5,42T0 549 | 42T9,Tank,40,8.5,42T0 550 | 42TD,Tank,40,8.5,42T0 551 | 42TG,Tank,40,8.5,42T0 552 | 4370,Tank,40,8.5,42T0 553 | 8770,Tank,35,4.25,42T0 554 | 4070,Tank,40,8,42T0 555 | 4071,Tank,40,8,42T0 556 | 40T0,Tank,40,8,42T0 557 | 40T1,Tank,40,8,42T0 558 | 40T2,Tank,40,8,42T0 559 | 40T3,Tank,40,8,42T0 560 | 40T4,Tank,40,8,42T0 561 | 40T5,Tank,40,8,42T0 562 | 40T6,Tank,40,8,42T0 563 | 40T7,Tank,40,8,42T0 564 | 40T8,Tank,40,8,42T0 565 | 40T9,Tank,40,8,42T0 566 | 40TD,Tank,40,8,42T0 567 | 40TG,Tank,40,8,42T0 568 | 40TN,Tank,40,8,42T0 569 | 4170,Tank,40,8,42T0 570 | 4270,Tank,40,8.5,42T0 571 | 4271,Tank,40,8.5,42T0 572 | 42TN,Tank,40,8.5,42T0 573 | 42U1,Open Top,40,8.5,42U1 574 | 42U2,Open Top,40,8.5,42U1 575 | 42U3,Open Top,40,8.5,42U1 576 | 42U4,Open Top,40,8.5,42U1 577 | 42U5,Open Top,40,8.5,42U1 578 | 42U6,Standard Dry,40,8.5,42U1 579 | 4350,Open Top,40,8.5,42U1 580 | 4351,Open Top,40,8.5,42U1 581 | 4650,Open Top,40,4.25,42U1 582 | 4750,Open Top,40,4.25,42U1 583 | 4751,Open Top,40,4.25,42U1 584 | 48U0,Open top,40,4.25,42U1 585 | 48UT,Open top,40,4.25,42U1 586 | 8550,Open top,35,8.5,42U1 587 | 4050,Open Top,40,8,42U1 588 | 4051,Open Top,40,8,42U1 589 | 4052,Open Top,40,8,42U1 590 | 4053,Open Top,40,8,42U1 591 | 40U0,Open Top,40,8,42U1 592 | 40U1,Open Top,40,8,42U1 593 | 40U2,Open Top,40,8,42U1 594 | 40U3,Open Top,40,8,42U1 595 | 40U4,Open Top,40,8,42U1 596 | 40U5,Open Top,40,8,42U1 597 | 40UT,Open Top,40,8,42U1 598 | 4250,Open Top,40,8.5,42U1 599 | 4251,Open Top,40,8.5,42U1 600 | 4252,Open Top,40,8.5,42U1 601 | 4253,Open Top,40,8.5,42U1 602 | 42P6,Open Top,40,8.5,42U1 603 | 42U0,Open Top,40,8.5,42U1 604 | 42UT,Open Top,40,8.5,42U1 605 | 4551,OPEN TOP HIGHCUBE,40,9.5,42U1 606 | 42UP,Hard Top,40,8.5,42UP 607 | 42V0,Closed Vented,40,8.5,42VH 608 | 42V2,Closed Vented,40,8.5,42VH 609 | 42V4,Closed Vented,40,8.5,42VH 610 | 42VH,Ventilated,40,8.5,42VH 611 | 4010,Closed Vented,40,8,42VH 612 | 4011,Closed Vented,40,8,42VH 613 | 4015,Closed Ventilated,40,8,42VH 614 | 40V0,Closed Vented,40,8,42VH 615 | 40V2,Closed Vented,40,8,42VH 616 | 40V4,Closed Vented,40,8,42VH 617 | 40VH,Closed Vented,40,8,42VH 618 | 4210,Closed Vented,40,8.5,42VH 619 | 4211,Closed Vented,40,8.5,42VH 620 | 4215,Closed Ventilated,40,8.5,42VH 621 | 45G0,High Cube,40,9.5,45G0 622 | 45G1,High Cube,40,9.5,45G0 623 | 45G2,Standard Dry,40,9.5,45G0 624 | 45G3,Standard Dry,40,9.5,45G0 625 | 45G4,Unrecognized container type,0,0,45G0 626 | 9200,Standard Dry,45,8.5,45G0 627 | 9400,Standard Dry,45,9.5,45G0 628 | 4400,Standard Dry,40,9,45G0 629 | 4410,HIGH CUBE,40,9.5,45G0 630 | 4420,Thermal Insulated,40,9,45G0 631 | 4426,Automobile Carrier,40,9,45G0 632 | 4500,Standard Dry,40,8.5,45G0 633 | 4505,Standard Dry,40,8.5,45G0 634 | 4510,Standard Dry,40,9.5,45G0 635 | 4511,Standard Dry,40,9.5,45G0 636 | 4514,HIGH CUBE,40,9.5,45G0 637 | 45GP,Standard Dry,40,9.5,45G0 638 | 45R0,Thermal Refrigerated/Heated,45,9.5,45R1 639 | 45R1,Reefer High Cube,40,9.5,45R1 640 | 45R2,Thermal Refrigerated/Heated,45,9.5,45R1 641 | 45R3,Thermal Refrigerated/Heated,45,9.5,45R1 642 | 45R9,Thermal Refrigerated,40,9.5,45R1 643 | 45RC,Thermal Refrigerated/Heated,40,9.5,45R1 644 | 45RE,Thermal Refrigerated,45,9.5,45R1 645 | 45RS,Thermal Refrigerated/Heated,45,9.5,45R1 646 | 4530,Thermal Refrigerated,40,8.5,45R1 647 | 4531,Thermal Refrigerated,40,8.5,45R1 648 | 4532,Thermal Refrigerated/Heated,40,8.5,45R1 649 | 4533,Thermal Refrigerated/Heated,40,8.5,45R1 650 | 45H2,Thermal Refrigerated/Heated,45,9.5,45R1 651 | 45RT,Thermal Refrigerated/Heated,45,9.5,45R1 652 | 4534,HIGHCUBE INTEGRATED REEFER,40,9.5,45R1 653 | 45U6,High Cube Hard Top,40,9.5,45UP 654 | 45UP,High Cube Hard Top,40,9.5,45UP 655 | 49P0,Platform,40,4,49P0 656 | 49P1,Platform,40,4,49P0 657 | 49P3,Platform,40,4,49P0 658 | 49P5,Platform,40,4,49P0 659 | 49PC,Platform,40,4,49P0 660 | 49PF,Platform,40,4,49P0 661 | 4060,Flat,40,8,49P0 662 | 40P0,Platform,40,8,49P0 663 | 4260,Flat,40,8.5,49P0 664 | 42P0,Platform,40,8.5,49P0 665 | 4360,Flat,40,8.5,49P0 666 | 45P3,FOLDING COMPLETE END STRUCTURE (PLATFORM),40,9.5,49P0 667 | 45P8,Platform,40,9.5,49P0 668 | 45PC,FOLDING COMPLETE END STRUCTURE (PLATFORM),40,9.5,49P0 669 | 48P0,Platform,40,4.25,49P0 670 | 4960,Platform,40,4,49P0 671 | 49PL,Platform,40,4,49P0 672 | 9500,Standard Dry,45,9.5,L5G1 673 | 9510,Standard Dry,45,9.5,L5G1 674 | L5G1,45 High Cube,45,9,L5G1 675 | L5G2,Standard Dry,45,9.5,L5G1 676 | L5G3,Standard Dry,45,9.5,L5G1 677 | L5G9,Standard Dry,45,9.5,L5G1 678 | L0G9,Standard Dry,45,8,L5G1 679 | L0GP,HL: OPENING(S) AT ONE END OR BOTH ENDS,45,8,L5G1 680 | L2G0,Standard Dry,45,8.5,L5G1 681 | L2G1,Standard Dry,45,8.5,L5G1 682 | L2G2,Standard Dry,45,8.5,L5G1 683 | L2G3,Standard Dry,45,8.5,L5G1 684 | L2G9,Standard Dry,45,8.5,L5G1 685 | L2GP,Standard Dry,45,8.5,L5G1 686 | L5G0,Standard Dry,45,9,L5G1 687 | L5GP,Standard Dry,45,9.5,L5G1 688 | L5R1,45 Reefer High Cube,45,9.5,L5R1 689 | L5R2,Thermal Refrigerated/Heated,45,9.5,L5R1 690 | L5R3,Thermal Refrigerated/Heated,45,9.5,L5R1 691 | L5RE,Thermal Refrigerated,45,9.5,L5R1 692 | L5RS,Thermal Refrigerated/Heated,45,9.5,L5R1 693 | 45H0,Thermal Refrigerated/Heated,45,9.5,L5R1 694 | 45H1,Thermal Refrigerated/Heated,45,9.5,L5R1 695 | 45HI,Thermal Refrigerated/Heated,45,9.5,L5R1 696 | 45HR,Thermal Refrigerated/Heated,45,9.5,L5R1 697 | 9532,Thermal Refrigerated/Heated,45,9.5,L5R1 698 | L2H0,Thermal Refrigerated/Heated,45,8.5,L5R1 699 | L2H1,Thermal Refrigerated/Heated,45,8.5,L5R1 700 | L2H2,Thermal Refrigerated/Heated,45,8.5,L5R1 701 | L2HI,Thermal Refrigerated/Heated,45,8.5,L5R1 702 | L2HR,Thermal Refrigerated/Heated,45,8.5,L5R1 703 | L2R0,Thermal Refrigerated,45,8.5,L5R1 704 | L2R1,Thermal Refrigerated/Heated,45,8.5,L5R1 705 | L2R2,Thermal Refrigerated/Heated,45,8.5,L5R1 706 | L2R3,Thermal Refrigerated/Heated,45,8.5,L5R1 707 | L2RE,Thermal Refrigerated,45,8.5,L5R1 708 | L2RS,Thermal Refrigerated/Heated,45,8.5,L5R1 709 | L2RT,Thermal Refrigerated/Heated,45,8.5,L5R1 710 | L5H0,Thermal Refrigerated/Heated,45,9.5,L5R1 711 | L5H1,Thermal Refrigerated/Heated,45,9.5,L5R1 712 | L5H2,Thermal Refrigerated/Heated,45,9.5,L5R1 713 | L5HI,Thermal Refrigerated/Heated,45,9.5,L5R1 714 | L5HR,Thermal Refrigerated/Heated,45,9.5,L5R1 715 | L5R0,Thermal Refrigerated,45,9.5,L5R1 716 | L5RT,Thermal Refrigerated/Heated,45,9.5,L5R1 717 | 12TR,Flatbed,42,8,12TR 718 | -------------------------------------------------------------------------------- /data/iso-container-groups.csv: -------------------------------------------------------------------------------- 1 | code,description 2 | 22B0,20 Bulk 3 | 22G0,20 Standard Dry 4 | 22H0,20 Insulated (Conair) 5 | 22P1,20 Flat Rack 6 | 8888,Uncontainerised 7 | 22P3,20 Collapsible Flat Rack 8 | 22R1,20 Reefer 9 | 22T0,20 Tank 10 | 22U1,20 Open Top 11 | 22UP,20 Hard Top 12 | 22VH,20 Ventilated 13 | 29P0,20 Platform 14 | 42G0,40 Standard Dry 15 | 42H0,40 Insulated (Conair) 16 | 42P1,40 Flat Rack 17 | 42P3,40 Collapsible Flat Rack 18 | 42R1,40 Reefer 19 | 42T0,40 Tank 20 | 42U1,40 Open Top 21 | 42UP,40 Hard Top 22 | 42VH,40 Ventilated 23 | 45G0,40 High Cube 24 | 45R1,40 Reefer High Cube 25 | 45UP,40 High Cube Hard Top 26 | 49P0,40 Platform 27 | L5G1,45 High Cube 28 | L5R1,45 Reefer High Cube 29 | 12TR,Flatbed 30 | -------------------------------------------------------------------------------- /data/pelicula_ids.csv: -------------------------------------------------------------------------------- 1 | 01 2 | 02 3 | 03 4 | 04 5 | 05 6 | 06 7 | 07 8 | 08 9 | 09 10 | 10 11 | 11 12 | 12 13 | 13 14 | 14 15 | 15 16 | 18 17 | 19 18 | 20 19 | 21 20 | 22 21 | 23 22 | 24 23 | 25 24 | 26 25 | 28 26 | 29 27 | 31 28 | 33 29 | 34 30 | 35 31 | 36 32 | 37 33 | 38 34 | 39 35 | 41 36 | 42 37 | 43 38 | 44 39 | 45 40 | 46 41 | 47 42 | 48 43 | 49 44 | 61 45 | 62 46 | 63 47 | 64 48 | 65 49 | 66 50 | 68 51 | 69 52 | 71 53 | 72 54 | 73 55 | 74 56 | 75 57 | 76 58 | 77 59 | 78 60 | 79 61 | 210 62 | 211 63 | 213 64 | 214 65 | 215 66 | 216 67 | 217 68 | 218 69 | 219 70 | 220 71 | 223 72 | 224 73 | 225 74 | 240 75 | 270 76 | 310 77 | 311 78 | 312 79 | 315 80 | 316 81 | 317 82 | 318 83 | 320 84 | 321 85 | 322 86 | 323 87 | 324 88 | 325 89 | 328 90 | 342 91 | 362 92 | 410 93 | 411 94 | 412 95 | 414 96 | 415 97 | 416 98 | 417 99 | 418 100 | 422 101 | 423 102 | 425 103 | 610 104 | 612 105 | 613 106 | 614 107 | 615 108 | 616 109 | 617 110 | 618 111 | 619 112 | 620 113 | 621 114 | 622 115 | 623 116 | 624 117 | 625 118 | 710 119 | 711 120 | 712 121 | 713 122 | 714 123 | 715 124 | 716 125 | 717 126 | 718 127 | 719 128 | 720 129 | 722 130 | 723 131 | 724 132 | 725 133 | 765 134 | 780 135 | 815 136 | 915 137 | -------------------------------------------------------------------------------- /data/peliculas.csv: -------------------------------------------------------------------------------- 1 | # Source http://www.theguardian.com/news/datablog/2010/oct/16/greatest-films-of-all-time 2 | Entry|Film|Director|Leading actors|Year of cinema release|No of Oscars won|IMDB link|Guardian film page|Country 3 | 01|Brief Encounter|David Lean |Celia Johnson, Cyril Raymond, Stanley Holloway, Trevor Howard|1945||http://www.imdb.com/title/tt0037558/|http://www.guardian.co.uk/film/movie/35664/brief.encounter|UK 4 | 02|Casablanca|Michael Curtiz|Claude Rains, Humphrey Bogart, Ingrid Bergman, Paul Henreid|1942|3|http://www.imdb.com/title/tt0034583/|http://www.guardian.co.uk/film/movie/36156/casablanca|USA 5 | 03|Before Sunset|Richard Linklater|Ethan Hawke and Julie Delpy|2004||http://www.imdb.com/title/tt0381681/awards|http://www.guardian.co.uk/film/movie/101181/before.sunset|USA 6 | 78|Let the Right One In|Tomas Alfredson| Henrik Dahl, Kare Hedebrant, Karin Bergquist, Lina Leandersson, Per Ragnar, Peter Carlberg|2008||http://www.imdb.com/title/tt1139797/|http://www.guardian.co.uk/film/movie/125671/let-the-right-one-in|Sweden 7 | 79|Vampyr|Carl Theodor Dreyer|Henriette Gérard, Henriette Gerard, Julian West, Sybille Schmitz|1932||http://www.imdb.com/title/tt0023649/|http://www.guardian.co.uk/film/movie/80562/vampyr|Germany 8 | 815|Roman Holiday|William Wyler|Audrey Hepburn, Gregory Peck|1953|3|http://www.imdb.com/title/tt0046250/|http://www.guardian.co.uk/film/movie/96156/roman-holiday|USA 9 | 29|Hidden| Michael Haneke|Annie Girardot, Daniel Auteuil, Juliette Binoche, Maurice Benichou|2005||http://www.imdb.com/title/tt0387898/|http://www.guardian.co.uk/film/movie/108597/hidden|France|Robinson, Fred MacMurray and Barbara Stanwyck|1944||http://www.imdb.com/title/tt0036775/|http://www.guardian.co.uk/film/movie/36162/double.indemnity|USA 10 | 310|Comedy|Monty Python’s Life of Brian|Terry Jones|Eric Idle, Graham Chapman, John Cleese, Michael Palin, Terry Gilliam, Terry Jones|1979||http://www.imdb.com/title/tt0079470/|http://www.guardian.co.uk/film/movie/78168/monty-python-s-life-of-brian|UK 11 | 04|Breathless|Jean-Luc Godard|Jean Seberg, Jean-Paul Belmondo|1960||http://www.imdb.com/title/tt0053472/|http://www.guardian.co.uk/film/movie/36219/a-bout-de-souffle|France 12 | 05|In the Mood for Love|Kar Wai Wong|Maggie Cheung Man-Yuk, Rebecca Pan, Tony Leung Chiu-Wai|2000||http://www.imdb.com/title/tt0118694/|http://www.guardian.co.uk/film/movie/85442/in.the.mood.for.love|Hong Kong 13 | 28|Pulp Fiction|Quentin Tarantino|Amanda Plummer, Bruce Willis, Eric Stoltz, Harvey Keitel, John Travolta, Rosanna Arquette, Samuel L Jackson, Steve Buscemi, Tim Roth, Uma Thurman|1994|1|http://www.imdb.com/title/tt0110912/|http://www.guardian.co.uk/film/movie/56612/pulp.fiction|USA 14 | 73|Don’t Look Now|Nicholas Roeg|Donald Sutherland, Hilary Mason, Julie Christie|1973||http://www.imdb.com/title/tt0069995/|http://www.guardian.co.uk/film/movie/35097/don.t.look.now|UK 15 | 74|The Wicker Man|Robin Hardy|Britt Ekland, Christopher Lee, Edward Woodward|1973||http://www.imdb.com/title/tt0070917/|http://www.guardian.co.uk/film/movie/36301/wicker.man|UK 16 | 75|The Shining|Stanley Kubrick|Danny Lloyd, Jack Nicholson, Shelley Duval|1980||http://www.imdb.com/title/tt0081505/|http://www.guardian.co.uk/film/movie/76626/shining|USA 17 | 765|Harold & Maude|Hal Ashby|Bud Cort, Cyril Cusack, Ruth Gordon, Vivian Pickles|1971||http://www.imdb.com/title/tt0067185/|http://www.guardian.co.uk/film/movie/78471/harold-and-maude|USA 18 | 76|The Exorcist|William Friedkin|Ellen Burstyn, Linda Blair, Max von Sydow|1973|2|http://www.imdb.com/title/tt0070047/|http://www.guardian.co.uk/film/movie/86477/exorcist|USA 19 | 77|Nosferatu (1922)|FW Mernau|Alexander Granach, Greta Schroder, Gustav von Wangenheim, Max Schreck|1922||http://www.imdb.com/title/tt0013442/|http://www.guardian.co.uk/film/movie/75839/nosferatu|Germany 20 | 780|Dracula (1958)|Terence Fisher|Christopher Lee, Melissa Stribling, Michael Gough, Peter Cushing|1958||http://www.imdb.com/title/tt0051554/|http://www.guardian.co.uk/film/movie/36215/dracula|UK 21 | 31|Comedy|Annie Hall|Woody Allen|Carol Kane, Diane Keaton, Paul Simon, Tony Roberts, Woody Allen|1977|4|http://www.imdb.com/title/tt0075686/|http://www.guardian.co.uk/film/movie/36314/annie.hall|USA 22 | 06|The Apartment|Billy Wilder|Fred MacMurray, Jack Lemmon, Ray Walston, Shirley MacLaine|1960||http://www.imdb.com/title/tt0053604/|http://www.guardian.co.uk/film/movie/36225/apartment|USA 23 | 07|Hannah & Her Sisters|Woody Allen |Barbara Hershey, Carrie Fisher, Dianne Wiest, Julie Kavner, Mia Farrow, Michael Caine, Woody Allen|1986|3|http://www.imdb.com/title/tt0091167/|http://www.guardian.co.uk/film/movie/89162/hannah-and-her-sisters|USA 24 | 08|Eternal Sunshine of the Spotless Mind|Michel Gondry|Elijah Wood, Jim Carrey, Kate Winslet, Kirsten Dunst, Mark Ruffalo, Tom Wilkinson|2004|1|http://www.imdb.com/title/tt0338013/|http://www.guardian.co.uk/film/movie/100140/eternal.sunshine.of.the.spotless.mind|USA 25 | 09|Room With a View|James Ivory|Helena Bonham Carter, Julian Sands, Maggie Smith|1985|3|http://www.imdb.com/title/tt0091867/|http://www.guardian.co.uk/film/movie/77615/room-with-a-view|UK 26 | 10|Jules et Jim|François Truffaut|Henri Serre, Jeanne Moreau, Oscar Werner, Oskar Werner|1962||http://www.imdb.com/title/tt0055032/|http://www.guardian.co.uk/film/movie/76699/jules.et.jim|France 27 | 11|All That Heaven Allows|Douglas Sirk|Jane Wyman, Rock Hudson|1955||http://www.imdb.com/title/tt0047811/|http://www.guardian.co.uk/film/movie/94875/all-that-heaven-allows|USA 28 | 12|Gone with the Wind|Victor Fleming|Anne Rutherford, Clark Gable, Hattie McDaniel, Leslie Howard, Olivia De Havilland, Vivien Leigh|1939|8|http://www.imdb.com/title/tt0031381/|http://www.guardian.co.uk/film/movie/36144/gone.with.the.wind|USA 29 | 13|An Affair to Remember|Leo McCarey|Cary Grant, Deborah Kerr, Richard Denning|1957||http://www.imdb.com/title/tt0050105/|http://www.guardian.co.uk/film/movie/82271/affair.to.remember|USA 30 | 14|Umbrellas of Cherbourg|Jaques Demy |Anne Vernon, Catherine Deneuve, Nino Castelnuovo|1964||http://www.imdb.com/title/tt0058450/|http://www.guardian.co.uk/film/movie/77848/umbrellas.of.cherbourg|France 31 | 15|Lost in Translation|Sofia Coppola|Bill Murray, Giovanni Ribisi, Scarlett Johansson|2003|1|http://www.imdb.com/title/tt0335266/|http://www.guardian.co.uk/film/movie/96936/lost.in.translation|USA 32 | 18|My Night With Maud|Eric Rohmer| Francoise Fabian, Jean-Louis Trintignant|1969||http://www.imdb.com/title/tt0064612/|http://www.guardian.co.uk/film/movie/77331/my-night-with-maud|France 33 | 19|Voyage to Italy|Roberto Rossellini|Ingrid Bergman|1954||http://www.imdb.com/title/tt0046511/|http://www.guardian.co.uk/film/movie/88522/voyage-to-italy|Italy 34 | 20|Dr Zhivago|David Lean|Geraldine Chaplin, Julie Christie, Omar Sharif|1965|5|http://www.imdb.com/title/tt0059113/|http://www.guardian.co.uk/film/movie/78519/dr-zhivago|USA 35 | 210|Goodfellas|Martin Scorsese| Frank Vincent, Joe Pesci, Lorraine Bracco, Ray Liotta, Robert De Niro|1990|1|http://www.imdb.com/title/tt0099685/|http://www.guardian.co.uk/film/movie/37702/goodfellas|USA 36 | 218|Hard Boiled|John Woo|Chow Yun Fat, Tony Leung|1992||http://www.imdb.com/title/tt0104684/|http://www.guardian.co.uk/film/movie/82687/hard-boiled|Hong Kong 37 | 219|Long Good Friday|John McKenzie|Bob Hoskins, Bryan Marshall, Dave King, Helen Mirren|1980||http://www.imdb.com/title/tt0081070/|http://www.guardian.co.uk/film/movie/36322/long.good.friday|UK 38 | 21|Chinatown|Roman Polanski|Faye Dunaway, Jack Nicholson, John Huston|1974|1|http://www.imdb.com/title/tt0071315/|http://www.guardian.co.uk/film/movie/36302/chinatown|USA 39 | 220|A Prophet|Jacques Audiard |Adel Bencherif, Niels Arestrup, Tahar Rahim, Tahar Ramin|2009||http://www.imdb.com/title/tt1235166/|http://www.guardian.co.uk/film/movie/129970/prophet|France 40 | 220|Scarface (1983)|Brian De Palma|Al Pacino, Mary Elizabeth Mastrantonio, Michelle Pfeiffer, Robert Loggia, Steven Bauer|1983||http://www.imdb.com/title/tt0086250/|http://www.guardian.co.uk/film/movie/78370/scarface|USA 41 | 223|Miller’s Crossing|Joel Coen|Albert Finney, Gabriel Byrne, Marcia| Gay Harden|1990||http://www.imdb.com/title/tt0100150/|http://www.guardian.co.uk/film/movie/78569/miller.s.crossing|USA 42 | 224|Postman Always Rings Twice (1942)|Tay Garnett|Cecil Kellaway, John Garfield, Lana Turner|1946||http://www.imdb.com/title/tt0038854/|http://www.guardian.co.uk/film/movie/90190/postman-always-rings-twice|USA 43 | 225|Jour Se Leve|Marcel Carne|Annabella, Arletty, Jean Gabin|1939||http://www.imdb.com/title/tt0031514/|http://www.guardian.co.uk/film/movie/76684/jour-se-leve|France 44 | 22|Touch of Evil|Orson Welles|Charlton Heston, Janet Leigh, Marlene Dietrich, Orson Welles, Zsa Zsa Gabor|1958||http://www.imdb.com/title/tt0052311/|http://www.guardian.co.uk/film/movie/36217/touch.of.evil|USA 45 | 23|Say Anything....|Cameron crowe|John Cusack, Ione Skye, John Mahoney|1989||http://www.imdb.com/title/tt0098258/||USA 46 | 214|French Connection|William Friedkin|Fernando Rey, Gene Hackman, Roy Schieder, Tony Lo Bianco|1971|5|http://www.imdb.com/title/tt0067116/|http://www.guardian.co.uk/film/movie/36293/french-connection|USA 47 | 215|The Big Sleep|Howard Hawkes|Bob Steele, Elisha Cook Jr, Elisha Cook Jr., Humphrey Bogart, Lauren Bacall|1946||http://www.imdb.com/title/tt0038355/|http://www.guardian.co.uk/film/movie/34621/big-sleep|USA 48 | 216|La Ceremonie|Claude Chabrol|Isabelle Huppert, Jacqueline Bisset, Sandrine Bonnaire|1995||http://www.imdb.com/title/tt0112769/|http://www.guardian.co.uk/film/movie/80763/ceremonie|France 49 | 217|Point Blank|John Boorman|Angie Dickinson, Keenan Wynn, Lee Marvin|1967||http://www.imdb.com/title/tt0062138/|http://www.guardian.co.uk/film/movie/36266/point-blank|USA 50 | 23|Vertigo|Alfred Hitchcock|Barbara Bel Geddes, James Stewart, Kim Novak|1958||http://www.imdb.com/title/tt0052357/|http://www.guardian.co.uk/film/movie/34909/vertigo|USA 51 | 240|When Harry Met Sally|Rob Reiner|Billy Crystal, Bruno Kirby, Carrie Fisher, Meg Ryan|1989||http://www.imdb.com/title/tt0098635/|http://www.guardian.co.uk/film/movie/75869/when-harry-met-sally.|USA 52 | 24|Badlands|Terrence Malik|Alan Vint, Martin Sheen, Ramon Bieri, Sissy Spacek, Warren Oates|1973||http://www.imdb.com/title/tt0069762/|http://www.guardian.co.uk/film/movie/76181/badlands|USA 53 | 24|Fabulous Baker Boys|Steve Kloves|Beau Bridges, Jeff Bridges, Michelle Pfeiffer|1989||http://www.imdb.com/title/tt0097322/|http://www.guardian.co.uk/film/movie/134648/fabulous-baker-boys|USA 54 | 25|A Matter of Life & Death| Emeric Pressburger, Michael Powell| David Niven, Kim Hunter, Raymond Massey, Richard Attenborough, Roger Livesey|1946||http://www.imdb.com/title/tt0038733/|http://www.guardian.co.uk/film/movie/36173/matter.of.life.and.death|UK 55 | 26|Rashomon|Akira Kurosawa|Machiko Kyo, Masayuki Mori, Toshiro Mifune|1950||http://www.imdb.com/title/tt0042876/|http://www.guardian.co.uk/film/movie/83179/rashomon|Japan 56 | 270|Heat|Michael Mann|Al Pacino, Ashley Judd, Jon Voight, Robert De Niro, Tom Sizemore, Val Kilmer|1995||http://www.imdb.com/title/tt0113277/|http://www.guardian.co.uk/film/movie/60365/heat|USA 57 | 320|Comedy|Groundhog Day|Harold Ramis|Andie MacDowell, Bill Murray, Chris Elliott, Stephen Tobolowsky|1993||http://www.imdb.com/title/tt0107048/|http://www.guardian.co.uk/film/movie/79383/groundhog-day|USA 58 | 321|Comedy|Clueless|Amy Heckerling|Alicia Silverstone, Dan Hedaya, Stacey Dash|1995||http://www.imdb.com/title/tt0112697/|http://www.guardian.co.uk/film/movie/59257/clueless|USA 59 | 322|Comedy|The Great Dictator|Charlie Chaplin|Charlie Chaplin, Jack Oakie, Paulette Goddard|1940||http://www.imdb.com/title/tt0032553/|http://www.guardian.co.uk/film/movie/96585/great.dictator|USA 60 | 323|Comedy|Clerks|Kevin Smith|Brian O'Halloran, Jeff Anderson, Marilyn Ghigliotti|1994||http://www.imdb.com/title/tt0109445/|http://www.guardian.co.uk/film/movie/53831/clerks|USA 61 | 211|Bonnie & Clyde|Arthur Penn|Faye Dunaway, Gene Hackman, Michael J Pollard, Warren Beatty|1967|2|http://www.imdb.com/title/tt0061418/|http://www.guardian.co.uk/film/movie/76253/bonnie-and-clyde|USA 62 | 211|The Conversation|Francis Coppola, Francis Ford Coppola|Allen Garfield, Gene Hackman, John Cazale|1974||http://www.imdb.com/title/tt0071360/|http://www.guardian.co.uk/film/movie/77114/conversation|USA 63 | 213|The Killing|Stanley Kubrick| Coleen Gray, Elisha Cook Junior, Jay C Flippen, Sterling Hayden, Vince Edwards|1956||http://www.imdb.com/title/tt0049406/|http://www.guardian.co.uk/film/movie/87920/killing|USA 64 | 324|Comedy|The Jerk|Carl Reiner|Steve Martin|1979||http://www.imdb.com/title/tt0079367/|http://www.guardian.co.uk/film/movie/88834/jerk|USA 65 | 311|Comedy|Airplane!|Jim Abrahams, David Zucker and Jerry Zucker|Julie Hagerty, Leslie Nielsen, Robert Hays|1980||http://www.imdb.com/title/tt0080339/|http://www.guardian.co.uk/film/movie/83228|USA 66 | 312|Comedy|Election|Alexander Payne|Chris Klein, Matthew Broderick, Reese Witherspoon|1999||http://www.imdb.com/title/tt0126886/|http://www.guardian.co.uk/film/movie/79657/election|USA 67 | 315|Comedy|This Is Spinal Tap|Rob Reiner| Christopher Guest, Harry Shearer, Michael McKean, Rob Reiner|1984||http://www.imdb.com/title/tt0088258/|http://www.guardian.co.uk/film/movie/81384/this.is.spinal.tap|USA 68 | 316|Comedy|Bringing Up Baby|Howard Hawkes|Cary Grant, Katharine Hepburn, Katherine Hepburn|1938||http://www.imdb.com/title/tt0029947/|http://www.guardian.co.uk/film/movie/36143/bringing-up-baby|USA 69 | 418|Last of the Mohicans|Michael Mann|Daniel Day-Lewis, Jodhi May, Madeleine Stowe|1992|1|http://www.imdb.com/title/tt0104691/|http://www.guardian.co.uk/film/movie/79330/last-of-the-mohicans| 70 | 41|Apocalypse Now|Francis Coppola|Dennis Hopper, Frederic Forrest, Laurence Fishburne, Marlon Brando, Martin Sheen, Robert Duvall, Rpobert Duvall|1979|2|http://www.imdb.com/title/tt0078788/|http://www.guardian.co.uk/film/movie/36320/apocalypse.now|USA 71 | 422|Deer Hunter|Michael Cimino|Christopher Walken, Meryl Streep, Robert De Niro|1978|5|http://www.imdb.com/title/tt0077416/|http://www.guardian.co.uk/film/movie/36318/deer-hunter|USA 72 | 422|Gladiator|Ridley Scott|Connie Nielsen, Joaquin Phoenix, Oliver Reed, Russell Crowe|2000|5|http://www.imdb.com/title/tt0172495/|http://www.guardian.co.uk/film/movie/83550/gladiator|USA 73 | 422|Rome Open City|Roberto Rossellini|Aldo Fabrizi, Anna Magnani, Marcello Pagliero|1945||http://www.imdb.com/title/tt0038890/|http://www.guardian.co.uk/film/movie/78859/rome-open-city|Italy 74 | 423|Butch Cassidy|George Roy Hill|Katharine Ross, Paul Newman, Robert Redford|1969|4|http://www.imdb.com/title/tt0064115/|http://www.guardian.co.uk/film/movie/36276/butch-cassidy-and-the-sundance-kid|USA 75 | 423|Where Eagles Dare|Brian G. Hutton|Clint Eastwood, Mary Ure, Richard Burton|1968||http://www.imdb.com/title/tt0065207/|http://www.guardian.co.uk/film/movie/83199/where-eagles-dare|USA 76 | 425|The Incredibles|Brad Bird|Craig T Nelson, Holly Hunter, Jason Lee, Samuel L Jackson|2004|2|http://www.imdb.com/title/tt0317705/|http://www.guardian.co.uk/film/movie/102423/incredibles|USA 77 | 42|North by Northwest|Alfred Hitchcock| Cary Grant, Eva Marie Saint, Eva Marie Saint, James Mason, Jessie Royce Landis, Leo G Carroll, Martin Landau|1959||http://www.imdb.com/title/tt0053125/|http://www.guardian.co.uk/film/movie/35095/north-by-northwest|USA 78 | 43|Once Upon a Time in the West|Sergio Leone|Charles Bronson, Claudia Cardinale, Henry Fonda, Jason Robards|1968||http://www.imdb.com/title/tt0064116/|http://www.guardian.co.uk/film/movie/36274/once.upon.a.time.in.the.west|Italy 79 | 44|The Wild Bunch|Sam Pekinpah|Ernest Borgnine, Robert Ryan, William Holden|1969||http://www.imdb.com/title/tt0065214/|http://www.guardian.co.uk/film/movie/36285/wild.bunch|USA 80 | 45|Deliverance|John Boorman |Burt Reynolds, Jon Voight, Ned Beatty|1972||http://www.imdb.com/title/tt0068473/|http://www.guardian.co.uk/film/movie/76560/deliverance|USA 81 | 317|Comedy|There’s Something About Mary|Peter & Bob Farrelly|Ben Stiller, Cameron Diaz, Lee Evans, Matt Dillon|1998||http://www.imdb.com/title/tt0129387/|http://www.guardian.co.uk/film/movie/34359/there.s.something.about.mary|USA 82 | 318|Comedy|Dazed and Confused|Richard Linklater|Adam Goldberg, Jason London, Joey Lauren Adams, Joey Lauren Adams, Milla Jovovich, Rory Cochrane, Shawn Andrew|1993||http://www.imdb.com/title/tt0106677/|http://www.guardian.co.uk/film/movie/49047/dazed-and-confused|USA 83 | 325|Comedy|Shaun of the Dead|Edgar Wright|Dylan Moran, Kate Ashfield, Nick Frost, Simon Pegg|2004||http://www.imdb.com/title/tt0365748/|http://www.guardian.co.uk/film/movie/99960/shaun.of.the.dead|UK 84 | 328|Comedy|MASH|Robert Altman|Donald Sutherland, Elliott Gould, Sally Kellerman|1970|1|http://www.imdb.com/title/tt0066026/|http://www.guardian.co.uk/film/movie/84547|USA 85 | 33|Comedy|Borat|Larry Charles|Ken Davitian, Pamela Anderson , Sacha Baron Cohen|2006||http://www.imdb.com/title/tt0443453/|http://www.guardian.co.uk/film/movie/114557/borat|USA 86 | 33|Comedy|Some Like it Hot|Billy Wilder|George Raft, Jack Lemmon, Joe E Brown, Marilyn Monroe, Tony Curtis|1959|1|http://www.imdb.com/title/tt0053291/|http://www.guardian.co.uk/film/movie/36223/some.like.it.hot|USA 87 | 342|Comedy|The Big Lebowski|Joel Coen|Jeff Bridges, John Goodman, Julianne Moore, Steve Buscemi|1998||http://www.imdb.com/title/tt0118715/|http://www.guardian.co.uk/film/movie/77069/big.lebowski|USA 88 | 34|Comedy|Team America|Trey Parker|Kristen Miller, Matt Stone, Trey Parker|2004||http://www.imdb.com/title/tt0372588/|http://www.guardian.co.uk/film/movie/103000/team.america|USA 89 | 35|Comedy|Dr Strangelove|Stanley Kubrick|George C Scott, Peter Sellers, Sterling Hayden|1964||http://www.imdb.com/title/tt0057012/|http://www.guardian.co.uk/film/movie/76390/dr-strangelove|UK 90 | 362|Comedy|His Girl Friday|Howard Hawkes|Cary Grant, Gene Lockhart, Ralph Bellamy, Rosalind Russell|1940||http://www.imdb.com/title/tt0032599/|http://www.guardian.co.uk/film/movie/76369/his-girl-friday|USA 91 | 36|Comedy|The Ladykillers|Alexander Mackendrick|Alec Guinness, Cecil Parker, Herbert Lom, Peter Sellers|1955||http://www.imdb.com/title/tt0048281/|http://www.guardian.co.uk/film/movie/36206/ladykillers|UK 92 | 61|2001|Stanley Kubrick|Daniel Richter, Gary Lockwood, Keir Dullea, William Sylvester|1968|1|http://www.imdb.com/title/tt0062622/|http://www.guardian.co.uk/film/movie/36269/2001|USA 93 | 620|Day the Earth Stood Still|Robert Wise|Hugh Marlowe, Lock Martin, Michael Rennie, Patricia Neal|1951||http://www.imdb.com/title/tt0043456/|http://www.guardian.co.uk/film/movie/82253/day-the-earth-stood-still|USA 94 | 621|Edward Scissorhands|Tim Burton|Dianne Wiest, Johnny Depp, Winona Ryder|1990||http://www.imdb.com/title/tt0099487/|http://www.guardian.co.uk/film/movie/82335/edward.scissorhands|USA 95 | 622|Akira|Katsuhiro Otomo|Mitsuo Iwata, Nozomu Sasaki, Mami Koyama, Tessho Genda|1988||http://www.imdb.com/title/tt0094625/|http://www.guardian.co.uk/film/movie/76882/akira|Japan 96 | 623|Princess Bride|Rob reiner|Billy Crystal, Carty Elwes, Cary Elwes, Mandy Patinkin, Peter Falk, Robin Wright|1987||http://www.imdb.com/title/tt0093779/|http://www.guardian.co.uk/film/movie/77070/princess-bride|USA 97 | 624|Pan’s Labyrinth|Guillermo del Toro| Ariadna Gil, Doug Jones, Ivana Baquero, Maribel Verdu, Sergi Lopez|2006|3|http://www.imdb.com/title/tt0457430/|http://www.guardian.co.uk/film/movie/112345/pan.s.labyrinth|Spain 98 | 625|Starship Troopers|Paul Verhoeven|Casper Van Dien, Clancy Brown, Dina Meyer, Jake Busey, Michael Ironside|1997||http://www.imdb.com/title/tt0120201/|http://www.guardian.co.uk/film/movie/71806/starship-troopers|USA 99 | 62|Metropolis|Fritz Lang|Alfred Abel, Brigitte Helm, Gustav Frohlich, Gustav Fruhlich|1927||http://www.imdb.com/title/tt0017136/|http://www.guardian.co.uk/film/movie/75782/metropolis|Germany 100 | 63|Blade Runner|Ridley Scott|Harrison Ford, Rutger Hauer, Sean Young|1982||http://www.imdb.com/title/tt0083658/|http://www.guardian.co.uk/film/movie/76627/blade-runner|USA 101 | 64|Alien|Ridley Scott|Ian Holm, John Hurt, Sigourney Weaver, Tom Skerritt|1979|1|http://www.imdb.com/title/tt0078748/|http://www.guardian.co.uk/film/movie/75860/alien|USA 102 | 65|The Wizard of Oz|Victor Fleming|Bert Lahr, Frank Morgan, Jack Haley, Judy Garland, Ray Bolger|1939|2|http://www.imdb.com/title/tt0032138/|http://www.guardian.co.uk/film/movie/36148/wizard.of.oz|USA 103 | 66|ET|Steven Spielberg|Dee Wallace, Drew Barrymore, Henry Thomas, Peter Coyote|1982|4|http://www.imdb.com/title/tt0083866/|http://www.guardian.co.uk/film/movie/92910/e.t.the.extra-terrestrial|USA 104 | 66|Solaris| Andrei Tarkovsky|Donatas Banionis, Juri Jarvet, Nataly Bondarchuk, Natalya Bondarchuk|1972||http://www.imdb.com/title/tt0069293/|http://www.guardian.co.uk/film/movie/76558/solaris|USA 105 | 68|Spirited Away|Hayao Miyazaki|Daveigh Chase, Jason Marsden, Jason Marsdon, Mari Natsuki, Miyu Irino, Rumi Hiragi, Suzanne Pleshette|2001|1|http://www.imdb.com/title/tt0245429/|http://www.guardian.co.uk/film/movie/96263/spirited.away|Japan 106 | 37|Comedy|Duck Soup|Leo McCarey|Chico Marx, Groucho Marx, Harpo Marx, Margaret Dumont, The Marx Brothers, Zeppo Marx|1933||http://www.imdb.com/title/tt0023969/|http://www.guardian.co.uk/film/movie/36133/duck.soup|USA 107 | 38|Comedy|Rushmore|Wes Anderson| Bill Murray, Brian Cox, Jason Schwartzman, Olivia Williams|1998||http://www.imdb.com/title/tt0128445/|http://www.guardian.co.uk/film/movie/79577/rushmore|USA 108 | 39|Comedy|Kind Hearts & Coronets|Robert Hamer|Alec Guinness, Dennis Price, Joan Greenwood|1949||http://www.imdb.com/title/tt0041546/|http://www.guardian.co.uk/film/movie/36180/kind-hearts-and-coronets|UK 109 | 410| The Thin Red Line|Terrence Malik|Adrien Brody, Ben Chaplin, Nick Nolte, Sean Penn|1998||http://www.imdb.com/title/tt0120863/|http://www.guardian.co.uk/film/movie/74795/thin.red.line|USA 110 | 411|Raiders of the Lost Ark|Steven Spielberg|Harrison Ford, Karen Allen, Paul Freeman, Ronald Lacey|1981|4|http://www.imdb.com/title/tt0082971/|http://www.guardian.co.uk/film/movie/36332/raiders-of-the-lost-ark|USA 111 | 712|Ringu|Hideo Nakata|Nanako Matsushima, Hiroyuki Sanada, Rikiya Otaka|1998||http://www.imdb.com/title/tt0178868/|http://www.guardian.co.uk/film/movie/121191/ringu|Japan 112 | 713|The Haunting|Robert Wise|Claire Bloom, Julie Harris, Richard Johnson|1963||http://www.imdb.com/title/tt0057129/|http://www.guardian.co.uk/film/movie/99697/haunting|USA 113 | 714|Texas Chainsaw Massacre|Tobe Hooper| Edwin Neal, Jim Siedow, Marilyn Burns, Paul A Partain|1974||http://www.imdb.com/title/tt0072271/|http://www.guardian.co.uk/film/movie/82763/texas-chainsaw-massacre|USA 114 | 715|Dead of Night|Alberto Cavalcanti, Charles Crichton|Googie Withers, Mervyn Johns, Michael Redgrave|1945||http://www.imdb.com/title/tt0037635/|http://www.guardian.co.uk/film/movie/79561/dead.of.night|UK 115 | 716|The Cabinet of Dr Caligari|Robert Wiene|Conrad Veidt, Lil Dagover, Werner Krauss|1920||http://www.imdb.com/title/tt0010323/|http://www.guardian.co.uk/film/movie/77300/cabinet-of-dr-caligari|Germany 116 | 717|Halloween|John Carpenter|Donald Pleasance, Donald Pleasence, Jamie Lee Curtis, Nancy Loomis, Tony Moran|1978||http://www.imdb.com/title/tt0077651/|http://www.guardian.co.uk/film/movie/104810/halloween|USA 117 | 718|Bride of Frankenstein|James Whale|Boris Karloff, Colin Clive, Elsa Lanchester|1935||http://www.imdb.com/title/tt0026138/|http://www.guardian.co.uk/film/movie/34577/bride-of-frankenstein|USA 118 | 719|Les Diaboliques|Henri-Georges Clouzot|Paul Meurisse, Simone Signoret, Vera Clouzot|1955||http://www.imdb.com/title/tt0046911/|http://www.guardian.co.uk/film/movie/75862/diaboliques|France 119 | 71|Psycho|Alfred Hitchcock|Anthony Perkins, Janet Leigh, Vera Miles|1960||http://www.imdb.com/title/tt0054215/|http://www.guardian.co.uk/film/movie/34630/psycho|USA 120 | 720|Audition|Miike Takashi|Eihi Shiina, Ishibashi Renji, Ishibashi Ryo, Matsuda Miyuki, Renji Ishibashi, Ryo Ishibashi, Shiina Eihi|1999||http://www.imdb.com/title/tt0235198/|http://www.guardian.co.uk/film/movie/84815/audition|Korea 121 | 412| Bullitt|Peter Yates|Jacqueline Bisset, Robert Vaughn, Steve McQueen|1968|1|http://www.imdb.com/title/tt0062765/|http://www.guardian.co.uk/film/movie/76966/bullitt|USA 122 | 412|Ran|Akira Kurosawa|Akira Terao, Daisuke Ryu, Mieko Harada, Tatsuya Nakadai|1985|1|http://www.imdb.com/title/tt0089881/|http://www.guardian.co.uk/film/movie/76633/ran|Japan 123 | 414|Die Hard|John McTeirnan|Alan Rickman, Bonnie Bedelia, Bruce Willis|1988||http://www.imdb.com/title/tt0095016/|http://www.guardian.co.uk/film/movie/80851/die-hard|Japan 124 | 415|The Adventures of Robin Hood|Michael Curtiz, William Keighley|Basil Rathbone, Claude Rains, Errol Flynn, Olivia De Havilland, Olivia de Havilland, William Keighley|1938|3|http://www.imdb.com/title/tt0029843/|http://www.guardian.co.uk/film/movie/34500/adventures-of-robin-hood|USA 125 | 416| The Searchers|John Ford|Jeffrey Hunter, John Wayne, Natalie Wood, Vera Miles, Ward Bond|1956||http://www.imdb.com/title/tt0049730/|http://www.guardian.co.uk/film/movie/115097/searchers|USA 126 | 417|Goldfinger|Guy Hamilton| Bernard Lee, Gert Frobe, Harold Sakata, Honor Blackman, Lois Maxwell, Sean Connery, Shirley Eaton, Tania Mallet|1964|1|http://www.imdb.com/title/tt0058150/|http://www.guardian.co.uk/film/movie/79341/goldfinger|UK 127 | 418|Full Metal Jacket|Stanley Kubrick|Adam Baldwin, Lee Ermey, Matthew Modine, Vincent D'Onofrio|1987||http://www.imdb.com/title/tt0093058/|http://www.guardian.co.uk/film/movie/76429/full-metal-jacket|USA 128 | 46|City of God|Fernando Meirelles|Alexandre Rodrigues, Leandro Firmino da Hora, Matheus Nachtergaele, Phelipe Haagensen|2002||http://www.imdb.com/title/tt0317248/|http://www.guardian.co.uk/film/movie/94028/city.of.god|Brazil 129 | 47|Paths of Glory|Stanley Kubrick|Adolphe Menjou, Kirk Douglas, Ralph Meeker|1957||http://www.imdb.com/title/tt0050825/|http://www.guardian.co.uk/film/movie/76931/paths.of.glory|USA 130 | 48|The Wages of Fear|Henri-Georges Clouzot|Charles Vanel, Folco Lulli, Yves Montand|1953||http://www.imdb.com/title/tt0046268/|http://www.guardian.co.uk/film/movie/78592/wages-of-fear|France 131 | 49|Crouching Tiger Hidden Dragon|Ang Lee|Chang Chen, Chow Yun-Fat, Michelle Yeoh, Zhang Ziyi, Ziyi Zhang|2000|4|http://www.imdb.com/title/tt0190332/|http://www.guardian.co.uk/film/movie/86383/crouching.tiger.hidden.dragon|Taiwan 132 | 610|Close Encounters|Steven Spielberg|Melinda Dillon, Richard Dreyfuss|1977|1|http://www.imdb.com/title/tt0075860/|http://www.guardian.co.uk/film/movie/36315/close-encounters-of-the-third-kind|USA 133 | 610|King Kong|Ernest B Schoedsack, Merian C Cooper|Bruce Cabot, Ernest B Schoedsack, Fay Wray, Frank Reicher, James Flavin, John Armstrong, Noble Jhonson, Robert Armstrong|1933||http://www.imdb.com/title/tt0024216/|http://www.guardian.co.uk/film/movie/36134/king.kong|USA 134 | 612|Terminator/Terminator 2|James Cameron|Arnold Schwarzenegger, Linda Hamilton, Michael Biehn|1984/1991|4 altogether|http://www.imdb.com/title/tt0088247/|http://www.guardian.co.uk/film/movie/88018/terminator|USA 135 | 613|The Matrix|Andy & Larry Wachowski|Carrie-Anne Moss, Keanu Reeves, Laurence Fishburne|1999|4|http://www.imdb.com/title/tt0133093/|http://www.guardian.co.uk/film/movie/77528/matrix|USA 136 | 614|Alphaville|Jean Luc-Godard|Anna Karina, Eddie Constantine|1965||http://www.imdb.com/title/tt0058898/|http://www.guardian.co.uk/film/movie/75764/alphaville|France 137 | 615|Back to the Future|Robert Zemeckis|Christopher Lloyd, Crispin Glover, Lea Thompson, Michael J Fox, Michael J. Fox|1985|1|http://www.imdb.com/title/tt0088763/|http://www.guardian.co.uk/film/movie/78042/back-to-the-future|USA 138 | 616|Planet of the Apes|Franklin J Schaffner |Charlton Heston, Kim Hunter, Roddy McDowell|1968|1|http://www.imdb.com/title/tt0063442/|http://www.guardian.co.uk/film/movie/95819/planet-of-the-apes|USA 139 | 617|Brazil|Terry Gilliam|Jonathan Pryce, Michael Palin, Robert De Niro|1985||http://www.imdb.com/title/tt0088846/|http://www.guardian.co.uk/film/movie/79920/brazil|UK 140 | 618|The Lord of the Rings trilogy|Peter Jackson|Cate Blanchett, Dominic Monaghan, Elijah Wood, Hugo Weaving, John Rhys-Davies, Liv Tyler, Miranda Otto, Orlando Bloom, Sean Astin, Sir Ian McKellen, Viggo Mortensen, William Boyd|2001-2003|17 altogether|http://www.imdb.com/title/tt0167260/|http://www.guardian.co.uk/film/movie/92716/lord.of.the.rings|New Zealand 141 | 619|Dark Star|John Carpenter|Brian Narelle, Dan O'Bannon, Dre Pahich|1974||http://www.imdb.com/title/tt0069945/|http://www.guardian.co.uk/film/movie/77501/dark-star|USA 142 | 69|Star Wars (1977)|George Lucas|Alec Guinness, Carrie Fisher, David Prowse, Harrison Ford, Mark Hamill, Peter Cushing, Peter Mayhew|1977|6|http://www.imdb.com/title/tt0076759/|http://www.guardian.co.uk/film/movie/36316/star.wars|USA 143 | 710|Peeping Tom|Michael Powell|Anna Massey, Carl Boehm, Esmond Knight, Karl Bohm, Maxine Audley, Moira Shearer|1960||http://www.imdb.com/title/tt0054167/|http://www.guardian.co.uk/film/movie/36228/peeping-tom|UK 144 | 711|The Innocents|Jack Clayton|Clytie Jessop, Deborah Kerr, Michael Redgrave, Peter Wyngarde|1961||http://www.imdb.com/title/tt0055018/|http://www.guardian.co.uk/film/movie/77279/innocents|USA 145 | 722|The Blair Witch Project|Daniel Myrick, E Sanchez|Heather Donahue, Joshua Leonard, Michael C. Williams|1999||http://www.imdb.com/title/tt0185937/|http://www.guardian.co.uk/film/movie/79459/blair.witch.project|USA 146 | 723|Evil Dead/Evil Dead II|Sam Raimi|Betsy Baker, Bruce Campbell, Ellen Sandweiss|1981/ 1987||http://www.imdb.com/title/tt0083907/|http://www.guardian.co.uk/film/movie/34582/evil-dead|USA 147 | 724|Carrie|Brian De Palma|John Travolta, Piper Laurie, Sissy Spacek|1976||http://www.imdb.com/title/tt0074285/|http://www.guardian.co.uk/film/movie/81489/carrie|USA 148 | 725|Les Vampires (1915)|Louis Feuillade|Edouard Mathe, Marcel Levesque|1915||http://www.imdb.com/title/tt0006206/|http://www.guardian.co.uk/film/movie/117077/vampires|France 149 | 72|Rosemary’s Baby|Roman Polanski|John Cassavetes, Mia Farrow, Ruth Gordon|1968|1|http://www.imdb.com/title/tt0063522/|http://www.guardian.co.uk/film/movie/80947/rosemary-s-baby|USA 150 | 915|Wall-E|Andrew Stanton|Ben Burtt, Fred Willard, Jeff Garlin, Kathy Najimy, Sigourney Weaver|2008|1|http://www.imdb.com/title/tt0910970/|http://www.guardian.co.uk/film/movie/125194/wall-e|USA 151 | -------------------------------------------------------------------------------- /data/poors_man_routes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Poor's man big data - bash edition 5 | 6 | function clean_up { 7 | rm -rf random_ships* 8 | } 9 | trap clean_up EXIT 10 | 11 | # Read ship_imo and ship_name 12 | mkfifo random_ships 13 | cut -d ";" -f1,2 containers_tiny.csv | uniq | tail -n +2 | sed "s/;/|/g" > random_ships & 14 | 15 | # Place random numbers 16 | while read f; do 17 | shuf country_codes.csv | head -n $(((RANDOM % 10) + 1)) | \ 18 | sed "s/,/|/g" | awk '{ printf "%d|'"$f"'|%s\n", i++, $0 }' 19 | done < random_ships 20 | 21 | -------------------------------------------------------------------------------- /data/random_data.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | require 'faker' 3 | require 'time' 4 | 5 | def gen_id alpha = 3, num = 10000 6 | ('A'..'Z').to_a.shuffle[0,alpha].join + (num + Random.rand(num - 1)).to_s 7 | end 8 | 9 | def time_rand from = 0.0, to = Time.now 10 | Time.at(from + rand * (to.to_f - from.to_f)) 11 | end 12 | 13 | def credit_card 14 | (1..4).map { |i| 1000 + Random.rand(999) }.join('-') 15 | end 16 | 17 | def funds(m) 18 | (1..m).each do |n| 19 | amount = Random.rand(1000.0) 20 | parent = gen_id 21 | parent_name = Faker::Company.name 22 | 23 | divisions = Random.rand(5) 24 | positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1] 25 | currency = ['USD', 'EUR', 'JPY', 'AUD', 'CAD', 'GBP'].sample 26 | positions.each_cons(2) do |pos| 27 | percent = pos[1] - pos[0] 28 | identifier = gen_id 29 | tx_time = time_rand Time.local(2010, 1, 1), Time.local(2010, 12, 31) 30 | puts [tx_time, parent_name, parent, identifier, currency, percent.round(3), amount*percent].join('|') 31 | end 32 | end 33 | end 34 | 35 | def ratings(m) 36 | peliculas = ["01", "02", "03", "78", "79", "815", "29", "310", "04", "05", "28", "73", "74", "75", "765", "76", "77", "780", "31", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "18", "19", "20", "210", "218", "219", "21", "220", "220", "223", "224", "225", "22", "23", "214", "215", "216", "217", "23", "240", "24", "24", "25", "26", "270", "320", "321", "322", "323", "211", "211", "213", "324", "311", "312", "315", "316", "418", "41", "422", "422", "422", "423", "423", "425", "42", "43", "44", "45", "317", "318", "325", "328", "33", "33", "342", "34", "35", "362", "36", "61", "620", "621", "622", "623", "624", "625", "62", "63", "64", "65", "66", "66", "68", "37", "38", "39", "410", "411", "712", "713", "714", "715", "716", "717", "718", "719", "71", "720", "412", "412", "414", "415", "416", "417", "418", "46", "47", "48", "49", "610", "610", "612", "613", "614", "615", "616", "617", "618", "619", "69", "710", "711", "722", "723", "724", "725", "72", "915"] 37 | (1..m).each do |n| 38 | fecha = (time_rand Time.local(2016, 2, 15), Time.local(2016, 2, 21)).utc.iso8601 39 | puts "#{peliculas.sample},#{Random.rand(100000)},#{Random.rand(5)},#{fecha}" 40 | end 41 | end 42 | 43 | def ships_and_containers(m, p) 44 | puts ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok'].join(";") 45 | container_codes = CSV.read('./iso-container-codes.csv').map { |m| m[0] }.drop(1) 46 | container_groups = CSV.read('./iso-container-groups.csv').map { |m| m[0] }.drop(1) 47 | (1..m).each do |n| 48 | ship_imo = gen_id(3, 1000000) 49 | ship_name = [Faker::Name.first_name, Faker::Address.city].sample 50 | divisions = (p*10) + Random.rand((p*10)-1) 51 | positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1] 52 | total_weight = (1000*1000*1000) + Random.rand(999999999) 53 | country = Faker::Address.country_code 54 | departure = (time_rand Time.local(2016, 2, 15), Time.local(2016, 2, 21)).strftime("%Y%m%d#{n}") 55 | positions.each_cons(2) do |pos| 56 | container_id = gen_id(4, 1000000) # ISO 6346 57 | container_type = container_codes.sample 58 | container_group = container_groups.sample 59 | owner = Faker::Company.name 60 | percent = pos[1] - pos[0] 61 | net_weight = (total_weight*percent).round(2) 62 | gross_weight = ([0.05, 0.1, 0.03].sample * net_weight).round(2) 63 | declared = Faker::Commerce.department(5) 64 | contact = Faker::Internet.email 65 | customs_ok = ((1..10).to_a.map { |n| true } + [false]).sample 66 | puts [ship_imo, ship_name, country, departure, container_id, container_type, container_group, net_weight, gross_weight, owner, declared, contact, customs_ok].join(";") 67 | end 68 | end 69 | end 70 | 71 | def shop(m) 72 | puts ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned'].join('|') 73 | (1..m).each do |n| 74 | buyer = Faker::Name.name 75 | tx_id = gen_id(7, 100) 76 | tx_time = time_rand Time.local(2010, 1, 1), Time.local(2010, 12, 31) 77 | cc = credit_card() 78 | price = Faker::Commerce.price 79 | currency = ['USD', 'EUR', 'JPY', 'AUD', 'CAD', 'GBP'].sample 80 | payment = ['VISA', 'MASTERCARD', 'AMERICAN_EXPRESS', 'DANKORT', 'JCB', 'FORBRUGSFORENINGEN'].sample 81 | country = Faker::Address.country_code 82 | divisions = [0, 0, Random.rand(5)].sample 83 | positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1] 84 | positions.each_cons(2) do |pos| 85 | percent = pos[1] - pos[0] 86 | item_price = (price*percent).round(2) 87 | department = Faker::Commerce.department(1, true) 88 | product = Faker::Commerce.product_name 89 | coupon = [false, false, true, false].sample 90 | coupon_code = '' 91 | if (coupon) 92 | coupon_code = gen_id(3,2) 93 | end 94 | returned = [false, false, false, 'defect', 'bounce', false, false, false, false].sample 95 | puts [tx_id, tx_time.utc.iso8601, buyer, currency, payment, cc, country, department, product, item_price, coupon_code, returned].join('|') 96 | end 97 | 98 | end 99 | end 100 | 101 | # shop(1000) 102 | # ships_and_containers(20, 2) 103 | # ratings(10000) 104 | -------------------------------------------------------------------------------- /data/ship_routes.csv: -------------------------------------------------------------------------------- 1 | 0|GUI1871167|Kirlinland|Estonia|EE 2 | 1|GUI1871167|Kirlinland|Mauritania|MR 3 | 2|GUI1871167|Kirlinland|Dominica|DM 4 | 3|GUI1871167|Kirlinland|Puerto Rico|PR 5 | 4|GUI1871167|Kirlinland|Gabon|GA 6 | 5|GUI1871167|Kirlinland|Lao People's Democratic Republic|LA 7 | 6|GUI1871167|Kirlinland|Kazakhstan|KZ 8 | 7|GUI1871167|Kirlinland|Grenada|GD 9 | 8|GUI1871167|Kirlinland|Bonaire Sint Eustatius and Saba|BQ 10 | 9|GUI1871167|Kirlinland|Greece|GR 11 | 0|COB1191390|St. Elena|Mauritius|MU 12 | 1|COB1191390|St. Elena|Gabon|GA 13 | 2|COB1191390|St. Elena|United Kingdom|GB 14 | 3|COB1191390|St. Elena|Jersey|JE 15 | 4|COB1191390|St. Elena|Mongolia|MN 16 | 5|COB1191390|St. Elena|Guatemala|GT 17 | 6|COB1191390|St. Elena|Korea|KR 18 | 0|KRO1091605|Ike|Kuwait|KW 19 | 1|KRO1091605|Ike|Pitcairn|PN 20 | 2|KRO1091605|Ike|Uruguay|UY 21 | 3|KRO1091605|Ike|Zimbabwe|ZW 22 | 0|JMP1211539|John Navy|Egypt|EG 23 | 0|QEF1881275|Simone|Austria|AT 24 | 1|QEF1881275|Simone|Tokelau|TK 25 | 2|QEF1881275|Simone|Cayman Islands|KY 26 | 3|QEF1881275|Simone|South Georgia and the South Sandwich Islands|GS 27 | 4|QEF1881275|Simone|United Arab Emirates|AE 28 | 5|QEF1881275|Simone|Lao People's Democratic Republic|LA 29 | 6|QEF1881275|Simone|Honduras|HN 30 | 7|QEF1881275|Simone|Svalbard and Jan Mayen|SJ 31 | 0|QPU1694193|Prestige|Guadeloupe|GP 32 | 1|QPU1694193|Prestige|Cayman Islands|KY 33 | 2|QPU1694193|Prestige|Niger|NE 34 | 3|QPU1694193|Prestige|Trinidad and Tobago|TT 35 | 4|QPU1694193|Prestige|Cameroon|CM 36 | 5|QPU1694193|Prestige|Uganda|UG 37 | 0|YIL1516412|Abner|Saint Martin (French part)|MF 38 | 1|YIL1516412|Abner|Bangladesh|BD 39 | 2|YIL1516412|Abner|Bosnia and Herzegovina|BA 40 | 0|XJM1059834|Margaretteview|Marshall Islands|MH 41 | 1|XJM1059834|Margaretteview|Afghanistan|AF 42 | 0|YKX1212832|Danyka|Virgin Islands British|VG 43 | 1|YKX1212832|Danyka|Burundi|BI 44 | 2|YKX1212832|Danyka|Qatar|QA 45 | 3|YKX1212832|Danyka|South Africa|ZA 46 | 4|YKX1212832|Danyka|Belarus|BY 47 | 5|YKX1212832|Danyka|Réunion|RE 48 | 6|YKX1212832|Danyka|United Arab Emirates|AE 49 | 7|YKX1212832|Danyka|Grenada|GD 50 | 8|YKX1212832|Danyka|Niue|NU 51 | 0|AKO1391643|Keara|Uganda|UG 52 | 1|AKO1391643|Keara|New Caledonia|NC 53 | 2|AKO1391643|Keara|Pakistan|PK 54 | 3|AKO1391643|Keara|Côte d'Ivoire|CI 55 | 4|AKO1391643|Keara|Macedonia|MK 56 | 5|AKO1391643|Keara|Bhutan|BT 57 | 6|AKO1391643|Keara|Bahamas|BS 58 | 0|PKJ1313228|Nelson|Afghanistan|AF 59 | 1|PKJ1313228|Nelson|Suriname|SR 60 | 2|PKJ1313228|Nelson|Tonga|TO 61 | 3|PKJ1313228|Nelson|Guyana|GY 62 | 0|MHE1939455|Magdalenstad|Ethiopia|ET 63 | 0|CAT1031760|Calistaborough|Trinidad and Tobago|TT 64 | 1|CAT1031760|Calistaborough|Congo|CG 65 | 2|CAT1031760|Calistaborough|French Southern Territories|TF 66 | 3|CAT1031760|Calistaborough|Jersey|JE 67 | 4|CAT1031760|Calistaborough|Gambia|GM 68 | 5|CAT1031760|Calistaborough|Azerbaijan|AZ 69 | 6|CAT1031760|Calistaborough|Heard Island and McDonald Islands|HM 70 | 7|CAT1031760|Calistaborough|Myanmar|MM 71 | 0|ZEW1505964|East Pierre|Afghanistan|AF 72 | 1|ZEW1505964|East Pierre|Romania|RO 73 | 2|ZEW1505964|East Pierre|Somalia|SO 74 | 3|ZEW1505964|East Pierre|Netherlands|NL 75 | 4|ZEW1505964|East Pierre|Saint Pierre and Miquelon|PM 76 | 0|RWK1014975|Princess|Mali|ML 77 | 1|RWK1014975|Princess|Peru|PE 78 | 2|RWK1014975|Princess|Aruba|AW 79 | 0|BXE1370077|Shaun|Pakistan|PK 80 | 1|BXE1370077|Shaun|Eritrea|ER 81 | 2|BXE1370077|Shaun|Austria|AT 82 | 3|BXE1370077|Shaun|Tajikistan|TJ 83 | 4|BXE1370077|Shaun|Serbia|RS 84 | 5|BXE1370077|Shaun|Chile|CL 85 | 6|BXE1370077|Shaun|Indonesia|ID 86 | 7|BXE1370077|Shaun|Equatorial Guinea|GQ 87 | 8|BXE1370077|Shaun|Nicaragua|NI 88 | 9|BXE1370077|Shaun|Bulgaria|BG 89 | 0|QCJ1879622|Urca da Lima|Micronesia|FM 90 | 1|QCJ1879622|Urca da Lima|Canada|CA 91 | 2|QCJ1879622|Urca da Lima|South Georgia and the South Sandwich Islands|GS 92 | 3|QCJ1879622|Urca da Lima|Saint Vincent and the Grenadines|VC 93 | 4|QCJ1879622|Urca da Lima|Guinea-Bissau|GW 94 | 5|QCJ1879622|Urca da Lima|Samoa|WS 95 | 6|QCJ1879622|Urca da Lima|Viet Nam|VN 96 | 0|EJQ1935333|Prestige|Cameroon|CM 97 | 1|EJQ1935333|Prestige|Montserrat|MS 98 | 2|EJQ1935333|Prestige|Lao People's Democratic Republic|LA 99 | 3|EJQ1935333|Prestige|Niger|NE 100 | 0|IFD1255823|North Cobyville|Mozambique|MZ 101 | 1|IFD1255823|North Cobyville|Spain|ES 102 | 0|FNV1248771|Lauryn|El Salvador|SV 103 | 1|FNV1248771|Lauryn|Estonia|EE 104 | 2|FNV1248771|Lauryn|Pitcairn|PN 105 | 3|FNV1248771|Lauryn|Luxembourg|LU 106 | 4|FNV1248771|Lauryn|Turks and Caicos Islands|TC 107 | 5|FNV1248771|Lauryn|Timor-Leste|TL 108 | 6|FNV1248771|Lauryn|Viet Nam|VN 109 | 7|FNV1248771|Lauryn|Belgium|BE 110 | 8|FNV1248771|Lauryn|Spain|ES 111 | -------------------------------------------------------------------------------- /infra/beam/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:jessie-slim 2 | LABEL maintainer="Luis Belloch " 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | RUN apt-get update && \ 6 | apt-get install -y build-essential python-software-properties python-pip python-dev && \ 7 | pip install --upgrade setuptools && \ 8 | rm -rf /var/lib/apt/lists/* ~/.cache/* 9 | 10 | RUN pip install --upgrade apache-beam && \ 11 | rm -rf ~/.cache/* 12 | 13 | RUN mkdir -p /data /opt/beam 14 | WORKDIR /opt/beam 15 | 16 | -------------------------------------------------------------------------------- /infra/beam/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build tag push list 2 | 3 | all: build tag 4 | 5 | build: 6 | docker build -t luisbelloch/beam . 7 | 8 | tag: 9 | docker tag luisbelloch/beam luisbelloch/beam:2018.1 10 | 11 | push: 12 | docker push luisbelloch/beam:2018.1 13 | docker push luisbelloch/beam 14 | 15 | list: 16 | docker images luisbelloch/beam 17 | 18 | -------------------------------------------------------------------------------- /infra/beam/README.md: -------------------------------------------------------------------------------- 1 | beam.md -------------------------------------------------------------------------------- /infra/beam/beam.md: -------------------------------------------------------------------------------- 1 | # Apache Beam Docker Image 2 | 3 | ## Basic usage 4 | 5 | This folder contains a simple docker container to execute Apache Beam using python SDK, under direct runner. The image has been published in docker hub as [luisbelloch/beam:python2](https://hub.docker.com/r/luisbelloch/beam/): 6 | 7 | ``` 8 | $ docker pull luisbelloch/beam:python2 9 | ``` 10 | 11 | A simple word count sample can be run as: 12 | 13 | ``` 14 | $ docker run luisbelloch/beam:python2 python -m apache_beam.examples.wordcount \ 15 | --input /etc/hosts --output /tmp/output.txt 16 | ``` 17 | 18 | We've included an script that will mount current folder as volume in `/data`: 19 | 20 | ``` 21 | $ ./beam -m apache_beam.examples.wordcount --input /etc/hosts --output /data/wordcount.txt 22 | ``` 23 | 24 | To run any script of the [samples](../../beam/) folder: 25 | 26 | ``` 27 | $ ./beam basic.py --input /data/compras_tiny.csv --output /data/purchases_summary.json 28 | ``` 29 | 30 | ## Building the container 31 | 32 | ``` 33 | docker build -t luisbelloch/beam:python2 . 34 | ``` 35 | 36 | -------------------------------------------------------------------------------- /infra/dataproc.md: -------------------------------------------------------------------------------- 1 | # Tutorial Dataproc (Spark) 2 | 3 | Dataproc es la versión gestionada de Spark en Google Cloud. En este tutorial vamos a cubrir como subir archivos a Cloud Storage (S3) y lanzar un trabajo de Spark para procesarlo. 4 | 5 | Duración estimada: 6 | 7 | ## Selecciona un proyecto 8 | 9 | 10 | 11 | ## Preparación 12 | 13 | ### 1. Habilita las APIs necesarias 14 | 15 | Antes de continuar es necesario habilitar las APIs de Cloud Storage y Dataproc. 16 | 17 | Habilitar APIs 18 | 19 | ### 2. Abre una terminal 20 | 21 | La mayoría de los comandos pueden ejecutarse desde la interfaz de usuario, pero en el tutorial utilizaremos la consola de cloudshell. 22 | 23 | Si no esta abierta ya en la parte inferior puedes abrirla mediante el icono 24 | arriba a la derecha, o utilizando el siguiente enlace: 25 | 26 | 27 | 28 | ### 3. Materiales de clase 29 | 30 | Asegurate de que la carpeta `cloudshell_open/data_processing_course` se ha creado y la terminal apunta a esa carpeta. 31 | 32 | ```sh 33 | cd ~/cloudshell_open/data_processing_course 34 | ``` 35 | 36 | Sino, puedes abrir de nuevo el proyecto desde [bigdata.luisbelloch.es](http://bigdata.luisbelloch.es) y seleccionando [Open in Cloud Shell](https://console.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git). 37 | 38 | Alternativamente puedes clonar el repositorio mediante `git`: 39 | 40 | ```sh 41 | git clone https://github.com/luisbelloch/data_processing_course.git && cd data_processing_course 42 | ``` 43 | 44 | ## Paso 1: Crear un bucket en cloud storage 45 | 46 | EL bucket se puede también crear desde [la UI de Google Cloud Storage](https://cloud.google.com/storage/docs/creating-buckets). 47 | 48 | En nuestro caso podemos usar la terminal para crearlo: 49 | 50 | ```sh 51 | gsutil mb -c regional -l europe-west1 gs://NOMBRE_BUCKET 52 | ``` 53 | 54 | Recuerda que el nombre del bucket `NOMBRE_BUCKET` debe ser único en internet. 55 | 56 | Para copiar datos puede utilizarse tambien `gsutil` con `cp`: 57 | 58 | ```sh 59 | gsutil cp data/compras_tiny.csv gs://NOMBRE_BUCKET 60 | ``` 61 | 62 | En el caso de que queramos sincronizar un directorio entero, podemos utilizar `rsync`: 63 | 64 | ```sh 65 | gsutil -m rsync data/ gs://NOMBRE_BUCKET 66 | ``` 67 | 68 | ## Paso 2: Crear un cluster en Dataproc 69 | 70 | Lo primero que debemos hacer es crear un cluster de Spark. Para las pruebas usaremos un único nodo, pero es posible crear varios también. En nuestro caso, vamos a crear un cluster llamado `dataproc1`. 71 | 72 | ```sh 73 | gcloud dataproc clusters create dataproc1 --region europe-west1 --single-node --enable-component-gateway 74 | ``` 75 | 76 | Una vez esté creado, podemos ver el estado del cluster en la [interfaz de usuario de Dataproc](https://console.cloud.google.com/dataproc/clusters). 77 | 78 | Es interesante ver que Dataproc ha creado distintas máquinas virtuales [en Compute Engine](https://console.cloud.google.com/compute/instances). 79 | 80 | Recuerda eliminar el cluster al finalizar el tutorial. 81 | 82 | ## Paso 3: Crear un trabajo de ejemplo de Spark 83 | 84 | Como ejemplo, vamos a crear un script que cuente las lineas de el archivo `compras_tiny.csv`, llamado `prueba_dataproc.py`. 85 | 86 | ```python 87 | from os import path 88 | from pyspark import SparkContext 89 | 90 | sc = SparkContext('local', 'hello') 91 | rdd = sc.textFile('gs://bigdataupv_data/compras_tiny.csv') 92 | 93 | print("Count:", rdd.count()) 94 | ``` 95 | 96 | Puedes crear el script en cualquier carpeta, pero asegurate de especificar la ruta al ejecutar el trabajo en el paso siguiente. 97 | 98 | ## Paso 4: Ejecutar el trabajo de Spark 99 | 100 | Para ejecutar el script `prueba_dataproc.py` que acabamos de crear es necesario enviarlo al cluster: 101 | 102 | ```sh 103 | gcloud dataproc jobs submit pyspark prueba_dataproc.py --cluster dataproc1 --region europe-west1 104 | ``` 105 | 106 | Esto creará un `job` (trabajo) en el cluster, ejecutado por Spark. 107 | 108 | Verás el progreso en la propia consola, en algún sitio debería haber impreso el número de filas del trabajo cuando termine: 109 | 110 | ```terminal 111 | Count: 1723 112 | ``` 113 | 114 | ### Adjuntar archivos adicionales 115 | 116 | En clase hemos trabajado haciendo uso de un archivo llamado `helpers.py`. Si se referencia el código de ese archivo desde cualquier script, es necesario adjuntarlo al trabajo mediante la opcion `--files`: 117 | 118 | ```sh 119 | gcloud dataproc jobs submit pyspark prueba_dataproc.py --cluster dataproc1 --region europe-west1 --files=helpers.py 120 | ``` 121 | 122 | Los scripts pueden también residir en un bucket de Cloud Storage, simplemente reemplaza los normbres por la ruta completa de los archivos: 123 | 124 | ```terminal 125 | gs://bigdataupv_code/prueba_dataproc.py 126 | gs://bigdataupv_code/helpers.py 127 | ``` 128 | 129 | ## Paso 5: Determinar el estado de los trabajos lanzados 130 | 131 | Los trabajos ejecutados también son accesibles desde [la interfaz de usuario de Dataproc](https://console.cloud.google.com/dataproc/clusters/dataproc1/jobs), desde donde pueden consultarse los resultados. 132 | 133 | Alternativamente se pueden listar todos los trabajos de una región, en nuestro caso `europe-west1`: 134 | 135 | ```sh 136 | gcloud dataproc jobs list --region=europe-west1 137 | ``` 138 | 139 | Tras ejecutarlo debería mostrar una lista de trabajos: 140 | 141 | ```terminal 142 | JOB_ID: 2c5c402a995e424ca24087498d559731 143 | TYPE: pyspark 144 | STATUS: DONE 145 | ``` 146 | 147 | ### Consultar un determinado trabajo 148 | 149 | Utilizando ese `JOB_ID` podemos también consultar el estado y los logs del trabajo, incluso antes de que finalize: 150 | 151 | ```sh 152 | gcloud dataproc jobs wait 2c5c402a995e424ca24087498d559731 --project bigdataupv2022 --region europe-west1 153 | ``` 154 | 155 | ## Paso 6: Eliminar el cluster 156 | 157 | Para finalizar el ejercicio eliminaremos el cluster creado, de forma que se detendrá la facturación por uso de los recursos involucrados: 158 | 159 | ```sh 160 | gcloud dataproc clusters delete dataproc1 --region=europe-west1 161 | ``` 162 | 163 | También es posible eliminarlo desde la consola de Google Cloud. 164 | 165 | ![](https://cloud.google.com/dataproc/images/dataproc-1-delete.png) 166 | 167 | ## Completado! 168 | 169 | Recuerda eliminar el cluster de Dataproc al completar el ejercicio. 170 | 171 | 172 | -------------------------------------------------------------------------------- /infra/docker/.envrc: -------------------------------------------------------------------------------- 1 | export DOCKER_BUILDKIT=1 2 | export COMPOSE_DOCKER_CLI_BUILD=1 3 | -------------------------------------------------------------------------------- /infra/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM amazoncorretto:11 as corretto-jdk 2 | RUN $JAVA_HOME/bin/jlink \ 3 | --verbose \ 4 | --add-modules ALL-MODULE-PATH \ 5 | --strip-debug \ 6 | --no-man-pages \ 7 | --no-header-files \ 8 | --compress=2 \ 9 | --output /opt/jre 10 | 11 | FROM debian:stable-slim 12 | LABEL maintainer="Luis Belloch " 13 | ENV JAVA_HOME=/opt/jre 14 | ENV PATH="${JAVA_HOME}/bin:${PATH}" 15 | COPY --from=corretto-jdk /opt/jre $JAVA_HOME 16 | 17 | ENV DEBIAN_FRONTEND=noninteractive 18 | RUN apt-get update && \ 19 | apt-get install -y --no-install-recommends ca-certificates procps python3-software-properties python3-numpy curl && \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | ARG SPARK_VERSION=3.3.1 23 | ENV SPARK_HOME=/opt/spark 24 | RUN mkdir -p /opt/spark && curl -s https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz | tar -xz -C "${SPARK_HOME}" --strip-components=1 25 | ENV PATH="${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}" 26 | 27 | RUN cp "${SPARK_HOME}/conf/log4j2.properties.template" "${SPARK_HOME}/conf/log4j2.properties" && \ 28 | sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties" 29 | 30 | ENV SPARK_NO_DAEMONIZE=true 31 | ENV PYSPARK_PYTHON=/usr/bin/python3 32 | ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3 33 | EXPOSE 4040 7077 8080 34 | 35 | CMD ["pyspark"] 36 | -------------------------------------------------------------------------------- /infra/docker/Makefile: -------------------------------------------------------------------------------- 1 | SPARK_VERSION:=3.3.1 2 | COURSE_VERSION:=2022.12 3 | IMAGE_NAME:=luisbelloch/spark 4 | 5 | .PHONY: help 6 | help: 7 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(lastword $(MAKEFILE_LIST)) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' 8 | 9 | .PHONY: all 10 | all: build tag ## Builds and tags an image 11 | 12 | .PHONY: build 13 | build: ## Assembles image from Spark binaries 14 | docker build --build-arg SPARK_VERSION=${SPARK_VERSION} -t $(IMAGE_NAME) . 15 | 16 | .PHONY: tag 17 | tag: ## Adds tags to current latest image 18 | docker tag $(IMAGE_NAME) $(IMAGE_NAME):$(SPARK_VERSION) 19 | docker tag $(IMAGE_NAME) $(IMAGE_NAME):$(COURSE_VERSION) 20 | 21 | .PHONY: push 22 | push: ## Uploads images to registry 23 | docker push $(IMAGE_NAME):$(SPARK_VERSION) 24 | docker push $(IMAGE_NAME):$(COURSE_VERSION) 25 | docker push $(IMAGE_NAME) 26 | 27 | .PHONY: list 28 | list: ## Lists local generated images 29 | docker images $(IMAGE_NAME) 30 | 31 | -------------------------------------------------------------------------------- /infra/docker/README.md: -------------------------------------------------------------------------------- 1 | docker.md -------------------------------------------------------------------------------- /infra/docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | master: 4 | image: luisbelloch/spark 5 | ports: 6 | - 8080:8080 7 | - 7077:7077 8 | command: /opt/spark/sbin/start-master.sh 9 | environment: 10 | MASTER: spark://master:7077 11 | SPARK_PUBLIC_DNS: localhost 12 | SPARK_NO_DAEMONIZE: 1 13 | worker: 14 | image: luisbelloch/spark 15 | command: /opt/spark/sbin/start-slave.sh spark://master:7077 16 | environment: 17 | SPARK_PUBLIC_DNS: localhost 18 | SPARK_NO_DAEMONIZE: 1 19 | ports: 20 | - 8081:8081 21 | -------------------------------------------------------------------------------- /infra/docker/docker.md: -------------------------------------------------------------------------------- 1 | # Spark-on-Docker Samples 2 | 3 | This folder will be used to see how we could provision a Spark cluster using Docker. While this is an interesting exercise to reason about some of the implications, ask yourself first if this makes sense at all before going to production. 4 | 5 | ## Bare-bones Docker Image 6 | 7 | By default, the image will run is pointing to `pyspark`, so running it without parameters will display directly the python repl: 8 | 9 | ``` 10 | $ docker run -ti luisbelloch/spark 11 | ``` 12 | 13 | ### Running PySpark samples 14 | 15 | We've included an script to easily run scripts in the [spark](../../spark) folder. To run any of the scripts, simply do: 16 | 17 | ``` 18 | $ cd data_processing_course/spark 19 | $ ./spark compras_conversion_a_dolares.py 20 | ``` 21 | 22 | Please pay attention to the dot before the name of the script, `./spark`. The docker container has access to all the scripts in that folder, included the `data` folder on it: 23 | 24 | ```python 25 | txt = sc.textFile('./data/compras_tiny.csv') 26 | ``` 27 | 28 | ### Using the image without the "spark" helper script 29 | 30 | Remember that inside the container you won't have access to the samples or data files we'll use in classroom. You'll have to mount a volume with them, [using -v option](https://docs.docker.com/engine/tutorials/dockervolumes). The local folder cannot contain relative routes, use `readlink` command to convert it to an absolute one. 31 | 32 | ``` 33 | $ docker run \ 34 | -v $(readlink -f ../../spark):/opt/samples \ 35 | -w /opt/samples \ 36 | -ti luisbelloch/spark spark-submit /opt/samples/compras_con_mas_de_un_descuento.py 37 | ``` 38 | 39 | That should spawn a new container and run the job inside it. We've also mounted the samples folder in `/opt/samples` inside the container. All the executables from the Spark distribution are available in the container's path. 40 | 41 | ### How to build the images 42 | 43 | Images are available in [Docker Hub](https://hub.docker.com/r/luisbelloch/spark/), you can easily modify and rebuild them: 44 | 45 | ``` 46 | $ docker build -t luisbelloch/spark . 47 | $ docker tag luisbelloch/spark:2.10 luisbelloch/spark:latest 48 | ``` 49 | 50 | ### Running Spark Master \ Workers 51 | 52 | Variable `SPARK_NO_DAEMONIZE` is already set in the `Dockerfile`, it will make start scripts to run foreground instead of leaving the process in the background. 53 | 54 | First step should be to start the master node. We've exposed ports 8080 (UI) and 7077 (Spark). 55 | 56 | ``` 57 | $ docker run -p 8080:8080 -p 7077:7077 -d luisbelloch/spark start-master.sh 58 | ``` 59 | 60 | Note that workers connect to master node through 7077 exposed to actual physical machine. Remember to configure port forwarding if you run docker inside a virtual machine. 61 | 62 | After it starts, go to [localhost:8080](http://localhost:8080) and get the master URL. In our case is `spark://11168790f9c1:7077`. You will also need the container alias, `nervous_noyce`, to enable a link between master and worker containers. List containers with `docker ps` to retrieve it. 63 | 64 | ``` 65 | $ docker ps 66 | CONTAINER ID IMAGE NAMES 67 | 11168790f9c1 luisbelloch/spark nervous_noyce 68 | 69 | $ docker run -p 8081:8081 \ 70 | --link nervous_noyce \ 71 | -d luisbelloch/spark start-worker.sh spark://11168790f9c1:7077 72 | ``` 73 | 74 | The worker node should be displayed in the master UI. 75 | 76 | Remember that if you want to run jobs against those containers you need to point `spark-submit` or `pyspark` to the master node. To do it, add the option `--master` and set the URL that you copied from master node web page: 77 | 78 | ``` 79 | $ docker run -p 8081:8081 \ 80 | --link nervous_noyce \ 81 | -ti luisbelloch/spark pyspark \ 82 | --master spark://11168790f9c1:7077 83 | ``` 84 | 85 | ## Using Docker Compose 86 | 87 | To bring up a mini-cluster with a master node and one worker: 88 | 89 | ``` 90 | $ docker compose up 91 | ``` 92 | 93 | The master UI should be available at [localhost:8080](http://localhost:8080). 94 | 95 | Then you can also connect to it via `pyspark`: 96 | 97 | ``` 98 | $ docker compose run -p 4040:4040 master pyspark --master spark://master:7077 99 | ``` 100 | 101 | Running `docker ps` will show containers and their ports mapped. Workers can connect to master using internal DNS resolution, we've exposed the master node as `master`. Note that exposing worker nodes port is not straight-forward, we'll discuss that in class. 102 | 103 | To scale up/down the cluster: 104 | 105 | ``` 106 | $ docker compose scale worker=3 107 | ``` 108 | 109 | Beware desired state persist between runs. 110 | -------------------------------------------------------------------------------- /infra/kubernetes/README.md: -------------------------------------------------------------------------------- 1 | kubernetes.md -------------------------------------------------------------------------------- /infra/kubernetes/kubernetes.md: -------------------------------------------------------------------------------- 1 | # Spark-on-Kubernetes 2 | 3 | Code is based on [official Kubernetes examples](https://github.com/kubernetes/kubernetes/tree/master/examples/spark) and the [Spark 2.1 docker image](../docker/docker.md) used during the course. 4 | 5 | ## Prerequisites 6 | 7 | We recommend to work locally using `minikube`. Install [kubectl](https://kubernetes.io/docs/user-guide/prereqs/) and [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/) from the official sources. 8 | 9 | After installing `minikube` we will push our Spark docker image to the internal Kubernetes registry. Use `minikube docker-env` to point current docker client to our cluster. 10 | 11 | ``` 12 | $ eval $(minikube docker-env) 13 | $ docker build -t luisbelloch/spark ../infra/docker 14 | $ docker push luisbelloch/spark 15 | ``` 16 | 17 | Alternatively you could use GRC images, just point the image containers to `gcr.io/google_containers/spark:1.5.2_v1`. 18 | 19 | ## Cluster provisioning 20 | 21 | First of all, we'll create a new namespace for our cluster and configure a context for `kubectl`. From this point, all the `kubectl` commands will be confined to that namespace. 22 | 23 | ``` 24 | $ kubectl create -f namespace.yaml 25 | $ kubectl config set-context spark --namespace=bigdataupv-spark --user=minikube --cluster=minikube 26 | $ kubectl config use-context spark 27 | ``` 28 | 29 | ### Master node 30 | 31 | First thing we'll deploy is the Spark Master. We've defined a replication controller that will create just one container to host it. Note that if the master goes down, Kubernetes will automatically respawn the container. 32 | 33 | ``` 34 | $ kubectl create -f master-controller.yaml 35 | $ kubectl get pods 36 | NAME READY STATUS RESTARTS AGE 37 | spark-master-controller-5pzdb 0/1 ContainerCreating 0 1s 38 | ``` 39 | 40 | If you want to submit again the configuration after some changes, use command `apply` and Kubernetes will reconfigure the controller again. Although you can use the UI for this, notice best practice is to reapply configurations. 41 | 42 | ``` 43 | $ kubectl apply -f master-controller.yaml 44 | ``` 45 | 46 | Now we can check pod is up and running and master has elected as leader 47 | 48 | ``` 49 | $ kubectl get pods 50 | NAME READY STATUS RESTARTS AGE 51 | spark-master-controller-78dqq 1/1 Running 0 2m 52 | 53 | $ kubectl logs spark-master-controller-78dqq 54 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 55 | 17/02/09 20:42:57 INFO Master: Started daemon with process name: 12@spark-master-controller-78dqq 56 | ... 57 | 17/02/09 20:42:58 INFO Master: I have been elected leader! New state: ALIVE 58 | ``` 59 | 60 | Note that the replication controller has `replicas: 1`, only one pod will be created to act as the master. The master node should declare two services: one in port 7077 to communicate with workers, and another in port 8080 serving the web UI: 61 | 62 | ``` 63 | $ kubectl apply -f master-service.yaml 64 | ``` 65 | 66 | Spark UI can be accessed starting `kubectl proxy` and accessing directly to this URL: 67 | 68 | ``` 69 | http://127.0.0.1:8001/api/v1/proxy/namespaces/bigdataupv-spark/pods/spark-master-controller-78dqq:8080/ 70 | ``` 71 | 72 | ### Slaves 73 | 74 | Starting slaves is pretty straightforward. Remember we've exposed the master node under the name `spark-master`, and therefore it will be accessible from other pods using simple DNS calls. The following command will create a replication controller for the slaves, starting with one pod: 75 | 76 | ``` 77 | $ kubectl apply -f slave-controller.yaml 78 | 79 | $ kubectl get rc -o wide 80 | NAME DESIRED CURRENT READY AGE CONTAINER(S) IMAGE(S) SELECTOR 81 | spark-master-controller 1 1 1 36m spark-master luisbelloch/spark component=spark-master 82 | spark-worker-controller 1 1 1 3m spark-worker luisbelloch/spark component=spark-worker 83 | ``` 84 | 85 | ### Accessing to PySpark 86 | 87 | We can open a `PySpark` session directly in the master node, using `exec` command: 88 | 89 | ``` 90 | $ kubectl exec spark-master-controller-78dqq -ti -- pyspark --master=spark://spark-master-controller-78dqq:7077 91 | ``` 92 | 93 | If you ever need an interactive login, simply replace `pyspark` by `/bin/bash`. 94 | 95 | ### Scaling the cluster 96 | 97 | ``` 98 | $ kubectl scale --replicas=4 rc/spark-worker-controller 99 | replicationcontroller "spark-worker-controller" scaled 100 | 101 | $ kubectl get pods 102 | NAME READY STATUS RESTARTS AGE 103 | spark-master-controller-78dqq 1/1 Running 0 40m 104 | spark-worker-controller-9r9vd 1/1 Running 0 8s 105 | spark-worker-controller-sp3tt 1/1 Running 0 1m 106 | spark-worker-controller-srvdm 0/1 ContainerCreating 0 8s 107 | ``` 108 | 109 | ## Problems not addressed 110 | 111 | As we've seen in class, this has been an exercise to play with Spark deployment options and much deeper thoughts are needed before going to production. Generally speaking, Spark needs a bit more of work to make it aware of the environment it executes, particularly the UIs. In the Kubernetes repository there are few issues that you may follow up close to get more information: 112 | 113 | - [#16517](kubernetes/kubernetes#16517) Has a good compendium of problems and things that doesn't work out of the box. 114 | - [#34377](kubernetes/kubernetes#34377) Describes some ideas to support other Spark deployment modes than the "standalone" one. 115 | - [#16949](kubernetes/kubernetes#16949) Talks about the problem with slave UIs ports and how it may be resolved. 116 | -------------------------------------------------------------------------------- /infra/kubernetes/master-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-master-controller 5 | spec: 6 | replicas: 1 7 | selector: 8 | component: spark-master 9 | template: 10 | metadata: 11 | labels: 12 | component: spark-master 13 | spec: 14 | containers: 15 | - name: spark-master 16 | image: luisbelloch/spark 17 | imagePullPolicy: Never 18 | command: ["/opt/spark/sbin/start-master.sh"] 19 | env: 20 | - name: SPARK_NO_DAEMONIZE 21 | value: "true" 22 | ports: 23 | - containerPort: 7077 24 | - containerPort: 8080 25 | resources: 26 | requests: 27 | cpu: 100m 28 | -------------------------------------------------------------------------------- /infra/kubernetes/master-service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: spark-master 5 | spec: 6 | type: NodePort 7 | ports: 8 | - port: 7077 9 | targetPort: 7077 10 | name: spark 11 | - port: 8080 12 | targetPort: 8080 13 | name: http 14 | selector: 15 | component: spark-master 16 | 17 | -------------------------------------------------------------------------------- /infra/kubernetes/namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: "bigdataupv-spark" 5 | labels: 6 | name: "bigdataupv-spark" 7 | -------------------------------------------------------------------------------- /infra/kubernetes/slave-controller.yaml: -------------------------------------------------------------------------------- 1 | kind: ReplicationController 2 | apiVersion: v1 3 | metadata: 4 | name: spark-worker-controller 5 | spec: 6 | replicas: 1 7 | selector: 8 | component: spark-worker 9 | template: 10 | metadata: 11 | labels: 12 | component: spark-worker 13 | spec: 14 | containers: 15 | - name: spark-worker 16 | image: luisbelloch/spark 17 | command: ["/opt/spark/sbin/start-slave.sh", "spark://spark-master:7077"] 18 | env: 19 | - name: SPARK_NO_DAEMONIZE 20 | value: "true" 21 | resources: 22 | requests: 23 | cpu: 100m 24 | -------------------------------------------------------------------------------- /infra/minio/config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "8", 3 | "hosts": { 4 | "local": { 5 | "url": "http://localhost:9000", 6 | "accessKey": "JX6SNZEW2CLYM66UDHT7", 7 | "secretKey": "NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0", 8 | "api": "S3v4" 9 | }, 10 | "minio": { 11 | "url": "http://minio:9000", 12 | "accessKey": "JX6SNZEW2CLYM66UDHT7", 13 | "secretKey": "NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0", 14 | "api": "s3v4" 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /infra/minio/config/config.json.old: -------------------------------------------------------------------------------- 1 | { 2 | "version": "8", 3 | "hosts": { 4 | "gcs": { 5 | "url": "https://storage.googleapis.com", 6 | "accessKey": "YOUR-ACCESS-KEY-HERE", 7 | "secretKey": "YOUR-SECRET-KEY-HERE", 8 | "api": "S3v2" 9 | }, 10 | "local": { 11 | "url": "http://localhost:9000", 12 | "accessKey": "", 13 | "secretKey": "", 14 | "api": "S3v4" 15 | }, 16 | "play": { 17 | "url": "https://play.minio.io:9000", 18 | "accessKey": "Q3AM3UQ867SPQQA43P2F", 19 | "secretKey": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG", 20 | "api": "S3v4" 21 | }, 22 | "s3": { 23 | "url": "https://s3.amazonaws.com", 24 | "accessKey": "YOUR-ACCESS-KEY-HERE", 25 | "secretKey": "YOUR-SECRET-KEY-HERE", 26 | "api": "S3v4" 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /infra/minio/config/share/downloads.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1", 3 | "shares": {} 4 | } -------------------------------------------------------------------------------- /infra/minio/config/share/uploads.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1", 3 | "shares": {} 4 | } -------------------------------------------------------------------------------- /infra/minio/data: -------------------------------------------------------------------------------- 1 | ../../data/ -------------------------------------------------------------------------------- /infra/minio/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | minio: 4 | image: minio/minio 5 | command: server /data 6 | volumes: 7 | - minio1:/data 8 | networks: [block] 9 | ports: 10 | - 9000:9000 11 | environment: 12 | MINIO_ACCESS_KEY: JX6SNZEW2CLYM66UDHT7 13 | MINIO_SECRET_KEY: NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0 14 | 15 | networks: 16 | block: 17 | 18 | volumes: 19 | minio1: 20 | 21 | -------------------------------------------------------------------------------- /infra/minio/mc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eou pipefail 3 | 4 | readonly MC=(docker run -v "$(PWD)/data":/data -v "$(PWD)/config":/root/.mc --network="host" -ti minio/mc) 5 | 6 | if [[ $# -lt 1 ]]; then 7 | >&2 ${MC[@]} 8 | exit 1 9 | fi 10 | 11 | ${MC[@]} $@ 12 | 13 | -------------------------------------------------------------------------------- /infra/minio/mirror.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eou pipefail 3 | readonly BUCKET=local/data 4 | ./mc mb -p "${BUCKET}" 5 | ./mc mirror --remove data "${BUCKET}" 6 | 7 | -------------------------------------------------------------------------------- /infra/pyspark-jupyter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/pyspark-notebook 2 | LABEL maintainer="Luis Belloch " 3 | ENV JUPYTER_ENABLE_LAB=yes 4 | RUN git clone https://github.com/luisbelloch/data_processing_course.git && \ 5 | mv data_processing_course/data . && \ 6 | mv data_processing_course/spark ./ejemplos && \ 7 | rm -rf data_processing_course 8 | -------------------------------------------------------------------------------- /infra/pyspark-jupyter/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build tag push list run 2 | 3 | all: build tag 4 | 5 | build: 6 | docker build -t luisbelloch/pyspark-jupyter . 7 | 8 | tag: 9 | docker tag luisbelloch/pyspark-jupyter luisbelloch/pyspark-jupyter:2021.10 10 | 11 | push: 12 | docker push luisbelloch/pyspark-jupyter:2021.10 13 | docker push luisbelloch/pyspark-jupyter 14 | 15 | run: 16 | docker run -p 8888:8888 -p 4040:4040 luisbelloch/pyspark-jupyter 17 | 18 | list: 19 | docker images luisbelloch/pyspark-jupyter 20 | 21 | -------------------------------------------------------------------------------- /infra/pyspark-jupyter/README.md: -------------------------------------------------------------------------------- 1 | # PySpark + Jupyter 2 | 3 | This folder contains a docker container with PySpark ready to be run from a Jupyter Notebook, specifically customized for the course. 4 | 5 | For more general uses, we recommend to use the official [Jupyter Docker Stacks](https://jupyter-docker-stacks.readthedocs.io/en/latest/index.html). This image itself is derived from `jupyter/pyspark-notebook` one. 6 | 7 | To run it, simply do: 8 | 9 | ```bash 10 | docker run -p 8888:8888 -ti luisbelloch/pyspark-jupyter 11 | ``` 12 | 13 | And navigate to [http://localhost:8888](http://localhost:8888). The password token will be displayed in the terminal. 14 | 15 | This image contains `data` folder used in the examples. You can easily access to it from the notebook: 16 | 17 | ```python 18 | rdd = sc.textFile('./data/compras_tiny.csv') 19 | rdd.take(2) 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /infra/single-node.md: -------------------------------------------------------------------------------- 1 | # Setting up single-node Spark 2 | 3 | This document describes how to download and setup Spark in your machine _without_ requiring a cluster setup. 4 | 5 | > :warning: This is only intended for demo and learning purposes, please refer to the [official deployment guide](https://spark.apache.org/docs/latest/cluster-overview.html) for further information on how to properly deploy an Spark cluster. 6 | 7 | In this repository you will find also other alternative options to run Spark locally: 8 | 9 | - [Spark on Docker](docker/docker.md) 10 | - [Spark on Kubernetes](kubernetes/kubernetes.md) 11 | - [Spark on Vagrant](vagrant.md) 12 | - [Spark on Google Cloud Dataproc](dataproc.md) 13 | - [PySpark Jupyter Notebook](pyspark-jupyter/README.md) 14 | 15 | ## Requirements 16 | 17 | This setup assumes you have a linux machine with Java 8 and Python 3 installed. Assuming a Debian distribution, _stretch_ version, you can install required dependencies with the following commands: 18 | 19 | ```bash 20 | sudo apt-get update 21 | sudo apt-get install -y openjdk-8-jdk-headless python3-software-properties python3-numpy curl 22 | ``` 23 | 24 | ## Downloading and unpacking Spark 25 | 26 | We recommend to install Spark in `/opt/spark`. To download Spark package, you could use the following commands: 27 | 28 | ```bash 29 | mkdir /opt/spark 30 | curl http://apache.rediris.es/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz | tar -xz -C /opt/spark --strip-components=1 31 | ``` 32 | 33 | To make Spark binaries accessible add `/opt/spark/bin` to the `PATH`, by appending the following lines to your `.bashrc` file: 34 | 35 | ```bash 36 | export PYSPARK_PYTHON=python3 37 | export PATH=$PATH:/opt/spark/bin 38 | ``` 39 | 40 | After that, restart current shell to make sure `PATH` changes are applied. 41 | 42 | ## Testing the installation 43 | 44 | Simply run the following command, you should get a value like `res0: Long = 100` in the console: 45 | 46 | ```bash 47 | echo 'sc.parallelize(1 to 100).count()' | spark-shell 48 | ``` 49 | 50 | ## Reducing log level 51 | 52 | By default Spark is too verbose and it would output a ton of the information in the terminal. Optionally you could reduce the log level doing: 53 | 54 | 1. Rename the file `/opt/spark/conf/log4j.properties.template` to `log4j.properties`, in the same directory. 55 | 2. Edit the file and set `rootCategory` property to `ERROR` instead of `INFO`. 56 | 57 | Use this two commands to do that automatically: 58 | 59 | ```bash 60 | sed 's/rootCategory=INFO/rootCategory=ERROR/g' < /opt/spark/conf/log4j.properties.template > /opt/spark/conf/log4j.properties 61 | ``` 62 | 63 | ## TL;DR Using helper script 64 | 65 | All this procedure can be accomplished by a simple script included in the [classroom repository](https://github.com/luisbelloch/data_processing_course). Just clone the repository and run [`local_setup.sh`](../local_setup.sh): 66 | 67 | ```bash 68 | git clone https://github.com/luisbelloch/data_processing_course.git 69 | cd data_processing_course 70 | ./local_setup.sh 71 | ``` 72 | 73 | Spark will be installed in `data_processing_course/.spark`. Do not forget to add `bin` folder to the `$PATH`. 74 | -------------------------------------------------------------------------------- /infra/vagrant.md: -------------------------------------------------------------------------------- 1 | # Using PySpark inside a Vagrant machine 2 | 3 | We have created a Vagrant setup using Ansible that will download and unpack Spark inside the generated machine. 4 | 5 | > :warning: This is only intended for demo and learning purposes, please refer to the [official deployment guide](https://spark.apache.org/docs/latest/cluster-overview.html) for further information on how to properly deploy an Spark cluster. 6 | 7 | To bootstrap the machine, do: 8 | 9 | ```bash 10 | git clone https://github.com/luisbelloch/data_processing_course.git 11 | cd data_processing_course 12 | vagrant up 13 | ``` 14 | 15 | Once the process completes you can access the machine by using: 16 | 17 | ```bash 18 | vagrant ssh 19 | ``` 20 | 21 | Remember that you can access the _host_ machine files using the `/vagrant` folder from the inside of the VM. 22 | 23 | ## Testing the installation 24 | 25 | Make sure the machine is up and running with `vagrant up`, and you can access the virtual machine after doing `vagrant ssh`. 26 | 27 | To test the setup run the following command, you should get a value like `res0: Long = 100` in the console: 28 | 29 | ```bash 30 | echo 'sc.parallelize(1 to 100).count()' | spark-shell 31 | ``` 32 | 33 | ## Running samples 34 | 35 | The samples we discussed in class are available in the folder `/vagrant/spark` inside the virtual machine: 36 | 37 | ```bash 38 | vagrant@buster:~$ cd /vagrant/spark/ 39 | vagrant@buster:/vagrant/spark$ spark-submit compras_con_mas_de_un_descuento.py 40 | ``` 41 | 42 | You may want to start the `pyspark` REPL as well: 43 | 44 | ```bash 45 | vagrant@buster:~$ cd /vagrant/spark/ 46 | vagrant@buster:/vagrant/spark$ pyspark 47 | ``` 48 | -------------------------------------------------------------------------------- /local_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | SPARK_URL=${SPARK_URL:-https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz} 4 | SPARK_PKG=${SPARK_URL##*/} 5 | SPARK_HOME=${SPARK_HOME:-$(pwd)/.spark} 6 | 7 | if [ -t 1 ]; then 8 | readonly colors=$(tput colors) 9 | if [ -n "$colors" ]; then 10 | readonly c_step="$(tput setaf 6)" 11 | readonly c_error="$(tput setaf 1)" 12 | readonly c_norm="$(tput sgr0)" 13 | fi 14 | fi 15 | 16 | stderr() { >&2 echo "$@"; } 17 | 18 | if [[ -d "${SPARK_HOME}" ]]; then 19 | stderr "${c_error}ERROR${c_norm}: Folder already exists '$SPARK_HOME'" 20 | stderr "Set SPARK_HOME to an empty folder before running this script or make sure there's no 'spark' folder in current directory." 21 | exit 1 22 | fi 23 | 24 | stderr "${c_step}[0] Destination: ${SPARK_HOME}${c_norm}" 25 | stderr "${c_step}[1] Downloading and unpacking $SPARK_PKG.tgz${c_norm}" 26 | mkdir -p "${SPARK_HOME}" 27 | curl -s "${SPARK_URL}" | tar -xz -C "${SPARK_HOME}" --strip-components=1 28 | 29 | stderr "${c_step}[2] Reducing log level${c_norm}" 30 | cp "${SPARK_HOME}"/conf/log4j2.properties.template "${SPARK_HOME}"/conf/log4j2.properties 31 | sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties" 32 | 33 | stderr "${c_step}[3] Testing setup${c_norm}" 34 | echo 'sc.parallelize(1 to 100).count()' | "${SPARK_HOME}"/bin/spark-shell 35 | rm -rf derby.log metastore_db 36 | 37 | stderr 38 | stderr "${c_step}DONE! Local setup completed${c_norm}" 39 | stderr "Spark unpacked properly. You can now modify your path:" 40 | echo "export PATH=${SPARK_HOME// /\\ /}/bin:\$PATH" 41 | 42 | -------------------------------------------------------------------------------- /playbook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | vars: 4 | spark_home: /opt/spark 5 | spark_pkg_name: spark-3.3.1-bin-hadoop3 6 | spark_pkg_url: https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz 7 | 8 | tasks: 9 | - name: Update all packages to the latest version 10 | become: true 11 | apt: 12 | upgrade: dist 13 | update_cache: yes 14 | 15 | - name: Basic dependencies 16 | become: true 17 | apt: 18 | name: ['software-properties-common', 'python3-software-properties', 'curl', 'git', 'vim'] 19 | state: latest 20 | update_cache: yes 21 | force_apt_get: true 22 | 23 | - name: Install AdoptOpenJDK 11 24 | become: true 25 | block: 26 | - name: Import keys 27 | apt_key: 28 | url: https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public 29 | state: present 30 | - name: Add repository 31 | apt_repository: 32 | repo: deb https://adoptopenjdk.jfrog.io/adoptopenjdk/deb xenial main 33 | state: present 34 | - name: Install package 35 | apt: 36 | name: ['adoptopenjdk-11-hotspot', 'ca-certificates'] 37 | state: latest 38 | update_cache: yes 39 | force_apt_get: true 40 | 41 | - name: Clone classrom repository 42 | git: 43 | repo: 'https://github.com/luisbelloch/data_processing_course.git' 44 | dest: '{{ ansible_env.HOME }}/data_processing_course' 45 | 46 | - stat: 47 | path: '/opt/{{ spark_pkg_name }}' 48 | register: spark_dest 49 | 50 | - name: Install SPARK 51 | when: spark_dest.stat.islnk is not defined 52 | block: 53 | - name: Download Spark 54 | become: true 55 | unarchive: 56 | src: '{{ spark_pkg_url }}' 57 | dest: /opt 58 | remote_src: yes 59 | 60 | - name: Link to latest version 61 | become: true 62 | file: 63 | state: link 64 | src: '/opt/{{ spark_pkg_name }}' 65 | dest: '{{ spark_home }}' 66 | 67 | - name: Add Spark to PATH 68 | lineinfile: 69 | path: '{{ ansible_env.HOME }}/.bashrc' 70 | line: 'export PATH=$PATH:/opt/{{ spark_pkg_name }}/bin' 71 | 72 | - name: Set PySpark Python version to 3 73 | lineinfile: 74 | path: '{{ ansible_env.HOME }}/.bashrc' 75 | line: 'export PYSPARK_PYTHON=python3' 76 | 77 | # https://bugs.python.org/issue19846 78 | - name: Update locale 79 | become: true 80 | command: update-locale LC_ALL=en_US.UTF-8 81 | 82 | -------------------------------------------------------------------------------- /spark/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | live/live.html 3 | live/live_mod.py 4 | _work.py 5 | -------------------------------------------------------------------------------- /spark/_template_rdd: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from helpers import item_fields, parse_item 3 | 4 | sc = SparkContext('local', 'playground') 5 | txt = sc.textFile('./data/compras_tiny.csv') 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0])) 7 | parsed = no_header.map(lambda s: parse_item(s)).cache() 8 | 9 | print(parsed.take(1)) 10 | print(parsed.toDebugString().decode('utf-8')) 11 | 12 | -------------------------------------------------------------------------------- /spark/_template_sql: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.master('local').appName('SQL').getOrCreate() 4 | df = spark.read.load('./data/containers_tiny.parquet') 5 | 6 | -------------------------------------------------------------------------------- /spark/compras_con_mas_de_un_descuento.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from helpers import parse_item, item_fields 3 | 4 | sc = SparkContext('local', 'compras') 5 | txt = sc.textFile('data/compras_tiny.csv') 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0])) 7 | parsed = no_header.map(lambda s: parse_item(s)).cache() 8 | 9 | # Brief talk on why color in terminal should not be abused, logs get destroyed 10 | 11 | # Primera aproximación 12 | mas_de_un_cupon = parsed \ 13 | .map(lambda i: (i.tx_id, i.coupon_code)) \ 14 | .filter(lambda t: t[1]) \ 15 | .map(lambda t: (t[0], 1)) \ 16 | .reduceByKey(lambda a, b: a + b) \ 17 | .filter(lambda t: t[1] > 1) 18 | print("\033[35mPlan de ejecución (v1):\033[0m") 19 | print(mas_de_un_cupon.toDebugString().decode('utf-8')) 20 | print("\033[35mCon más de un descuento (v1):\033[0m", mas_de_un_cupon.count()) 21 | 22 | # Segunda aproximación, código equivalente 23 | mas_de_un_cupon2 = parsed \ 24 | .map(lambda i: (i.tx_id, 1 if i.coupon_code else 0)) \ 25 | .filter(lambda t: t[1] == 1) \ 26 | .reduceByKey(lambda a, b: a + b) \ 27 | .filter(lambda t: t[1] > 1) 28 | print("\n\033[36mPlan de ejecución (v2):\033[0m") 29 | print(mas_de_un_cupon2.toDebugString().decode('utf-8')) 30 | print("\033[36mCon más de un descuento (v2):\033[0m", mas_de_un_cupon2.count()) 31 | 32 | total = parsed.count() 33 | p_descuentos = mas_de_un_cupon2.count() / float(total) 34 | print("\n\x1b[38;5;214mPorcentaje:\x1b[0m", p_descuentos, "\n") 35 | 36 | -------------------------------------------------------------------------------- /spark/compras_conversion_a_dolares.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from helpers import get_usd_exchange_rates, item_fields, parse_item 3 | 4 | sc = SparkContext('local', 'compras') 5 | txt = sc.textFile('data/compras_tiny.csv') 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0])) 7 | parsed = no_header.map(lambda s: parse_item(s)).cache() 8 | 9 | rates = get_usd_exchange_rates() 10 | 11 | # El archivo puede tener múltiples problemas, incluso con algo 12 | # sencillo como una simple conversion a dólares: 13 | # - el precio ya está en dólares 14 | # - item_price no viene como float 15 | # - no existe tasa de cambio para ese item 16 | # - ¿Cómo descartamos la linea? -> None 17 | # - ¿Cómo recogemos las filas que han fallado? ¿debemos? 18 | def convert_to_usd(item): 19 | if (item.currency_code == 'USD'): 20 | return item 21 | if (not item.currency_code in rates): 22 | return None # error? 23 | new_price = rates[item.currency_code] * float(item.item_price) 24 | new_item = item._replace(currency_code='USD', item_price = new_price) 25 | return new_item 26 | 27 | in_usd = parsed.map(convert_to_usd) 28 | print(in_usd.take(2)) 29 | 30 | -------------------------------------------------------------------------------- /spark/compras_importe_total_agrupado_por_tx_id.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from helpers import item_fields, parse_item 3 | 4 | sc = SparkContext('local', 'compras') 5 | txt = sc.textFile('data/compras_tiny.csv') 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0])) 7 | parsed = no_header.map(lambda s: parse_item(s)).cache() 8 | 9 | importes = parsed \ 10 | .map(lambda i: (i.tx_id, float(i.item_price))) \ 11 | .reduceByKey(lambda elemento, acumulado: elemento + acumulado) 12 | 13 | print(importes.take(10)) 14 | 15 | -------------------------------------------------------------------------------- /spark/compras_sql.py: -------------------------------------------------------------------------------- 1 | from os import walk 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 6 | 7 | df = spark.read.option("delimiter", "|").option("header", "true").csv('./data/compras_tiny.csv') 8 | df.printSchema() 9 | df.show() 10 | 11 | df.createOrReplaceTempView("compras") 12 | spark.sql("SELECT tx_id, SUM(item_price) as tx_total FROM compras GROUP BY tx_id").show() 13 | -------------------------------------------------------------------------------- /spark/compras_top_ten_countries.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkContext 3 | from helpers import dataUrl, item_fields, parse_item 4 | 5 | sc = SparkContext('local', 'compras') 6 | txt = sc.textFile(dataUrl('compras_tiny.csv')) 7 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0])) 8 | parsed = no_header.map(lambda s: parse_item(s)).cache() 9 | 10 | countries_rdd = sc \ 11 | .textFile(dataUrl('country_codes.csv')) \ 12 | .map(lambda c: tuple(reversed(c.split(',')))) 13 | 14 | join_rdd = parsed \ 15 | .filter(lambda i: i.currency_code == 'USD') \ 16 | .map(lambda i: (i.country, float(i.item_price))) \ 17 | .reduceByKey(lambda a, b: a + b) \ 18 | .leftOuterJoin(countries_rdd) \ 19 | .sortBy(lambda i: i[1][0], ascending=False) 20 | 21 | print(join_rdd.take(10)) 22 | 23 | # print map(lambda i: (i[0], i[1][1], i[1][0]), join_rdd.take(10)) 24 | # join_rdd.saveAsTextFile(dataUrl('out/top10countries'), 'org.apache.hadoop.io.compress.GzipCodec') 25 | 26 | -------------------------------------------------------------------------------- /spark/container.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.master("local").appName("container").getOrCreate() 4 | 5 | df = spark.read.load('data/containers_tiny.parquet') 6 | df.printSchema() 7 | 8 | # Using API 9 | df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show() 10 | 11 | # Register table alias to allow SQL use 12 | df.createOrReplaceTempView("container") 13 | spark.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show() 14 | 15 | # ship_imo, num of containers, total ship weight 16 | total_weight_rdd = spark.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo") 17 | total_weight_rdd.printSchema() 18 | total_weight_rdd.show() 19 | # print total_weight_rdd.map(lambda r: r['number']).collect() 20 | 21 | # UDFs 22 | spark.udf.register('en_toneladas', lambda c: float(c) / 1000.0) 23 | spark.sql("SELECT en_toneladas(net_weight) toneladas, net_weight FROM container WHERE container_id = 'FMBV1684747'").show() 24 | 25 | # JOINs: Extract description of container codes 26 | codes = spark.read.json('data/iso-container-codes.json') 27 | codes.createOrReplaceTempView('codes') 28 | codes.printSchema() 29 | codes.show() 30 | 31 | w_desc = spark.sql("SELECT c.container_id, s.code, s.description FROM container c JOIN codes s on c.container_type = s.code") 32 | w_desc.show() 33 | print(w_desc.groupBy("code").count().take(3)) 34 | 35 | -------------------------------------------------------------------------------- /spark/container_caching.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import SQLContext, Row 3 | 4 | sc = SparkContext("local", "barcos") 5 | sq = SQLContext(sc) 6 | 7 | df = sq.read.load("data/containers_tiny.parquet") 8 | df.registerTempTable("container") 9 | sq.cacheTable("container") 10 | 11 | df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show() 12 | sq.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show() 13 | sq.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo").show() 14 | 15 | input("Press Enter to continue... http://localhost:4040/storage") 16 | 17 | -------------------------------------------------------------------------------- /spark/container_convertir_a_parquet.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import SQLContext, Row 3 | 4 | from helpers import container_fields, parse_container 5 | 6 | sc = SparkContext('local', 'barcos') 7 | sq = SQLContext(sc) 8 | 9 | csv_source = sc \ 10 | .textFile('data/containers_tiny.csv') \ 11 | .filter(lambda s: not s.startswith(container_fields[0])) \ 12 | .map(parse_container) \ 13 | .map(lambda c: Row(**dict(c._asdict()))) 14 | 15 | print(csv_source.count()) 16 | 17 | # Python 2.7.6 to 3.5 18 | # http://stackoverflow.com/a/26180604 19 | # .map(lambda c: Row(**dict(c.__dict__))) 20 | 21 | # Convert RDD to a DataFrame (in scala, DataSet[Row]) 22 | # It will preserve types from the RDD ones. Note it 23 | # won't do anything fancy, since the namedtuple types 24 | # are just strings. 25 | containerSchema = sq.createDataFrame(csv_source) 26 | containerSchema.createOrReplaceTempView('container') 27 | containerSchema.printSchema() 28 | 29 | denmark_only = sq.sql("SELECT ship_name FROM container WHERE country = 'DK'") 30 | print(denmark_only.first()) 31 | 32 | todo_df = sq.sql("SELECT * FROM container") 33 | todo_df.printSchema() 34 | 35 | outpath = 'data/containers_tiny.parquet' 36 | todo_df.write.mode('overwrite').parquet(outpath) 37 | print("\nDatos guardados en", outpath) 38 | 39 | -------------------------------------------------------------------------------- /spark/container_databricks_csv.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 4 | 5 | # https://github.com/databricks/spark-csv#python-api 6 | df = spark.read \ 7 | .format("com.databricks.spark.csv") \ 8 | .options(header='true', inferschema='true', delimiter=";") \ 9 | .load('data/containers_tiny.csv') 10 | 11 | df.printSchema() 12 | df.show() 13 | 14 | df.select("container_id", "container_type", "gross_weight") \ 15 | .filter(df["country"] == "DK") \ 16 | .show() 17 | 18 | df.groupBy("country").count().show() 19 | 20 | df.createOrReplaceTempView("container") 21 | spark.sql("SELECT ship_name FROM container WHERE country = 'DK'").show() 22 | 23 | -------------------------------------------------------------------------------- /spark/container_partition.py: -------------------------------------------------------------------------------- 1 | from os import walk 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 6 | 7 | def segment(df, field, value, num = 5): 8 | df.filter(df[field] == value).limit(num) \ 9 | .write.mode('overwrite') \ 10 | .parquet('data/containers_partitioned/{}={}'.format(field, value)) 11 | 12 | def main(): 13 | df = spark.read.load('data/containers_tiny.parquet') 14 | segment(df, "country", "DK") 15 | segment(df, "country", "SB") 16 | 17 | for path, dirs, files in walk('data/containers_partitioned/'): 18 | print("\x1b[38;5;214m"+path+"\033[0m") 19 | for f in files: 20 | print(" |-- ", f) 21 | 22 | partitioned = spark.read.load("data/containers_partitioned") 23 | partitioned.select("container_id", "country").show() 24 | 25 | if __name__ == '__main__': 26 | main() 27 | 28 | -------------------------------------------------------------------------------- /spark/container_rdd_to_dataset.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import * 3 | 4 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 5 | 6 | csv_source = spark.sparkContext \ 7 | .textFile('data/containers_tiny.csv') \ 8 | .filter(lambda s: not s.startswith("ship_imo")) \ 9 | .map(lambda i: i.split(";")) \ 10 | .map(lambda i: (i[4], i[5], float(i[7]))) \ 11 | .cache() 12 | 13 | print(csv_source.take(1)) 14 | 15 | # Set schema 16 | container_id_field = StructField("container_id", StringType(), True) 17 | container_type_field = StructField("container_type", StringType(), True) 18 | net_weight_field = StructField("net_weight", FloatType(), True) 19 | schemaDef = StructType([container_id_field, container_type_field, net_weight_field]) 20 | 21 | schema = spark.createDataFrame(csv_source, schemaDef) 22 | schema.printSchema() 23 | 24 | -------------------------------------------------------------------------------- /spark/data: -------------------------------------------------------------------------------- 1 | ../data -------------------------------------------------------------------------------- /spark/enable_history.properties: -------------------------------------------------------------------------------- 1 | # spark-submit --verbose --master spark://MASTER:7077 --properties-file enable_history.properties --py-files helpers.py SCRIPT 2 | spark.eventLog.enabled=true 3 | spark.history.fs.logDirectory=/tmp/spark-events 4 | 5 | -------------------------------------------------------------------------------- /spark/friends.py: -------------------------------------------------------------------------------- 1 | # RUN: ./graphframes.sh ship_routes.py 2 | 3 | from pyspark import SparkContext 4 | from pyspark.sql import SQLContext 5 | 6 | from graphframes import * 7 | from graphframes.examples import Graphs 8 | 9 | sc = SparkContext('local', 'friends') 10 | sq = SQLContext(sc) 11 | friends = Graphs(sq).friends() 12 | 13 | friends.vertices.show() 14 | friends.edges.show() 15 | 16 | over30 = friends.vertices.filter("age > 30") 17 | only_friends = friends.edges.filter("relationship = 'friend'") 18 | friends_over_30 = GraphFrame(over30, only_friends) 19 | friends_over_30.triplets.show() 20 | 21 | -------------------------------------------------------------------------------- /spark/graphframes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | readonly PKG="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 5 | if [ $# -eq 0 ]; then 6 | pyspark --packages $PKG 7 | else 8 | spark-submit --packages $PKG "$*" 9 | fi 10 | 11 | exit $? 12 | 13 | -------------------------------------------------------------------------------- /spark/hello1.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | 3 | sc = SparkContext('local', 'hello') 4 | rdd = sc.textFile('./data/compras_tiny.csv') 5 | 6 | print(rdd.count()) 7 | 8 | # Also spark-submit hello1.py --conf spark.logLineage=true 9 | print(rdd.toDebugString().decode('utf-8')) 10 | 11 | -------------------------------------------------------------------------------- /spark/hello2.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | 3 | sc = SparkContext('local', 'hello') 4 | rdd = sc.textFile('./data/compras_tiny.csv') 5 | 6 | solo_en_euros = rdd.filter(lambda fila: 'EUR' in fila) 7 | 8 | print(solo_en_euros.toDebugString().decode('utf-8')) 9 | print(solo_en_euros.count()) 10 | print(solo_en_euros.take(10)) 11 | 12 | -------------------------------------------------------------------------------- /spark/helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | 5 | from collections import namedtuple 6 | from datetime import datetime 7 | 8 | item_fields = ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned'] 9 | Item = namedtuple('Item', item_fields) 10 | 11 | def parse_item(raw_string): 12 | f = raw_string.split('|') 13 | f += [None] * (len(item_fields) - len(f)) 14 | return Item(*f) 15 | 16 | # Thing = namedtuple('Item', ['foo', 'bar']) 17 | # some = Thing(foo=42, bar='hello') 18 | # some.foo 19 | # item = parse_item(["one", "two"]) 20 | # new_item = item._replace(tx_id=1, buyer=5) 21 | 22 | # API http://fixer.io/ 23 | def get_usd_exchange_rates(): 24 | with open('./data/exchange_rates_usd.json') as f: 25 | data = json.load(f) 26 | return data['rates'] 27 | 28 | container_fields = ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok'] 29 | Container = namedtuple('Container', container_fields) 30 | 31 | def parse_container(raw_string): 32 | f = raw_string.split(';') 33 | f += [None] * (len(container_fields) - len(f)) 34 | return Container(*f) 35 | 36 | stock_fields = ['simbolo', 'numero', 'precio_compra', 'ultimo_precio', 'returns'] 37 | Stock = namedtuple('Stock', stock_fields) 38 | def parse_stock(raw_string): 39 | f = raw_string.split(',') 40 | return Stock(simbolo=f[0], numero=None, precio_compra=None, ultimo_precio=float(f[1]), returns=0.0) 41 | 42 | def setup_checkpoint(streamingContext): 43 | checkpoint = './checkpoint' 44 | if (os.path.exists(checkpoint)): 45 | shutil.rmtree(checkpoint) 46 | os.mkdir(checkpoint) 47 | streamingContext.checkpoint(checkpoint) 48 | 49 | def isoDate(raw_string): 50 | try: 51 | return datetime.strptime(raw_string, "%Y-%m-%dT%H:%M:%SZ") 52 | except Exception: 53 | return None 54 | 55 | def dataUrl(fileName): 56 | base = "./data" 57 | # base = "gs://bigdataupv_data" 58 | return os.path.join(base, fileName) 59 | 60 | -------------------------------------------------------------------------------- /spark/hft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pyspark import SparkContext 4 | from pyspark.streaming import StreamingContext 5 | 6 | from helpers import * 7 | 8 | sc = SparkContext("local[2]", "NetworkWordCount") 9 | st = StreamingContext(sc, 1) 10 | setup_checkpoint(st) 11 | 12 | portfolio = { u'MSFT': Stock('MSFT', 1, 150.06, None, 0.0), u'APPL': Stock('APPL', 4, 70.23, None, 0.0), u'GOOG': Stock('GOOG', 2, 104.55, None, 0.0) } 13 | 14 | def actualizar_portfolio(stocks): 15 | actualizaciones = stocks.filter(lambda s: s.simbolo in portfolio).collect() 16 | al_menos_una_actualizacion = False 17 | for a in actualizaciones: 18 | al_menos_una_actualizacion = True 19 | actual = portfolio[a.simbolo] 20 | nuevo = actual._replace( \ 21 | ultimo_precio = a.ultimo_precio, \ 22 | returns = (a.ultimo_precio - actual.precio_compra) / actual.precio_compra) 23 | portfolio[a.simbolo] = nuevo 24 | if al_menos_una_actualizacion: 25 | print map(lambda s: list(s), portfolio.values()) 26 | 27 | stocks = st.socketTextStream("localhost", 9999) \ 28 | .map(parse_stock) \ 29 | .foreachRDD(actualizar_portfolio) 30 | 31 | # stocks.pprint() 32 | # stocks.reduceByKey(lambda a,b: a + b) 33 | 34 | st.start() 35 | st.awaitTermination() 36 | 37 | -------------------------------------------------------------------------------- /spark/live.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "hello world!\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "print(\"hello world!\")" 18 | ] 19 | } 20 | ], 21 | "metadata": { 22 | "kernelspec": { 23 | "display_name": "Python 3", 24 | "language": "python", 25 | "name": "python3" 26 | }, 27 | "language_info": { 28 | "codemirror_mode": { 29 | "name": "ipython", 30 | "version": 3 31 | }, 32 | "file_extension": ".py", 33 | "mimetype": "text/x-python", 34 | "name": "python", 35 | "nbconvert_exporter": "python", 36 | "pygments_lexer": "ipython3", 37 | "version": "3.7.2" 38 | } 39 | }, 40 | "nbformat": 4, 41 | "nbformat_minor": 2 42 | } 43 | -------------------------------------------------------------------------------- /spark/live.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 4 | df = spark.read.load('data/containers_tiny.parquet') 5 | df.select("ship_imo", "container_id", "net_weight").show() 6 | 7 | -------------------------------------------------------------------------------- /spark/live/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: watch 2 | watch: 3 | ./live.sh 4 | 5 | .PHONY: auth 6 | auth: 7 | docker run -ti --name gcloud-config google/cloud-sdk gcloud auth login 8 | 9 | .PHONY: set-project 10 | set-project: 11 | ./gcloud config set project bigdataupv2021 12 | 13 | -------------------------------------------------------------------------------- /spark/live/gcloud: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud "$@" 3 | 4 | -------------------------------------------------------------------------------- /spark/live/gsutil: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | docker run --rm --volumes-from gcloud-config -w /tmp/current -v $(pwd):/tmp/current google/cloud-sdk gsutil "$@" 3 | 4 | -------------------------------------------------------------------------------- /spark/live/live.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | fswatch ../live.py | while read -r fpath; do \ 5 | echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%M%S")" 6 | echo -e "# $(date +"%H:%M:%S")\n" | cat - ../live.py > live_mod.py 7 | sed -e '/-python">/r./live_mod.py' live_template.html > live.html 8 | ./gsutil -h "Cache-Control:no-cache,max-age=0" \ 9 | cp /tmp/current/live.html gs://bigdata.luisbelloch.es/en_directo.html 10 | 11 | # echo -e "# $(date +"%H:%M:%S")\n" | cat - live.py | pygmentize -f html -O full,linenos=1 -o live.html 12 | # scp live.html root@live.luisbelloch.es:/var/www/html/index.html 13 | done 14 | 15 | -------------------------------------------------------------------------------- /spark/live/live_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eou pipefail 3 | 4 | fswatch ../live.ipynb | while read -r fpath; do \ 5 | echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%m%S")" 6 | jupyter nbconvert ../live.ipynb --to html --output-dir="$(pwd)" 7 | gsutil -h "Cache-Control:no-cache,max-age=0" \ 8 | cp live.html gs://bigdata.luisbelloch.es/en_directo.html 9 | # scp live/live.html root@live.luisbelloch.es:/var/www/html/index.html 10 | done 11 | 12 | -------------------------------------------------------------------------------- /spark/live/live_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | #bidataupv - live 9 | 10 | 11 | 24 | 25 | 26 |
27 |

28 |       
29 |
30 | 31 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /spark/peliculas_0_ml.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.ml.recommendation import ALS 3 | 4 | # ALTERNATIVE 5 | # from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics 6 | # from pyspark.mllib.recommendation import ALS, Rating 7 | 8 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() 9 | 10 | print("\033[36mInitial data\033[0m") 11 | columns = ["user", "item", "rating"] 12 | data = [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)] 13 | df = spark.createDataFrame(data, columns) 14 | df.show() 15 | 16 | print("\033[36mTraining model...\033[0m") 17 | als = ALS() 18 | model = als.fit(df) 19 | 20 | output_model_path = "data/peliculas0_trained_model" 21 | print("\033[36mSaving model to '{}'...\033[0m".format(output_model_path)) 22 | model.write().overwrite().save(output_model_path) 23 | 24 | print("\033[36mTesting some user/item pairs...:\033[0m") 25 | test = spark.createDataFrame([(0, 2), (1, 0), (2, 0), (3, 0)], ["user", "item"]) 26 | model.transform(test).show() 27 | 28 | -------------------------------------------------------------------------------- /spark/peliculas_1_mllib.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics 3 | from pyspark.mllib.recommendation import ALS, Rating 4 | 5 | sc = SparkContext() 6 | 7 | # Generar recomendaciones para todos los usuarios 8 | # - Clasificación: suma(votos) / numero_votos 9 | # - Clasificación con tiempo: inventar, algoritmo de Reddit p.e. 10 | # - Descartar votos duplicados 11 | # - Report para la web, necesario orden por pelicula: usuario_id, pelicula_id, titulo, rating_medio 12 | # - Guardar en parquet 13 | 14 | peliculas = sc.textFile("data/peliculas.csv") \ 15 | .filter(lambda l: not l.startswith(u'#') and not l.startswith(u'Entry|')) \ 16 | .map(lambda l: l.split("|")) 17 | # print peliculas.take(2) 18 | 19 | def parseLine(line): 20 | fields = line.split("|") 21 | return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) 22 | 23 | ratings = sc.textFile("data/ratings.csv") \ 24 | .filter(lambda l: not l.startswith('pelicula_id')) \ 25 | .map(lambda l: l.split(",")) \ 26 | .map(lambda l: Rating(int(l[1]), int(l[0]), float(l[2]))) 27 | # print ratings.take(2) 28 | 29 | media_ratings = ratings \ 30 | .map(lambda r: (r.product, (r.rating, 1))) \ 31 | .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \ 32 | .map(lambda p: (p[0], p[1][0] / float(p[1][1]))) 33 | # print media_ratings.collect() 34 | 35 | # Entrenar modelo 36 | model = ALS.train(ratings, 1) 37 | 38 | # generar posibles pares de usuario / pelicula 39 | # VER bash en shell_trans.sh 40 | ids_pelicula = sc.textFile('data/pelicula_ids.csv') 41 | ids_usuario = sc.textFile('data/pelicula_usuarios.csv') 42 | publico_objetivo = ids_usuario.cartesian(ids_pelicula) #ids_pelicula.cartesian(ids_usuario) 43 | # print posibles_pares.take(10) 44 | 45 | # Crear predicciones 46 | predicciones = model.predictAll(publico_objetivo) 47 | # print predicciones.take(4) 48 | 49 | # Convertir a DF para manipular 50 | # POR QUÉ no hemos de hacer el sort/join aquí, mejor en una BBDD relacional 51 | # Cálculo número de filas + espacio 52 | # Cómo se realizaría la inserción? 53 | from pyspark.sql import SQLContext, Row 54 | sq = SQLContext(sc) 55 | df = sq.createDataFrame(predicciones) 56 | df.registerTempTable('predicciones') 57 | df.show() 58 | 59 | # ¿Tenemos un modelo correcto? 60 | # R-Squared 0, indicates that the model explains none of the variability of the response data around its mean. 61 | # R-Squared 1, indicates that the model explains all the variability of the response data around its mean. 62 | ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) 63 | scoreAndLabels = predicciones \ 64 | .map(lambda r: ((r.user, r.product), r.rating)) \ 65 | .join(ratingsTuple) \ 66 | .map(lambda tup: tup[1]) 67 | 68 | metrics = RegressionMetrics(scoreAndLabels) 69 | print("RMSE = %s" % metrics.rootMeanSquaredError) 70 | print("R-squared = %s" % metrics.r2) 71 | 72 | -------------------------------------------------------------------------------- /spark/peliculas_calculo_de_medias_por_key.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from helpers import * 3 | 4 | sc = SparkContext('local', 'compras') 5 | 6 | ratings = sc.textFile("data/ratings.csv") \ 7 | .filter(lambda l: not l.startswith('pelicula_id')) \ 8 | .map(lambda l: l.split(",")) 9 | 10 | media_ratings = ratings \ 11 | .map(lambda r: (r[0], (float(r[2]), 1))) \ 12 | .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \ 13 | .map(lambda p: (int(p[0]), p[1][0] / float(p[1][1]))) 14 | 15 | print(media_ratings.take(5)) 16 | 17 | -------------------------------------------------------------------------------- /spark/reload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | fswatch live.py | while read -r fpath; do \ 5 | clear 6 | echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%M%S")" 7 | spark-submit live.py 8 | done 9 | 10 | -------------------------------------------------------------------------------- /spark/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | readonly c_step="$(tput setaf 6)" 4 | readonly c_norm="$(tput sgr0)" 5 | readonly excluded=(helpers.py hft.py container_caching.py ship_routes.py) 6 | 7 | for file in *.py; do 8 | if [[ ! " ${excluded[*]} " =~ " ${file} " ]]; then 9 | echo -e "${c_step}Running${c_norm} $file" 10 | spark-submit $file 2>/dev/null 11 | fi 12 | done 13 | 14 | ./graphframes.sh ship_routes.py 15 | -------------------------------------------------------------------------------- /spark/ship_routes.py: -------------------------------------------------------------------------------- 1 | # RUN: ./graphframes.sh ship_routes.py 2 | 3 | from pyspark import SparkContext 4 | from pyspark.sql import SQLContext 5 | from pyspark.sql.types import * 6 | from pyspark.sql.functions import lead, col, explode 7 | from pyspark.sql.window import Window 8 | 9 | from graphframes import * 10 | from graphframes.examples import Graphs 11 | 12 | sc = SparkContext('local', 'barcos') 13 | sq = SQLContext(sc) 14 | 15 | csv = sc.textFile("data/ship_routes.csv") \ 16 | .map(lambda c: c.split("|")) \ 17 | .map(lambda c: (c[0], c[1], c[4])) 18 | sequential_route = sq.createDataFrame(csv, ["order", "ship_imo", "country_code"]) 19 | sequential_route.orderBy("ship_imo", "order").show() 20 | 21 | w = Window().partitionBy("ship_imo").orderBy(col("order")) 22 | routes = sequential_route.select("*", lead("country_code").over(w).alias("dst")).na.drop() 23 | routes.orderBy("ship_imo", "order").show() 24 | 25 | edges = routes.select(col("country_code").alias("src"), col("dst"), col("ship_imo")) 26 | # edges.show(100) 27 | 28 | countries_rdd = sc \ 29 | .textFile('./data/country_codes.csv') \ 30 | .map(lambda c: tuple(reversed(c.split(',')))) 31 | vertices = sq.createDataFrame(countries_rdd, ["id", "country_label"]) 32 | # vertices.show(100) 33 | 34 | g = GraphFrame(vertices, edges) 35 | results = g.shortestPaths(landmarks=["AT", "GS"]) \ 36 | .select("id", "country_label", explode("distances")) 37 | results.show(200) 38 | 39 | -------------------------------------------------------------------------------- /spark/spark: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Alternative: SPARK_DOCKER_IMAGE=apache/spark-py ./spark-submit script.py 5 | readonly SPARK_DOCKER_IMAGE=${SPARK_DOCKER_IMAGE:-luisbelloch/spark} 6 | readonly SPARK_SUBMIT=/opt/spark/bin/spark-submit 7 | readonly DATA_DIR=/tmp/bigdataupv/data 8 | readonly WORK_DIR=/tmp/bigdataupv/scripts 9 | 10 | if [[ $# -lt 1 ]]; then 11 | >&2 echo "USAGE: ./spark [SCRIPT_NAME]" 12 | >&2 echo "Sample: ./spark hello1.py" 13 | exit 1 14 | fi 15 | 16 | abs_path() { 17 | echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")" 18 | } 19 | 20 | get_data_volume() { 21 | # Probe for source folder first, if it doesn't 22 | # exists then it'll try with current folder 23 | if [[ -d "${0}" ]]; then 24 | echo "-v $(abs_path $0):"${DATA_DIR}"" 25 | elif [[ -d "./data" ]]; then 26 | echo "-v $(abs_path "./data"):"${DATA_DIR}"" 27 | elif [[ -d "../data" ]]; then 28 | echo "-v $(abs_path "../data"):"${DATA_DIR}"" 29 | else 30 | >&2 echo "WARN: ./data directoy not found!" 31 | echo "" 32 | fi 33 | } 34 | 35 | readonly source_folder="$(cd "$(dirname "$1")" && pwd)" 36 | readonly data_volume=$(get_data_volume "${source_folder}") 37 | 38 | docker run --rm -ti \ 39 | -w "${WORK_DIR}" \ 40 | -v "${source_folder}":"${WORK_DIR}" \ 41 | $data_volume \ 42 | ${SPARK_DOCKER_IMAGE} "${SPARK_SUBMIT}" "${WORK_DIR}"/$1 ${@:2} 43 | 44 | -------------------------------------------------------------------------------- /spark/stock_server.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'socket' 4 | require 'csv' 5 | 6 | @simbolos = ["MSFT", "IBM", "GOOG", "YHOO", "APPL", "SIFI", "NWBO", "CRTO", "LAMR", "EYES", "ONTX", "FWP", "XXIA", "ASBB", "FTHI", "LSCC", "MRTN", "MBII", "EARS", "FTLB", "PBSK", "PRPH", "VRTU", "QUIK", "RYAAY", "WPRT", "HNNA", "CBSHP", "ADHD", "SGEN", "EZCH", "ADXS", "SNMX", "AXAS", "ASEI", "PME", "AGII", "HABT", "SCAI", "WMAR", "BKSC", "ORBK", "FTSL", "JRVR", "PMTS", "PRTO", "BLVDU", "XCRA", "LIND", "DTLK", "CERS", "TSC", "SONA", "CFGE", "CMFN", "PHIIK", "ASCMA", "HCAP", "HBANP", "WOWO", "KWEB", "CRDS", "EMIF", "MAUI", "LIVE", "ADRD", "AMAT", "EXLS", "FEIC", "QUNR", "LABL", "CDOR", "FRSH", "MTSI", "PCYO", "GOODN", "PRGX", "VXUS", "PCRX", "MAGS", "ALOG", "CYTR", "WHLR", "XBKS", "JRJC", "MDM", "HFBC", "CHY", "WSBF", "WOOD", "GULF", "FNWB", "GMLP", "NATR", "RDI", "RPRX", "EMMS", "ZFGN", "ADI", "BBH"] 7 | # @simbolos = CSV.read('data/nasdaq.csv', {:col_sep => "|"}).drop(2).map { |s| s[0] } 8 | @emitted = {} 9 | 10 | def generar_stock 11 | name = @simbolos.sample 12 | price = 20 + Random.rand(200.0) 13 | if @emitted.has_key? name 14 | current = @emitted[name] 15 | price = current + (current * ([1,-1].sample * Random.rand(0.01))) 16 | end 17 | @emitted[name] = price 18 | "#{name},#{price.round(2)}" 19 | end 20 | 21 | def envio_continuo(cliente) 22 | loop do 23 | [1,3,5,7].sample.times do 24 | stock = generar_stock 25 | puts stock 26 | cliente.puts stock 27 | end 28 | sleep Random.rand(2.0) 29 | end 30 | end 31 | 32 | def main 33 | server = TCPServer.new 9999 34 | puts "Escuchando en tcp://localhost:9999..." 35 | 36 | loop do 37 | Thread.start(server.accept) do |cliente| 38 | envio_continuo cliente 39 | end 40 | end 41 | end 42 | 43 | if __FILE__ == $0 44 | main() 45 | 50.times { |n| puts generar_stock } 46 | end 47 | --------------------------------------------------------------------------------