├── .envrc
├── .gitignore
├── .mailmap
├── .python-version
├── LICENSE
├── README.md
├── Vagrantfile
├── airflow
    ├── .env
    ├── .gitignore
    ├── Makefile
    ├── dags
    │   ├── gasolina_naive.py
    │   ├── gasolina_s3.py
    │   ├── hello_dags.py
    │   ├── hello_python_operator.py
    │   ├── hello_simple.py
    │   ├── s3_bucket_operations.py
    │   ├── s3_file_sensor.py
    │   ├── spark_ondemand.py
    │   └── spark_simple.py
    ├── docker-compose.minio.yaml
    └── minio_connection.json
├── assignments
    ├── .dockerignore
    ├── .gitignore
    ├── Dockerfile
    ├── Makefile
    ├── README.md
    ├── __init__.py
    ├── conftest.py
    ├── contenedores.py
    ├── data
    │   └── containers.csv
    ├── helpers.py
    ├── pytest.ini
    ├── requirements.txt
    ├── test.sh
    ├── test_ejercicio_0.py
    ├── test_ejercicio_1.py
    ├── test_ejercicio_2.py
    ├── test_ejercicio_3.py
    ├── test_ejercicio_4.py
    ├── test_ejercicio_5.py
    ├── test_ejercicio_6.py
    └── test_ejercicio_7.py
├── beam
    ├── .gitignore
    ├── basic.py
    ├── beam
    ├── compras.py
    ├── compras_ptransform.py
    ├── compras_ptransform_condensed.py
    └── compras_totales_por_pais.py
├── data
    ├── .gitignore
    ├── Gemfile
    ├── Gemfile.lock
    ├── compras_tiny.csv
    ├── containers_tiny.csv
    ├── containers_tiny.parquet
    │   ├── .part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc
    │   ├── _SUCCESS
    │   ├── _common_metadata
    │   ├── _metadata
    │   └── part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet
    ├── country_codes.csv
    ├── exchange_rates_usd.json
    ├── iso-container-codes.csv
    ├── iso-container-codes.json
    ├── iso-container-groups.csv
    ├── nasdaq.csv
    ├── nasdaq.json
    ├── pelicula_ids.csv
    ├── pelicula_usuarios.csv
    ├── peliculas.csv
    ├── poors_man_routes.sh
    ├── random_data.rb
    ├── ratings.csv
    └── ship_routes.csv
├── infra
    ├── beam
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── README.md
    │   └── beam.md
    ├── dataproc.md
    ├── docker
    │   ├── .envrc
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── README.md
    │   ├── docker-compose.yml
    │   └── docker.md
    ├── kubernetes
    │   ├── README.md
    │   ├── kubernetes.md
    │   ├── master-controller.yaml
    │   ├── master-service.yaml
    │   ├── namespace.yaml
    │   └── slave-controller.yaml
    ├── minio
    │   ├── config
    │   │   ├── config.json
    │   │   ├── config.json.old
    │   │   └── share
    │   │   │   ├── downloads.json
    │   │   │   └── uploads.json
    │   ├── data
    │   ├── docker-compose.yml
    │   ├── mc
    │   └── mirror.sh
    ├── pyspark-jupyter
    │   ├── Dockerfile
    │   ├── Makefile
    │   └── README.md
    ├── single-node.md
    └── vagrant.md
├── local_setup.sh
├── playbook.yml
└── spark
    ├── .gitignore
    ├── _template_rdd
    ├── _template_sql
    ├── compras_con_mas_de_un_descuento.py
    ├── compras_conversion_a_dolares.py
    ├── compras_importe_total_agrupado_por_tx_id.py
    ├── compras_sql.py
    ├── compras_top_ten_countries.py
    ├── container.py
    ├── container_caching.py
    ├── container_convertir_a_parquet.py
    ├── container_databricks_csv.py
    ├── container_partition.py
    ├── container_rdd_to_dataset.py
    ├── data
    ├── enable_history.properties
    ├── friends.py
    ├── graphframes.sh
    ├── hello1.py
    ├── hello2.py
    ├── helpers.py
    ├── hft.py
    ├── live.ipynb
    ├── live.py
    ├── live
        ├── Makefile
        ├── gcloud
        ├── gsutil
        ├── live.sh
        ├── live_jupyter.sh
        └── live_template.html
    ├── peliculas_0_ml.py
    ├── peliculas_1_mllib.py
    ├── peliculas_calculo_de_medias_por_key.py
    ├── reload.sh
    ├── run_all.sh
    ├── ship_routes.py
    ├── spark
    └── stock_server.rb


/.envrc:
--------------------------------------------------------------------------------
1 | layout pyenv 3.9.2
2 | use java adopt@1.11.0-11
3 | export SPARK_HOME="$(pwd)/.spark"
4 | PATH_add "$SPARK_HOME/bin"
5 | export PYSPARK_PYTHON=python3
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules
 2 | .DS_Store
 3 | *.aux
 4 | *.log
 5 | *.pyc
 6 | alt/
 7 | checkpoint
 8 | metastore_db/
 9 | .spark*/
10 | .vagrant/
11 | out/
12 | __pycache__/
13 | spark-warehouse/
14 | .direnv/
15 | .ipynb_checkpoints/
16 | 
17 | 


--------------------------------------------------------------------------------
/.mailmap:
--------------------------------------------------------------------------------
1 | Luis Belloch <luis@luisbelloch.es> <luisbelloch@gmail.com>
2 | 
3 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.2
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Advanced Data Processing course materials. 
2 | Copyright (C) 2016, Luis Belloch
3 | 
4 | This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. http://creativecommons.org/licenses/by-nc-sa/4.0/
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Processing with Spark
  2 | 
  3 | Materials for the Advanced Data Processing course of the [Big Data Analytics](http://bigdata.inf.upv.es) Master at the Universitat Politècnica de València.
  4 | 
  5 | This course gives a 30 hours overview of many concepts, techniques and tools in data processing using Spark, including some key concepts from Apache Beam. We assume you're familiar with Python, but all the exercises can be easily followed in Java and Scala. We've included a Vagrant definition and docker images for both [Spark](infra/docker/docker.md) and [Beam](infra/beam/beam.md).
  6 | 
  7 | If you find a bug or you want to contribute some comments, please [fill an issue in this repository](https://github.com/luisbelloch/data_processing_course/issues/new) or simply [write us](mailto:bigdata@luisbelloch.es). You're free to reuse course materials, please follow details in the [license section](#license).
  8 | 
  9 | ## Structure
 10 | 
 11 | ### Part A - Spark
 12 | 
 13 | 1. Brief intro to functional programming
 14 | 2. Spark basics
 15 | 3. PySpark: transformations, actions and basic IO
 16 | 4. Spark SQL
 17 | 5. MLib
 18 | 6. Graphs
 19 |     - GraphX (Scala)
 20 |     - GraphFrames (Python)
 21 | 7. Spark cluster deployment
 22 |     - [Single node](infra/single-node.md)
 23 |     - [Vagrant box playground](infra/vagrant.md)
 24 |     - Clustering
 25 |     - [Docker](infra/docker/docker.md)
 26 |     - [Kubernetes](infra/kubernetes/kubernetes.md)
 27 |     - [Cloud Dataproc](infra/dataproc.md) - [Start Tutorial](https://ssh.cloud.google.com/cloudshell/open?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git&page=editor&cloudshell_tutorial=infra/dataproc.md) (in Spanish)
 28 | 8. Apache Beam
 29 |     - [Rationale](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)
 30 |     - [Docker container using Python SDK](infra/beam/beam.md)
 31 |     - Slides (coming soon)
 32 | 9. Minio
 33 | 10. Apache Airflow: coordinating jobs
 34 |     - Basic setup
 35 |     - DAGs
 36 |     - Cloud Composer
 37 | 
 38 | ### Part B - Architecture Workshop
 39 | 
 40 | Team work using [Aronson's puzzle](https://en.wikipedia.org/wiki/Jigsaw_(teaching_technique)). We present a set of real case studies to solve and teams have to design and develop them using any technology available in the market today.
 41 | 
 42 | In the first phase, the teams will split with the goal of becoming experts into a particular area and dig into the proposed tools and framework specifics. In the second phase, they'll return to their peers to design a system that covers use case requirement. There's a 15 minute presentation per team to share the results.
 43 | 
 44 | ## Lecture Notes
 45 | 
 46 | To be added soon, stay tuned!
 47 | 
 48 | ## Source Samples
 49 | 
 50 | - Functional programming (coming soon)
 51 | - Why you don't need big data tools
 52 |     - [poors_man_routes.sh](data/poors_man_routes.sh) - bash superpowers
 53 | - Basic data processing using PySpark
 54 |     - [compras_con_mas_de_un_descuento.py](spark/compras_con_mas_de_un_descuento.py)
 55 |     - [compras_importe_total_agrupado_por_tx_id.py](spark/compras_importe_total_agrupado_por_tx_id.py)
 56 |     - [compras_conversion_a_dolares.py](spark/compras_conversion_a_dolares.py)
 57 |     - [compras_top_ten_countries.py](spark/compras_top_ten_countries.py)
 58 |     - [helpers.py](spark/helpers.py) - basic parse functions to get started quickly
 59 | - Spark SQL
 60 |     - [compras_sql.py](spark/compras_sql.py)
 61 |     - [container.py](spark/container.py)
 62 |     - [container_convertir_a_parquet.py](spark/container_convertir_a_parquet.py)
 63 |     - [container_rdd_to_dataset.py](spark/container_rdd_to_dataset.py)
 64 |     - [container_databricks_csv.py](spark/container_databricks_csv.py)
 65 |     - [container_caching.py](spark/container_caching.py)
 66 |     - [container_partition.py](spark/container_partition.py)
 67 | - Spark Streaming
 68 |     - [hft.py](spark/stock_server.py) and [stock_server.rb](spark/stock_server.rb)
 69 | - MLib
 70 |     - [peliculas_0_ml.py](spark/peliculas_0_ml.py) - ALS intro
 71 |     - [peliculas_1_ml.py](spark/peliculas_1_ml.py) - Predictions
 72 | - GraphFrames
 73 |     - [friends.py](spark/friends.py) - Classic graph sample
 74 |     - [ship_routes.py](spark/ship_routes.py) - Shortest paths for ship routes
 75 | - Apache Beam
 76 |     - [basic.py](beam/basic.py)
 77 |     - [compras.py](beam/compras.py)
 78 |     - [compras_ptransform.py](beam/compras_ptransform.py)
 79 |     - [compras_ptransform_condensed.py](beam/compras_ptransform_condensed.py)
 80 |     - [compras_totales_por_pais.py](beam/compras_totales_por_pais.py)
 81 | - Apache Airflow
 82 |     - [Standalone Docker Image](https://hub.docker.com/r/luisbelloch/airflow)
 83 |     - Tutorial for Composer in Cloud Shell [[English]() / [Spanish]()]
 84 |     - [hello_dags.py](airflow/dags/hello_dags.py)
 85 |     - [hello_python_operator.py](airflow/dags/hello_python_operator.py)
 86 |     - [hello_simple.py](airflow/dags/hello_simple.py)
 87 |     - [spark_ondemand.py](airflow/dags/spark_ondemand.py)
 88 |     - [spark_simple.py](airflow/dags/spark_simple.py)
 89 | - Deployment
 90 |     - [Single Node](infra/single-node.md)
 91 |     - [Vagrant](infra/vagrant.md)
 92 |     - [Ansible](playbook.yml)
 93 |     - [Spark on Docker](infra/docker/docker.md)
 94 |     - [Beam on Docker](infra/beam/beam.md)
 95 |     - [Spark on Kubernetes](infra/kubernetes/kubernetes.md)
 96 |     - [Spark on Google Cloud Dataproc](infra/dataproc.md)
 97 |         - Tutorial for Dataproc in Cloud Shell English / [Spanish](https://ssh.cloud.google.com/cloudshell/open?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git&page=editor&cloudshell_tutorial=infra/dataproc.md)]
 98 |     - [PySpark Jupyter Notebook](infra/pyspark-jupyter/README.md)
 99 | 
100 | ## Assignments
101 | 
102 | Final course assignments can be found in [this document](assignments/README.md). They are in Spanish, they will be translated to English at some point.
103 | 
104 | I'm not publishing the solutions to avoid remaking the exercises every year. There's a test suite using [py.test](http://pytest.org) to help you validate the results. If you're really interested on them, please write me to [bigdata@luisbelloch.es](mailto:bigdata@luisbelloch.es).
105 | 
106 | ## Evaluation Criteria
107 | 
108 | > Self-sufficiency is the state of not requiring any aid, support, or interaction, for survival; it is therefore a type of personal or collective autonomy -  [Wikipedia](https://en.wikipedia.org/wiki/Self-sufficiency).
109 | 
110 | We follow a self-sufficiency principles for students to drive course goals. At the end of the course, students should have enough knowledge and tools to develop small data processing solutions their own.
111 | 
112 | 1. Student understands the underlying concepts behind Spark, and is able to write data processing scripts using PySpark, Spark SQL and MLib.
113 | 2. Student is capable of identify common data processing libraries and frameworks and their applications.
114 | 3. Student is capable to work in a team designing a system to cover a simple data processing scenario, understanding the basic implications of the choices they made on systems, languages, libraries and platforms.
115 | 
116 | ## Readings and links
117 | 
118 | We recommend the following papers to expand knowledge on Spark and other data processing techniques:
119 | 
120 | - [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing](https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf)
121 | - [Discretized Streams: An Efficient and Fault-Tolerant Model for Stream Processing on Large Clusters](http://people.csail.mit.edu/matei/papers/2012/hotcloud_spark_streaming.pdf)
122 | - [Spark SQL: Relational Data Processing in Spark](http://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf)
123 | - [MLlib: Machine Learning in Apache Spark](http://www.jmlr.org/papers/volume17/15-237/15-237.pdf)
124 | - [GraphX: Unifying Data-Parallel and Graph-Parallel Analytics](https://amplab.cs.berkeley.edu/wp-content/uploads/2014/02/graphx.pdf)
125 | - [Tachyon: Memory Throughput I/O for Cluster Computing Frameworks](http://people.eecs.berkeley.edu/~haoyuan/papers/2013_ladis_tachyon.pdf)
126 | - [The Dataflow Model: A Practical Approach to Balancing Correctness, Latency, and Cost in Massive-Scale, Unbounded, Out-of-Order Data Processing](http://www.vldb.org/pvldb/vol8/p1792-Akidau.pdf)
127 | - [Streaming 101: The world beyond batch](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) - [Part two](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102)
128 | - [Apache Flink™: Stream and Batch Processing in a Single Engine](https://www.user.tu-berlin.de/asteriosk/assets/publications/flink-deb.pdf)
129 | - [MillWheel: Fault-Tolerant Stream Processing at Internet Scale](http://research.google.com/pubs/pub41378.html)
130 | - [Pig Latin: A Not-So-Foreign Language for Data Processing](http://infolab.stanford.edu/~olston/publications/sigmod08.pdf)
131 | - [Interpreting the Data: Parallel Analysis with Sawzall](http://research.google.com/archive/sawzall.html)
132 | - [Photon: Fault-tolerant and Scalable Joining of Continuous Data Streams](http://research.google.com/pubs/pub41318.html)
133 | - [Above the Clouds: A Berkeley View of Cloud Computing](https://www2.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.pdf)
134 | - [Cloud Programming Simplified: A Berkeley View on Serverless Computing](https://arxiv.org/abs/1902.03383) (particularly item 8.2 on MapReduce also applies to Spark)
135 | 
136 | ## Roadmap
137 | 
138 | Some ideas we might add in forthcoming course editions:
139 | 
140 | - Code samples in python notebooks
141 | - ~~Apache Flink and Apache Beam~~ (2017)
142 | - Add Tachyon content and exercises
143 | - Add Kafka source to the streaming sample
144 | - ~~Introduce samples with Minio / InfiniSpan~~ (2018)
145 | - ~~Improve deployment scenarios and tools: Mesos, Chef, etc.~~ (2017)
146 | - Monitoring using Prometheus and Grafana, provide ready-to-use docker containers
147 | - Profiling of Spark applications (Scala only)
148 | - Translate all content to English and Spanish
149 | - ~~Cloud Dataproc~~ (2019)
150 | - ~~Apache Airflow~~ (2019)
151 | - Tensorflow training and model execution at scale
152 | 
153 | ## License
154 | 
155 | Advanced Data Processing course materials.
156 | Copyright (C) 2016, Luis Belloch
157 | 
158 | <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.
159 | 
160 | ### Recommended citation
161 | 
162 | > Luis Belloch, course materials for Advanced Data Processing, Spring 2016. Master on Big Data Analytics (http://bigdata.inf.upv.es), Universitat Politècnica de València. Downloaded on [DD Month YYYY].
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | Vagrant.configure("2") do |config|
 5 |   config.vm.box = "debian/buster64"
 6 |   config.vbguest.auto_update = false
 7 |   # config.vm.network "forwarded_port", guest: 8080, host: 8080
 8 |   # config.vm.network "forwarded_port", guest: 8081, host: 8081
 9 |   # config.vm.network "forwarded_port", guest: 8082, host: 8082
10 | 
11 |   config.vm.provision "shell" do |s|
12 |     s.inline = "apt-get update && apt-get install -y python"
13 |   end
14 | 
15 |   config.vm.provision "ansible_local" do |ansible|
16 |     ansible.verbose = "v"
17 |     ansible.playbook = "playbook.yml"
18 |     ansible.compatibility_mode = "2.0"
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/airflow/.env:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=50000
2 | 


--------------------------------------------------------------------------------
/airflow/.gitignore:
--------------------------------------------------------------------------------
1 | docker-compose.yaml
2 | logs/
3 | plugins/
4 | .minio/
5 | airflow
6 | 


--------------------------------------------------------------------------------
/airflow/Makefile:
--------------------------------------------------------------------------------
 1 | AIRFLOW_VERSION:=2.4.3
 2 | 
 3 | .PHONY: all
 4 | all: clean docker-compose.yaml airflow init up
 5 | 
 6 | docker-compose.yaml:
 7 | 	curl -LfO 'https://airflow.apache.org/docs/apache-airflow/${AIRFLOW_VERSION}/docker-compose.yaml'
 8 | 
 9 | airflow:
10 | 	curl -Lf 'https://airflow.apache.org/docs/apache-airflow/${AIRFLOW_VERSION}/airflow.sh' > airflow
11 | 	chmod +x airflow
12 | 
13 | .PHONY: init
14 | init:
15 | 	docker-compose up airflow-init
16 | 
17 | .PHONY: up
18 | up:
19 | 	docker-compose up
20 | 
21 | .PHONY: down
22 | down:
23 | 	docker-compose down --remove-orphans
24 | 
25 | .PHONY: minio
26 | minio:
27 | 	docker-compose -f docker-compose.yaml -f docker-compose.minio.yaml up minio
28 | 
29 | .PHONY: minio_connection
30 | minio_connection: airflow
31 | 	./airflow connections import minio_connection.json
32 | 
33 | .PHONY: clean
34 | clean:
35 | 	-docker-compose down --volumes --remove-orphans
36 | 	-rm -rf logs/ plugins/ .minio/ docker-compose.yaml airflow dags/__pycache__
37 | 


--------------------------------------------------------------------------------
/airflow/dags/gasolina_naive.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | 
 4 | from airflow import AirflowException
 5 | from airflow.decorators import dag, task
 6 | 
 7 | import requests
 8 | 
 9 | codigo_postal = "50197"
10 | endpoint = "https://sedeaplicaciones.minetur.gob.es/ServiciosRESTCarburantes/PreciosCarburantes/EstacionesTerrestres/"
11 | 
12 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['upv'])
13 | def extraer_precio_gasolina_naive():
14 | 
15 |   @task
16 |   def recogida():
17 |     print("Recogiendo datos...")
18 |     response = requests.get(endpoint)
19 |     if response.status_code != 200:
20 |       AirflowException(f"Fallo de conexión {response.status_code}")
21 | 
22 |     datos = response.json()
23 |     return datos['ListaEESSPrecio']
24 | 
25 |   @task
26 |   def filtrado(datos, codigo_postal):
27 |     return list(filter(lambda x: x['C.P.'] == codigo_postal, datos))
28 | 
29 |   @task
30 |   def almacenamiento(datos):
31 |     print("Almacenando datos...")
32 |     print(json.dumps(datos, indent=2))
33 | 
34 |   todos_los_datos = recogida()
35 |   datos_del_codigo_postal_x = filtrado(todos_los_datos, codigo_postal)
36 |   almacenamiento(datos_del_codigo_postal_x)
37 | 
38 | dag_gasolina = extraer_precio_gasolina_naive()
39 | 
40 | 


--------------------------------------------------------------------------------
/airflow/dags/gasolina_s3.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | from airflow import AirflowException
 6 | from airflow.decorators import dag, task
 7 | from airflow.operators.bash_operator import BashOperator
 8 | from airflow.contrib.sensors.file_sensor import FileSensor
 9 | 
10 | import boto3
11 | import botocore.client
12 | import requests
13 | 
14 | codigo_postal = "50197"
15 | bucket_name = "gasolina"
16 | endpoint = "https://sedeaplicaciones.minetur.gob.es/ServiciosRESTCarburantes/PreciosCarburantes/EstacionesTerrestres/"
17 | 
18 | def s3_resource():
19 |   return boto3.resource('s3',
20 |         endpoint_url='http://minio:9000',
21 |         aws_access_key_id='bigdataupv',
22 |         aws_secret_access_key='bigdataupv',
23 |         config=botocore.client.Config(signature_version='s3v4'), region_name='us-east-1')
24 | 
25 | def read_json_from_s3(key):
26 |   obj = s3_resource().Object(bucket_name, key)
27 |   return json.loads(obj.get()['Body'].read().decode('utf-8'))
28 | 
29 | def save_to_s3(key, data):
30 |   obj = s3_resource().Object(bucket_name, key)
31 |   obj.put(Body=data)
32 | 
33 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['upv'])
34 | def extraer_precio_gasolina_s3():
35 | 
36 |   @task
37 |   def recogida_s3():
38 |     print("Recogiendo datos...")
39 |     response = requests.get(endpoint)
40 |     if response.status_code != 200:
41 |       AirflowException(f"Fallo de conexión {response.status_code}")
42 | 
43 |     filename = f'recogida-{datetime.now().strftime("%Y%m%d%H%M%S")}.json'
44 |     save_to_s3(filename, response.text)
45 |     return { "recogida": filename }
46 | 
47 |   @task
48 |   def filtrado_s3(contexto, codigo_postal):
49 |     print("Filtrando datos...")
50 | 
51 |     datos = read_json_from_s3(contexto['recogida'])
52 |     filtrados = list(filter(lambda x: x['C.P.'] == codigo_postal, datos['ListaEESSPrecio']))
53 | 
54 |     filename = f'filtrado-{datetime.now().strftime("%Y%m%d%H%M%S")}.json'
55 |     save_to_s3(filename, json.dumps(filtrados))
56 | 
57 |     return { **contexto, "filtrado": filename }
58 | 
59 |   @task
60 |   def almacenamiento_s3(contexto):
61 |     print("Almacenando datos... Nothing to do!")
62 |     return 42
63 | 
64 |   todos_los_datos = recogida_s3()
65 |   datos_del_codigo_postal_x = filtrado_s3(todos_los_datos, codigo_postal)
66 |   almacenamiento_s3(datos_del_codigo_postal_x)
67 | 
68 | dag_gasolina = extraer_precio_gasolina_s3()
69 | 
70 | # Additionally, use Amazon operator, particularly S3KeySensor
71 | # https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/operators/s3.html
72 | # https://github.com/apache/airflow/tree/main/airflow/providers/amazon/aws/example_dags
73 | 


--------------------------------------------------------------------------------
/airflow/dags/hello_dags.py:
--------------------------------------------------------------------------------
 1 | from builtins import range
 2 | from datetime import timedelta
 3 | 
 4 | import airflow
 5 | from airflow.models import DAG
 6 | from airflow.operators.bash_operator import BashOperator
 7 | from airflow.operators.dummy_operator import DummyOperator
 8 | from airflow.operators.python_operator import PythonOperator
 9 | 
10 | dag = DAG('hello_dags', schedule_interval=None, start_date=airflow.utils.dates.days_ago(2), tags=['upv'])
11 | 
12 | def print_hello():
13 |     return 'Hello world!'
14 | 
15 | inicio = BashOperator(task_id='inicio', bash_command="echo inicio!", dag=dag)
16 | paso1 = BashOperator(task_id='paso1', bash_command="echo paso 1", dag=dag)
17 | paso2 = PythonOperator(task_id='paso2', python_callable=print_hello, dag=dag)
18 | paso3 = DummyOperator(task_id='paso3', dag=dag)
19 | ultima_tarea = DummyOperator(task_id='ultima_tarea', dag=dag)
20 | 
21 | inicio >> [paso1, paso3]
22 | paso1 >> paso2 >> ultima_tarea
23 | paso3 >> ultima_tarea
24 | 
25 | if __name__ == "__main__":
26 |     dag.cli()
27 | 


--------------------------------------------------------------------------------
/airflow/dags/hello_python_operator.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | def print_hello():
 8 |     return 'Hello world!'
 9 | 
10 | dag = DAG(
11 |     'hello_python_operator',
12 |     description='Simple tutorial DAG',
13 |     schedule_interval='20 * * * *',
14 |     start_date=datetime(2017, 3, 20),
15 |     tags=['upv'],
16 |     catchup=False)
17 | 
18 | dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
19 | 
20 | hello_operator = PythonOperator(task_id='hello_from_python', python_callable=print_hello, dag=dag)
21 | 
22 | dummy_operator >> hello_operator
23 | 


--------------------------------------------------------------------------------
/airflow/dags/hello_simple.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | 
 6 | default_args = {"start_date": datetime(2019, 2, 5)}
 7 | dag = DAG('hello', default_args=default_args, schedule_interval=None, tags=['upv'],)
 8 | 
 9 | dummy_operator = DummyOperator(task_id='dummy_task', dag=dag)
10 | hello_operator = DummyOperator(task_id='hello_task', dag=dag)
11 | 
12 | dummy_operator >> hello_operator
13 | 


--------------------------------------------------------------------------------
/airflow/dags/s3_bucket_operations.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from airflow.decorators import task
 5 | from airflow.models.dag import DAG
 6 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 7 | from airflow.providers.amazon.aws.operators.s3 import S3CreateBucketOperator, S3DeleteBucketOperator
 8 | 
 9 | # By default, it will use 'aws_default' connection. You can create it here by running `make minio_credentials`
10 | # If you want to change it, use a variable and pass it as `aws_conn_id` to all AWS operators.
11 | AWS_CONN_ID = 'aws_default'
12 | 
13 | BUCKET_NAME = os.environ.get('BUCKET_NAME', 'patatas')
14 | 
15 | @task(task_id="s3_bucket_dag_add_keys_to_bucket")
16 | def upload_keys():
17 |   s3_hook = S3Hook()
18 |   for i in range(0, 3):
19 |     s3_hook.load_string(string_data="input", key=f"path/data{i}", bucket_name=BUCKET_NAME)
20 | 
21 | with DAG(
22 |   dag_id='s3_bucket_operations',
23 |   schedule_interval=None,
24 |   start_date=datetime(2021, 1, 1),
25 |   catchup=False,
26 |   default_args={"bucket_name": BUCKET_NAME},
27 |   max_active_runs=1,
28 |   tags=['upv'],
29 | ) as dag:
30 | 
31 |   create_bucket = S3CreateBucketOperator(task_id='s3_bucket_dag_create', region_name='us-east-1')
32 |   add_keys_to_bucket = upload_keys()
33 |   delete_bucket = S3DeleteBucketOperator(task_id='s3_bucket_dag_delete', force_delete=True)
34 |   create_bucket >> add_keys_to_bucket >> delete_bucket
35 | 


--------------------------------------------------------------------------------
/airflow/dags/s3_file_sensor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from airflow.decorators import task
 5 | from airflow.models.dag import DAG
 6 | from airflow.models.variable import Variable
 7 | from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
 8 | 
 9 | BUCKET_NAME = os.environ.get('BUCKET_NAME', 'patatas')
10 | 
11 | @task(task_id="do_something")
12 | def do_something():
13 |   print("Something!")
14 | 
15 | with DAG(
16 |   dag_id='s3_file_sensor',
17 |   schedule_interval=None,
18 |   start_date=datetime(2021, 1, 1),
19 |   catchup=False,
20 |   default_args={"bucket_name": BUCKET_NAME},
21 |   max_active_runs=1,
22 |   tags=['upv'],
23 | ) as dag:
24 | 
25 |   op = S3KeySensor(task_id="s3_key_sensor", bucket_key="s3://gasolina/some_file.json", bucket_name=None, dag=dag)
26 |   end_task = do_something()
27 |   op >> end_task
28 | 


--------------------------------------------------------------------------------
/airflow/dags/spark_ondemand.py:
--------------------------------------------------------------------------------
 1 | # ./airflow variables set gcp_project bigdataupv2022
 2 | # ./airflow variables set gcp_region europe-west1
 3 | # ./airflow variables set gcp_zone europe-west1-b
 4 | # ./airflow variables set gcp_bucket bigdataupv_data
 5 | 
 6 | import datetime
 7 | import os
 8 | 
 9 | from airflow import models
10 | from airflow.contrib.operators import dataproc_operator
11 | from airflow.utils import trigger_rule
12 | 
13 | yesterday = datetime.datetime.combine(
14 |     datetime.datetime.today() - datetime.timedelta(1),
15 |     datetime.datetime.min.time())
16 | 
17 | default_dag_args = {
18 |     'start_date': yesterday,
19 |     'email_on_failure': False,
20 |     'email_on_retry': False,
21 |     'retries': 1,
22 |     'retry_delay': datetime.timedelta(minutes=5),
23 |     'project_id': models.Variable.get('gcp_project')
24 | }
25 | 
26 | with models.DAG(
27 |         'spark_ondemand',
28 |         schedule_interval=datetime.timedelta(days=1),
29 |         default_args=default_dag_args) as dag:
30 | 
31 |     create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
32 |         task_id='create_dataproc_cluster',
33 |         cluster_name='spark-cluster-{{ ds_nodash }}',
34 |         num_workers=2,
35 |         zone=models.Variable.get('gcp_zone'),
36 |         region=models.Variable.get('gcp_region'),
37 |         master_machine_type='n1-standard-1',
38 |         worker_machine_type='n1-standard-1')
39 | 
40 |     run_dataproc_pyspark = dataproc_operator.DataProcPySparkOperator(
41 |         task_id='run_spark',
42 |         cluster_name='spark-cluster-{{ ds_nodash }}',
43 |         region=models.Variable.get('gcp_region'),
44 |         main='gs://bigdataupv_code/compras_top_ten_countries.py',
45 |         files=['gs://bigdataupv_code/helpers.py'])
46 | 
47 |     delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
48 |         task_id='delete_dataproc_cluster',
49 |         cluster_name='spark-cluster-{{ ds_nodash }}',
50 |         trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
51 | 
52 |     create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster
53 | 
54 | 


--------------------------------------------------------------------------------
/airflow/dags/spark_simple.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | 
 4 | from airflow import models
 5 | from airflow.contrib.operators import dataproc_operator
 6 | from airflow.utils import trigger_rule
 7 | 
 8 | output_file = os.path.join(
 9 |   models.Variable.get('gcs_bucket'), 'dataproc_simple',
10 |   datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep
11 | 
12 | yesterday = datetime.datetime.combine(
13 |   datetime.datetime.today() - datetime.timedelta(1),
14 |   datetime.datetime.min.time())
15 | 
16 | args = {
17 |   'start_date': yesterday,
18 |   'email_on_failure': False,
19 |   'email_on_retry': False,
20 |   'retries': 1,
21 |   'retry_delay': datetime.timedelta(minutes=5),
22 |   'project_id': models.Variable.get('gcp_project')
23 | }
24 | 
25 | with models.DAG('spark_simple', schedule_interval=datetime.timedelta(days=1), default_args=args) as dag:
26 |   run_step = dataproc_operator.DataProcPySparkOperator(
27 |       task_id='run_spark',
28 |       cluster_name='cluster-9c11',
29 |       region=models.Variable.get('gcp_region'),
30 |       main='gs://bigdataupv_code/compras_top_ten_countries.py',
31 |       files=['gs://bigdataupv_code/helpers.py'])
32 | 
33 | 


--------------------------------------------------------------------------------
/airflow/docker-compose.minio.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3'
 3 | services:
 4 |  minio:
 5 |     image: minio/minio
 6 |     environment:
 7 |       MINIO_ROOT_USER: bigdataupv
 8 |       MINIO_ROOT_PASSWORD: bigdataupv
 9 |       MINIO_REGION_NAME: us-east-1
10 |     ports:
11 |       - '9000:9000'
12 |       - '9001:9001'
13 |     init: true
14 |     entrypoint: sh
15 |     command: -c 'mkdir -p /data/gasolina && minio server /data --console-address ":9001"'
16 |     volumes:
17 |       - .minio:/data
18 | 


--------------------------------------------------------------------------------
/airflow/minio_connection.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aws_default": {
 3 |     "conn_type": "s3",
 4 |     "description": "",
 5 |     "host": "",
 6 |     "login": "bigdataupv",
 7 |     "password": "bigdataupv",
 8 |     "schema": "",
 9 |     "port": null,
10 |     "extra": "{\"host\": \"http://minio:9000\"}"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/assignments/.dockerignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .cache/
 3 | .direnv/
 4 | .pytest_cache/
 5 | .venv/
 6 | resultados/
 7 | resultados/
 8 | soluciones/
 9 | spark-warehouse/
10 | venv/
11 | *.pyc
12 | 


--------------------------------------------------------------------------------
/assignments/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .cache/
 3 | .direnv/
 4 | .pytest_cache/
 5 | .venv/
 6 | resultados/
 7 | resultados/
 8 | soluciones/
 9 | spark-warehouse/
10 | venv/
11 | *.pyc
12 | 


--------------------------------------------------------------------------------
/assignments/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM luisbelloch/spark
 2 | LABEL maintainer="Luis Belloch <docker@luisbelloch.es>"
 3 | 
 4 | WORKDIR /opt/tests/
 5 | 
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | RUN apt-get update && \
 8 |     apt-get upgrade -y python3 && \
 9 |     apt-get install -y --no-install-recommends python3-venv python3-pip && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY requirements.txt .
13 | 
14 | RUN pip3 install wheel
15 | RUN pip3 install -r requirements.txt
16 | 
17 | 


--------------------------------------------------------------------------------
/assignments/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGE_NAME:=luisbelloch/spark-assignments
 2 | 
 3 | .PHONY: all build tag push list test
 4 | 
 5 | test:
 6 | 	./test.sh
 7 | 
 8 | all: build tag
 9 | 
10 | build:
11 | 	docker build -t $(IMAGE_NAME) .
12 | 
13 | tag:
14 | 	docker tag $(IMAGE_NAME) $(IMAGE_NAME):2.4.5
15 | 	docker tag $(IMAGE_NAME) $(IMAGE_NAME):2020.1
16 | 
17 | push:
18 | 	docker push $(IMAGE_NAME):2.4.5
19 | 	docker push $(IMAGE_NAME):2020.1
20 | 	docker push $(IMAGE_NAME)
21 | 
22 | list:
23 | 	docker images $(IMAGE_NAME)
24 | 
25 | 


--------------------------------------------------------------------------------
/assignments/README.md:
--------------------------------------------------------------------------------
  1 | # Prácticas SPARK
  2 | 
  3 | Las prácticas consisten en desarrollar una serie de ejercicios de procesado de datos con `PySpark`.
  4 | 
  5 | Para completar las prácticas debe completarse un archivo llamado `contenedores.py` con los ejercicios abajo descritos. No es necesaria explicación alguna, únicamente se pide que el código esté limpio, bien estructurado y ejecute correctamente.
  6 | 
  7 | Los archivos de datos vienen incluidos en este repositorio en la carpeta `data`. Entre los archivos de la práctica se ha incluido [un contenedor de Docker](https://hub.docker.com/r/luisbelloch/spark-assignments/) con todo lo necesario instalado. También se ha incluido una [batería de pruebas](pruebas) para que puedas comprobar los resultados antes de entregar la práctica.
  8 | 
  9 | Cada ejercicio produce un resultado distinto. Los resultados deben guardarse en una carpeta denominada `resultados`, teniendo un único archivo por ejercicio con la nomenclatura `resultado_1`, `resultado_2` etc. La función `path_resultados` devuelve la ruta completa que puedes usar para guardar los datos procesados en cada ejercicio. En la mayoría de los casos debes devolver un DataFrame:
 10 | 
 11 | ```
 12 | def ejercicio_3(sc, path_resultados):
 13 |   df = sq.sql(...)
 14 |   # ... otras operaciones
 15 |   # ... save(path_resultados(3))
 16 |   return df
 17 | ```
 18 | 
 19 | Los ejercicios se realizarán sobre un fichero en formato CSV que contiene una lista de barcos, identificados por la columna `ship_imo`. A su vez, cada barco tiene una lista de contenedores identificados por la columna `container_id`.
 20 | 
 21 | Para el procesado del archivo puedes utilizar cualquier función disponible en el API de Python de Spark 2.2.1
 22 | 
 23 | ## Plazo de entrega
 24 | 
 25 | Los ejercicios hay que enviarlos antes del 1 de febrero.
 26 | 
 27 | ## Criterios de evaluación
 28 | 
 29 | 1. El alumno entiende y es capaz de ejecutar programas en PySpark, haciendo uso de el core de Spark 2.2 y Spark SQL.
 30 | 2. El archivo `contenedores.py` producido por el alumno se puede ejecutar con `spark-submit` y, opcionalmente, con `pytest`.
 31 | 3. El código está estructurado correctamente, es legible y tiene una intencionalidad clara.
 32 | 
 33 | ## Ejercicios
 34 | 
 35 | **Ejercicio 0**. Ejecutar el archivo `contenedores.py` y comprobar que se crea un archivo dentro de la carpeta `resultados` con números del 0 al 9.
 36 | 
 37 | ```
 38 | $ spark-submit contenedores.py
 39 | $ cat resultados/resultado_0
 40 | 0,1,2,3,4,5,6,7,8,9
 41 | ```
 42 | 
 43 | **Ejercicio 1**. Leer el archivo `data/containers.csv` y contar el número de líneas.
 44 | 
 45 | **Ejercicio 2**. Leer el archivo `data/containers.csv` y filtrar aquellos contenedores cuyo `ship_imo` es `DEJ1128330` y el grupo del contenedor es `22P1`. Guardar los resultados en un archivo de texto en `resultados/resutado_2`.
 46 | 
 47 | **Ejercicio 3**. Leer el archivo `data/containers.csv` y convertir a formato Parquet. Recuerda que puedes hacer uso de la funcion `parse_container` en `helpers.py` tal y como vimos en clase. Guarda los resultados en `resultados/resultado_3`.
 48 | 
 49 | **Ejercicio 4**. Lee el archivo de Parquet guardado en el ejercicio 3 y filtra los barcos que tienen al menos un contenedor donde la columna `customs_ok` es igual a `false`. Extrae una lista con los identificadores de barco, `ship_imo`, sin duplicados y ordenados alfabéticamente, en formato `json`.
 50 | 
 51 | **Ejercicio 5**. Crea una UDF para validar el [código de identificación](https://en.wikipedia.org/wiki/ISO_6346) del contenedor `container_id`. Para simplificar la validación, daremos como válidos aquellos códigos compuestos de 3 letras para el propietario, 1 letra para la categoría, 6 números y 1 dígito de control. Devuelve un `DataFrame` con los campos: `ship_imo`, `container_id`, `propietario`, `categoria`, `numero_serie` y `digito_control`.
 52 | 
 53 | **Ejercicio 6**. Extrae una lista con peso total de cada barco, `net_weight`, sumando cada contenedor y agrupado por los campos `ship_imo` y `container_group`. Devuelve un DataFrame con la siguiente estructura: `ship_imo`, `ship_name`, `container`, `total_net_weight`.
 54 | 
 55 | **Ejercicio 7**. Guarda los resultados del ejercicio anterior en formato Parquet.
 56 | 
 57 | **Ejercicio 8**. ¿En qué casos crees que es más eficiente utilizar formatos como Parquet? ¿Existe alguna desventaja frente a formatos de texto como CSV?
 58 | 
 59 | **Ejercicio 9**. ¿Es posible procesar XML mediante Spark? ¿Existe alguna restricción por la cual no sea eficiente procesar un único archivo en multiples nodos? ¿Se te ocurre alguna posible solución para _trocear_ archivos suficientemente grandes? ¿Existe la misma problemática con otros formatos de texto como JSON?
 60 | 
 61 | **Ejercicio 10**. Spark SQL tiene [una función](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.avg) denominada `avg` que se utiliza para calcular el promedio de un conjunto de valores ¿Por qué los autores han creado esta función en lugar de usar el API estándar de Python o Scala?
 62 | 
 63 | ## Pruebas
 64 | 
 65 | Existe una batería de pruebas para comprobar los resultados de cada ejercicio, desarrollada sobre [pytest](http://pytest.org). Las pruebas no son exhaustivas y únicamente están orientadas a verificar los resultados de cada ejercicio. No es necesario que las pruebas pasen para entregar la práctica, aunque se valorará de forma positiva. Se deja como ejercicio optativo adaptar o ampliar la batería de pruebas.
 66 | 
 67 | ### Ejecución de pruebas en Docker
 68 | 
 69 | De forma alternativa, hemos incluido una imágen de Docker con todas las dependencias necesarias. El directorio actual se montará como volumen dentro del contenedor, concretamente en `/opt/tests/assigments`.
 70 | 
 71 | ```
 72 | $ ./test.sh
 73 | ```
 74 | 
 75 | También es posible lanzar `bash` o `pyspark` para hacer comprobaciones manualmente:
 76 | 
 77 | ```
 78 | $ docker run -v $(pwd):/opt/tests/assigments -ti luisbelloch/spark-assignments /bin/bash
 79 | ```
 80 | 
 81 | Para simplificar el uso, hemos incluido un script llamado `spark` dentro de [la carpeta de ejemplos](../spark) que vimos en clase. El script es capaz de ejecutar cualquier script contenido dentro de esa carpeta, teniendo acceso también a los archivos de datos en `data`:
 82 | 
 83 | ```
 84 | $ cd data_processing_course/spark
 85 | $ ./spark compras_conversion_a_dolares.py
 86 | ```
 87 | 
 88 | ### Ejecución local de pruebas
 89 | 
 90 | Teniendo Spark instalado mediante `local_setup.sh`, puedes instalar `pytest` en local mediante `venv`:
 91 | 
 92 | ```
 93 | $ python3 -m venv .venv
 94 | $ source .venv/bin/activate
 95 | $ pip install -r requirements.txt
 96 | $ export SPARK_HOME=$(pwd)/../.spark
 97 | ```
 98 | 
 99 | Y a partir de aquí puede ejecutarse la suite de pruebas:
100 | 
101 | ```
102 | $ pytest -v
103 | ```
104 | 
105 | Para ejecutar un único test añade el nombre al final, lo único que hay que tener en cuenta es que algunos ejercicios dependen de los datos de los anteriores:
106 | 
107 | ```
108 | $ pytest -v test_ejercicio_2.py
109 | ```
110 | 
111 | Happy hacking!
112 | 


--------------------------------------------------------------------------------
/assignments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/assignments/__init__.py


--------------------------------------------------------------------------------
/assignments/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pytest
 4 | import shutil
 5 | import sys
 6 | 
 7 | from glob import glob
 8 | from .helpers import definir_path_resultados, comprobar_resultados_en_hdfs
 9 | 
10 | spark_home = os.environ.get('SPARK_HOME', None)
11 | if not spark_home:
12 |   raise ValueError("Unable to find Spark, make sure SPARK_HOME environment variable is set")
13 | 
14 | if not os.path.exists(spark_home):
15 |   raise ValueError("Cannot find path set in SPARK_HOME: " + spark_home)
16 | 
17 | spark_python = os.path.join(spark_home, 'python')
18 | py4j = glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0]
19 | sys.path[:0] = [spark_python, py4j]
20 | 
21 | from pyspark.context import SparkContext
22 | 
23 | @pytest.fixture(scope='session')
24 | def spark_context(request):
25 |   sc = SparkContext('local', 'tests_practicas_spark')
26 |   request.addfinalizer(lambda: sc.stop())
27 |   logger = logging.getLogger('py4j')
28 |   logger.setLevel(logging.WARN)
29 |   return sc
30 | 
31 | @pytest.fixture(scope='session')
32 | def path_resultados(request):
33 |   return definir_path_resultados('./resultados')
34 | 
35 | @pytest.fixture(scope='session')
36 | def resultados_ejercicio_3(spark_context, path_resultados):
37 |   from contenedores import ejercicio_3
38 |   return ejercicio_3(spark_context, path_resultados)
39 | 
40 | @pytest.fixture(scope='session')
41 | def comprobar_hdfs(path_resultados):
42 |   def check(ejercicio_n):
43 |     path = path_resultados(ejercicio_n)
44 |     return comprobar_resultados_en_hdfs(path)
45 |   return check
46 | 
47 | @pytest.fixture(scope='session')
48 | def tiene_columnas():
49 |   def check(df, expected):
50 |     assert df is not None, 'El DataFrame no existe ¿Olvidaste un "return df" al final del ejercicio?'
51 |     assert sorted(expected) == sorted([column.lower() for column in df.columns])
52 |   return check
53 | 
54 | 


--------------------------------------------------------------------------------
/assignments/contenedores.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import SQLContext, Row
 3 | 
 4 | from .helpers import *
 5 | 
 6 | path_containers = 'data/containers.csv'
 7 | 
 8 | def ejercicio_0(sc, path_resultados):
 9 |   lineas = sc.parallelize(range(10)).collect()
10 |   with open(path_resultados(0), 'w') as f:
11 |     f.write("{}\n".format(",".join([str(s) for s in lineas])))
12 |   return lineas
13 | 
14 | # Ejercicio 1. Leer el archivo data/containers.csv y contar el número de líneas.
15 | def ejercicio_1(sc, path_resultados):
16 |   # COMPLETAR CÓDIGO AQUÍ
17 |   # Devolver número de líneas
18 |   return 0
19 | 
20 | # Ejercicio 2. Leer el archivo data/containers.csv y filtrar aquellos
21 | # contenedores cuyo ship_imo es DEJ1128330 y el grupo del contenedor es 22P1.
22 | # Guardar los resultados en un archivo de texto en resultados/resutado_2.
23 | def ejercicio_2(sc, path_resultados):
24 |   # COMPLETAR CÓDIGO AQUÍ
25 |   # Guardar en resultados/resultado_2. La función path_resultados devuelve
26 |   # la ruta donde se van a guardar los resultados, para que los tests puedan
27 |   # ejecutar de forma correcta. Por ejemplo, path_resultados(2) devuelve la
28 |   # ruta para el ejercicio 2, path_resultados(3) para el 3, etc.
29 |   # Devolver rdd contenedores filtrados:
30 |   # return rdd.collect()
31 |   pass
32 | 
33 | # Ejercicio 3. Leer el archivo data/containers.csv y convertir a formato
34 | # Parquet. Recuerda que puedes hacer uso de la funcion parse_container en
35 | # helpers.py tal y como vimos en clase. Guarda los resultados en
36 | # resultados/resultado_3.
37 | def ejercicio_3(sc, path_resultados):
38 |   # COMPLETAR CÓDIGO AQUÍ
39 |   # Guardar resultados y devolver DataFrame (return df)
40 |   pass
41 | 
42 | # Ejercicio 4. Lee el archivo de Parquet guardado en el ejercicio 3 y filtra
43 | # los barcos que tienen al menos un contenedor donde la columna customs_ok es
44 | # igual a false. Extrae un fichero de texto una lista con los identificadores
45 | # de barco, ship_imo, sin duplicados y ordenados alfabéticamente.
46 | def ejercicio_4(sc, path_resultados):
47 |   # COMPLETAR CÓDIGO AQUÍ
48 |   # Guardar resultados y devolver DataFrame (return df)
49 |   pass
50 | 
51 | # Ejercicio 5. Crea una UDF para validar el código de identificación del
52 | # contenedor container_id. Para simplificar la validación, daremos como
53 | # válidos aquellos códigos compuestos de 3 letras para el propietario, 1
54 | # letra para la categoría, 6 números y 1 dígito de control. Devuelve un
55 | # DataFrame con los campos: ship_imo, container_id, propietario, categoria,
56 | # numero_serie y digito_control.
57 | def ejercicio_5(sc, path_resultados):
58 |   # COMPLETAR CÓDIGO AQUÍ
59 |   # Guardar resultados y devolver DataFrame (return df)
60 |   pass
61 | 
62 | # Ejercicio 6. Extrae una lista con peso total de cada barco, `net_weight`,
63 | # sumando cada contenedor y agrupado por los campos `ship_imo` y `container_group`.
64 | # Devuelve un DataFrame con la siguiente estructura:
65 | # `ship_imo`, `ship_name`, `container_group`, `total_net_weight`.
66 | def ejercicio_6(sc, path_resultados):
67 |   # COMPLETAR CÓDIGO AQUÍ
68 |   # Guardar resultados y devolver DataFrame (return df)
69 |   pass
70 | 
71 | # Ejercicio 7. Guarda los resultados del ejercicio anterior en formato Parquet.
72 | def ejercicio_7(sc, path_resultados):
73 |   # COMPLETAR CÓDIGO AQUÍ
74 |   # Guardar resultados y devolver DataFrame (return df)
75 |   pass
76 | 
77 | def main():
78 |   sc = SparkContext('local', 'practicas_spark')
79 |   pr = definir_path_resultados('./resultados')
80 |   ejercicio_0(sc, pr)
81 |   ejercicio_1(sc, pr)
82 |   ejercicio_2(sc, pr)
83 |   ejercicio_3(sc, pr)
84 |   ejercicio_4(sc, pr)
85 |   ejercicio_5(sc, pr)
86 |   ejercicio_6(sc, pr)
87 |   ejercicio_7(sc, pr)
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 
92 | 


--------------------------------------------------------------------------------
/assignments/helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | 
 5 | from collections import namedtuple
 6 | from functools import partial
 7 | from glob import glob
 8 | 
 9 | item_fields = ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned']
10 | Item = namedtuple('Item', item_fields)
11 | 
12 | def parse_item(raw_string):
13 |   f = raw_string.split('|')
14 |   f += [None] * (len(item_fields) - len(f))
15 |   return Item(*f)
16 | 
17 | # Uso básico de namedtuples:
18 | # item = parse_item(['one', 'two'])
19 | # new_item = item._replace(tx_id=1, buyer=5)
20 | 
21 | # API http://fixer.io/
22 | def get_usd_exchange_rates():
23 |   with open('./data/exchange_rates_usd.csv') as f:
24 |     data = json.load(f)
25 |     return data['rates']
26 | 
27 | container_fields = ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok']
28 | Container = namedtuple('Container', container_fields)
29 | 
30 | def parse_container(raw_string):
31 |   f = raw_string.split(';')
32 |   f += [None] * (len(container_fields) - len(f))
33 |   return Container(*f)
34 | 
35 | stock_fields = ['simbolo', 'numero', 'precio_compra', 'ultimo_precio', 'returns']
36 | Stock = namedtuple('Stock', stock_fields)
37 | def parse_stock(raw_string):
38 |   f = raw_string.split(',')
39 |   return Stock(simbolo=f[0], numero=None, precio_compra=None, ultimo_precio=float(f[1]), returns=0.0)
40 | 
41 | def setup_checkpoint(streamingContext):
42 |   checkpoint = './checkpoint'
43 |   if (os.path.exists(checkpoint)):
44 |     shutil.rmtree(checkpoint)
45 |   os.mkdir(checkpoint)
46 |   streamingContext.checkpoint(checkpoint)
47 | 
48 | def definir_path_resultados(path):
49 |   if os.path.exists(path):
50 |     shutil.rmtree(path)
51 |   os.makedirs(path)
52 |   return partial(path_resultados_fn, path)
53 | 
54 | def path_resultados_fn(basePath, testId, extra = None):
55 |   if not extra:
56 |     return os.path.join(basePath, 'resultado_' + str(testId))
57 |   return os.path.join(basePath, 'resultado_' + str(testId), extra)
58 | 
59 | def comprobar_resultados_en_hdfs(path):
60 |   if not os.path.exists(path):
61 |     return 'No existe el directorio "{}", asegurate de guardar los datos al finalizar el ejercicio'.format(path)
62 |   if not os.path.exists(os.path.join(path, '_SUCCESS')):
63 |     return 'El trabajo no terminó correctamente'
64 |   parts = glob(os.path.join(path, 'part*'))
65 |   at_least_one = any(map(lambda p: os.stat(p).st_size > 0, parts))
66 |   if not parts or not at_least_one:
67 |     return 'El trabajo terminó correctamente, pero no existen datos en la carpeta "{}"'.format(path)
68 |   return True
69 | 
70 | 


--------------------------------------------------------------------------------
/assignments/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs = .git data __pycache__ _build tmp* venv
3 | usefixtures = spark_context path_resultados
4 | 


--------------------------------------------------------------------------------
/assignments/requirements.txt:
--------------------------------------------------------------------------------
1 | attrs==17.4.0
2 | pluggy==0.6.0
3 | py==1.5.2
4 | pytest==3.4.0
5 | six==1.11.0
6 | 


--------------------------------------------------------------------------------
/assignments/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | readonly WORKDIR=/opt/tests/assignments
3 | docker run -v $(pwd):${WORKDIR} -w ${WORKDIR} -ti luisbelloch/spark-assignments pytest -v
4 | 
5 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_0.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from .contenedores import *
 4 | 
 5 | def test_ejercicio_0_crea_secuencia_de_10_elementos(spark_context, path_resultados):
 6 |   resultado = ejercicio_0(spark_context, path_resultados)
 7 |   esperado = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 8 |   assert resultado == esperado
 9 | 
10 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_1.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from .contenedores import *
4 | 
5 | def test_ejercicio_1_cuenta_correctamente_el_numero_de_lineas(spark_context, path_resultados):
6 |   resultado = ejercicio_1(spark_context, path_resultados)
7 |   assert 614 == resultado
8 | 
9 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def resultados(spark_context, path_resultados):
 8 |   return ejercicio_2(spark_context, path_resultados)
 9 | 
10 | def test_ejercicio_2_solo_quedan_dos_contenedores_despues_de_filtrar(resultados):
11 |   assert 2 == len(resultados)
12 | 
13 | def test_ejercicio_2_comprobar_que_las_matriculas_son_las_correctas(resultados):
14 |   assert all([e[0] == 'DEJ1128330' for e in resultados])
15 |   assert 'GYFD1228113' in [e[4] for e in resultados]
16 |   assert 'MBPF1909627' in [e[4] for e in resultados]
17 | 
18 | def test_ejercicio_2_resultados_guardados(resultados, comprobar_hdfs):
19 |   assert comprobar_hdfs(2) == True
20 | 
21 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | from pyspark.sql import SQLContext
 6 | 
 7 | def test_ejercicio_3_data_frame_tiene_613_filas(resultados_ejercicio_3):
 8 |   assert 613 == resultados_ejercicio_3.rdd.count()
 9 | 
10 | def test_ejercicio_3_data_frame_tiene_al_menos_una_fila_correcta(resultados_ejercicio_3):
11 |   df = resultados_ejercicio_3
12 |   assert 1 == df.filter(df.ship_imo == "JMP1637582").filter(df.container_id == "XPOG1294738").rdd.count()
13 | 
14 | def test_ejercicio_3_resultados_guardados(resultados_ejercicio_3, comprobar_hdfs):
15 |   assert comprobar_hdfs(3) == True
16 | 
17 | def test_ejercicio_3_estructura_dataframe_correcta(resultados_ejercicio_3, tiene_columnas):
18 |   tiene_columnas(resultados_ejercicio_3, ['contact', 'container_group', 'container_id', 'container_type', 'country', 'customs_ok', 'declared', 'departure', 'gross_weight', 'net_weight', 'owner', 'ship_imo', 'ship_name'])
19 | 
20 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_4.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | from pyspark.sql import SQLContext
 6 | 
 7 | @pytest.fixture(scope="session")
 8 | def resultados(spark_context, path_resultados):
 9 |   return ejercicio_4(spark_context, path_resultados)
10 | 
11 | def test_ejercicio_4_puede_filtrar_la_lista_de_contenedores(resultados):
12 |   assert [row.ship_imo for row in resultados.rdd.collect()] == [
13 |       u'AEY1108363',
14 |       u'AMC1861710',
15 |       u'DEJ1128330',
16 |       u'FUS1202266',
17 |       u'GEU1548633',
18 |       u'GLV1922612',
19 |       u'GYR1192020',
20 |       u'IWE1254579',
21 |       u'JCI1797526',
22 |       u'JET1053895',
23 |       u'JMP1637582',
24 |       u'KSP1096387',
25 |       u'MBV1836745',
26 |       u'NCZ1777367',
27 |       u'NLH1771681',
28 |       u'POG1615575',
29 |       u'RYP1117603',
30 |       u'SQH1155999',
31 |       u'TCU1641123',
32 |       u'YZX1455509']
33 | 
34 | def test_ejercicio_4_resultados_guardados(resultados, comprobar_hdfs):
35 |   assert comprobar_hdfs(4) == True
36 | 
37 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_5.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def resultados(spark_context, path_resultados):
 8 |   return ejercicio_5(spark_context, path_resultados)
 9 | 
10 | def test_ejercicio_5_existen_605_contenedores(resultados):
11 |   assert 605 == resultados.rdd.count()
12 | 
13 | def test_ejercicio_5_al_menos_uno_de_los_contenedores_validos_existe_en_la_lista(resultados):
14 |   assert any([e["propietario"] == "UFC" and e["numero_serie"] == "118653" for e in resultados.rdd.collect()])
15 | 
16 | def test_ejercicio_5_todos_los_contendores_invalidos_estan_excluidos(resultados):
17 |   existentes = resultados.select(resultados["container_id"]).rdd.collect()
18 |   excluidos = [u'GJFL14A2798', u'CTVU1506A832', u'IJWDR1216916', u'OKANR1240284', u'JMYG190Z978', u'DUKF166276', u'']
19 |   assert all([(e not in excluidos) for e in existentes])
20 | 
21 | def test_ejercicio_5_resultados_guardados(resultados, comprobar_hdfs):
22 |   assert comprobar_hdfs(5) == True
23 | 
24 | def test_ejercicio_5_estructura_dataframe_correcta(resultados, tiene_columnas):
25 |   tiene_columnas(resultados, ['categoria', 'container_id', 'digito_control', 'numero_serie', 'propietario', 'ship_imo'])
26 | 
27 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_6.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def resultados(spark_context, path_resultados):
 8 |   return ejercicio_6(spark_context, path_resultados)
 9 | 
10 | def test_ejercicio_6_existen_261_contenedores_agrupados(resultados):
11 |   assert 261 == resultados.rdd.count()
12 | 
13 | def test_ejercicio_6_al_menos_uno_de_los_contenedores_validos_existe_en_la_lista(resultados):
14 |   esperados = [109383187.34, 14038620.92, 213307524.22, 26936712.06, 29567214.06, 36127305.83, 38100695.63, 57417325.75, 60934192.91, 723432237.28]
15 |   assert sorted(esperados) == sorted([r["total_net_weight"] for r in resultados.rdd.collect() if r["ship_imo"] == u'GLV1922612'], key=float)
16 | 
17 | def test_ejercicio_6_estructura_dataframe_correcta(resultados, tiene_columnas):
18 |   tiene_columnas(resultados, ['container_group', 'ship_imo', 'ship_name', 'total_net_weight'])
19 | 
20 | 


--------------------------------------------------------------------------------
/assignments/test_ejercicio_7.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from .contenedores import *
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def resultados(spark_context, path_resultados):
 8 |   return ejercicio_7(spark_context, path_resultados)
 9 | 
10 | def test_ejercicio_7_resultados_guardados(resultados, comprobar_hdfs):
11 |   assert comprobar_hdfs(7) == True
12 | 
13 | def test_ejercicio_7_estructura_dataframe_correcta(resultados, tiene_columnas):
14 |   tiene_columnas(resultados, ['container_group', 'ship_imo', 'ship_name', 'total_net_weight'])
15 | 
16 | 


--------------------------------------------------------------------------------
/beam/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | 


--------------------------------------------------------------------------------
/beam/basic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import absolute_import
 4 | 
 5 | import argparse
 6 | import logging
 7 | 
 8 | import apache_beam as beam
 9 | 
10 | from apache_beam.options.pipeline_options import PipelineOptions
11 | from apache_beam.options.pipeline_options import SetupOptions
12 | 
13 | def run(argv=None):
14 |   parser = argparse.ArgumentParser()
15 |   parser.add_argument('--input', dest='input') 
16 |   parser.add_argument('--output', dest='output')
17 | 
18 |   known_args, pipeline_args = parser.parse_known_args(argv)
19 |   pipeline_args.extend(['--runner=DirectRunner'])
20 |   pipeline_options = PipelineOptions(pipeline_args)
21 |   pipeline_options.view_as(SetupOptions).save_main_session = True
22 |   p = beam.Pipeline(options=pipeline_options)
23 | 
24 |   print("Input:", known_args.input)
25 |   print("Output:", known_args.output)
26 | 
27 | if __name__ == '__main__':
28 |   logging.getLogger().setLevel(logging.DEBUG)
29 |   run()
30 | 
31 | 


--------------------------------------------------------------------------------
/beam/beam:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | if [[ $# -lt 1 ]]; then
 5 |     >&2 echo "USAGE: ./beam [SCRIPT_NAME]"
 6 |     >&2 echo "Sample: ./beam basic.py"
 7 |     exit 1
 8 | fi
 9 | 
10 | readonly script=/opt/beam/$1
11 | readonly local_data=`cd "../data"; pwd`
12 | docker run --rm -v "${PWD}":/opt/beam -v "${local_data}":/data -ti luisbelloch/beam python ${script} ${@:2}
13 | 
14 | 


--------------------------------------------------------------------------------
/beam/compras.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | 
 4 | import datetime
 5 | import logging
 6 | 
 7 | import apache_beam as beam
 8 | 
 9 | logging.getLogger().setLevel(logging.INFO)
10 | 
11 | def dump(line):
12 |   logging.info(line)
13 |   return line
14 | 
15 | def isoDate(date):
16 |   return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
17 | 
18 | p1 = beam.Pipeline()
19 | lines_collection = (p1
20 |   | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv")
21 |   | 'Split' >> beam.Map(lambda l: l.split("|"))
22 |   | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id')
23 |   | 'DosCampos' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]) })
24 |   | 'DebugPrint' >> beam.Map(lambda x: dump(x)))
25 | 
26 | p1.run().wait_until_finish()
27 | 
28 | 


--------------------------------------------------------------------------------
/beam/compras_ptransform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | 
 4 | import datetime
 5 | import logging
 6 | 
 7 | import apache_beam as beam
 8 | 
 9 | # ./beam compras_ptransform.py
10 | # head ../data/compras_tiny.json/compras_tiny.json-00000-of-00018
11 | # {'tx_id': u'RHMLNJB157', 'tx_time': datetime.datetime(2010, 2, 3, 4, 12, 3)}
12 | # {'tx_id': u'VFJDQNX118', 'tx_time': datetime.datetime(2010, 10, 24, 3, 1, 9)}
13 | # {'tx_id': u'MYOIBZV163', 'tx_time': datetime.datetime(2010, 7, 26, 5, 23, 35)}
14 | 
15 | logging.getLogger().setLevel(logging.INFO)
16 | 
17 | def dump(line):
18 |   logging.info(line)
19 |   return line
20 | 
21 | def isoDate(date):
22 |   return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
23 | 
24 | class ParseCompras(beam.PTransform):
25 |   def init(self):
26 |     super(ParseCompras, self).__init__()
27 | 
28 |   def expand(self, pcol):
29 |     return (pcol
30 |       | 'SplitFields' >> beam.Map(lambda l: l.split("|"))
31 |       | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id')
32 |       | 'DosCampos' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]) }))
33 | 
34 | p1 = beam.Pipeline()
35 | lines_collection = (p1
36 |   | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv")
37 |   | ParseCompras()
38 |   # | 'DebugPrint' >> beam.Map(lambda x: dump(x))
39 |   | 'Write' >> beam.io.WriteToText('/data/compras_tiny.json/compras_tiny.json'))
40 | 
41 | p1.run().wait_until_finish()
42 | 
43 | 


--------------------------------------------------------------------------------
/beam/compras_ptransform_condensed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | 
 4 | import datetime
 5 | import json
 6 | import logging
 7 | 
 8 | import apache_beam as beam
 9 | 
10 | # ./beam compras_ptransform_condensed.py
11 | # head ../data/compras_tiny.json/compras_tiny.json-00000-of-00018
12 | # {'tx_id': u'RHMLNJB157', 'tx_time': datetime.datetime(2010, 2, 3, 4, 12, 3)}
13 | # {'tx_id': u'VFJDQNX118', 'tx_time': datetime.datetime(2010, 10, 24, 3, 1, 9)}
14 | # {'tx_id': u'MYOIBZV163', 'tx_time': datetime.datetime(2010, 7, 26, 5, 23, 35)}
15 | 
16 | logging.getLogger().setLevel(logging.INFO)
17 | 
18 | class DateTimeEncoder(json.JSONEncoder):
19 |   def default(self, target):
20 |     if isinstance(target, datetime.datetime):
21 |       return target.isoformat()
22 |     return json.JSONEncoder.default(self, target)
23 | 
24 | class JsonCoder(object):
25 |   def encode(self, x):
26 |     return json.dumps(x, cls=DateTimeEncoder)
27 | 
28 |   def decode(self, x):
29 |     return json.loads(x)
30 | 
31 | def dump(line):
32 |   logging.info(line)
33 |   return line
34 | 
35 | def isoDate(date):
36 |   return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
37 | 
38 | @beam.ptransform_fn
39 | def ParseCompras(pcol):
40 |   return (pcol
41 |     | 'SplitFields' >> beam.Map(lambda l: l.split("|"))
42 |     | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id')
43 |     | 'Struct' >> beam.Map(lambda f: { "tx_id": f[0], "tx_time": isoDate(f[1]), "amount": float(f[9]) }))
44 | 
45 | p1 = beam.Pipeline()
46 | lines_collection = (p1
47 |   | 'LecturaCompras' >> beam.io.ReadFromText("/data/compras_tiny.csv")
48 |   | ParseCompras()
49 |   # | 'DebugPrint' >> beam.Map(lambda x: dump(x))
50 |   | 'Write' >> beam.io.WriteToText('/data/compras_tiny.json/compras_tiny.json', coder=JsonCoder()))
51 | 
52 | p1.run().wait_until_finish()
53 | 
54 | 


--------------------------------------------------------------------------------
/beam/compras_totales_por_pais.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import
 3 | 
 4 | import datetime
 5 | import logging
 6 | 
 7 | import apache_beam as beam
 8 | 
 9 | logging.getLogger().setLevel(logging.INFO)
10 | 
11 | def dump(line):
12 |   logging.info(line)
13 |   return line
14 | 
15 | def isoDate(date):
16 |   return datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
17 | 
18 | @beam.ptransform_fn
19 | def ParseCompras(pcol):
20 |   return (pcol
21 |     | 'SplitFields' >> beam.Map(lambda l: l.split("|"))
22 |     | 'SkipHeader' >> beam.Filter(lambda l: l[0] != 'tx_id')
23 |     | 'Struct' >> beam.Map(lambda f: (f[3], float(f[9]))))
24 | 
25 | p = beam.Pipeline()
26 | compras = (p
27 |   | beam.io.ReadFromText("/data/compras_tiny.csv")
28 |   | ParseCompras())
29 | 
30 | totales = (compras | beam.CombinePerKey(sum))
31 | cuentas = (compras | beam.combiners.Count.PerKey())
32 | 
33 | ({ "total": totales, "cuenta": cuentas} 
34 |   | 'Join' >> beam.CoGroupByKey()
35 |   | 'Flatten' >> beam.Map(lambda e: "%s|%f|%d" % (e[0], e[1]["total"][0], e[1]["cuenta"][0]))
36 |   | 'Dump' >> beam.Map(lambda x: dump(x))
37 |   | 'Write' >> beam.io.WriteToText('/data/compras_totales_por_pais'))
38 | 
39 | p.run().wait_until_finish()
40 | 
41 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | containers_partitioned
2 | peliculas0_trained_model
3 | compras_tiny.json
4 | compras_totales_por_pais-00000-of-00001
5 | 
6 | 


--------------------------------------------------------------------------------
/data/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | gem "faker"
3 | 


--------------------------------------------------------------------------------
/data/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     concurrent-ruby (1.0.5)
 5 |     faker (1.8.7)
 6 |       i18n (>= 0.7)
 7 |     i18n (0.9.3)
 8 |       concurrent-ruby (~> 1.0)
 9 | 
10 | PLATFORMS
11 |   ruby
12 | 
13 | DEPENDENCIES
14 |   faker
15 | 
16 | BUNDLED WITH
17 |    1.16.1
18 | 


--------------------------------------------------------------------------------
/data/containers_tiny.parquet/.part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/.part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet.crc


--------------------------------------------------------------------------------
/data/containers_tiny.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_SUCCESS


--------------------------------------------------------------------------------
/data/containers_tiny.parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_common_metadata


--------------------------------------------------------------------------------
/data/containers_tiny.parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/_metadata


--------------------------------------------------------------------------------
/data/containers_tiny.parquet/part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luisbelloch/data_processing_course/0ab2e59f1e3bf3b16fa5440002b4b772b06ee80b/data/containers_tiny.parquet/part-r-00000-699887c3-5b1f-4afb-ae91-da7750b810c7.gz.parquet


--------------------------------------------------------------------------------
/data/country_codes.csv:
--------------------------------------------------------------------------------
  1 | Afghanistan,AF
  2 | Åland Islands,AX
  3 | Albania,AL
  4 | Algeria,DZ
  5 | American Samoa,AS
  6 | Andorra,AD
  7 | Angola,AO
  8 | Anguilla,AI
  9 | Antarctica,AQ
 10 | Antigua and Barbuda,AG
 11 | Argentina,AR
 12 | Armenia,AM
 13 | Aruba,AW
 14 | Australia,AU
 15 | Austria,AT
 16 | Azerbaijan,AZ
 17 | Bahamas,BS
 18 | Bahrain,BH
 19 | Bangladesh,BD
 20 | Barbados,BB
 21 | Belarus,BY
 22 | Belgium,BE
 23 | Belize,BZ
 24 | Benin,BJ
 25 | Bermuda,BM
 26 | Bhutan,BT
 27 | Bolivia,BO
 28 | Bonaire,BQ
 29 | Bosnia and Herzegovina,BA
 30 | Botswana,BW
 31 | Bouvet Island,BV
 32 | Brazil,BR
 33 | British Indian Ocean Territory,IO
 34 | Brunei Darussalam,BN
 35 | Bulgaria,BG
 36 | Burkina Faso,BF
 37 | Burundi,BI
 38 | Cambodia,KH
 39 | Cameroon,CM
 40 | Canada,CA
 41 | Cape Verde,CV
 42 | Cayman Islands,KY
 43 | Central African Republic,CF
 44 | Chad,TD
 45 | Chile,CL
 46 | China,CN
 47 | Christmas Island,CX
 48 | Cocos (Keeling) Islands,CC
 49 | Colombia,CO
 50 | Comoros,KM
 51 | Congo,CG
 52 | Congo,CD
 53 | Cook Islands,CK
 54 | Costa Rica,CR
 55 | Côte d'Ivoire,CI
 56 | Croatia,HR
 57 | Cuba,CU
 58 | Curaçao,CW
 59 | Cyprus,CY
 60 | Czech Republic,CZ
 61 | Denmark,DK
 62 | Djibouti,DJ
 63 | Dominica,DM
 64 | Dominican Republic,DO
 65 | Ecuador,EC
 66 | Egypt,EG
 67 | El Salvador,SV
 68 | Equatorial Guinea,GQ
 69 | Eritrea,ER
 70 | Estonia,EE
 71 | Ethiopia,ET
 72 | Falkland Islands (Malvinas),FK
 73 | Faroe Islands,FO
 74 | Fiji,FJ
 75 | Finland,FI
 76 | France,FR
 77 | French Guiana,GF
 78 | French Polynesia,PF
 79 | French Southern Territories,TF
 80 | Gabon,GA
 81 | Gambia,GM
 82 | Georgia,GE
 83 | Germany,DE
 84 | Ghana,GH
 85 | Gibraltar,GI
 86 | Greece,GR
 87 | Greenland,GL
 88 | Grenada,GD
 89 | Guadeloupe,GP
 90 | Guam,GU
 91 | Guatemala,GT
 92 | Guernsey,GG
 93 | Guinea,GN
 94 | Guinea-Bissau,GW
 95 | Guyana,GY
 96 | Haiti,HT
 97 | Heard Island and McDonald Islands,HM
 98 | Holy See (Vatican City State),VA
 99 | Honduras,HN
100 | Hong Kong,HK
101 | Hungary,HU
102 | Iceland,IS
103 | India,IN
104 | Indonesia,ID
105 | Iran,IR
106 | Iraq,IQ
107 | Ireland,IE
108 | Isle of Man,IM
109 | Israel,IL
110 | Italy,IT
111 | Jamaica,JM
112 | Japan,JP
113 | Jersey,JE
114 | Jordan,JO
115 | Kazakhstan,KZ
116 | Kenya,KE
117 | Kiribati,KI
118 | Korea,KP
119 | Korea,KR
120 | Kuwait,KW
121 | Kyrgyzstan,KG
122 | Lao People's Democratic Republic,LA
123 | Latvia,LV
124 | Lebanon,LB
125 | Lesotho,LS
126 | Liberia,LR
127 | Libya,LY
128 | Liechtenstein,LI
129 | Lithuania,LT
130 | Luxembourg,LU
131 | Macao,MO
132 | Macedonia,MK
133 | Madagascar,MG
134 | Malawi,MW
135 | Malaysia,MY
136 | Maldives,MV
137 | Mali,ML
138 | Malta,MT
139 | Marshall Islands,MH
140 | Martinique,MQ
141 | Mauritania,MR
142 | Mauritius,MU
143 | Mayotte,YT
144 | Mexico,MX
145 | Micronesia,FM
146 | Moldova,MD
147 | Monaco,MC
148 | Mongolia,MN
149 | Montenegro,ME
150 | Montserrat,MS
151 | Morocco,MA
152 | Mozambique,MZ
153 | Myanmar,MM
154 | Namibia,NA
155 | Nauru,NR
156 | Nepal,NP
157 | Netherlands,NL
158 | New Caledonia,NC
159 | New Zealand,NZ
160 | Nicaragua,NI
161 | Niger,NE
162 | Nigeria,NG
163 | Niue,NU
164 | Norfolk Island,NF
165 | Northern Mariana Islands,MP
166 | Norway,NO
167 | Oman,OM
168 | Pakistan,PK
169 | Palau,PW
170 | Palestine,PS
171 | Panama,PA
172 | Papua New Guinea,PG
173 | Paraguay,PY
174 | Peru,PE
175 | Philippines,PH
176 | Pitcairn,PN
177 | Poland,PL
178 | Portugal,PT
179 | Puerto Rico,PR
180 | Qatar,QA
181 | Réunion,RE
182 | Romania,RO
183 | Russian Federation,RU
184 | Rwanda,RW
185 | Saint Barthélemy,BL
186 | Saint Helena,SH
187 | Saint Kitts and Nevis,KN
188 | Saint Lucia,LC
189 | Saint Martin (French part),MF
190 | Saint Pierre and Miquelon,PM
191 | Saint Vincent and the Grenadines,VC
192 | Samoa,WS
193 | San Marino,SM
194 | Sao Tome and Principe,ST
195 | Saudi Arabia,SA
196 | Senegal,SN
197 | Serbia,RS
198 | Seychelles,SC
199 | Sierra Leone,SL
200 | Singapore,SG
201 | Sint Maarten (Dutch part),SX
202 | Slovakia,SK
203 | Slovenia,SI
204 | Solomon Islands,SB
205 | Somalia,SO
206 | South Africa,ZA
207 | South Georgia and the South Sandwich Islands,GS
208 | South Sudan,SS
209 | Spain,ES
210 | Sri Lanka,LK
211 | Sudan,SD
212 | Suriname,SR
213 | Svalbard and Jan Mayen,SJ
214 | Swaziland,SZ
215 | Sweden,SE
216 | Switzerland,CH
217 | Syrian Arab Republic,SY
218 | Taiwan,TW
219 | Tajikistan,TJ
220 | Tanzania,TZ
221 | Thailand,TH
222 | Timor-Leste,TL
223 | Togo,TG
224 | Tokelau,TK
225 | Tonga,TO
226 | Trinidad and Tobago,TT
227 | Tunisia,TN
228 | Turkey,TR
229 | Turkmenistan,TM
230 | Turks and Caicos Islands,TC
231 | Tuvalu,TV
232 | Uganda,UG
233 | Ukraine,UA
234 | United Arab Emirates,AE
235 | United Kingdom,GB
236 | United States,US
237 | United States Minor Outlying Islands,UM
238 | Uruguay,UY
239 | Uzbekistan,UZ
240 | Vanuatu,VU
241 | Venezuela,VE
242 | Viet Nam,VN
243 | Virgin Islands,VG
244 | Wallis and Futuna,WF
245 | Western Sahara,EH
246 | Yemen,YE
247 | Zambia,ZM
248 | Zimbabwe,ZW
249 | 


--------------------------------------------------------------------------------
/data/exchange_rates_usd.json:
--------------------------------------------------------------------------------
1 | {"base":"USD","date":"2016-02-05","rates":{"AUD":1.3911,"BGN":1.7459,"BRL":3.8791,"CAD":1.3751,"CHF":0.99098,"CNY":6.5724,"CZK":24.136,"DKK":6.6621,"GBP":0.68715,"HKD":7.7871,"HRK":6.8327,"HUF":276.69,"IDR":13549.0,"ILS":3.8756,"INR":67.654,"JPY":116.68,"KRW":1190.8,"MXN":18.21,"MYR":4.1535,"NOK":8.54,"NZD":1.4868,"PHP":47.555,"PLN":3.9398,"RON":4.0196,"RUB":76.776,"SEK":8.4204,"SGD":1.3958,"THB":35.51,"TRY":2.9011,"ZAR":15.853,"EUR":0.8927}}


--------------------------------------------------------------------------------
/data/iso-container-codes.csv:
--------------------------------------------------------------------------------
  1 | code,description,length,height,group
  2 | 22B0,Bulk,20,8.5,22B0
  3 | 22B1,Dry Bulk,20,8.5,22B0
  4 | 22B3,Dry Bulk,20,8.5,22B0
  5 | 22B4,Dry Bulk,20,8.5,22B0
  6 | 22B5,Dry Bulk,20,8.5,22B0
  7 | 22B6,Dry Bulk,20,8.5,22B0
  8 | 22BK,Dry Bulk,20,8.5,22B0
  9 | 2080,Dry Bulk,20,8,22B0
 10 | 20B0,Dry Bulk,20,8,22B0
 11 | 20B1,Dry Bulk,20,8,22B0
 12 | 20B3,Dry Bulk,20,8,22B0
 13 | 20B4,Dry Bulk,20,8,22B0
 14 | 20B5,Dry Bulk,20,8,22B0
 15 | 20B6,Dry Bulk,20,8,22B0
 16 | 20BK,Dry Bulk,20,8,22B0
 17 | 20BU,Dry Bulk,20,8,22B0
 18 | 2280,Dry Bulk,20,8.5,22B0
 19 | 2281,Dry Bulk,20,8.5,22B0
 20 | 2299,Air/Surface,20,8.5,22B0
 21 | 22BU,Dry Bulk,20,8.5,22B0
 22 | 22G0,Standard Dry,20,8.5,22G0
 23 | 22G1,Standard Dry,20,8.5,22G0
 24 | 22G2,Standard Dry,20,8.5,22G0
 25 | 22G3,Standard Dry,20,8.5,22G0
 26 | 22V3,Standard Dry,20,8.5,22G0
 27 | 2300,Standard Dry,20,8.5,22G0
 28 | 2301,Standard Dry,20,8.5,22G0
 29 | 2302,Standard Dry,20,8.5,22G0
 30 | 2303,Standard Dry,20,8.5,22G0
 31 | 2304,Standard Dry,20,8.5,22G0
 32 | 2410,HIGH CUBE,20,9.5,22G0
 33 | 24G0,Standard Dry,20,9,22G0
 34 | 24G1,Standard Dry,20,9,22G0
 35 | 24G2,Standard Dry,20,9,22G0
 36 | 24G3,Standard Dry,20,9,22G0
 37 | 24GP,Standard Dry,20,9,22G0
 38 | 2500,Standard Dry,20,8.5,22G0
 39 | 25G0,Standard Dry High Cube,20,9,22G0
 40 | 2600,Standard Dry,20,4.25,22G0
 41 | 26G0,Standard Dry,20,9.5,22G0
 42 | 26G1,Standard Dry,20,9.5,22G0
 43 | 26G2,Standard Dry,20,9.5,22G0
 44 | 26G3,Standard Dry,20,9.5,22G0
 45 | 26GP,Standard Dry,20,9.5,22G0
 46 | 28G0,Standard Dry,20,4.25,22G0
 47 | 28GP,Standard Dry,20,4.25,22G0
 48 | 28U1,BIN HALF HEIGHT (OPEN TOP),20,4.25,22G0
 49 | 28U2,"OPENING(S) AT ONE OR BOTH ENDS, PLUS REMV TOP MEMB",20,8.5,22G0
 50 | 28UT,"OPENING(S) AT ONE OR BOTH ENDS, PLUS REMV TOP MEMB",20,8.5,22G0
 51 | 2994,Air/Surface,20,4,22G0
 52 | 2999,SLIDER CHASSIS,20,0,22G0
 53 | 3000,Standard Dry,30,8,22G0
 54 | 30G0,DRY CARGO/GENERAL PURPOSE,30,8,22G0
 55 | 3200,Standard Dry,30,8.5,22G0
 56 | 32G0,DRY CARGO/GENERAL PURPOSE,30,8.5,22G0
 57 | 3399,TRIAXLE CHASSIS,23,0,22G0
 58 | 7999,SLIDER CHASSIS,20,0,22G0
 59 | B2G1,PASSIVE VENTS AT UPPER PART OF CARGO SPACE,24,8.5,22G0
 60 | 1000,Standard Dry,10,8,22G0
 61 | 10G0,DRY CARGO/GENERAL PURPOSE,10,8,22G0
 62 | 1200,Standard Dry,10,8.5,22G0
 63 | 12G0,DRY CARGO/GENERAL PURPOSE,10,8.5,22G0
 64 | 2000,Standard Dry,20,8,22G0
 65 | 2001,Standard Dry,20,8,22G0
 66 | 2002,Standard Dry,20,8,22G0
 67 | 2003,Standard Dry,20,8,22G0
 68 | 2004,Standard Dry,20,8,22G0
 69 | 2025,Livestock Carrier,20,8,22G0
 70 | 20G0,Standard Dry,20,8,22G0
 71 | 20G1,Standard Dry,20,8,22G0
 72 | 20G2,Standard Dry,20,8,22G0
 73 | 20G3,Standard Dry,20,8,22G0
 74 | 20GP,Standard Dry,20,8,22G0
 75 | 2101,Standard Dry,20,8,22G0
 76 | 2102,Standard Dry,20,8,22G0
 77 | 2103,Standard Dry,20,8,22G0
 78 | 2104,Standard Dry,20,8,22G0
 79 | 2125,Livestock Carrier,20,8,22G0
 80 | 2200,Standard Dry,20,8.5,22G0
 81 | 2201,Standard Dry,20,8.5,22G0
 82 | 2202,Standard Dry,20,8.5,22G0
 83 | 2204,Standard Dry,20,8.5,22G0
 84 | 2205,Standard Dry,20,8.5,22G0
 85 | 2210,Standard Dry,20,8.5,22G0
 86 | 2213,Standard Dry,20,8.5,22G0
 87 | 2225,Livestock Carrier,20,8.5,22G0
 88 | 22GP,Standard Dry,20,8.5,22G0
 89 | 2212,General Purpose (Hanging Garments),20,8.5,22G0
 90 | 25GP,High Cube,20,9.6,22G0
 91 | 22H0,Insulated (Conair),20,8.5,22H0
 92 | 22H1,Thermal Refrigerated/Heated,20,8.5,22H0
 93 | 22H2,Thermal Refrigerated/Heated,20,8.5,22H0
 94 | 22H5,Thermal Insulated,20,8.5,22H0
 95 | 22H6,Thermal Insulated,20,8.5,22H0
 96 | 22HI,Thermal Refrigerated/Heated,20,8.5,22H0
 97 | 24H5,Thermal Insulated,20,9,22H0
 98 | 24H6,Thermal Insulated,20,9,22H0
 99 | 2020,Thermal Insulated,20,8,22H0
100 | 20H0,Thermal Refrigerated/Heated,20,8,22H0
101 | 20H1,Thermal Refrigerated/Heated,20,8,22H0
102 | 20H2,Thermal Refrigerated/Heated,20,8,22H0
103 | 20H5,Thermal Insulated,20,8,22H0
104 | 20H6,Thermal Insulated,20,8,22H0
105 | 20HI,Thermal Refrigerated/Heated,20,8,22H0
106 | 20HR,Thermal Refrigerated/Heated,20,8,22H0
107 | 2220,Thermal Insulated,20,8.5,22H0
108 | 2224,Insulated,20,8.5,22H0
109 | 22HR,Thermal Refrigerated/Heated,20,8.5,22H0
110 | 22P1,Flat Rack,20,8.5,22P1
111 | 22P2,Platform,20,8.5,22P1
112 | 22P4,Platform,20,8.5,22P1
113 | 22P5,Platform,20,8.5,22P1
114 | 22P7,PLATFORM FIXED,20,8.5,22P1
115 | 22P8,Platform,20,8.5,22P1
116 | 22P9,Platform,20,8.5,22P1
117 | 22PL,Platform,20,8.5,22P1
118 | 22PS,Platform,20,8.5,22P1
119 | 2361,Platform,20,8.5,22P1
120 | 2362,Platform,20,8.5,22P1
121 | 2363,Platform,20,8.5,22P1
122 | 2364,Platform,20,8.5,22P1
123 | 2365,Platform,20,8.5,22P1
124 | 2366,Platform,20,8.5,22P1
125 | 2367,Platform,20,8.5,22P1
126 | 2651,Open Top,20,4.25,22P1
127 | 2661,Platform,20,4.25,22P1
128 | 2761,Platform,20,4.25,22P1
129 | 2063,Flat,20,8,22P1
130 | 2066,Platform,20,8,22P1
131 | 2067,Platform,20,8,22P1
132 | 20P2,Platform,20,8,22P1
133 | 20P4,Platform,20,8,22P1
134 | 20P5,Platform,20,8,22P1
135 | 20PC,Platform,20,8,22P1
136 | 20PF,Platform,20,8,22P1
137 | 20PL,Platform,20,8,22P1
138 | 20PS,Platform,20,8,22P1
139 | 2160,Flat,20,8,22P1
140 | 2161,Platform,20,8,22P1
141 | 2162,Platform,20,8,22P1
142 | 2163,Platform,20,8,22P1
143 | 2164,Platform,20,8,22P1
144 | 2165,Platform,20,8,22P1
145 | 2166,Platform,20,8,22P1
146 | 2167,Platform,20,8,22P1
147 | 2260,Flat,20,8.5,22P1
148 | 2261,Flat,20,8.5,22P1
149 | 2262,Platform,20,8.5,22P1
150 | 2265,Platform,20,8.5,22P1
151 | 2266,Platform,20,8.5,22P1
152 | 2267,Platform,20,8.5,22P1
153 | 22PF,Platform,20,8.5,22P1
154 | 22PC,Platform,20,8.5,22P1
155 | 8888,Uncontainerised,0,0,8888
156 | 22P3,Collapsible Flat Rack,20,8.5,22P3
157 | 20P3,Platform,20,8,22P3
158 | 2263,Flat,20,8.5,22P3
159 | 2264,Platform,20,8.5,22P3
160 | 22R1,Reefer,20,8.5,22R1
161 | 22R2,Thermal Refrigerated/Heated,20,8.5,22R1
162 | 22R3,Thermal Refrigerated/Heated,20,8.5,22R1
163 | 22R9,Thermal Refrigerated/Heated,20,8.5,22R1
164 | 22RC,Thermal Refrigerated/Heated,20,8.5,22R1
165 | 22RE,Thermal Refrigerated,20,8.5,22R1
166 | 22RS,Thermal Refrigerated/Heated,20,8.5,22R1
167 | 2330,Thermal Refrigerated,20,8.5,22R1
168 | 2331,Thermal Refrigerated,20,8.5,22R1
169 | 2332,Thermal Refrigerated/Heated,20,8.5,22R1
170 | 2432,Thermal Refrigerated/Heated,20,9,22R1
171 | 24H0,Thermal Refrigerated/Heated,20,9,22R1
172 | 24H1,Thermal Refrigerated/Heated,20,9,22R1
173 | 24H2,Thermal Refrigerated/Heated,20,9,22R1
174 | 24HI,Thermal Refrigerated/Heated,20,9,22R1
175 | 24HR,Thermal Refrigerated/Heated,20,9,22R1
176 | 24R0,Thermal Refrigerated/Heated,20,9,22R1
177 | 24R1,Thermal Refrigerated/Heated,20,9,22R1
178 | 24R2,Thermal Refrigerated/Heated,20,9,22R1
179 | 24R3,Thermal Refrigerated/Heated,20,9,22R1
180 | 24RE,Thermal Refrigerated,20,9,22R1
181 | 24RS,Thermal Refrigerated/Heated,20,9,22R1
182 | 24RT,Thermal Refrigerated/Heated,20,9,22R1
183 | 2030,Thermal Refrigerated,20,8,22R1
184 | 2031,Thermal Refrigerated,20,8,22R1
185 | 2032,Thermal Refrigerated/Heated,20,8,22R1
186 | 2040,Thermal Refrigerated,20,8,22R1
187 | 2041,Thermal Refrigerated,20,8,22R1
188 | 2042,Thermal Refrigerated,20,8,22R1
189 | 2043,Thermal Refrigerated,20,8,22R1
190 | 20R0,Thermal Refrigerated,20,8,22R1
191 | 20R1,Thermal Refrigerated/Heated,20,8,22R1
192 | 20R2,Thermal Refrigerated/Heated,20,8,22R1
193 | 20R3,Thermal Refrigerated/Heated,20,8,22R1
194 | 20RE,Thermal Refrigerated,20,8,22R1
195 | 20RS,Thermal Refrigerated/Heated,20,8,22R1
196 | 20RT,Thermal Refrigerated/Heated,20,8,22R1
197 | 2130,Thermal Refrigerated,20,8,22R1
198 | 2131,Thermal Refrigerated,20,8,22R1
199 | 2132,Thermal Refrigerated,20,8,22R1
200 | 2230,Thermal Refrigerated,20,8.5,22R1
201 | 2231,Thermal Refrigerated,20,8.5,22R1
202 | 2232,Thermal Refrigerated/Heated,20,8.5,22R1
203 | 2240,Thermal Refrigerated,20,8.5,22R1
204 | 2242,Thermal Refrigerated,20,8.5,22R1
205 | 22R0,Thermal Refrigerated/Heated,20,8.5,22R1
206 | 22RT,Thermal Refrigerated/Heated,20,8.5,22R1
207 | 2234,"Thermal containers, Heated",20,8.5,22R1
208 | 22T0,Tank,20,8.5,22T0
209 | 22T1,Tank,20,8.5,22T0
210 | 22T2,Tank,20,8.5,22T0
211 | 22T3,Tank,20,8.5,22T0
212 | 22T4,Tank,20,8.5,22T0
213 | 22T5,Tank,20,8.5,22T0
214 | 22T6,Tank,20,8.5,22T0
215 | 22T7,Tank,20,8.5,22T0
216 | 22T8,Tank,20,8.5,22T0
217 | 22T9,Tank,20,8.5,22T0
218 | 22TD,Tank,20,8.5,22T0
219 | 22TG,Tank,20,8.5,22T0
220 | 2670,Tank,20,4.25,22T0
221 | 2671,Tank,20,4.25,22T0
222 | 2870,HALF HEIGHT THERMAL TANK,20,0,22T0
223 | 2070,Tank,20,8,22T0
224 | 2071,Tank,20,8,22T0
225 | 2072,Tank,20,8,22T0
226 | 2073,Tank,20,8,22T0
227 | 2074,Tank,20,8,22T0
228 | 2075,Tank,20,8,22T0
229 | 2076,Tank,20,8,22T0
230 | 2077,Tank,20,8,22T0
231 | 2078,Tank,20,8,22T0
232 | 2079,Tank,20,8,22T0
233 | 20T0,Tank,20,8,22T0
234 | 20T1,Tank,20,8,22T0
235 | 20T2,Tank,20,8,22T0
236 | 20T3,Tank,20,8,22T0
237 | 20T4,Tank,20,8,22T0
238 | 20T5,Tank,20,8,22T0
239 | 20T6,Tank,20,8,22T0
240 | 20T7,Tank,20,8,22T0
241 | 20T8,Tank,20,8,22T0
242 | 20T9,Tank,20,8,22T0
243 | 20TD,Tank,20,8,22T0
244 | 20TG,Tank,20,8,22T0
245 | 20TN,Tank,20,8,22T0
246 | 2270,Tank,20,8.5,22T0
247 | 2271,Tank,20,8.5,22T0
248 | 2272,Tank,20,8.5,22T0
249 | 2273,Tank,20,8.5,22T0
250 | 2274,Tank,20,8.5,22T0
251 | 2275,Tank,20,8.5,22T0
252 | 2276,Tank,20,8.5,22T0
253 | 2277,Tank,20,8.5,22T0
254 | 2278,Tank,20,8.5,22T0
255 | 2279,Tank,20,8.5,22T0
256 | 22TN,Tank,20,8.5,22T0
257 | 22U1,Open Top,20,8.5,22U1
258 | 22U2,Open Top,20,8.5,22U1
259 | 22U3,Open Top,20,8.5,22U1
260 | 22U4,Open Top,20,8.5,22U1
261 | 22U5,Open Top,20,8.5,22U1
262 | 22U6,Standard Dry,20,8.5,22U1
263 | 2650,Open Top,0,4.25,22U1
264 | 2750,Open Top,20,4.25,22U1
265 | 2770,Tank,20,4.25,22U1
266 | 2771,Tank,20,4.25,22U1
267 | 2851,HALF OPEN TOP,20,0,22U1
268 | 2050,Open Top,20,8,22U1
269 | 2051,Open Top,20,8,22U1
270 | 2052,Open Top,20,8,22U1
271 | 2053,Open Top,20,8,22U1
272 | 20U0,Open Top,20,8,22U1
273 | 20U1,Open Top,20,8,22U1
274 | 20U2,Open Top,20,8,22U1
275 | 20U3,Open Top,20,8,22U1
276 | 20U4,Open Top,20,8,22U1
277 | 20U5,Open Top,20,8,22U1
278 | 20UT,Open Top,20,8,22U1
279 | 2150,Open Top,20,8,22U1
280 | 2203,Standard Dry,20,8.5,22U1
281 | 2250,Open Top,20,8.5,22U1
282 | 2251,Open Top,20,8.5,22U1
283 | 2252,Open Top,20,8.5,22U1
284 | 2253,Open Top,20,8.5,22U1
285 | 2259,Open Top,20,8.5,22U1
286 | 22U0,Open Top,20,8.5,22U1
287 | 22UT,Open Top,20,8.5,22U1
288 | 22UP,Hard Top,20,8.5,22UP
289 | 22V0,Closed Vented,20,8.5,22VH
290 | 22V2,Closed Vented,20,8.5,22VH
291 | 22V4,Closed Vented,20,8.5,22VH
292 | 22VH,Ventilated,20,8.5,22VH
293 | 28VH,Vented,20,4.75,22VH
294 | 28VO,Vented,20,4.75,22VH
295 | 2010,Closed Vented,20,8,22VH
296 | 2011,Closed Vented,20,8,22VH
297 | 2013,Closed Ventilated,20,8,22VH
298 | 2015,Closed Ventilated,20,8,22VH
299 | 2017,Closed Ventilated,20,8,22VH
300 | 20V0,Closed Vented,20,8,22VH
301 | 20V2,Closed Vented,20,8,22VH
302 | 20V4,Closed Vented,20,8,22VH
303 | 20VH,Closed Vented,20,8,22VH
304 | 2113,Closed Ventilated,20,8,22VH
305 | 2117,Closed Ventilated,20,8,22VH
306 | 2211,Closed Vented,20,8.5,22VH
307 | 2215,Closed Ventilated,20,8.5,22VH
308 | 2216,Closed Ventilated,20,8.5,22VH
309 | 2217,Closed Ventilated,20,8.5,22VH
310 | 29P0,Platform,20,1,29P0
311 | 29P1,PLATFORM (CONTAINER),20,4,29P0
312 | 2060,Platform,20,8,29P0
313 | 2061,Platform,20,8,29P0
314 | 2062,Platform,20,8,29P0
315 | 2064,Platform,20,8,29P0
316 | 2065,Platform,20,8,29P0
317 | 20P0,Platform,20,8,29P0
318 | 20P1,Platform,20,8,29P0
319 | 22P0,Platform,20,8.5,29P0
320 | 2760,Platform,20,4.25,29P0
321 | 2960,Platform,20,4,29P0
322 | 2969,Platform,20,4,29P0
323 | 29PL,PLATFORM (CONTAINER),20,1,29P0
324 | 42G0,Standard Dry,40,8.5,42G0
325 | 42G1,Standard Dry,40,8.5,42G0
326 | 42G2,Standard Dry,40,8.5,42G0
327 | 42G3,Standard Dry,40,8.5,42G0
328 | 4300,Standard Dry,40,8.5,42G0
329 | 4301,Standard Dry,40,8.5,42G0
330 | 4302,Standard Dry,40,8.5,42G0
331 | 4303,Standard Dry,40,8.5,42G0
332 | 4304,Standard Dry,40,8.5,42G0
333 | 4305,Standard Dry,40,8.5,42G0
334 | 4310,Standard Dry,40,8.5,42G0
335 | 4311,Closed Vented,40,8.5,42G0
336 | 4312,General Purpose (Hanging Garments),40,8.5,42G0
337 | 4313,VENTILATED,40,0,42G0
338 | 4315,Closed Ventilated,40,8.5,42G0
339 | 4325,Livestock Carrier,40,8.5,42G0
340 | 4326,Automobile Carrier,40,8.5,42G0
341 | 4380,Dry Bulk,40,8.5,42G0
342 | 44G0,Standard Dry,40,9,42G0
343 | 44G1,Standard Dry,40,9,42G0
344 | 44G2,Standard Dry,40,9,42G0
345 | 44G3,Standard Dry,40,9,42G0
346 | 44GP,Standard Dry,40,9,42G0
347 | 4595,Air/Surface,40,8.5,42G0
348 | 4599,Air/Surface,40,9,42G0
349 | 4651,HALF HIGH,40,0,42G0
350 | 4699,Air/Surface,40,4.25,42G0
351 | 4886,Dry Bulk,40,4.25,42G0
352 | 48UI,HALF HEIGHT (OPEN TOP),40,4.25,42G0
353 | 4994,Air/Surface,40,8.5,42G0
354 | 4999,GOOSENECK CHASSIS,40,0,42G0
355 | 4CG0,OPENING(S) AT ONE OR BOTH ENDS,40,8.5,42G0
356 | 4CGP,OPENING(S) AT ONE OR BOTH ENDS,40,8.5,42G0
357 | 8500,Standard Dry,35,8.5,42G0
358 | 8599,Air/Surface,35,8.5,42G0
359 | 9995,Air/Surface,40,4,42G0
360 | 9998,Air/Surface,40,4,42G0
361 | 9999,Air/Surface,40,4,42G0
362 | M2G0,OPENING(S) AT ONE END OR BOTH ENDS,48,8.5,42G0
363 | P2G0,OPENING(S) AT ONE END OR BOTH ENDS,53,8.5,42G0
364 | 4000,Standard Dry,40,8,42G0
365 | 4001,Standard Dry,40,8,42G0
366 | 4002,Standard Dry,40,8,42G0
367 | 4003,Standard Dry,40,8,42G0
368 | 4004,Standard Dry,40,8,42G0
369 | 4020,Thermal Insulated,40,8,42G0
370 | 4025,Livestock Carrier,40,8,42G0
371 | 4026,Automobile Carrier,40,8,42G0
372 | 4080,Dry Bulk,40,8,42G0
373 | 4096,Air/Surface,40,8,42G0
374 | 40B0,Dry Bulk,40,8,42G0
375 | 40B1,Dry Bulk,40,8,42G0
376 | 40B3,Dry Bulk,40,8,42G0
377 | 40B4,Dry Bulk,40,8,42G0
378 | 40B5,Dry Bulk,40,8,42G0
379 | 40B6,Dry Bulk,40,8,42G0
380 | 40BK,Dry Bulk,40,8,42G0
381 | 40BU,Dry Bulk,40,8,42G0
382 | 40G0,Standard Dry,40,8,42G0
383 | 40G1,Standard Dry,40,8,42G0
384 | 40G2,Standard Dry,40,8,42G0
385 | 40G3,Standard Dry,40,8,42G0
386 | 40GP,Standard Dry,40,8,42G0
387 | 4101,Standard Dry,40,8,42G0
388 | 4102,Standard Dry,40,8,42G0
389 | 4103,Standard Dry,40,8,42G0
390 | 4104,Standard Dry,40,8,42G0
391 | 4126,Automobile Carrier,40,8,42G0
392 | 4200,Standard Dry,40,8.5,42G0
393 | 4201,Standard Dry,40,8.5,42G0
394 | 4202,Standard Dry,40,8.5,42G0
395 | 4203,Standard Dry,40,8.5,42G0
396 | 4204,Standard Dry,40,8.5,42G0
397 | 4225,Livestock Carrier,40,8.5,42G0
398 | 4226,Automobile Carrier,40,8.5,42G0
399 | 4280,Dry Bulk,40,8.5,42G0
400 | 42B0,Dry Bulk,40,8.5,42G0
401 | 42B1,Dry Bulk,40,8.5,42G0
402 | 42B3,Dry Bulk,40,8.5,42G0
403 | 42B4,Dry Bulk,40,8.5,42G0
404 | 42B5,Dry Bulk,40,8.5,42G0
405 | 42B6,Dry Bulk,40,8.5,42G0
406 | 42BK,Dry Bulk,40,8.5,42G0
407 | 42BU,Dry Bulk,40,8.5,42G0
408 | 42GP,Standard Dry,40,8.5,42G0
409 | 42G4,General Purose (Hanging Garments),40,8.5,42G0
410 | 4CG1,PASSIVE VENTS AT UPPER PART OF CARGO SPACE,40,8.5,42G0
411 | 42H5,Thermal Insulated,40,8.5,42H0
412 | 42H6,Thermal Insulated,40,8.5,42H0
413 | 44H5,Thermal Insulated,40,9,42H0
414 | 44H6,Thermal Insulated,40,9,42H0
415 | 45H5,Thermal Insulated,45,9.5,42H0
416 | 45H6,Thermal Insulated,45,9.5,42H0
417 | L2H5,Thermal Insulated,45,8.5,42H0
418 | L2H6,Thermal Insulated,45,8.5,42H0
419 | L5H5,Thermal Insulated,45,9.5,42H0
420 | L5H6,Thermal Insulated,45,9.5,42H0
421 | 42H0,Insulated (Conair),40,8.5,42H0
422 | 42HI,Thermal Refrigerated/Heated,40,8.5,42H0
423 | 42P1,Flat Rack,40,8.5,42P1
424 | 42P2,Platform,40,8.5,42P1
425 | 42P4,Platform,40,8.5,42P1
426 | 42P5,Platform,40,8.5,42P1
427 | 42P8,Platform,40,8.5,42P1
428 | 42P9,Platform,40,8.5,42P1
429 | 42PL,Platform,40,8.5,42P1
430 | 42PS,Platform,40,8.5,42P1
431 | 4361,Flat,40,8.5,42P1
432 | 4362,Platform,40,8.5,42P1
433 | 4364,Platform,40,8.5,42P1
434 | 4365,Platform,40,8.5,42P1
435 | 4366,Platform,40,8.5,42P1
436 | 4367,Platform,40,8.5,42P1
437 | 4560,Platform,40,8.5,42P1
438 | 4561,Platform,40,8.5,42P1
439 | 4661,Platform,40,4.25,42P1
440 | 4761,Platform,40,4.25,42P1
441 | 48P1,Platform,40,4.25,42P1
442 | 48P5,Platform,40,4.25,42P1
443 | 48PC,Platform,40,4.25,42P1
444 | 48PF,Platform,40,4.25,42P1
445 | 48PL,Platform,40,4.25,42P1
446 | 4061,Platform,40,8,42P1
447 | 4062,Platform,40,8,42P1
448 | 4063,Platform,40,8,42P1
449 | 4064,Platform,40,8,42P1
450 | 4065,Platform,40,8,42P1
451 | 4066,Platform,40,8,42P1
452 | 4067,Platform,40,8,42P1
453 | 40P1,Platform,40,8,42P1
454 | 40P2,Platform,40,8,42P1
455 | 40P4,Platform,40,8,42P1
456 | 40P5,Platform,40,8,42P1
457 | 40PC,Platform,40,8,42P1
458 | 40PF,Platform,40,8,42P1
459 | 40PL,Platform,40,8,42P1
460 | 40PS,Platform,40,8,42P1
461 | 4161,Platform,40,8,42P1
462 | 4162,Platform,40,8,42P1
463 | 4163,Platform,40,8,42P1
464 | 4164,Platform,40,8,42P1
465 | 4165,Platform,40,8,42P1
466 | 4166,Platform,40,8,42P1
467 | 4167,Platform,40,8,42P1
468 | 4261,Platform,40,8.5,42P1
469 | 4262,Platform,40,8.5,42P1
470 | 4263,Flat,40,8.5,42P1
471 | 4264,Platform,40,8.5,42P1
472 | 4265,Platform,40,8.5,42P1
473 | 4266,Platform,40,8.5,42P1
474 | 4267,Platform,40,8.5,42P1
475 | 42PC,Platform,40,8.5,42P1
476 | 42PF,Platform,40,8.5,42P1
477 | 42P3,Collapsible Flat Rack,40,8.5,42P3
478 | 4363,Flat,40,8.5,42P3
479 | 48P3,Platform,40,4.25,42P3
480 | 40P3,Platform,40,8,42P3
481 | 42R1,Reefer,40,8.5,42R1
482 | 42R2,Thermal Refrigerated/Heated,40,8.5,42R1
483 | 42R3,Thermal Refrigerated/Heated,40,8.5,42R1
484 | 42R9,Thermal Refrigerated/Heated,40,8.5,42R1
485 | 42RC,Thermal Refrigerated/Heated,40,8.5,42R1
486 | 42RE,Thermal Refrigerated,40,8.5,42R1
487 | 42RS,Thermal Refrigerated/Heated,40,8.5,42R1
488 | 4320,Thermal Insulated,40,8.5,42R1
489 | 4330,Thermal Refrigerated,40,8.5,42R1
490 | 4332,Thermal Refrigerated/Heated,40,8.5,42R1
491 | 4333,Thermal Refrigerated/Heated,40,8.5,42R1
492 | 4340,Thermal Refrigerated,40,8.5,42R1
493 | 44H0,Thermal Refrigerated/Heated,40,9,42R1
494 | 44H1,Thermal Refrigerated/Heated,40,9,42R1
495 | 44H2,Thermal Refrigerated/Heated,40,9,42R1
496 | 44HI,Thermal Refrigerated/Heated,40,9,42R1
497 | 44HR,Thermal Refrigerated/Heated,40,9,42R1
498 | 44R0,Thermal Refrigerated/Heated,40,9,42R1
499 | 44R1,Thermal Refrigerated/Heated,40,9,42R1
500 | 44R2,Thermal Refrigerated/Heated,40,9,42R1
501 | 44R3,Thermal Refrigerated/Heated,40,9,42R1
502 | 44RE,Thermal Refrigerated,40,9,42R1
503 | 44RS,Thermal Refrigerated/Heated,40,9,42R1
504 | 44RT,Thermal Refrigerated/Heated,40,9,42R1
505 | 8520,Thermal Insulated,35,8.5,42R1
506 | 8532,Thermal Refrigerated/Heated,35,8.5,42R1
507 | 4030,Thermal Refrigerated,40,8,42R1
508 | 4031,Thermal Refrigerated,40,8,42R1
509 | 4032,Thermal Refrigerated/Heated,40,8,42R1
510 | 4040,Thermal Refrigerated,40,8,42R1
511 | 40H0,Thermal Refrigerated/Heated,40,8,42R1
512 | 40H1,Thermal Refrigerated/Heated,40,8,42R1
513 | 40H2,Thermal Refrigerated/Heated,40,8,42R1
514 | 40H5,Thermal Insulated,40,8,42R1
515 | 40H6,Thermal Insulated,40,8,42R1
516 | 40HI,Thermal Refrigerated/Heated,40,8,42R1
517 | 40HR,Thermal Refrigerated/Heated,40,8,42R1
518 | 40R0,Thermal Refrigerated/Heated,40,8,42R1
519 | 40R1,Thermal Refrigerated/Heated,40,8,42R1
520 | 40R2,Thermal Refrigerated/Heated,40,8,42R1
521 | 40R3,Thermal Refrigerated/Heated,40,8,42R1
522 | 40RE,Thermal Refrigerated,40,8,42R1
523 | 40RS,Thermal Refrigerated/Heated,40,8,42R1
524 | 40RT,Thermal Refrigerated/Heated,40,8,42R1
525 | 4130,Thermal Refrigerated,40,8,42R1
526 | 4131,Thermal Refrigerated,40,8,42R1
527 | 4132,Thermal Refrigerated/Heated,40,8,42R1
528 | 4224,Insulated,40,8.5,42R1
529 | 4230,Thermal Refrigerated,40,8.5,42R1
530 | 4231,Thermal Refrigerated,40,8.5,42R1
531 | 4232,Thermal Refrigerated/Heated,40,8.5,42R1
532 | 4240,Thermal Refrigerated,40,8.5,42R1
533 | 4243,Thermal Refrigerated,40,8.5,42R1
534 | 42H1,Thermal Refrigerated/Heated,40,8.5,42R1
535 | 42H2,Thermal Refrigerated/Heated,40,8.5,42R1
536 | 42HR,Thermal Refrigerated/Heated,40,8.5,42R1
537 | 42R0,Thermal Refrigerated/Heated,40,8.5,42R1
538 | 4432,Thermal Refrigerated/Heated,40,9,42R1
539 | 42RT,Thermal Refrigerated/Heated,40,8.5,42R1
540 | 42T0,Tank,40,8.5,42T0
541 | 42T1,Tank,40,8.5,42T0
542 | 42T2,Tank,40,8.5,42T0
543 | 42T3,Tank,40,8.5,42T0
544 | 42T4,Tank,40,8.5,42T0
545 | 42T5,Tank,40,8.5,42T0
546 | 42T6,Tank,40,8.5,42T0
547 | 42T7,Tank,40,8.5,42T0
548 | 42T8,Tank,40,8.5,42T0
549 | 42T9,Tank,40,8.5,42T0
550 | 42TD,Tank,40,8.5,42T0
551 | 42TG,Tank,40,8.5,42T0
552 | 4370,Tank,40,8.5,42T0
553 | 8770,Tank,35,4.25,42T0
554 | 4070,Tank,40,8,42T0
555 | 4071,Tank,40,8,42T0
556 | 40T0,Tank,40,8,42T0
557 | 40T1,Tank,40,8,42T0
558 | 40T2,Tank,40,8,42T0
559 | 40T3,Tank,40,8,42T0
560 | 40T4,Tank,40,8,42T0
561 | 40T5,Tank,40,8,42T0
562 | 40T6,Tank,40,8,42T0
563 | 40T7,Tank,40,8,42T0
564 | 40T8,Tank,40,8,42T0
565 | 40T9,Tank,40,8,42T0
566 | 40TD,Tank,40,8,42T0
567 | 40TG,Tank,40,8,42T0
568 | 40TN,Tank,40,8,42T0
569 | 4170,Tank,40,8,42T0
570 | 4270,Tank,40,8.5,42T0
571 | 4271,Tank,40,8.5,42T0
572 | 42TN,Tank,40,8.5,42T0
573 | 42U1,Open Top,40,8.5,42U1
574 | 42U2,Open Top,40,8.5,42U1
575 | 42U3,Open Top,40,8.5,42U1
576 | 42U4,Open Top,40,8.5,42U1
577 | 42U5,Open Top,40,8.5,42U1
578 | 42U6,Standard Dry,40,8.5,42U1
579 | 4350,Open Top,40,8.5,42U1
580 | 4351,Open Top,40,8.5,42U1
581 | 4650,Open Top,40,4.25,42U1
582 | 4750,Open Top,40,4.25,42U1
583 | 4751,Open Top,40,4.25,42U1
584 | 48U0,Open top,40,4.25,42U1
585 | 48UT,Open top,40,4.25,42U1
586 | 8550,Open top,35,8.5,42U1
587 | 4050,Open Top,40,8,42U1
588 | 4051,Open Top,40,8,42U1
589 | 4052,Open Top,40,8,42U1
590 | 4053,Open Top,40,8,42U1
591 | 40U0,Open Top,40,8,42U1
592 | 40U1,Open Top,40,8,42U1
593 | 40U2,Open Top,40,8,42U1
594 | 40U3,Open Top,40,8,42U1
595 | 40U4,Open Top,40,8,42U1
596 | 40U5,Open Top,40,8,42U1
597 | 40UT,Open Top,40,8,42U1
598 | 4250,Open Top,40,8.5,42U1
599 | 4251,Open Top,40,8.5,42U1
600 | 4252,Open Top,40,8.5,42U1
601 | 4253,Open Top,40,8.5,42U1
602 | 42P6,Open Top,40,8.5,42U1
603 | 42U0,Open Top,40,8.5,42U1
604 | 42UT,Open Top,40,8.5,42U1
605 | 4551,OPEN TOP HIGHCUBE,40,9.5,42U1
606 | 42UP,Hard Top,40,8.5,42UP
607 | 42V0,Closed Vented,40,8.5,42VH
608 | 42V2,Closed Vented,40,8.5,42VH
609 | 42V4,Closed Vented,40,8.5,42VH
610 | 42VH,Ventilated,40,8.5,42VH
611 | 4010,Closed Vented,40,8,42VH
612 | 4011,Closed Vented,40,8,42VH
613 | 4015,Closed Ventilated,40,8,42VH
614 | 40V0,Closed Vented,40,8,42VH
615 | 40V2,Closed Vented,40,8,42VH
616 | 40V4,Closed Vented,40,8,42VH
617 | 40VH,Closed Vented,40,8,42VH
618 | 4210,Closed Vented,40,8.5,42VH
619 | 4211,Closed Vented,40,8.5,42VH
620 | 4215,Closed Ventilated,40,8.5,42VH
621 | 45G0,High Cube,40,9.5,45G0
622 | 45G1,High Cube,40,9.5,45G0
623 | 45G2,Standard Dry,40,9.5,45G0
624 | 45G3,Standard Dry,40,9.5,45G0
625 | 45G4,Unrecognized container type,0,0,45G0
626 | 9200,Standard Dry,45,8.5,45G0
627 | 9400,Standard Dry,45,9.5,45G0
628 | 4400,Standard Dry,40,9,45G0
629 | 4410,HIGH CUBE,40,9.5,45G0
630 | 4420,Thermal Insulated,40,9,45G0
631 | 4426,Automobile Carrier,40,9,45G0
632 | 4500,Standard Dry,40,8.5,45G0
633 | 4505,Standard Dry,40,8.5,45G0
634 | 4510,Standard Dry,40,9.5,45G0
635 | 4511,Standard Dry,40,9.5,45G0
636 | 4514,HIGH CUBE,40,9.5,45G0
637 | 45GP,Standard Dry,40,9.5,45G0
638 | 45R0,Thermal Refrigerated/Heated,45,9.5,45R1
639 | 45R1,Reefer High Cube,40,9.5,45R1
640 | 45R2,Thermal Refrigerated/Heated,45,9.5,45R1
641 | 45R3,Thermal Refrigerated/Heated,45,9.5,45R1
642 | 45R9,Thermal Refrigerated,40,9.5,45R1
643 | 45RC,Thermal Refrigerated/Heated,40,9.5,45R1
644 | 45RE,Thermal Refrigerated,45,9.5,45R1
645 | 45RS,Thermal Refrigerated/Heated,45,9.5,45R1
646 | 4530,Thermal Refrigerated,40,8.5,45R1
647 | 4531,Thermal Refrigerated,40,8.5,45R1
648 | 4532,Thermal Refrigerated/Heated,40,8.5,45R1
649 | 4533,Thermal Refrigerated/Heated,40,8.5,45R1
650 | 45H2,Thermal Refrigerated/Heated,45,9.5,45R1
651 | 45RT,Thermal Refrigerated/Heated,45,9.5,45R1
652 | 4534,HIGHCUBE INTEGRATED REEFER,40,9.5,45R1
653 | 45U6,High Cube Hard Top,40,9.5,45UP
654 | 45UP,High Cube Hard Top,40,9.5,45UP
655 | 49P0,Platform,40,4,49P0
656 | 49P1,Platform,40,4,49P0
657 | 49P3,Platform,40,4,49P0
658 | 49P5,Platform,40,4,49P0
659 | 49PC,Platform,40,4,49P0
660 | 49PF,Platform,40,4,49P0
661 | 4060,Flat,40,8,49P0
662 | 40P0,Platform,40,8,49P0
663 | 4260,Flat,40,8.5,49P0
664 | 42P0,Platform,40,8.5,49P0
665 | 4360,Flat,40,8.5,49P0
666 | 45P3,FOLDING COMPLETE END STRUCTURE (PLATFORM),40,9.5,49P0
667 | 45P8,Platform,40,9.5,49P0
668 | 45PC,FOLDING COMPLETE END STRUCTURE (PLATFORM),40,9.5,49P0
669 | 48P0,Platform,40,4.25,49P0
670 | 4960,Platform,40,4,49P0
671 | 49PL,Platform,40,4,49P0
672 | 9500,Standard Dry,45,9.5,L5G1
673 | 9510,Standard Dry,45,9.5,L5G1
674 | L5G1,45 High Cube,45,9,L5G1
675 | L5G2,Standard Dry,45,9.5,L5G1
676 | L5G3,Standard Dry,45,9.5,L5G1
677 | L5G9,Standard Dry,45,9.5,L5G1
678 | L0G9,Standard Dry,45,8,L5G1
679 | L0GP,HL: OPENING(S) AT ONE END OR BOTH ENDS,45,8,L5G1
680 | L2G0,Standard Dry,45,8.5,L5G1
681 | L2G1,Standard Dry,45,8.5,L5G1
682 | L2G2,Standard Dry,45,8.5,L5G1
683 | L2G3,Standard Dry,45,8.5,L5G1
684 | L2G9,Standard Dry,45,8.5,L5G1
685 | L2GP,Standard Dry,45,8.5,L5G1
686 | L5G0,Standard Dry,45,9,L5G1
687 | L5GP,Standard Dry,45,9.5,L5G1
688 | L5R1,45 Reefer High Cube,45,9.5,L5R1
689 | L5R2,Thermal Refrigerated/Heated,45,9.5,L5R1
690 | L5R3,Thermal Refrigerated/Heated,45,9.5,L5R1
691 | L5RE,Thermal Refrigerated,45,9.5,L5R1
692 | L5RS,Thermal Refrigerated/Heated,45,9.5,L5R1
693 | 45H0,Thermal Refrigerated/Heated,45,9.5,L5R1
694 | 45H1,Thermal Refrigerated/Heated,45,9.5,L5R1
695 | 45HI,Thermal Refrigerated/Heated,45,9.5,L5R1
696 | 45HR,Thermal Refrigerated/Heated,45,9.5,L5R1
697 | 9532,Thermal Refrigerated/Heated,45,9.5,L5R1
698 | L2H0,Thermal Refrigerated/Heated,45,8.5,L5R1
699 | L2H1,Thermal Refrigerated/Heated,45,8.5,L5R1
700 | L2H2,Thermal Refrigerated/Heated,45,8.5,L5R1
701 | L2HI,Thermal Refrigerated/Heated,45,8.5,L5R1
702 | L2HR,Thermal Refrigerated/Heated,45,8.5,L5R1
703 | L2R0,Thermal Refrigerated,45,8.5,L5R1
704 | L2R1,Thermal Refrigerated/Heated,45,8.5,L5R1
705 | L2R2,Thermal Refrigerated/Heated,45,8.5,L5R1
706 | L2R3,Thermal Refrigerated/Heated,45,8.5,L5R1
707 | L2RE,Thermal Refrigerated,45,8.5,L5R1
708 | L2RS,Thermal Refrigerated/Heated,45,8.5,L5R1
709 | L2RT,Thermal Refrigerated/Heated,45,8.5,L5R1
710 | L5H0,Thermal Refrigerated/Heated,45,9.5,L5R1
711 | L5H1,Thermal Refrigerated/Heated,45,9.5,L5R1
712 | L5H2,Thermal Refrigerated/Heated,45,9.5,L5R1
713 | L5HI,Thermal Refrigerated/Heated,45,9.5,L5R1
714 | L5HR,Thermal Refrigerated/Heated,45,9.5,L5R1
715 | L5R0,Thermal Refrigerated,45,9.5,L5R1
716 | L5RT,Thermal Refrigerated/Heated,45,9.5,L5R1
717 | 12TR,Flatbed,42,8,12TR
718 | 


--------------------------------------------------------------------------------
/data/iso-container-groups.csv:
--------------------------------------------------------------------------------
 1 | code,description
 2 | 22B0,20 Bulk
 3 | 22G0,20 Standard Dry
 4 | 22H0,20 Insulated (Conair)
 5 | 22P1,20 Flat Rack
 6 | 8888,Uncontainerised
 7 | 22P3,20 Collapsible Flat Rack
 8 | 22R1,20 Reefer
 9 | 22T0,20 Tank
10 | 22U1,20 Open Top
11 | 22UP,20 Hard Top
12 | 22VH,20 Ventilated
13 | 29P0,20 Platform
14 | 42G0,40 Standard Dry
15 | 42H0,40 Insulated (Conair)
16 | 42P1,40 Flat Rack
17 | 42P3,40 Collapsible Flat Rack
18 | 42R1,40 Reefer
19 | 42T0,40 Tank
20 | 42U1,40 Open Top
21 | 42UP,40 Hard Top
22 | 42VH,40 Ventilated
23 | 45G0,40 High Cube
24 | 45R1,40 Reefer High Cube
25 | 45UP,40 High Cube Hard Top
26 | 49P0,40 Platform
27 | L5G1,45 High Cube
28 | L5R1,45 Reefer High Cube
29 | 12TR,Flatbed
30 | 


--------------------------------------------------------------------------------
/data/pelicula_ids.csv:
--------------------------------------------------------------------------------
  1 | 01
  2 | 02
  3 | 03
  4 | 04
  5 | 05
  6 | 06
  7 | 07
  8 | 08
  9 | 09
 10 | 10
 11 | 11
 12 | 12
 13 | 13
 14 | 14
 15 | 15
 16 | 18
 17 | 19
 18 | 20
 19 | 21
 20 | 22
 21 | 23
 22 | 24
 23 | 25
 24 | 26
 25 | 28
 26 | 29
 27 | 31
 28 | 33
 29 | 34
 30 | 35
 31 | 36
 32 | 37
 33 | 38
 34 | 39
 35 | 41
 36 | 42
 37 | 43
 38 | 44
 39 | 45
 40 | 46
 41 | 47
 42 | 48
 43 | 49
 44 | 61
 45 | 62
 46 | 63
 47 | 64
 48 | 65
 49 | 66
 50 | 68
 51 | 69
 52 | 71
 53 | 72
 54 | 73
 55 | 74
 56 | 75
 57 | 76
 58 | 77
 59 | 78
 60 | 79
 61 | 210
 62 | 211
 63 | 213
 64 | 214
 65 | 215
 66 | 216
 67 | 217
 68 | 218
 69 | 219
 70 | 220
 71 | 223
 72 | 224
 73 | 225
 74 | 240
 75 | 270
 76 | 310
 77 | 311
 78 | 312
 79 | 315
 80 | 316
 81 | 317
 82 | 318
 83 | 320
 84 | 321
 85 | 322
 86 | 323
 87 | 324
 88 | 325
 89 | 328
 90 | 342
 91 | 362
 92 | 410
 93 | 411
 94 | 412
 95 | 414
 96 | 415
 97 | 416
 98 | 417
 99 | 418
100 | 422
101 | 423
102 | 425
103 | 610
104 | 612
105 | 613
106 | 614
107 | 615
108 | 616
109 | 617
110 | 618
111 | 619
112 | 620
113 | 621
114 | 622
115 | 623
116 | 624
117 | 625
118 | 710
119 | 711
120 | 712
121 | 713
122 | 714
123 | 715
124 | 716
125 | 717
126 | 718
127 | 719
128 | 720
129 | 722
130 | 723
131 | 724
132 | 725
133 | 765
134 | 780
135 | 815
136 | 915
137 | 


--------------------------------------------------------------------------------
/data/peliculas.csv:
--------------------------------------------------------------------------------
  1 | # Source http://www.theguardian.com/news/datablog/2010/oct/16/greatest-films-of-all-time
  2 | Entry|Film|Director|Leading actors|Year of cinema release|No of Oscars won|IMDB link|Guardian film page|Country
  3 | 01|Brief Encounter|David Lean |Celia Johnson, Cyril Raymond, Stanley Holloway, Trevor Howard|1945||http://www.imdb.com/title/tt0037558/|http://www.guardian.co.uk/film/movie/35664/brief.encounter|UK
  4 | 02|Casablanca|Michael Curtiz|Claude Rains, Humphrey Bogart, Ingrid Bergman, Paul Henreid|1942|3|http://www.imdb.com/title/tt0034583/|http://www.guardian.co.uk/film/movie/36156/casablanca|USA
  5 | 03|Before Sunset|Richard Linklater|Ethan Hawke and Julie Delpy|2004||http://www.imdb.com/title/tt0381681/awards|http://www.guardian.co.uk/film/movie/101181/before.sunset|USA
  6 | 78|Let the Right One In|Tomas Alfredson| Henrik Dahl, Kare Hedebrant, Karin Bergquist, Lina Leandersson, Per Ragnar, Peter Carlberg|2008||http://www.imdb.com/title/tt1139797/|http://www.guardian.co.uk/film/movie/125671/let-the-right-one-in|Sweden
  7 | 79|Vampyr|Carl Theodor Dreyer|Henriette Gérard, Henriette Gerard, Julian West, Sybille Schmitz|1932||http://www.imdb.com/title/tt0023649/|http://www.guardian.co.uk/film/movie/80562/vampyr|Germany
  8 | 815|Roman Holiday|William Wyler|Audrey Hepburn, Gregory Peck|1953|3|http://www.imdb.com/title/tt0046250/|http://www.guardian.co.uk/film/movie/96156/roman-holiday|USA
  9 | 29|Hidden| Michael Haneke|Annie Girardot, Daniel Auteuil, Juliette Binoche, Maurice Benichou|2005||http://www.imdb.com/title/tt0387898/|http://www.guardian.co.uk/film/movie/108597/hidden|France|Robinson, Fred MacMurray and Barbara Stanwyck|1944||http://www.imdb.com/title/tt0036775/|http://www.guardian.co.uk/film/movie/36162/double.indemnity|USA
 10 | 310|Comedy|Monty Python’s Life of Brian|Terry Jones|Eric Idle, Graham Chapman, John Cleese, Michael Palin, Terry Gilliam, Terry Jones|1979||http://www.imdb.com/title/tt0079470/|http://www.guardian.co.uk/film/movie/78168/monty-python-s-life-of-brian|UK
 11 | 04|Breathless|Jean-Luc Godard|Jean Seberg, Jean-Paul Belmondo|1960||http://www.imdb.com/title/tt0053472/|http://www.guardian.co.uk/film/movie/36219/a-bout-de-souffle|France
 12 | 05|In the Mood for Love|Kar Wai Wong|Maggie Cheung Man-Yuk, Rebecca Pan, Tony Leung Chiu-Wai|2000||http://www.imdb.com/title/tt0118694/|http://www.guardian.co.uk/film/movie/85442/in.the.mood.for.love|Hong Kong
 13 | 28|Pulp Fiction|Quentin Tarantino|Amanda Plummer, Bruce Willis, Eric Stoltz, Harvey Keitel, John Travolta, Rosanna Arquette, Samuel L Jackson, Steve Buscemi, Tim Roth, Uma Thurman|1994|1|http://www.imdb.com/title/tt0110912/|http://www.guardian.co.uk/film/movie/56612/pulp.fiction|USA
 14 | 73|Don’t Look Now|Nicholas Roeg|Donald Sutherland, Hilary Mason, Julie Christie|1973||http://www.imdb.com/title/tt0069995/|http://www.guardian.co.uk/film/movie/35097/don.t.look.now|UK
 15 | 74|The Wicker Man|Robin Hardy|Britt Ekland, Christopher Lee, Edward Woodward|1973||http://www.imdb.com/title/tt0070917/|http://www.guardian.co.uk/film/movie/36301/wicker.man|UK
 16 | 75|The Shining|Stanley Kubrick|Danny Lloyd, Jack Nicholson, Shelley Duval|1980||http://www.imdb.com/title/tt0081505/|http://www.guardian.co.uk/film/movie/76626/shining|USA
 17 | 765|Harold & Maude|Hal Ashby|Bud Cort, Cyril Cusack, Ruth Gordon, Vivian Pickles|1971||http://www.imdb.com/title/tt0067185/|http://www.guardian.co.uk/film/movie/78471/harold-and-maude|USA
 18 | 76|The Exorcist|William Friedkin|Ellen Burstyn, Linda Blair, Max von Sydow|1973|2|http://www.imdb.com/title/tt0070047/|http://www.guardian.co.uk/film/movie/86477/exorcist|USA
 19 | 77|Nosferatu  (1922)|FW Mernau|Alexander Granach, Greta Schroder, Gustav von Wangenheim, Max Schreck|1922||http://www.imdb.com/title/tt0013442/|http://www.guardian.co.uk/film/movie/75839/nosferatu|Germany
 20 | 780|Dracula   (1958)|Terence Fisher|Christopher Lee, Melissa Stribling, Michael Gough, Peter Cushing|1958||http://www.imdb.com/title/tt0051554/|http://www.guardian.co.uk/film/movie/36215/dracula|UK
 21 | 31|Comedy|Annie Hall|Woody Allen|Carol Kane, Diane Keaton, Paul Simon, Tony Roberts, Woody Allen|1977|4|http://www.imdb.com/title/tt0075686/|http://www.guardian.co.uk/film/movie/36314/annie.hall|USA
 22 | 06|The Apartment|Billy Wilder|Fred MacMurray, Jack Lemmon, Ray Walston, Shirley MacLaine|1960||http://www.imdb.com/title/tt0053604/|http://www.guardian.co.uk/film/movie/36225/apartment|USA
 23 | 07|Hannah & Her Sisters|Woody Allen |Barbara Hershey, Carrie Fisher, Dianne Wiest, Julie Kavner, Mia Farrow, Michael Caine, Woody Allen|1986|3|http://www.imdb.com/title/tt0091167/|http://www.guardian.co.uk/film/movie/89162/hannah-and-her-sisters|USA
 24 | 08|Eternal Sunshine of the Spotless Mind|Michel Gondry|Elijah Wood, Jim Carrey, Kate Winslet, Kirsten Dunst, Mark Ruffalo, Tom Wilkinson|2004|1|http://www.imdb.com/title/tt0338013/|http://www.guardian.co.uk/film/movie/100140/eternal.sunshine.of.the.spotless.mind|USA
 25 | 09|Room With a View|James Ivory|Helena Bonham Carter, Julian Sands, Maggie Smith|1985|3|http://www.imdb.com/title/tt0091867/|http://www.guardian.co.uk/film/movie/77615/room-with-a-view|UK
 26 | 10|Jules et Jim|François Truffaut|Henri Serre, Jeanne Moreau, Oscar Werner, Oskar Werner|1962||http://www.imdb.com/title/tt0055032/|http://www.guardian.co.uk/film/movie/76699/jules.et.jim|France
 27 | 11|All That Heaven Allows|Douglas Sirk|Jane Wyman, Rock Hudson|1955||http://www.imdb.com/title/tt0047811/|http://www.guardian.co.uk/film/movie/94875/all-that-heaven-allows|USA
 28 | 12|Gone with the Wind|Victor Fleming|Anne Rutherford, Clark Gable, Hattie McDaniel, Leslie Howard, Olivia De Havilland, Vivien Leigh|1939|8|http://www.imdb.com/title/tt0031381/|http://www.guardian.co.uk/film/movie/36144/gone.with.the.wind|USA
 29 | 13|An Affair to Remember|Leo McCarey|Cary Grant, Deborah Kerr, Richard Denning|1957||http://www.imdb.com/title/tt0050105/|http://www.guardian.co.uk/film/movie/82271/affair.to.remember|USA
 30 | 14|Umbrellas of Cherbourg|Jaques Demy |Anne Vernon, Catherine Deneuve, Nino Castelnuovo|1964||http://www.imdb.com/title/tt0058450/|http://www.guardian.co.uk/film/movie/77848/umbrellas.of.cherbourg|France
 31 | 15|Lost in Translation|Sofia Coppola|Bill Murray, Giovanni Ribisi, Scarlett Johansson|2003|1|http://www.imdb.com/title/tt0335266/|http://www.guardian.co.uk/film/movie/96936/lost.in.translation|USA
 32 | 18|My Night With Maud|Eric Rohmer| Francoise Fabian, Jean-Louis Trintignant|1969||http://www.imdb.com/title/tt0064612/|http://www.guardian.co.uk/film/movie/77331/my-night-with-maud|France
 33 | 19|Voyage to Italy|Roberto Rossellini|Ingrid Bergman|1954||http://www.imdb.com/title/tt0046511/|http://www.guardian.co.uk/film/movie/88522/voyage-to-italy|Italy
 34 | 20|Dr Zhivago|David Lean|Geraldine Chaplin, Julie Christie, Omar Sharif|1965|5|http://www.imdb.com/title/tt0059113/|http://www.guardian.co.uk/film/movie/78519/dr-zhivago|USA
 35 | 210|Goodfellas|Martin Scorsese| Frank Vincent, Joe Pesci, Lorraine Bracco, Ray Liotta, Robert De Niro|1990|1|http://www.imdb.com/title/tt0099685/|http://www.guardian.co.uk/film/movie/37702/goodfellas|USA
 36 | 218|Hard Boiled|John Woo|Chow Yun Fat, Tony Leung|1992||http://www.imdb.com/title/tt0104684/|http://www.guardian.co.uk/film/movie/82687/hard-boiled|Hong Kong
 37 | 219|Long Good Friday|John McKenzie|Bob Hoskins, Bryan Marshall, Dave King, Helen Mirren|1980||http://www.imdb.com/title/tt0081070/|http://www.guardian.co.uk/film/movie/36322/long.good.friday|UK
 38 | 21|Chinatown|Roman Polanski|Faye Dunaway, Jack Nicholson, John Huston|1974|1|http://www.imdb.com/title/tt0071315/|http://www.guardian.co.uk/film/movie/36302/chinatown|USA
 39 | 220|A Prophet|Jacques Audiard |Adel Bencherif, Niels Arestrup, Tahar Rahim, Tahar Ramin|2009||http://www.imdb.com/title/tt1235166/|http://www.guardian.co.uk/film/movie/129970/prophet|France
 40 | 220|Scarface (1983)|Brian De Palma|Al Pacino, Mary Elizabeth Mastrantonio, Michelle Pfeiffer, Robert Loggia, Steven Bauer|1983||http://www.imdb.com/title/tt0086250/|http://www.guardian.co.uk/film/movie/78370/scarface|USA
 41 | 223|Miller’s Crossing|Joel Coen|Albert Finney, Gabriel Byrne, Marcia| Gay Harden|1990||http://www.imdb.com/title/tt0100150/|http://www.guardian.co.uk/film/movie/78569/miller.s.crossing|USA
 42 | 224|Postman Always Rings Twice  (1942)|Tay Garnett|Cecil Kellaway, John Garfield, Lana Turner|1946||http://www.imdb.com/title/tt0038854/|http://www.guardian.co.uk/film/movie/90190/postman-always-rings-twice|USA
 43 | 225|Jour Se Leve|Marcel Carne|Annabella, Arletty, Jean Gabin|1939||http://www.imdb.com/title/tt0031514/|http://www.guardian.co.uk/film/movie/76684/jour-se-leve|France
 44 | 22|Touch of Evil|Orson Welles|Charlton Heston, Janet Leigh, Marlene Dietrich, Orson Welles, Zsa Zsa Gabor|1958||http://www.imdb.com/title/tt0052311/|http://www.guardian.co.uk/film/movie/36217/touch.of.evil|USA
 45 | 23|Say Anything....|Cameron crowe|John Cusack, Ione Skye, John Mahoney|1989||http://www.imdb.com/title/tt0098258/||USA
 46 | 214|French Connection|William Friedkin|Fernando Rey, Gene Hackman, Roy Schieder, Tony Lo Bianco|1971|5|http://www.imdb.com/title/tt0067116/|http://www.guardian.co.uk/film/movie/36293/french-connection|USA
 47 | 215|The Big Sleep|Howard Hawkes|Bob Steele, Elisha Cook Jr, Elisha Cook Jr., Humphrey Bogart, Lauren Bacall|1946||http://www.imdb.com/title/tt0038355/|http://www.guardian.co.uk/film/movie/34621/big-sleep|USA
 48 | 216|La Ceremonie|Claude Chabrol|Isabelle Huppert, Jacqueline Bisset, Sandrine Bonnaire|1995||http://www.imdb.com/title/tt0112769/|http://www.guardian.co.uk/film/movie/80763/ceremonie|France
 49 | 217|Point Blank|John Boorman|Angie Dickinson, Keenan Wynn, Lee Marvin|1967||http://www.imdb.com/title/tt0062138/|http://www.guardian.co.uk/film/movie/36266/point-blank|USA
 50 | 23|Vertigo|Alfred Hitchcock|Barbara Bel Geddes, James Stewart, Kim Novak|1958||http://www.imdb.com/title/tt0052357/|http://www.guardian.co.uk/film/movie/34909/vertigo|USA
 51 | 240|When Harry Met Sally|Rob Reiner|Billy Crystal, Bruno Kirby, Carrie Fisher, Meg Ryan|1989||http://www.imdb.com/title/tt0098635/|http://www.guardian.co.uk/film/movie/75869/when-harry-met-sally.|USA
 52 | 24|Badlands|Terrence Malik|Alan Vint, Martin Sheen, Ramon Bieri, Sissy Spacek, Warren Oates|1973||http://www.imdb.com/title/tt0069762/|http://www.guardian.co.uk/film/movie/76181/badlands|USA
 53 | 24|Fabulous Baker Boys|Steve Kloves|Beau Bridges, Jeff Bridges, Michelle Pfeiffer|1989||http://www.imdb.com/title/tt0097322/|http://www.guardian.co.uk/film/movie/134648/fabulous-baker-boys|USA
 54 | 25|A Matter of Life & Death| Emeric Pressburger, Michael Powell| David Niven, Kim Hunter, Raymond Massey, Richard Attenborough, Roger Livesey|1946||http://www.imdb.com/title/tt0038733/|http://www.guardian.co.uk/film/movie/36173/matter.of.life.and.death|UK
 55 | 26|Rashomon|Akira Kurosawa|Machiko Kyo, Masayuki Mori, Toshiro Mifune|1950||http://www.imdb.com/title/tt0042876/|http://www.guardian.co.uk/film/movie/83179/rashomon|Japan
 56 | 270|Heat|Michael Mann|Al Pacino, Ashley Judd, Jon Voight, Robert De Niro, Tom Sizemore, Val Kilmer|1995||http://www.imdb.com/title/tt0113277/|http://www.guardian.co.uk/film/movie/60365/heat|USA
 57 | 320|Comedy|Groundhog Day|Harold Ramis|Andie MacDowell, Bill Murray, Chris Elliott, Stephen Tobolowsky|1993||http://www.imdb.com/title/tt0107048/|http://www.guardian.co.uk/film/movie/79383/groundhog-day|USA
 58 | 321|Comedy|Clueless|Amy Heckerling|Alicia Silverstone, Dan Hedaya, Stacey Dash|1995||http://www.imdb.com/title/tt0112697/|http://www.guardian.co.uk/film/movie/59257/clueless|USA
 59 | 322|Comedy|The Great Dictator|Charlie Chaplin|Charlie Chaplin, Jack Oakie, Paulette Goddard|1940||http://www.imdb.com/title/tt0032553/|http://www.guardian.co.uk/film/movie/96585/great.dictator|USA
 60 | 323|Comedy|Clerks|Kevin Smith|Brian O'Halloran, Jeff Anderson, Marilyn Ghigliotti|1994||http://www.imdb.com/title/tt0109445/|http://www.guardian.co.uk/film/movie/53831/clerks|USA
 61 | 211|Bonnie & Clyde|Arthur Penn|Faye Dunaway, Gene Hackman, Michael J Pollard, Warren Beatty|1967|2|http://www.imdb.com/title/tt0061418/|http://www.guardian.co.uk/film/movie/76253/bonnie-and-clyde|USA
 62 | 211|The Conversation|Francis Coppola, Francis Ford Coppola|Allen Garfield, Gene Hackman, John Cazale|1974||http://www.imdb.com/title/tt0071360/|http://www.guardian.co.uk/film/movie/77114/conversation|USA
 63 | 213|The Killing|Stanley Kubrick| Coleen Gray, Elisha Cook Junior, Jay C Flippen, Sterling Hayden, Vince Edwards|1956||http://www.imdb.com/title/tt0049406/|http://www.guardian.co.uk/film/movie/87920/killing|USA
 64 | 324|Comedy|The Jerk|Carl Reiner|Steve Martin|1979||http://www.imdb.com/title/tt0079367/|http://www.guardian.co.uk/film/movie/88834/jerk|USA
 65 | 311|Comedy|Airplane!|Jim Abrahams, David Zucker and Jerry Zucker|Julie Hagerty, Leslie Nielsen, Robert Hays|1980||http://www.imdb.com/title/tt0080339/|http://www.guardian.co.uk/film/movie/83228|USA
 66 | 312|Comedy|Election|Alexander Payne|Chris Klein, Matthew Broderick, Reese Witherspoon|1999||http://www.imdb.com/title/tt0126886/|http://www.guardian.co.uk/film/movie/79657/election|USA
 67 | 315|Comedy|This Is Spinal Tap|Rob Reiner| Christopher Guest, Harry Shearer, Michael McKean, Rob Reiner|1984||http://www.imdb.com/title/tt0088258/|http://www.guardian.co.uk/film/movie/81384/this.is.spinal.tap|USA
 68 | 316|Comedy|Bringing Up Baby|Howard Hawkes|Cary Grant, Katharine Hepburn, Katherine Hepburn|1938||http://www.imdb.com/title/tt0029947/|http://www.guardian.co.uk/film/movie/36143/bringing-up-baby|USA
 69 | 418|Last of the Mohicans|Michael Mann|Daniel Day-Lewis, Jodhi May, Madeleine Stowe|1992|1|http://www.imdb.com/title/tt0104691/|http://www.guardian.co.uk/film/movie/79330/last-of-the-mohicans|
 70 | 41|Apocalypse Now|Francis Coppola|Dennis Hopper, Frederic Forrest, Laurence Fishburne, Marlon Brando, Martin Sheen, Robert Duvall, Rpobert Duvall|1979|2|http://www.imdb.com/title/tt0078788/|http://www.guardian.co.uk/film/movie/36320/apocalypse.now|USA
 71 | 422|Deer Hunter|Michael Cimino|Christopher Walken, Meryl Streep, Robert De Niro|1978|5|http://www.imdb.com/title/tt0077416/|http://www.guardian.co.uk/film/movie/36318/deer-hunter|USA
 72 | 422|Gladiator|Ridley Scott|Connie Nielsen, Joaquin Phoenix, Oliver Reed, Russell Crowe|2000|5|http://www.imdb.com/title/tt0172495/|http://www.guardian.co.uk/film/movie/83550/gladiator|USA
 73 | 422|Rome Open City|Roberto Rossellini|Aldo Fabrizi, Anna Magnani, Marcello Pagliero|1945||http://www.imdb.com/title/tt0038890/|http://www.guardian.co.uk/film/movie/78859/rome-open-city|Italy
 74 | 423|Butch Cassidy|George Roy Hill|Katharine Ross, Paul Newman, Robert Redford|1969|4|http://www.imdb.com/title/tt0064115/|http://www.guardian.co.uk/film/movie/36276/butch-cassidy-and-the-sundance-kid|USA
 75 | 423|Where Eagles Dare|Brian G. Hutton|Clint Eastwood, Mary Ure, Richard Burton|1968||http://www.imdb.com/title/tt0065207/|http://www.guardian.co.uk/film/movie/83199/where-eagles-dare|USA
 76 | 425|The Incredibles|Brad Bird|Craig T Nelson, Holly Hunter, Jason Lee, Samuel L Jackson|2004|2|http://www.imdb.com/title/tt0317705/|http://www.guardian.co.uk/film/movie/102423/incredibles|USA
 77 | 42|North by Northwest|Alfred Hitchcock| Cary Grant, Eva Marie Saint, Eva Marie Saint, James Mason, Jessie Royce Landis, Leo G Carroll, Martin Landau|1959||http://www.imdb.com/title/tt0053125/|http://www.guardian.co.uk/film/movie/35095/north-by-northwest|USA
 78 | 43|Once Upon a Time in the West|Sergio Leone|Charles Bronson, Claudia Cardinale, Henry Fonda, Jason Robards|1968||http://www.imdb.com/title/tt0064116/|http://www.guardian.co.uk/film/movie/36274/once.upon.a.time.in.the.west|Italy
 79 | 44|The Wild Bunch|Sam Pekinpah|Ernest Borgnine, Robert Ryan, William Holden|1969||http://www.imdb.com/title/tt0065214/|http://www.guardian.co.uk/film/movie/36285/wild.bunch|USA
 80 | 45|Deliverance|John Boorman |Burt Reynolds, Jon Voight, Ned Beatty|1972||http://www.imdb.com/title/tt0068473/|http://www.guardian.co.uk/film/movie/76560/deliverance|USA
 81 | 317|Comedy|There’s Something About Mary|Peter & Bob Farrelly|Ben Stiller, Cameron Diaz, Lee Evans, Matt Dillon|1998||http://www.imdb.com/title/tt0129387/|http://www.guardian.co.uk/film/movie/34359/there.s.something.about.mary|USA
 82 | 318|Comedy|Dazed and Confused|Richard Linklater|Adam Goldberg, Jason London, Joey Lauren Adams, Joey Lauren Adams, Milla Jovovich, Rory Cochrane, Shawn Andrew|1993||http://www.imdb.com/title/tt0106677/|http://www.guardian.co.uk/film/movie/49047/dazed-and-confused|USA
 83 | 325|Comedy|Shaun of the Dead|Edgar Wright|Dylan Moran, Kate Ashfield, Nick Frost, Simon Pegg|2004||http://www.imdb.com/title/tt0365748/|http://www.guardian.co.uk/film/movie/99960/shaun.of.the.dead|UK
 84 | 328|Comedy|MASH|Robert Altman|Donald Sutherland, Elliott Gould, Sally Kellerman|1970|1|http://www.imdb.com/title/tt0066026/|http://www.guardian.co.uk/film/movie/84547|USA
 85 | 33|Comedy|Borat|Larry Charles|Ken Davitian, Pamela Anderson , Sacha Baron Cohen|2006||http://www.imdb.com/title/tt0443453/|http://www.guardian.co.uk/film/movie/114557/borat|USA
 86 | 33|Comedy|Some Like it Hot|Billy Wilder|George Raft, Jack Lemmon, Joe E Brown, Marilyn Monroe, Tony Curtis|1959|1|http://www.imdb.com/title/tt0053291/|http://www.guardian.co.uk/film/movie/36223/some.like.it.hot|USA
 87 | 342|Comedy|The Big Lebowski|Joel Coen|Jeff Bridges, John Goodman, Julianne Moore, Steve Buscemi|1998||http://www.imdb.com/title/tt0118715/|http://www.guardian.co.uk/film/movie/77069/big.lebowski|USA
 88 | 34|Comedy|Team America|Trey Parker|Kristen Miller, Matt Stone, Trey Parker|2004||http://www.imdb.com/title/tt0372588/|http://www.guardian.co.uk/film/movie/103000/team.america|USA
 89 | 35|Comedy|Dr Strangelove|Stanley Kubrick|George C Scott, Peter Sellers, Sterling Hayden|1964||http://www.imdb.com/title/tt0057012/|http://www.guardian.co.uk/film/movie/76390/dr-strangelove|UK
 90 | 362|Comedy|His Girl Friday|Howard Hawkes|Cary Grant, Gene Lockhart, Ralph Bellamy, Rosalind Russell|1940||http://www.imdb.com/title/tt0032599/|http://www.guardian.co.uk/film/movie/76369/his-girl-friday|USA
 91 | 36|Comedy|The Ladykillers|Alexander Mackendrick|Alec Guinness, Cecil Parker, Herbert Lom, Peter Sellers|1955||http://www.imdb.com/title/tt0048281/|http://www.guardian.co.uk/film/movie/36206/ladykillers|UK
 92 | 61|2001|Stanley Kubrick|Daniel Richter, Gary Lockwood, Keir Dullea, William Sylvester|1968|1|http://www.imdb.com/title/tt0062622/|http://www.guardian.co.uk/film/movie/36269/2001|USA
 93 | 620|Day the Earth Stood Still|Robert Wise|Hugh Marlowe, Lock Martin, Michael Rennie, Patricia Neal|1951||http://www.imdb.com/title/tt0043456/|http://www.guardian.co.uk/film/movie/82253/day-the-earth-stood-still|USA
 94 | 621|Edward Scissorhands|Tim Burton|Dianne Wiest, Johnny Depp, Winona Ryder|1990||http://www.imdb.com/title/tt0099487/|http://www.guardian.co.uk/film/movie/82335/edward.scissorhands|USA
 95 | 622|Akira|Katsuhiro Otomo|Mitsuo Iwata, Nozomu Sasaki, Mami Koyama, Tessho Genda|1988||http://www.imdb.com/title/tt0094625/|http://www.guardian.co.uk/film/movie/76882/akira|Japan
 96 | 623|Princess Bride|Rob reiner|Billy Crystal, Carty Elwes, Cary Elwes, Mandy Patinkin, Peter Falk, Robin Wright|1987||http://www.imdb.com/title/tt0093779/|http://www.guardian.co.uk/film/movie/77070/princess-bride|USA
 97 | 624|Pan’s Labyrinth|Guillermo del Toro| Ariadna Gil, Doug Jones, Ivana Baquero, Maribel Verdu, Sergi Lopez|2006|3|http://www.imdb.com/title/tt0457430/|http://www.guardian.co.uk/film/movie/112345/pan.s.labyrinth|Spain
 98 | 625|Starship Troopers|Paul Verhoeven|Casper Van Dien, Clancy Brown, Dina Meyer, Jake Busey, Michael Ironside|1997||http://www.imdb.com/title/tt0120201/|http://www.guardian.co.uk/film/movie/71806/starship-troopers|USA
 99 | 62|Metropolis|Fritz Lang|Alfred Abel, Brigitte Helm, Gustav Frohlich, Gustav Fruhlich|1927||http://www.imdb.com/title/tt0017136/|http://www.guardian.co.uk/film/movie/75782/metropolis|Germany
100 | 63|Blade Runner|Ridley Scott|Harrison Ford, Rutger Hauer, Sean Young|1982||http://www.imdb.com/title/tt0083658/|http://www.guardian.co.uk/film/movie/76627/blade-runner|USA
101 | 64|Alien|Ridley Scott|Ian Holm, John Hurt, Sigourney Weaver, Tom Skerritt|1979|1|http://www.imdb.com/title/tt0078748/|http://www.guardian.co.uk/film/movie/75860/alien|USA
102 | 65|The Wizard of Oz|Victor Fleming|Bert Lahr, Frank Morgan, Jack Haley, Judy Garland, Ray Bolger|1939|2|http://www.imdb.com/title/tt0032138/|http://www.guardian.co.uk/film/movie/36148/wizard.of.oz|USA
103 | 66|ET|Steven Spielberg|Dee Wallace, Drew Barrymore, Henry Thomas, Peter Coyote|1982|4|http://www.imdb.com/title/tt0083866/|http://www.guardian.co.uk/film/movie/92910/e.t.the.extra-terrestrial|USA
104 | 66|Solaris| Andrei Tarkovsky|Donatas Banionis, Juri Jarvet, Nataly Bondarchuk, Natalya Bondarchuk|1972||http://www.imdb.com/title/tt0069293/|http://www.guardian.co.uk/film/movie/76558/solaris|USA
105 | 68|Spirited Away|Hayao Miyazaki|Daveigh Chase, Jason Marsden, Jason Marsdon, Mari Natsuki, Miyu Irino, Rumi Hiragi, Suzanne Pleshette|2001|1|http://www.imdb.com/title/tt0245429/|http://www.guardian.co.uk/film/movie/96263/spirited.away|Japan
106 | 37|Comedy|Duck Soup|Leo McCarey|Chico Marx, Groucho Marx, Harpo Marx, Margaret Dumont, The Marx Brothers, Zeppo Marx|1933||http://www.imdb.com/title/tt0023969/|http://www.guardian.co.uk/film/movie/36133/duck.soup|USA
107 | 38|Comedy|Rushmore|Wes Anderson| Bill Murray, Brian Cox, Jason Schwartzman, Olivia Williams|1998||http://www.imdb.com/title/tt0128445/|http://www.guardian.co.uk/film/movie/79577/rushmore|USA
108 | 39|Comedy|Kind Hearts & Coronets|Robert Hamer|Alec Guinness, Dennis Price, Joan Greenwood|1949||http://www.imdb.com/title/tt0041546/|http://www.guardian.co.uk/film/movie/36180/kind-hearts-and-coronets|UK
109 | 410| The Thin Red Line|Terrence Malik|Adrien Brody, Ben Chaplin, Nick Nolte, Sean Penn|1998||http://www.imdb.com/title/tt0120863/|http://www.guardian.co.uk/film/movie/74795/thin.red.line|USA
110 | 411|Raiders of the Lost Ark|Steven Spielberg|Harrison Ford, Karen Allen, Paul Freeman, Ronald Lacey|1981|4|http://www.imdb.com/title/tt0082971/|http://www.guardian.co.uk/film/movie/36332/raiders-of-the-lost-ark|USA
111 | 712|Ringu|Hideo Nakata|Nanako Matsushima, Hiroyuki Sanada,  Rikiya Otaka|1998||http://www.imdb.com/title/tt0178868/|http://www.guardian.co.uk/film/movie/121191/ringu|Japan
112 | 713|The Haunting|Robert Wise|Claire Bloom, Julie Harris, Richard Johnson|1963||http://www.imdb.com/title/tt0057129/|http://www.guardian.co.uk/film/movie/99697/haunting|USA
113 | 714|Texas Chainsaw Massacre|Tobe Hooper| Edwin Neal, Jim Siedow, Marilyn Burns, Paul A Partain|1974||http://www.imdb.com/title/tt0072271/|http://www.guardian.co.uk/film/movie/82763/texas-chainsaw-massacre|USA
114 | 715|Dead of Night|Alberto Cavalcanti, Charles Crichton|Googie Withers, Mervyn Johns, Michael Redgrave|1945||http://www.imdb.com/title/tt0037635/|http://www.guardian.co.uk/film/movie/79561/dead.of.night|UK
115 | 716|The Cabinet of Dr Caligari|Robert Wiene|Conrad Veidt, Lil Dagover, Werner Krauss|1920||http://www.imdb.com/title/tt0010323/|http://www.guardian.co.uk/film/movie/77300/cabinet-of-dr-caligari|Germany
116 | 717|Halloween|John Carpenter|Donald Pleasance, Donald Pleasence, Jamie Lee Curtis, Nancy Loomis, Tony Moran|1978||http://www.imdb.com/title/tt0077651/|http://www.guardian.co.uk/film/movie/104810/halloween|USA
117 | 718|Bride of Frankenstein|James Whale|Boris Karloff, Colin Clive, Elsa Lanchester|1935||http://www.imdb.com/title/tt0026138/|http://www.guardian.co.uk/film/movie/34577/bride-of-frankenstein|USA
118 | 719|Les Diaboliques|Henri-Georges Clouzot|Paul Meurisse, Simone Signoret, Vera Clouzot|1955||http://www.imdb.com/title/tt0046911/|http://www.guardian.co.uk/film/movie/75862/diaboliques|France
119 | 71|Psycho|Alfred Hitchcock|Anthony Perkins, Janet Leigh, Vera Miles|1960||http://www.imdb.com/title/tt0054215/|http://www.guardian.co.uk/film/movie/34630/psycho|USA
120 | 720|Audition|Miike Takashi|Eihi Shiina, Ishibashi Renji, Ishibashi Ryo, Matsuda Miyuki, Renji Ishibashi, Ryo Ishibashi, Shiina Eihi|1999||http://www.imdb.com/title/tt0235198/|http://www.guardian.co.uk/film/movie/84815/audition|Korea
121 | 412| Bullitt|Peter Yates|Jacqueline Bisset, Robert Vaughn, Steve McQueen|1968|1|http://www.imdb.com/title/tt0062765/|http://www.guardian.co.uk/film/movie/76966/bullitt|USA
122 | 412|Ran|Akira Kurosawa|Akira Terao, Daisuke Ryu, Mieko Harada, Tatsuya Nakadai|1985|1|http://www.imdb.com/title/tt0089881/|http://www.guardian.co.uk/film/movie/76633/ran|Japan
123 | 414|Die Hard|John McTeirnan|Alan Rickman, Bonnie Bedelia, Bruce Willis|1988||http://www.imdb.com/title/tt0095016/|http://www.guardian.co.uk/film/movie/80851/die-hard|Japan
124 | 415|The Adventures of Robin Hood|Michael Curtiz, William Keighley|Basil Rathbone, Claude Rains, Errol Flynn, Olivia De Havilland, Olivia de Havilland, William Keighley|1938|3|http://www.imdb.com/title/tt0029843/|http://www.guardian.co.uk/film/movie/34500/adventures-of-robin-hood|USA
125 | 416| The Searchers|John Ford|Jeffrey Hunter, John Wayne, Natalie Wood, Vera Miles, Ward Bond|1956||http://www.imdb.com/title/tt0049730/|http://www.guardian.co.uk/film/movie/115097/searchers|USA
126 | 417|Goldfinger|Guy Hamilton| Bernard Lee, Gert Frobe, Harold Sakata, Honor Blackman, Lois Maxwell, Sean Connery, Shirley Eaton, Tania Mallet|1964|1|http://www.imdb.com/title/tt0058150/|http://www.guardian.co.uk/film/movie/79341/goldfinger|UK
127 | 418|Full Metal Jacket|Stanley Kubrick|Adam Baldwin, Lee Ermey, Matthew Modine, Vincent D'Onofrio|1987||http://www.imdb.com/title/tt0093058/|http://www.guardian.co.uk/film/movie/76429/full-metal-jacket|USA
128 | 46|City of God|Fernando Meirelles|Alexandre Rodrigues, Leandro Firmino da Hora, Matheus Nachtergaele, Phelipe Haagensen|2002||http://www.imdb.com/title/tt0317248/|http://www.guardian.co.uk/film/movie/94028/city.of.god|Brazil
129 | 47|Paths of Glory|Stanley Kubrick|Adolphe Menjou, Kirk Douglas, Ralph Meeker|1957||http://www.imdb.com/title/tt0050825/|http://www.guardian.co.uk/film/movie/76931/paths.of.glory|USA
130 | 48|The Wages of Fear|Henri-Georges Clouzot|Charles Vanel, Folco Lulli, Yves Montand|1953||http://www.imdb.com/title/tt0046268/|http://www.guardian.co.uk/film/movie/78592/wages-of-fear|France
131 | 49|Crouching Tiger Hidden Dragon|Ang Lee|Chang Chen, Chow Yun-Fat, Michelle Yeoh, Zhang Ziyi, Ziyi Zhang|2000|4|http://www.imdb.com/title/tt0190332/|http://www.guardian.co.uk/film/movie/86383/crouching.tiger.hidden.dragon|Taiwan
132 | 610|Close Encounters|Steven Spielberg|Melinda Dillon, Richard Dreyfuss|1977|1|http://www.imdb.com/title/tt0075860/|http://www.guardian.co.uk/film/movie/36315/close-encounters-of-the-third-kind|USA
133 | 610|King Kong|Ernest B Schoedsack, Merian C Cooper|Bruce Cabot, Ernest B Schoedsack, Fay Wray, Frank Reicher, James Flavin, John Armstrong, Noble Jhonson, Robert Armstrong|1933||http://www.imdb.com/title/tt0024216/|http://www.guardian.co.uk/film/movie/36134/king.kong|USA
134 | 612|Terminator/Terminator 2|James Cameron|Arnold Schwarzenegger, Linda Hamilton, Michael Biehn|1984/1991|4 altogether|http://www.imdb.com/title/tt0088247/|http://www.guardian.co.uk/film/movie/88018/terminator|USA
135 | 613|The Matrix|Andy & Larry Wachowski|Carrie-Anne Moss, Keanu Reeves, Laurence Fishburne|1999|4|http://www.imdb.com/title/tt0133093/|http://www.guardian.co.uk/film/movie/77528/matrix|USA
136 | 614|Alphaville|Jean Luc-Godard|Anna Karina, Eddie Constantine|1965||http://www.imdb.com/title/tt0058898/|http://www.guardian.co.uk/film/movie/75764/alphaville|France
137 | 615|Back to the Future|Robert Zemeckis|Christopher Lloyd, Crispin Glover, Lea Thompson, Michael J Fox, Michael J. Fox|1985|1|http://www.imdb.com/title/tt0088763/|http://www.guardian.co.uk/film/movie/78042/back-to-the-future|USA
138 | 616|Planet of the Apes|Franklin J Schaffner |Charlton Heston, Kim Hunter, Roddy McDowell|1968|1|http://www.imdb.com/title/tt0063442/|http://www.guardian.co.uk/film/movie/95819/planet-of-the-apes|USA
139 | 617|Brazil|Terry Gilliam|Jonathan Pryce, Michael Palin, Robert De Niro|1985||http://www.imdb.com/title/tt0088846/|http://www.guardian.co.uk/film/movie/79920/brazil|UK
140 | 618|The Lord of the Rings trilogy|Peter Jackson|Cate Blanchett, Dominic Monaghan, Elijah Wood, Hugo Weaving, John Rhys-Davies, Liv Tyler, Miranda Otto, Orlando Bloom, Sean Astin, Sir Ian McKellen, Viggo Mortensen, William Boyd|2001-2003|17 altogether|http://www.imdb.com/title/tt0167260/|http://www.guardian.co.uk/film/movie/92716/lord.of.the.rings|New Zealand
141 | 619|Dark Star|John Carpenter|Brian Narelle, Dan O'Bannon, Dre Pahich|1974||http://www.imdb.com/title/tt0069945/|http://www.guardian.co.uk/film/movie/77501/dark-star|USA
142 | 69|Star Wars  (1977)|George Lucas|Alec Guinness, Carrie Fisher, David Prowse, Harrison Ford, Mark Hamill, Peter Cushing, Peter Mayhew|1977|6|http://www.imdb.com/title/tt0076759/|http://www.guardian.co.uk/film/movie/36316/star.wars|USA
143 | 710|Peeping Tom|Michael Powell|Anna Massey, Carl Boehm, Esmond Knight, Karl Bohm, Maxine Audley, Moira Shearer|1960||http://www.imdb.com/title/tt0054167/|http://www.guardian.co.uk/film/movie/36228/peeping-tom|UK
144 | 711|The Innocents|Jack Clayton|Clytie Jessop, Deborah Kerr, Michael Redgrave, Peter Wyngarde|1961||http://www.imdb.com/title/tt0055018/|http://www.guardian.co.uk/film/movie/77279/innocents|USA
145 | 722|The Blair Witch Project|Daniel Myrick, E Sanchez|Heather Donahue, Joshua Leonard, Michael C. Williams|1999||http://www.imdb.com/title/tt0185937/|http://www.guardian.co.uk/film/movie/79459/blair.witch.project|USA
146 | 723|Evil Dead/Evil Dead II|Sam Raimi|Betsy Baker, Bruce Campbell, Ellen Sandweiss|1981/ 1987||http://www.imdb.com/title/tt0083907/|http://www.guardian.co.uk/film/movie/34582/evil-dead|USA
147 | 724|Carrie|Brian De Palma|John Travolta, Piper Laurie, Sissy Spacek|1976||http://www.imdb.com/title/tt0074285/|http://www.guardian.co.uk/film/movie/81489/carrie|USA
148 | 725|Les Vampires (1915)|Louis Feuillade|Edouard Mathe, Marcel Levesque|1915||http://www.imdb.com/title/tt0006206/|http://www.guardian.co.uk/film/movie/117077/vampires|France
149 | 72|Rosemary’s Baby|Roman Polanski|John Cassavetes, Mia Farrow, Ruth Gordon|1968|1|http://www.imdb.com/title/tt0063522/|http://www.guardian.co.uk/film/movie/80947/rosemary-s-baby|USA
150 | 915|Wall-E|Andrew Stanton|Ben Burtt, Fred Willard, Jeff Garlin, Kathy Najimy, Sigourney Weaver|2008|1|http://www.imdb.com/title/tt0910970/|http://www.guardian.co.uk/film/movie/125194/wall-e|USA
151 | 


--------------------------------------------------------------------------------
/data/poors_man_routes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Poor's man big data - bash edition
 5 | 
 6 | function clean_up {
 7 |     rm -rf random_ships*
 8 | }
 9 | trap clean_up EXIT
10 | 
11 | # Read ship_imo and ship_name
12 | mkfifo random_ships
13 | cut -d ";" -f1,2 containers_tiny.csv | uniq | tail -n +2 | sed "s/;/|/g" > random_ships &
14 | 
15 | # Place random numbers
16 | while read f; do 
17 |     shuf country_codes.csv | head -n $(((RANDOM % 10) + 1)) | \
18 |        sed "s/,/|/g" | awk '{ printf "%d|'"$f"'|%s\n", i++, $0 }'
19 | done < random_ships
20 | 
21 | 


--------------------------------------------------------------------------------
/data/random_data.rb:
--------------------------------------------------------------------------------
  1 | require 'csv'
  2 | require 'faker'
  3 | require 'time'
  4 | 
  5 | def gen_id alpha = 3, num = 10000
  6 |   ('A'..'Z').to_a.shuffle[0,alpha].join + (num + Random.rand(num - 1)).to_s
  7 | end
  8 | 
  9 | def time_rand from = 0.0, to = Time.now
 10 |   Time.at(from + rand * (to.to_f - from.to_f))
 11 | end
 12 | 
 13 | def credit_card
 14 |   (1..4).map { |i| 1000 + Random.rand(999) }.join('-')
 15 | end
 16 | 
 17 | def funds(m)
 18 |   (1..m).each do |n|
 19 |     amount = Random.rand(1000.0)
 20 |     parent = gen_id
 21 |     parent_name = Faker::Company.name
 22 | 
 23 |     divisions = Random.rand(5)
 24 |     positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1]
 25 |     currency = ['USD', 'EUR', 'JPY', 'AUD', 'CAD', 'GBP'].sample
 26 |     positions.each_cons(2) do |pos|
 27 |       percent = pos[1] - pos[0]
 28 |       identifier = gen_id
 29 |       tx_time = time_rand Time.local(2010, 1, 1), Time.local(2010, 12, 31)
 30 |       puts [tx_time, parent_name, parent, identifier, currency, percent.round(3), amount*percent].join('|')
 31 |     end
 32 |   end
 33 | end
 34 | 
 35 | def ratings(m)
 36 |   peliculas = ["01", "02", "03", "78", "79", "815", "29", "310", "04", "05", "28", "73", "74", "75", "765", "76", "77", "780", "31", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "18", "19", "20", "210", "218", "219", "21", "220", "220", "223", "224", "225", "22", "23", "214", "215", "216", "217", "23", "240", "24", "24", "25", "26", "270", "320", "321", "322", "323", "211", "211", "213", "324", "311", "312", "315", "316", "418", "41", "422", "422", "422", "423", "423", "425", "42", "43", "44", "45", "317", "318", "325", "328", "33", "33", "342", "34", "35", "362", "36", "61", "620", "621", "622", "623", "624", "625", "62", "63", "64", "65", "66", "66", "68", "37", "38", "39", "410", "411", "712", "713", "714", "715", "716", "717", "718", "719", "71", "720", "412", "412", "414", "415", "416", "417", "418", "46", "47", "48", "49", "610", "610", "612", "613", "614", "615", "616", "617", "618", "619", "69", "710", "711", "722", "723", "724", "725", "72", "915"]
 37 |   (1..m).each do |n|
 38 |     fecha = (time_rand Time.local(2016, 2, 15), Time.local(2016, 2, 21)).utc.iso8601
 39 |     puts "#{peliculas.sample},#{Random.rand(100000)},#{Random.rand(5)},#{fecha}"
 40 |   end
 41 | end
 42 | 
 43 | def ships_and_containers(m, p)
 44 |   puts ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok'].join(";")
 45 |   container_codes = CSV.read('./iso-container-codes.csv').map { |m| m[0] }.drop(1)
 46 |   container_groups = CSV.read('./iso-container-groups.csv').map { |m| m[0] }.drop(1)
 47 |   (1..m).each do |n|
 48 |     ship_imo = gen_id(3, 1000000)
 49 |     ship_name = [Faker::Name.first_name, Faker::Address.city].sample
 50 |     divisions = (p*10) + Random.rand((p*10)-1)
 51 |     positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1]
 52 |     total_weight = (1000*1000*1000) + Random.rand(999999999)
 53 |     country = Faker::Address.country_code
 54 |     departure = (time_rand Time.local(2016, 2, 15), Time.local(2016, 2, 21)).strftime("%Y%m%d#{n}")
 55 |     positions.each_cons(2) do |pos|
 56 |       container_id = gen_id(4, 1000000) # ISO 6346
 57 |       container_type = container_codes.sample
 58 |       container_group = container_groups.sample
 59 |       owner = Faker::Company.name
 60 |       percent = pos[1] - pos[0]
 61 |       net_weight = (total_weight*percent).round(2)
 62 |       gross_weight = ([0.05, 0.1, 0.03].sample * net_weight).round(2)
 63 |       declared = Faker::Commerce.department(5)
 64 |       contact = Faker::Internet.email
 65 |       customs_ok = ((1..10).to_a.map { |n| true } + [false]).sample
 66 |       puts [ship_imo, ship_name, country, departure, container_id, container_type, container_group, net_weight, gross_weight, owner, declared, contact, customs_ok].join(";")
 67 |     end
 68 |   end
 69 | end
 70 | 
 71 | def shop(m)
 72 |   puts ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned'].join('|')
 73 |   (1..m).each do |n|
 74 |     buyer = Faker::Name.name
 75 |     tx_id = gen_id(7, 100)
 76 |     tx_time = time_rand Time.local(2010, 1, 1), Time.local(2010, 12, 31)
 77 |     cc = credit_card()
 78 |     price = Faker::Commerce.price
 79 |     currency = ['USD', 'EUR', 'JPY', 'AUD', 'CAD', 'GBP'].sample
 80 |     payment = ['VISA', 'MASTERCARD', 'AMERICAN_EXPRESS', 'DANKORT', 'JCB', 'FORBRUGSFORENINGEN'].sample
 81 |     country = Faker::Address.country_code
 82 |     divisions = [0, 0, Random.rand(5)].sample
 83 |     positions = [0] + (0..divisions-1).map { |d| Random.rand(1.0) }.sort + [1]
 84 |     positions.each_cons(2) do |pos|
 85 |       percent = pos[1] - pos[0]
 86 |       item_price = (price*percent).round(2)
 87 |       department = Faker::Commerce.department(1, true)
 88 |       product = Faker::Commerce.product_name
 89 |       coupon = [false, false, true, false].sample
 90 |       coupon_code = ''
 91 |       if (coupon)
 92 |         coupon_code = gen_id(3,2)
 93 |       end
 94 |       returned = [false, false, false, 'defect', 'bounce', false, false, false, false].sample
 95 |       puts [tx_id, tx_time.utc.iso8601, buyer, currency, payment, cc, country, department, product, item_price, coupon_code, returned].join('|')
 96 |     end
 97 | 
 98 |   end
 99 | end
100 | 
101 | # shop(1000)
102 | # ships_and_containers(20, 2)
103 | # ratings(10000)
104 | 


--------------------------------------------------------------------------------
/data/ship_routes.csv:
--------------------------------------------------------------------------------
  1 | 0|GUI1871167|Kirlinland|Estonia|EE
  2 | 1|GUI1871167|Kirlinland|Mauritania|MR
  3 | 2|GUI1871167|Kirlinland|Dominica|DM
  4 | 3|GUI1871167|Kirlinland|Puerto Rico|PR
  5 | 4|GUI1871167|Kirlinland|Gabon|GA
  6 | 5|GUI1871167|Kirlinland|Lao People's Democratic Republic|LA
  7 | 6|GUI1871167|Kirlinland|Kazakhstan|KZ
  8 | 7|GUI1871167|Kirlinland|Grenada|GD
  9 | 8|GUI1871167|Kirlinland|Bonaire Sint Eustatius and Saba|BQ
 10 | 9|GUI1871167|Kirlinland|Greece|GR
 11 | 0|COB1191390|St. Elena|Mauritius|MU
 12 | 1|COB1191390|St. Elena|Gabon|GA
 13 | 2|COB1191390|St. Elena|United Kingdom|GB
 14 | 3|COB1191390|St. Elena|Jersey|JE
 15 | 4|COB1191390|St. Elena|Mongolia|MN
 16 | 5|COB1191390|St. Elena|Guatemala|GT
 17 | 6|COB1191390|St. Elena|Korea|KR
 18 | 0|KRO1091605|Ike|Kuwait|KW
 19 | 1|KRO1091605|Ike|Pitcairn|PN
 20 | 2|KRO1091605|Ike|Uruguay|UY
 21 | 3|KRO1091605|Ike|Zimbabwe|ZW
 22 | 0|JMP1211539|John Navy|Egypt|EG
 23 | 0|QEF1881275|Simone|Austria|AT
 24 | 1|QEF1881275|Simone|Tokelau|TK
 25 | 2|QEF1881275|Simone|Cayman Islands|KY
 26 | 3|QEF1881275|Simone|South Georgia and the South Sandwich Islands|GS
 27 | 4|QEF1881275|Simone|United Arab Emirates|AE
 28 | 5|QEF1881275|Simone|Lao People's Democratic Republic|LA
 29 | 6|QEF1881275|Simone|Honduras|HN
 30 | 7|QEF1881275|Simone|Svalbard and Jan Mayen|SJ
 31 | 0|QPU1694193|Prestige|Guadeloupe|GP
 32 | 1|QPU1694193|Prestige|Cayman Islands|KY
 33 | 2|QPU1694193|Prestige|Niger|NE
 34 | 3|QPU1694193|Prestige|Trinidad and Tobago|TT
 35 | 4|QPU1694193|Prestige|Cameroon|CM
 36 | 5|QPU1694193|Prestige|Uganda|UG
 37 | 0|YIL1516412|Abner|Saint Martin (French part)|MF
 38 | 1|YIL1516412|Abner|Bangladesh|BD
 39 | 2|YIL1516412|Abner|Bosnia and Herzegovina|BA
 40 | 0|XJM1059834|Margaretteview|Marshall Islands|MH
 41 | 1|XJM1059834|Margaretteview|Afghanistan|AF
 42 | 0|YKX1212832|Danyka|Virgin Islands British|VG
 43 | 1|YKX1212832|Danyka|Burundi|BI
 44 | 2|YKX1212832|Danyka|Qatar|QA
 45 | 3|YKX1212832|Danyka|South Africa|ZA
 46 | 4|YKX1212832|Danyka|Belarus|BY
 47 | 5|YKX1212832|Danyka|Réunion|RE
 48 | 6|YKX1212832|Danyka|United Arab Emirates|AE
 49 | 7|YKX1212832|Danyka|Grenada|GD
 50 | 8|YKX1212832|Danyka|Niue|NU
 51 | 0|AKO1391643|Keara|Uganda|UG
 52 | 1|AKO1391643|Keara|New Caledonia|NC
 53 | 2|AKO1391643|Keara|Pakistan|PK
 54 | 3|AKO1391643|Keara|Côte d'Ivoire|CI
 55 | 4|AKO1391643|Keara|Macedonia|MK
 56 | 5|AKO1391643|Keara|Bhutan|BT
 57 | 6|AKO1391643|Keara|Bahamas|BS
 58 | 0|PKJ1313228|Nelson|Afghanistan|AF
 59 | 1|PKJ1313228|Nelson|Suriname|SR
 60 | 2|PKJ1313228|Nelson|Tonga|TO
 61 | 3|PKJ1313228|Nelson|Guyana|GY
 62 | 0|MHE1939455|Magdalenstad|Ethiopia|ET
 63 | 0|CAT1031760|Calistaborough|Trinidad and Tobago|TT
 64 | 1|CAT1031760|Calistaborough|Congo|CG
 65 | 2|CAT1031760|Calistaborough|French Southern Territories|TF
 66 | 3|CAT1031760|Calistaborough|Jersey|JE
 67 | 4|CAT1031760|Calistaborough|Gambia|GM
 68 | 5|CAT1031760|Calistaborough|Azerbaijan|AZ
 69 | 6|CAT1031760|Calistaborough|Heard Island and McDonald Islands|HM
 70 | 7|CAT1031760|Calistaborough|Myanmar|MM
 71 | 0|ZEW1505964|East Pierre|Afghanistan|AF
 72 | 1|ZEW1505964|East Pierre|Romania|RO
 73 | 2|ZEW1505964|East Pierre|Somalia|SO
 74 | 3|ZEW1505964|East Pierre|Netherlands|NL
 75 | 4|ZEW1505964|East Pierre|Saint Pierre and Miquelon|PM
 76 | 0|RWK1014975|Princess|Mali|ML
 77 | 1|RWK1014975|Princess|Peru|PE
 78 | 2|RWK1014975|Princess|Aruba|AW
 79 | 0|BXE1370077|Shaun|Pakistan|PK
 80 | 1|BXE1370077|Shaun|Eritrea|ER
 81 | 2|BXE1370077|Shaun|Austria|AT
 82 | 3|BXE1370077|Shaun|Tajikistan|TJ
 83 | 4|BXE1370077|Shaun|Serbia|RS
 84 | 5|BXE1370077|Shaun|Chile|CL
 85 | 6|BXE1370077|Shaun|Indonesia|ID
 86 | 7|BXE1370077|Shaun|Equatorial Guinea|GQ
 87 | 8|BXE1370077|Shaun|Nicaragua|NI
 88 | 9|BXE1370077|Shaun|Bulgaria|BG
 89 | 0|QCJ1879622|Urca da Lima|Micronesia|FM
 90 | 1|QCJ1879622|Urca da Lima|Canada|CA
 91 | 2|QCJ1879622|Urca da Lima|South Georgia and the South Sandwich Islands|GS
 92 | 3|QCJ1879622|Urca da Lima|Saint Vincent and the Grenadines|VC
 93 | 4|QCJ1879622|Urca da Lima|Guinea-Bissau|GW
 94 | 5|QCJ1879622|Urca da Lima|Samoa|WS
 95 | 6|QCJ1879622|Urca da Lima|Viet Nam|VN
 96 | 0|EJQ1935333|Prestige|Cameroon|CM
 97 | 1|EJQ1935333|Prestige|Montserrat|MS
 98 | 2|EJQ1935333|Prestige|Lao People's Democratic Republic|LA
 99 | 3|EJQ1935333|Prestige|Niger|NE
100 | 0|IFD1255823|North Cobyville|Mozambique|MZ
101 | 1|IFD1255823|North Cobyville|Spain|ES
102 | 0|FNV1248771|Lauryn|El Salvador|SV
103 | 1|FNV1248771|Lauryn|Estonia|EE
104 | 2|FNV1248771|Lauryn|Pitcairn|PN
105 | 3|FNV1248771|Lauryn|Luxembourg|LU
106 | 4|FNV1248771|Lauryn|Turks and Caicos Islands|TC
107 | 5|FNV1248771|Lauryn|Timor-Leste|TL
108 | 6|FNV1248771|Lauryn|Viet Nam|VN
109 | 7|FNV1248771|Lauryn|Belgium|BE
110 | 8|FNV1248771|Lauryn|Spain|ES
111 | 


--------------------------------------------------------------------------------
/infra/beam/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:jessie-slim 
 2 | LABEL maintainer="Luis Belloch <docker@luisbelloch.es>"
 3 | 
 4 | ENV DEBIAN_FRONTEND=noninteractive
 5 | RUN apt-get update && \
 6 |     apt-get install -y build-essential python-software-properties python-pip python-dev && \
 7 |     pip install --upgrade setuptools && \
 8 |     rm -rf /var/lib/apt/lists/* ~/.cache/*
 9 | 
10 | RUN pip install --upgrade apache-beam && \
11 |     rm -rf ~/.cache/*
12 | 
13 | RUN mkdir -p /data /opt/beam
14 | WORKDIR /opt/beam
15 | 
16 | 


--------------------------------------------------------------------------------
/infra/beam/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all build tag push list
 2 | 
 3 | all: build tag
 4 | 
 5 | build:
 6 | 	docker build -t luisbelloch/beam .
 7 | 
 8 | tag:
 9 | 	docker tag luisbelloch/beam luisbelloch/beam:2018.1
10 | 
11 | push:
12 | 	docker push luisbelloch/beam:2018.1
13 | 	docker push luisbelloch/beam
14 | 
15 | list:
16 | 	docker images luisbelloch/beam
17 | 
18 | 


--------------------------------------------------------------------------------
/infra/beam/README.md:
--------------------------------------------------------------------------------
1 | beam.md


--------------------------------------------------------------------------------
/infra/beam/beam.md:
--------------------------------------------------------------------------------
 1 | # Apache Beam Docker Image
 2 | 
 3 | ## Basic usage
 4 | 
 5 | This folder contains a simple docker container to execute Apache Beam using python SDK, under direct runner. The image has been published in docker hub as [luisbelloch/beam:python2](https://hub.docker.com/r/luisbelloch/beam/):
 6 | 
 7 | ```
 8 | $ docker pull luisbelloch/beam:python2
 9 | ```
10 | 
11 | A simple word count sample can be run as:
12 | 
13 | ```
14 | $ docker run luisbelloch/beam:python2 python -m apache_beam.examples.wordcount \
15 |     --input /etc/hosts --output /tmp/output.txt
16 | ```
17 | 
18 | We've included an script that will mount current folder as volume in `/data`:
19 | 
20 | ```
21 | $ ./beam -m apache_beam.examples.wordcount --input /etc/hosts --output /data/wordcount.txt
22 | ```
23 | 
24 | To run any script of the [samples](../../beam/) folder:
25 | 
26 | ```
27 | $ ./beam basic.py --input /data/compras_tiny.csv --output /data/purchases_summary.json
28 | ```
29 | 
30 | ## Building the container
31 | 
32 | ```
33 | docker build -t luisbelloch/beam:python2 .
34 | ```
35 | 
36 | 


--------------------------------------------------------------------------------
/infra/dataproc.md:
--------------------------------------------------------------------------------
  1 | # Tutorial Dataproc (Spark)
  2 | 
  3 | Dataproc es la versión gestionada de Spark en Google Cloud. En este tutorial vamos a cubrir como subir archivos a Cloud Storage (S3) y lanzar un trabajo de Spark para procesarlo.
  4 | 
  5 | Duración estimada: <walkthrough-tutorial-duration duration="45"></walkthrough-tutorial-duration>
  6 | 
  7 | ## Selecciona un proyecto
  8 | 
  9 | <walkthrough-project-setup></walkthrough-project-setup>
 10 | 
 11 | ## Preparación
 12 | 
 13 | ### 1. Habilita las APIs necesarias
 14 | 
 15 | Antes de continuar es necesario habilitar las APIs de Cloud Storage y Dataproc.
 16 | 
 17 | <walkthrough-enable-apis apis="dataproc.googleapis.com,storage.googleapis.com ">Habilitar APIs</walkthrough-enable-apis>
 18 | 
 19 | ### 2. Abre una terminal
 20 | 
 21 | La mayoría de los comandos pueden ejecutarse desde la interfaz de usuario, pero en el tutorial utilizaremos la consola de cloudshell.
 22 | 
 23 | Si no esta abierta ya en la parte inferior puedes abrirla mediante el icono <walkthrough-cloud-shell-icon></walkthrough-cloud-shell-icon>
 24 | arriba a la derecha, o utilizando el siguiente enlace:
 25 | 
 26 | <walkthrough-open-cloud-shell-button></walkthrough-open-cloud-shell-button>
 27 | 
 28 | ### 3. Materiales de clase
 29 | 
 30 | Asegurate de que la carpeta `cloudshell_open/data_processing_course` se ha creado y la terminal apunta a esa carpeta.
 31 | 
 32 | ```sh
 33 | cd ~/cloudshell_open/data_processing_course
 34 | ```
 35 | 
 36 | Sino, puedes abrir de nuevo el proyecto desde [bigdata.luisbelloch.es](http://bigdata.luisbelloch.es) y seleccionando [Open in Cloud Shell](https://console.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https://github.com/luisbelloch/data_processing_course.git).
 37 | 
 38 | Alternativamente puedes clonar el repositorio mediante `git`:
 39 | 
 40 | ```sh
 41 | git clone https://github.com/luisbelloch/data_processing_course.git && cd data_processing_course
 42 | ```
 43 | 
 44 | ## Paso 1: Crear un bucket en cloud storage
 45 | 
 46 | EL bucket se puede también crear desde [la UI de Google Cloud Storage](https://cloud.google.com/storage/docs/creating-buckets).
 47 | 
 48 | En nuestro caso podemos usar la terminal para crearlo:
 49 | 
 50 | ```sh
 51 | gsutil mb -c regional -l europe-west1 gs://NOMBRE_BUCKET
 52 | ```
 53 | 
 54 | Recuerda que el nombre del bucket `NOMBRE_BUCKET` debe ser único en internet.
 55 | 
 56 | Para copiar datos puede utilizarse tambien `gsutil` con `cp`:
 57 | 
 58 | ```sh
 59 | gsutil cp data/compras_tiny.csv gs://NOMBRE_BUCKET
 60 | ```
 61 | 
 62 | En el caso de que queramos sincronizar un directorio entero, podemos utilizar `rsync`:
 63 | 
 64 | ```sh
 65 | gsutil -m rsync data/ gs://NOMBRE_BUCKET
 66 | ```
 67 | 
 68 | ## Paso 2: Crear un cluster en Dataproc
 69 | 
 70 | Lo primero que debemos hacer es crear un cluster de Spark. Para las pruebas usaremos un único nodo, pero es posible crear varios también. En nuestro caso, vamos a crear un cluster llamado `dataproc1`.
 71 | 
 72 | ```sh
 73 | gcloud dataproc clusters create dataproc1 --region europe-west1 --single-node --enable-component-gateway
 74 | ```
 75 | 
 76 | Una vez esté creado, podemos ver el estado del cluster en la [interfaz de usuario de Dataproc](https://console.cloud.google.com/dataproc/clusters).
 77 | 
 78 | Es interesante ver que Dataproc ha creado distintas máquinas virtuales [en Compute Engine](https://console.cloud.google.com/compute/instances).
 79 | 
 80 | <walkthrough-footnote>Recuerda eliminar el cluster al finalizar el tutorial.</walkthrough-footnote>
 81 | 
 82 | ## Paso 3: Crear un trabajo de ejemplo de Spark
 83 | 
 84 | Como ejemplo, vamos a crear un script que cuente las lineas de el archivo `compras_tiny.csv`, llamado `prueba_dataproc.py`.
 85 | 
 86 | ```python
 87 | from os import path
 88 | from pyspark import SparkContext
 89 | 
 90 | sc = SparkContext('local', 'hello')
 91 | rdd = sc.textFile('gs://bigdataupv_data/compras_tiny.csv')
 92 | 
 93 | print("Count:", rdd.count())
 94 | ```
 95 | 
 96 | Puedes crear el script en cualquier carpeta, pero asegurate de especificar la ruta al ejecutar el trabajo en el paso siguiente.
 97 | 
 98 | ## Paso 4: Ejecutar el trabajo de Spark
 99 | 
100 | Para ejecutar el script `prueba_dataproc.py` que acabamos de crear es necesario enviarlo al cluster:
101 | 
102 | ```sh
103 | gcloud dataproc jobs submit pyspark prueba_dataproc.py --cluster dataproc1 --region europe-west1
104 | ```
105 | 
106 | Esto creará un `job` (trabajo) en el cluster, ejecutado por Spark.
107 | 
108 | Verás el progreso en la propia consola, en algún sitio debería haber impreso el número de filas del trabajo cuando termine:
109 | 
110 | ```terminal
111 | Count: 1723
112 | ```
113 | 
114 | ### Adjuntar archivos adicionales
115 | 
116 | En clase hemos trabajado haciendo uso de un archivo llamado `helpers.py`. Si se referencia el código de ese archivo desde cualquier script, es necesario adjuntarlo al trabajo mediante la opcion `--files`:
117 | 
118 | ```sh
119 | gcloud dataproc jobs submit pyspark prueba_dataproc.py --cluster dataproc1 --region europe-west1 --files=helpers.py
120 | ```
121 | 
122 | Los scripts pueden también residir en un bucket de Cloud Storage, simplemente reemplaza los normbres por la ruta completa de los archivos:
123 | 
124 | ```terminal
125 | gs://bigdataupv_code/prueba_dataproc.py
126 | gs://bigdataupv_code/helpers.py
127 | ```
128 | 
129 | ## Paso 5: Determinar el estado de los trabajos lanzados
130 | 
131 | Los trabajos ejecutados también son accesibles desde [la interfaz de usuario de Dataproc](https://console.cloud.google.com/dataproc/clusters/dataproc1/jobs), desde donde pueden consultarse los resultados.
132 | 
133 | Alternativamente se pueden listar todos los trabajos de una región, en nuestro caso `europe-west1`:
134 | 
135 | ```sh
136 | gcloud dataproc jobs list --region=europe-west1
137 | ```
138 | 
139 | Tras ejecutarlo debería mostrar una lista de trabajos:
140 | 
141 | ```terminal
142 | JOB_ID: 2c5c402a995e424ca24087498d559731
143 | TYPE: pyspark
144 | STATUS: DONE
145 | ```
146 | 
147 | ### Consultar un determinado trabajo
148 | 
149 | Utilizando ese `JOB_ID` podemos también consultar el estado y los logs del trabajo, incluso antes de que finalize:
150 | 
151 | ```sh
152 | gcloud dataproc jobs wait 2c5c402a995e424ca24087498d559731 --project bigdataupv2022 --region europe-west1
153 | ```
154 | 
155 | ## Paso 6: Eliminar el cluster
156 | 
157 | Para finalizar el ejercicio eliminaremos el cluster creado, de forma que se detendrá la facturación por uso de los recursos involucrados:
158 | 
159 | ```sh
160 | gcloud dataproc clusters delete dataproc1 --region=europe-west1
161 | ```
162 | 
163 | También es posible eliminarlo desde la consola de Google Cloud.
164 | 
165 | ![](https://cloud.google.com/dataproc/images/dataproc-1-delete.png)
166 | 
167 | ## Completado!
168 | 
169 | Recuerda eliminar el cluster de Dataproc al completar el ejercicio.
170 | 
171 | <walkthrough-conclusion-trophy></walkthrough-conclusion-trophy>
172 | 


--------------------------------------------------------------------------------
/infra/docker/.envrc:
--------------------------------------------------------------------------------
1 | export DOCKER_BUILDKIT=1
2 | export COMPOSE_DOCKER_CLI_BUILD=1
3 | 


--------------------------------------------------------------------------------
/infra/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM amazoncorretto:11 as corretto-jdk
 2 | RUN $JAVA_HOME/bin/jlink \
 3 |     --verbose \
 4 |     --add-modules ALL-MODULE-PATH \
 5 |     --strip-debug \
 6 |     --no-man-pages \
 7 |     --no-header-files \
 8 |     --compress=2 \
 9 |     --output /opt/jre
10 | 
11 | FROM debian:stable-slim
12 | LABEL maintainer="Luis Belloch <docker@luisbelloch.es>"
13 | ENV JAVA_HOME=/opt/jre
14 | ENV PATH="${JAVA_HOME}/bin:${PATH}"
15 | COPY --from=corretto-jdk /opt/jre $JAVA_HOME
16 | 
17 | ENV DEBIAN_FRONTEND=noninteractive
18 | RUN apt-get update && \
19 |     apt-get install -y --no-install-recommends ca-certificates procps python3-software-properties python3-numpy curl && \
20 |     rm -rf /var/lib/apt/lists/*
21 | 
22 | ARG SPARK_VERSION=3.3.1
23 | ENV SPARK_HOME=/opt/spark
24 | RUN mkdir -p /opt/spark && curl -s https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz | tar -xz -C "${SPARK_HOME}" --strip-components=1
25 | ENV PATH="${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}"
26 | 
27 | RUN cp "${SPARK_HOME}/conf/log4j2.properties.template" "${SPARK_HOME}/conf/log4j2.properties" && \
28 |     sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties"
29 | 
30 | ENV SPARK_NO_DAEMONIZE=true
31 | ENV PYSPARK_PYTHON=/usr/bin/python3
32 | ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3
33 | EXPOSE 4040 7077 8080
34 | 
35 | CMD ["pyspark"]
36 | 


--------------------------------------------------------------------------------
/infra/docker/Makefile:
--------------------------------------------------------------------------------
 1 | SPARK_VERSION:=3.3.1
 2 | COURSE_VERSION:=2022.12
 3 | IMAGE_NAME:=luisbelloch/spark
 4 | 
 5 | .PHONY: help
 6 | help:
 7 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(lastword $(MAKEFILE_LIST)) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 8 | 
 9 | .PHONY: all
10 | all: build tag ## Builds and tags an image
11 | 
12 | .PHONY: build
13 | build: ## Assembles image from Spark binaries
14 | 	docker build --build-arg SPARK_VERSION=${SPARK_VERSION} -t $(IMAGE_NAME) .
15 | 
16 | .PHONY: tag
17 | tag: ## Adds tags to current latest image
18 | 	docker tag $(IMAGE_NAME) $(IMAGE_NAME):$(SPARK_VERSION)
19 | 	docker tag $(IMAGE_NAME) $(IMAGE_NAME):$(COURSE_VERSION)
20 | 
21 | .PHONY: push
22 | push: ## Uploads images to registry
23 | 	docker push $(IMAGE_NAME):$(SPARK_VERSION)
24 | 	docker push $(IMAGE_NAME):$(COURSE_VERSION)
25 | 	docker push $(IMAGE_NAME)
26 | 
27 | .PHONY: list
28 | list: ## Lists local generated images
29 | 	docker images $(IMAGE_NAME)
30 | 
31 | 


--------------------------------------------------------------------------------
/infra/docker/README.md:
--------------------------------------------------------------------------------
1 | docker.md


--------------------------------------------------------------------------------
/infra/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   master:
 4 |     image: luisbelloch/spark
 5 |     ports:
 6 |       - 8080:8080
 7 |       - 7077:7077
 8 |     command: /opt/spark/sbin/start-master.sh
 9 |     environment:
10 |       MASTER: spark://master:7077
11 |       SPARK_PUBLIC_DNS: localhost
12 |       SPARK_NO_DAEMONIZE: 1
13 |   worker:
14 |     image: luisbelloch/spark
15 |     command: /opt/spark/sbin/start-slave.sh spark://master:7077
16 |     environment:
17 |       SPARK_PUBLIC_DNS: localhost
18 |       SPARK_NO_DAEMONIZE: 1
19 |     ports:
20 |       - 8081:8081
21 | 


--------------------------------------------------------------------------------
/infra/docker/docker.md:
--------------------------------------------------------------------------------
  1 | # Spark-on-Docker Samples
  2 | 
  3 | This folder will be used to see how we could provision a Spark cluster using Docker. While this is an interesting exercise to reason about some of the implications, ask yourself first if this makes sense at all before going to production.
  4 | 
  5 | ## Bare-bones Docker Image
  6 | 
  7 | By default, the image will run is pointing to `pyspark`, so running it without parameters will display directly the python repl:
  8 | 
  9 | ```
 10 | $ docker run -ti luisbelloch/spark
 11 | ```
 12 | 
 13 | ### Running PySpark samples
 14 | 
 15 | We've included an script to easily run scripts in the [spark](../../spark) folder. To run any of the scripts, simply do:
 16 | 
 17 | ```
 18 | $ cd data_processing_course/spark
 19 | $ ./spark compras_conversion_a_dolares.py
 20 | ```
 21 | 
 22 | Please pay attention to the dot before the name of the script, `./spark`. The docker container has access to all the scripts in that folder, included the `data` folder on it:
 23 | 
 24 | ```python
 25 | txt = sc.textFile('./data/compras_tiny.csv')
 26 | ```
 27 | 
 28 | ### Using the image without the "spark" helper script
 29 | 
 30 | Remember that inside the container you won't have access to the samples or data files we'll use in classroom. You'll have to mount a volume with them, [using -v option](https://docs.docker.com/engine/tutorials/dockervolumes). The local folder cannot contain relative routes, use `readlink` command to convert it to an absolute one.
 31 | 
 32 | ```
 33 | $ docker run \
 34 |   -v $(readlink -f ../../spark):/opt/samples \
 35 |   -w /opt/samples \
 36 |   -ti luisbelloch/spark spark-submit /opt/samples/compras_con_mas_de_un_descuento.py
 37 | ```
 38 | 
 39 | That should spawn a new container and run the job inside it. We've also mounted the samples folder in `/opt/samples` inside the container. All the executables from the Spark distribution are available in the container's path.
 40 | 
 41 | ### How to build the images
 42 | 
 43 | Images are available in [Docker Hub](https://hub.docker.com/r/luisbelloch/spark/), you can easily modify and rebuild them:
 44 | 
 45 | ```
 46 | $ docker build -t luisbelloch/spark .
 47 | $ docker tag luisbelloch/spark:2.10 luisbelloch/spark:latest
 48 | ```
 49 | 
 50 | ### Running Spark Master \ Workers
 51 | 
 52 | Variable `SPARK_NO_DAEMONIZE` is already set in the `Dockerfile`, it will make start scripts to run foreground instead of leaving the process in the background.
 53 | 
 54 | First step should be to start the master node. We've exposed ports 8080 (UI) and 7077 (Spark).
 55 | 
 56 | ```
 57 | $ docker run -p 8080:8080 -p 7077:7077 -d luisbelloch/spark start-master.sh
 58 | ```
 59 | 
 60 | Note that workers connect to master node through 7077 exposed to actual physical machine. Remember to configure port forwarding if you run docker inside a virtual machine.
 61 | 
 62 | After it starts, go to [localhost:8080](http://localhost:8080) and get the master URL. In our case is `spark://11168790f9c1:7077`. You will also need the container alias, `nervous_noyce`, to enable a link between master and worker containers. List containers with `docker ps` to retrieve it.
 63 | 
 64 | ```
 65 | $ docker ps
 66 | CONTAINER ID   IMAGE               NAMES
 67 | 11168790f9c1   luisbelloch/spark   nervous_noyce
 68 | 
 69 | $ docker run -p 8081:8081 \
 70 |     --link nervous_noyce \
 71 |     -d luisbelloch/spark start-worker.sh spark://11168790f9c1:7077
 72 | ```
 73 | 
 74 | The worker node should be displayed in the master UI.
 75 | 
 76 | Remember that if you want to run jobs against those containers you need to point `spark-submit` or `pyspark` to the master node. To do it, add the option `--master` and set the URL that you copied from master node web page:
 77 | 
 78 | ```
 79 | $ docker run -p 8081:8081 \
 80 |     --link nervous_noyce \
 81 |     -ti luisbelloch/spark pyspark \
 82 |     --master spark://11168790f9c1:7077
 83 | ```
 84 | 
 85 | ## Using Docker Compose
 86 | 
 87 | To bring up a mini-cluster with a master node and one worker:
 88 | 
 89 | ```
 90 | $ docker compose up
 91 | ```
 92 | 
 93 | The master UI should be available at [localhost:8080](http://localhost:8080).
 94 | 
 95 | Then you can also connect to it via `pyspark`:
 96 | 
 97 | ```
 98 | $ docker compose run -p 4040:4040 master pyspark --master spark://master:7077
 99 | ```
100 | 
101 | Running `docker ps` will show containers and their ports mapped. Workers can connect to master using internal DNS resolution, we've exposed the master node as `master`. Note that exposing worker nodes port is not straight-forward, we'll discuss that in class.
102 | 
103 | To scale up/down the cluster:
104 | 
105 | ```
106 | $ docker compose scale worker=3
107 | ```
108 | 
109 | Beware desired state persist between runs.
110 | 


--------------------------------------------------------------------------------
/infra/kubernetes/README.md:
--------------------------------------------------------------------------------
1 | kubernetes.md


--------------------------------------------------------------------------------
/infra/kubernetes/kubernetes.md:
--------------------------------------------------------------------------------
  1 | # Spark-on-Kubernetes
  2 | 
  3 | Code is based on [official Kubernetes examples](https://github.com/kubernetes/kubernetes/tree/master/examples/spark) and the [Spark 2.1 docker image](../docker/docker.md) used during the course.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | We recommend to work locally using `minikube`. Install [kubectl](https://kubernetes.io/docs/user-guide/prereqs/) and [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/) from the official sources.
  8 | 
  9 | After installing `minikube` we will push our Spark docker image to the internal Kubernetes registry. Use `minikube docker-env` to point current docker client to our cluster.
 10 | 
 11 | ```
 12 | $ eval $(minikube docker-env)
 13 | $ docker build -t luisbelloch/spark ../infra/docker
 14 | $ docker push luisbelloch/spark
 15 | ```
 16 | 
 17 | Alternatively you could use GRC images, just point the image containers to `gcr.io/google_containers/spark:1.5.2_v1`.
 18 | 
 19 | ## Cluster provisioning
 20 | 
 21 | First of all, we'll create a new namespace for our cluster and configure a context for `kubectl`. From this point, all the `kubectl` commands will be confined to that namespace.
 22 | 
 23 | ```
 24 | $ kubectl create -f namespace.yaml
 25 | $ kubectl config set-context spark --namespace=bigdataupv-spark --user=minikube --cluster=minikube
 26 | $ kubectl config use-context spark
 27 | ```
 28 | 
 29 | ### Master node
 30 | 
 31 | First thing we'll deploy is the Spark Master. We've defined a replication controller that will create just one container to host it. Note that if the master goes down, Kubernetes will automatically respawn the container.
 32 | 
 33 | ```
 34 | $ kubectl create -f master-controller.yaml
 35 | $ kubectl get pods
 36 | NAME                            READY     STATUS              RESTARTS   AGE
 37 | spark-master-controller-5pzdb   0/1       ContainerCreating   0          1s
 38 | ```
 39 | 
 40 | If you want to submit again the configuration after some changes, use command `apply` and Kubernetes will reconfigure the controller again. Although you can use the UI for this, notice best practice is to reapply configurations.
 41 | 
 42 | ```
 43 | $ kubectl apply -f master-controller.yaml
 44 | ```
 45 | 
 46 | Now we can check pod is up and running and master has elected as leader
 47 | 
 48 | ```
 49 | $ kubectl get pods
 50 | NAME                            READY     STATUS    RESTARTS   AGE
 51 | spark-master-controller-78dqq   1/1       Running   0          2m
 52 | 
 53 | $ kubectl logs spark-master-controller-78dqq
 54 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 55 | 17/02/09 20:42:57 INFO Master: Started daemon with process name: 12@spark-master-controller-78dqq
 56 | ...
 57 | 17/02/09 20:42:58 INFO Master: I have been elected leader! New state: ALIVE
 58 | ```
 59 | 
 60 | Note that the replication controller has `replicas: 1`, only one pod will be created to act as the master. The master node should declare two services: one in port 7077 to communicate with workers, and another in port 8080 serving the web UI:
 61 | 
 62 | ```
 63 | $ kubectl apply -f master-service.yaml
 64 | ```
 65 | 
 66 | Spark UI can be accessed starting `kubectl proxy` and accessing directly to this URL:
 67 | 
 68 | ```
 69 | http://127.0.0.1:8001/api/v1/proxy/namespaces/bigdataupv-spark/pods/spark-master-controller-78dqq:8080/
 70 | ```
 71 | 
 72 | ### Slaves
 73 | 
 74 | Starting slaves is pretty straightforward. Remember we've exposed the master node under the name `spark-master`, and therefore it will be accessible from other pods using simple DNS calls. The following command will create a replication controller for the slaves, starting with one pod:
 75 | 
 76 | ```
 77 | $ kubectl apply -f slave-controller.yaml
 78 | 
 79 | $ kubectl get rc -o wide
 80 | NAME                      DESIRED   CURRENT   READY     AGE       CONTAINER(S)   IMAGE(S)            SELECTOR
 81 | spark-master-controller   1         1         1         36m       spark-master   luisbelloch/spark   component=spark-master
 82 | spark-worker-controller   1         1         1         3m        spark-worker   luisbelloch/spark   component=spark-worker
 83 | ```
 84 | 
 85 | ### Accessing to PySpark
 86 | 
 87 | We can open a `PySpark` session directly in the master node, using `exec` command:
 88 | 
 89 | ```
 90 | $ kubectl exec spark-master-controller-78dqq -ti -- pyspark --master=spark://spark-master-controller-78dqq:7077
 91 | ```
 92 | 
 93 | If you ever need an interactive login, simply replace `pyspark` by `/bin/bash`.
 94 | 
 95 | ### Scaling the cluster
 96 | 
 97 | ```
 98 | $ kubectl scale --replicas=4 rc/spark-worker-controller
 99 | replicationcontroller "spark-worker-controller" scaled
100 | 
101 | $ kubectl get pods
102 | NAME                            READY     STATUS              RESTARTS   AGE
103 | spark-master-controller-78dqq   1/1       Running             0          40m
104 | spark-worker-controller-9r9vd   1/1       Running             0          8s
105 | spark-worker-controller-sp3tt   1/1       Running             0          1m
106 | spark-worker-controller-srvdm   0/1       ContainerCreating   0          8s
107 | ```
108 | 
109 | ## Problems not addressed
110 | 
111 | As we've seen in class, this has been an exercise to play with Spark deployment options and much deeper thoughts are needed before going to production. Generally speaking, Spark needs a bit more of work to make it aware of the environment it executes, particularly the UIs. In the Kubernetes repository there are few issues that you may follow up close to get more information:
112 | 
113 | - [#16517](kubernetes/kubernetes#16517) Has a good compendium of problems and things that doesn't work out of the box.
114 | - [#34377](kubernetes/kubernetes#34377) Describes some ideas to support other Spark deployment modes than the "standalone" one.
115 | - [#16949](kubernetes/kubernetes#16949) Talks about the problem with slave UIs ports and how it may be resolved.
116 | 


--------------------------------------------------------------------------------
/infra/kubernetes/master-controller.yaml:
--------------------------------------------------------------------------------
 1 | kind: ReplicationController
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: spark-master-controller
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     component: spark-master
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         component: spark-master
13 |     spec:
14 |       containers:
15 |         - name: spark-master
16 |           image: luisbelloch/spark
17 |           imagePullPolicy: Never
18 |           command: ["/opt/spark/sbin/start-master.sh"]
19 |           env:
20 |             - name: SPARK_NO_DAEMONIZE
21 |               value: "true"
22 |           ports:
23 |             - containerPort: 7077
24 |             - containerPort: 8080
25 |           resources:
26 |             requests:
27 |               cpu: 100m
28 | 


--------------------------------------------------------------------------------
/infra/kubernetes/master-service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: spark-master
 5 | spec:
 6 |   type: NodePort
 7 |   ports:
 8 |     - port: 7077
 9 |       targetPort: 7077
10 |       name: spark
11 |     - port: 8080
12 |       targetPort: 8080
13 |       name: http
14 |   selector:
15 |     component: spark-master
16 | 
17 | 


--------------------------------------------------------------------------------
/infra/kubernetes/namespace.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 |   name: "bigdataupv-spark"
5 |   labels:
6 |     name: "bigdataupv-spark"
7 | 


--------------------------------------------------------------------------------
/infra/kubernetes/slave-controller.yaml:
--------------------------------------------------------------------------------
 1 | kind: ReplicationController
 2 | apiVersion: v1
 3 | metadata:
 4 |   name: spark-worker-controller
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     component: spark-worker
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         component: spark-worker
13 |     spec:
14 |       containers:
15 |         - name: spark-worker
16 |           image: luisbelloch/spark
17 |           command: ["/opt/spark/sbin/start-slave.sh", "spark://spark-master:7077"]
18 |           env:
19 |             - name: SPARK_NO_DAEMONIZE
20 |               value: "true"
21 |           resources:
22 |             requests:
23 |               cpu: 100m
24 | 


--------------------------------------------------------------------------------
/infra/minio/config/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "8",
 3 | 	"hosts": {
 4 | 		"local": {
 5 | 			"url": "http://localhost:9000",
 6 | 			"accessKey": "JX6SNZEW2CLYM66UDHT7",
 7 | 			"secretKey": "NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0",
 8 | 			"api": "S3v4"
 9 | 		},
10 | 		"minio": {
11 | 			"url": "http://minio:9000",
12 | 			"accessKey": "JX6SNZEW2CLYM66UDHT7",
13 | 			"secretKey": "NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0",
14 | 			"api": "s3v4"
15 | 		}
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/infra/minio/config/config.json.old:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "8",
 3 | 	"hosts": {
 4 | 		"gcs": {
 5 | 			"url": "https://storage.googleapis.com",
 6 | 			"accessKey": "YOUR-ACCESS-KEY-HERE",
 7 | 			"secretKey": "YOUR-SECRET-KEY-HERE",
 8 | 			"api": "S3v2"
 9 | 		},
10 | 		"local": {
11 | 			"url": "http://localhost:9000",
12 | 			"accessKey": "",
13 | 			"secretKey": "",
14 | 			"api": "S3v4"
15 | 		},
16 | 		"play": {
17 | 			"url": "https://play.minio.io:9000",
18 | 			"accessKey": "Q3AM3UQ867SPQQA43P2F",
19 | 			"secretKey": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG",
20 | 			"api": "S3v4"
21 | 		},
22 | 		"s3": {
23 | 			"url": "https://s3.amazonaws.com",
24 | 			"accessKey": "YOUR-ACCESS-KEY-HERE",
25 | 			"secretKey": "YOUR-SECRET-KEY-HERE",
26 | 			"api": "S3v4"
27 | 		}
28 | 	}
29 | }


--------------------------------------------------------------------------------
/infra/minio/config/share/downloads.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"version": "1",
3 | 	"shares": {}
4 | }


--------------------------------------------------------------------------------
/infra/minio/config/share/uploads.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"version": "1",
3 | 	"shares": {}
4 | }


--------------------------------------------------------------------------------
/infra/minio/data:
--------------------------------------------------------------------------------
1 | ../../data/


--------------------------------------------------------------------------------
/infra/minio/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   minio:
 4 |     image: minio/minio
 5 |     command: server /data
 6 |     volumes:
 7 |       - minio1:/data
 8 |     networks: [block]
 9 |     ports:
10 |       - 9000:9000
11 |     environment:
12 |       MINIO_ACCESS_KEY: JX6SNZEW2CLYM66UDHT7
13 |       MINIO_SECRET_KEY: NHtuFRcy8XnRuqbASsHTK65oxYMQ7sNvwTnA1oX0
14 | 
15 | networks:
16 |   block:
17 | 
18 | volumes:
19 |   minio1:
20 | 
21 | 


--------------------------------------------------------------------------------
/infra/minio/mc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eou pipefail
 3 | 
 4 | readonly MC=(docker run -v "$(PWD)/data":/data -v "$(PWD)/config":/root/.mc --network="host" -ti minio/mc)
 5 | 
 6 | if [[ $# -lt 1 ]]; then
 7 |     >&2 ${MC[@]}
 8 |     exit 1
 9 | fi
10 | 
11 | ${MC[@]} $@
12 | 
13 | 


--------------------------------------------------------------------------------
/infra/minio/mirror.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eou pipefail
3 | readonly BUCKET=local/data
4 | ./mc mb -p "${BUCKET}"
5 | ./mc mirror --remove data "${BUCKET}"
6 | 
7 | 


--------------------------------------------------------------------------------
/infra/pyspark-jupyter/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/pyspark-notebook
2 | LABEL maintainer="Luis Belloch <docker@luisbelloch.es>"
3 | ENV JUPYTER_ENABLE_LAB=yes
4 | RUN git clone https://github.com/luisbelloch/data_processing_course.git && \
5 |     mv data_processing_course/data . && \
6 |     mv data_processing_course/spark ./ejemplos && \
7 |     rm -rf data_processing_course
8 | 


--------------------------------------------------------------------------------
/infra/pyspark-jupyter/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build tag push list run
 2 | 
 3 | all: build tag
 4 | 
 5 | build:
 6 | 	docker build -t luisbelloch/pyspark-jupyter .
 7 | 
 8 | tag:
 9 | 	docker tag luisbelloch/pyspark-jupyter luisbelloch/pyspark-jupyter:2021.10
10 | 
11 | push:
12 | 	docker push luisbelloch/pyspark-jupyter:2021.10
13 | 	docker push luisbelloch/pyspark-jupyter
14 | 
15 | run:
16 | 	docker run -p 8888:8888 -p 4040:4040 luisbelloch/pyspark-jupyter
17 | 
18 | list:
19 | 	docker images luisbelloch/pyspark-jupyter
20 | 
21 | 


--------------------------------------------------------------------------------
/infra/pyspark-jupyter/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark + Jupyter
 2 | 
 3 | This folder contains a docker container with PySpark ready to be run from a Jupyter Notebook, specifically customized for the course.
 4 | 
 5 | For more general uses, we recommend to use the official [Jupyter Docker Stacks](https://jupyter-docker-stacks.readthedocs.io/en/latest/index.html). This image itself is derived from `jupyter/pyspark-notebook` one.
 6 | 
 7 | To run it, simply do:
 8 | 
 9 | ```bash
10 | docker run -p 8888:8888 -ti luisbelloch/pyspark-jupyter
11 | ```
12 | 
13 | And navigate to [http://localhost:8888](http://localhost:8888). The password token will be displayed in the terminal.
14 | 
15 | This image contains `data` folder used in the examples. You can easily access to it from the notebook:
16 | 
17 | ```python
18 | rdd = sc.textFile('./data/compras_tiny.csv')
19 | rdd.take(2)
20 | ```
21 | 
22 | 


--------------------------------------------------------------------------------
/infra/single-node.md:
--------------------------------------------------------------------------------
 1 | # Setting up single-node Spark
 2 | 
 3 | This document describes how to download and setup Spark in your machine _without_ requiring a cluster setup.
 4 | 
 5 | > :warning: This is only intended for demo and learning purposes, please refer to the [official deployment guide](https://spark.apache.org/docs/latest/cluster-overview.html) for further information on how to properly deploy an Spark cluster.
 6 | 
 7 | In this repository you will find also other alternative options to run Spark locally:
 8 | 
 9 |   - [Spark on Docker](docker/docker.md)
10 |   - [Spark on Kubernetes](kubernetes/kubernetes.md)
11 |   - [Spark on Vagrant](vagrant.md)
12 |   - [Spark on Google Cloud Dataproc](dataproc.md)
13 |   - [PySpark Jupyter Notebook](pyspark-jupyter/README.md)
14 | 
15 | ## Requirements
16 | 
17 | This setup assumes you have a linux machine with Java 8 and Python 3 installed. Assuming a Debian distribution, _stretch_ version, you can install required dependencies with the following commands:
18 | 
19 | ```bash
20 | sudo apt-get update
21 | sudo apt-get install -y openjdk-8-jdk-headless python3-software-properties python3-numpy curl
22 | ```
23 | 
24 | ## Downloading and unpacking Spark
25 | 
26 | We recommend to install Spark in `/opt/spark`. To download Spark package, you could use the following commands:
27 | 
28 | ```bash
29 | mkdir /opt/spark
30 | curl http://apache.rediris.es/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz | tar -xz -C /opt/spark --strip-components=1
31 | ```
32 | 
33 | To make Spark binaries accessible add `/opt/spark/bin` to the `PATH`, by appending the following lines to your `.bashrc` file:
34 | 
35 | ```bash
36 | export PYSPARK_PYTHON=python3
37 | export PATH=$PATH:/opt/spark/bin
38 | ```
39 | 
40 | After that, restart current shell to make sure `PATH` changes are applied.
41 | 
42 | ## Testing the installation
43 | 
44 | Simply run the following command, you should get a value like `res0: Long = 100` in the console:
45 | 
46 | ```bash
47 | echo 'sc.parallelize(1 to 100).count()' | spark-shell
48 | ```
49 | 
50 | ## Reducing log level
51 | 
52 | By default Spark is too verbose and it would output a ton of the information in the terminal. Optionally you could reduce the log level doing:
53 | 
54 |   1. Rename the file `/opt/spark/conf/log4j.properties.template` to `log4j.properties`, in the same directory.
55 |   2. Edit the file and set `rootCategory` property to `ERROR` instead of `INFO`.
56 | 
57 | Use this two commands to do that automatically:
58 | 
59 | ```bash
60 | sed 's/rootCategory=INFO/rootCategory=ERROR/g' < /opt/spark/conf/log4j.properties.template > /opt/spark/conf/log4j.properties
61 | ```
62 | 
63 | ## TL;DR Using helper script
64 | 
65 | All this procedure can be accomplished by a simple script included in the [classroom repository](https://github.com/luisbelloch/data_processing_course). Just clone the repository and run [`local_setup.sh`](../local_setup.sh):
66 | 
67 | ```bash
68 | git clone https://github.com/luisbelloch/data_processing_course.git
69 | cd data_processing_course
70 | ./local_setup.sh
71 | ```
72 | 
73 | Spark will be installed in `data_processing_course/.spark`. Do not forget to add `bin` folder to the `$PATH`.
74 | 


--------------------------------------------------------------------------------
/infra/vagrant.md:
--------------------------------------------------------------------------------
 1 | # Using PySpark inside a Vagrant machine
 2 | 
 3 | We have created a Vagrant setup using Ansible that will download and unpack Spark inside the generated machine.
 4 | 
 5 | > :warning: This is only intended for demo and learning purposes, please refer to the [official deployment guide](https://spark.apache.org/docs/latest/cluster-overview.html) for further information on how to properly deploy an Spark cluster.
 6 | 
 7 | To bootstrap the machine, do:
 8 | 
 9 | ```bash
10 | git clone https://github.com/luisbelloch/data_processing_course.git
11 | cd data_processing_course
12 | vagrant up
13 | ```
14 | 
15 | Once the process completes you can access the machine by using:
16 | 
17 | ```bash
18 | vagrant ssh
19 | ```
20 | 
21 | Remember that you can access the _host_ machine files using the `/vagrant` folder from the inside of the VM.
22 | 
23 | ## Testing the installation
24 | 
25 | Make sure the machine is up and running with `vagrant up`, and you can access the virtual machine after doing `vagrant ssh`.
26 | 
27 | To test the setup run the following command, you should get a value like `res0: Long = 100` in the console:
28 | 
29 | ```bash
30 | echo 'sc.parallelize(1 to 100).count()' | spark-shell
31 | ```
32 | 
33 | ## Running samples
34 | 
35 | The samples we discussed in class are available in the folder `/vagrant/spark` inside the virtual machine:
36 | 
37 | ```bash
38 | vagrant@buster:~$ cd /vagrant/spark/
39 | vagrant@buster:/vagrant/spark$ spark-submit compras_con_mas_de_un_descuento.py
40 | ```
41 | 
42 | You may want to start the `pyspark` REPL as well:
43 | 
44 | ```bash
45 | vagrant@buster:~$ cd /vagrant/spark/
46 | vagrant@buster:/vagrant/spark$ pyspark
47 | ```
48 | 


--------------------------------------------------------------------------------
/local_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | SPARK_URL=${SPARK_URL:-https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz}
 4 | SPARK_PKG=${SPARK_URL##*/}
 5 | SPARK_HOME=${SPARK_HOME:-$(pwd)/.spark}
 6 | 
 7 | if [ -t 1 ]; then
 8 |     readonly colors=$(tput colors)
 9 |     if [ -n "$colors" ]; then
10 |         readonly c_step="$(tput setaf 6)"
11 |         readonly c_error="$(tput setaf 1)"
12 |         readonly c_norm="$(tput sgr0)"
13 |     fi
14 | fi
15 | 
16 | stderr() { >&2 echo "$@"; }
17 | 
18 | if [[ -d "${SPARK_HOME}" ]]; then
19 |     stderr "${c_error}ERROR${c_norm}: Folder already exists '$SPARK_HOME'"
20 |     stderr "Set SPARK_HOME to an empty folder before running this script or make sure there's no 'spark' folder in current directory."
21 |     exit 1
22 | fi
23 | 
24 | stderr "${c_step}[0] Destination: ${SPARK_HOME}${c_norm}"
25 | stderr "${c_step}[1] Downloading and unpacking $SPARK_PKG.tgz${c_norm}"
26 | mkdir -p "${SPARK_HOME}"
27 | curl -s "${SPARK_URL}" | tar -xz -C "${SPARK_HOME}" --strip-components=1
28 | 
29 | stderr "${c_step}[2] Reducing log level${c_norm}"
30 | cp "${SPARK_HOME}"/conf/log4j2.properties.template "${SPARK_HOME}"/conf/log4j2.properties
31 | sed -ibak 's/rootLogger.level = info/rootLogger.level = error/g' "${SPARK_HOME}/conf/log4j2.properties"
32 | 
33 | stderr "${c_step}[3] Testing setup${c_norm}"
34 | echo 'sc.parallelize(1 to 100).count()' | "${SPARK_HOME}"/bin/spark-shell
35 | rm -rf derby.log metastore_db
36 | 
37 | stderr
38 | stderr "${c_step}DONE! Local setup completed${c_norm}"
39 | stderr "Spark unpacked properly. You can now modify your path:"
40 | echo "export PATH=${SPARK_HOME// /\\ /}/bin:\$PATH"
41 | 
42 | 


--------------------------------------------------------------------------------
/playbook.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   vars:
 4 |     spark_home: /opt/spark
 5 |     spark_pkg_name: spark-3.3.1-bin-hadoop3
 6 |     spark_pkg_url: https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
 7 | 
 8 |   tasks:
 9 |     - name: Update all packages to the latest version
10 |       become: true
11 |       apt:
12 |         upgrade: dist
13 |         update_cache: yes
14 | 
15 |     - name: Basic dependencies
16 |       become: true
17 |       apt:
18 |         name: ['software-properties-common', 'python3-software-properties', 'curl', 'git', 'vim']
19 |         state: latest
20 |         update_cache: yes
21 |         force_apt_get: true
22 | 
23 |     - name: Install AdoptOpenJDK 11
24 |       become: true
25 |       block:
26 |         - name: Import keys
27 |           apt_key:
28 |             url: https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public
29 |             state: present
30 |         - name: Add repository
31 |           apt_repository:
32 |             repo: deb https://adoptopenjdk.jfrog.io/adoptopenjdk/deb xenial main
33 |             state: present
34 |         - name: Install package
35 |           apt:
36 |             name: ['adoptopenjdk-11-hotspot', 'ca-certificates']
37 |             state: latest
38 |             update_cache: yes
39 |             force_apt_get: true
40 | 
41 |     - name: Clone classrom repository
42 |       git:
43 |         repo: 'https://github.com/luisbelloch/data_processing_course.git'
44 |         dest: '{{ ansible_env.HOME }}/data_processing_course'
45 | 
46 |     - stat:
47 |         path: '/opt/{{ spark_pkg_name }}'
48 |       register: spark_dest
49 | 
50 |     - name: Install SPARK
51 |       when: spark_dest.stat.islnk is not defined
52 |       block:
53 |         - name: Download Spark
54 |           become: true
55 |           unarchive:
56 |             src: '{{ spark_pkg_url }}'
57 |             dest: /opt
58 |             remote_src: yes
59 | 
60 |         - name: Link to latest version
61 |           become: true
62 |           file:
63 |             state: link
64 |             src: '/opt/{{ spark_pkg_name }}'
65 |             dest: '{{ spark_home }}'
66 | 
67 |         - name: Add Spark to PATH
68 |           lineinfile:
69 |             path: '{{ ansible_env.HOME }}/.bashrc'
70 |             line: 'export PATH=$PATH:/opt/{{ spark_pkg_name }}/bin'
71 | 
72 |         - name: Set PySpark Python version to 3
73 |           lineinfile:
74 |             path: '{{ ansible_env.HOME }}/.bashrc'
75 |             line: 'export PYSPARK_PYTHON=python3'
76 | 
77 |         # https://bugs.python.org/issue19846
78 |         - name: Update locale
79 |           become: true
80 |           command: update-locale LC_ALL=en_US.UTF-8
81 | 
82 | 


--------------------------------------------------------------------------------
/spark/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | live/live.html
3 | live/live_mod.py
4 | _work.py
5 | 


--------------------------------------------------------------------------------
/spark/_template_rdd:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from helpers import item_fields, parse_item
 3 | 
 4 | sc = SparkContext('local', 'playground')
 5 | txt = sc.textFile('./data/compras_tiny.csv')
 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0]))
 7 | parsed = no_header.map(lambda s: parse_item(s)).cache()
 8 | 
 9 | print(parsed.take(1))
10 | print(parsed.toDebugString().decode('utf-8'))
11 | 
12 | 


--------------------------------------------------------------------------------
/spark/_template_sql:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | spark = SparkSession.builder.master('local').appName('SQL').getOrCreate()
4 | df = spark.read.load('./data/containers_tiny.parquet')
5 | 
6 | 


--------------------------------------------------------------------------------
/spark/compras_con_mas_de_un_descuento.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from helpers import parse_item, item_fields
 3 | 
 4 | sc = SparkContext('local', 'compras')
 5 | txt = sc.textFile('data/compras_tiny.csv')
 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0]))
 7 | parsed = no_header.map(lambda s: parse_item(s)).cache()
 8 | 
 9 | # Brief talk on why color in terminal should not be abused, logs get destroyed
10 | 
11 | # Primera aproximación
12 | mas_de_un_cupon = parsed \
13 |     .map(lambda i: (i.tx_id, i.coupon_code)) \
14 |     .filter(lambda t: t[1]) \
15 |     .map(lambda t: (t[0], 1)) \
16 |     .reduceByKey(lambda a, b: a + b) \
17 |     .filter(lambda t: t[1] > 1)
18 | print("\033[35mPlan de ejecución (v1):\033[0m") 
19 | print(mas_de_un_cupon.toDebugString().decode('utf-8'))
20 | print("\033[35mCon más de un descuento (v1):\033[0m", mas_de_un_cupon.count())
21 | 
22 | # Segunda aproximación, código equivalente
23 | mas_de_un_cupon2 = parsed \
24 |     .map(lambda i: (i.tx_id, 1 if i.coupon_code else 0)) \
25 |     .filter(lambda t: t[1] == 1) \
26 |     .reduceByKey(lambda a, b: a + b) \
27 |     .filter(lambda t: t[1] > 1)
28 | print("\n\033[36mPlan de ejecución (v2):\033[0m") 
29 | print(mas_de_un_cupon2.toDebugString().decode('utf-8'))
30 | print("\033[36mCon más de un descuento (v2):\033[0m", mas_de_un_cupon2.count())
31 | 
32 | total = parsed.count()
33 | p_descuentos = mas_de_un_cupon2.count() / float(total)
34 | print("\n\x1b[38;5;214mPorcentaje:\x1b[0m", p_descuentos, "\n")
35 | 
36 | 


--------------------------------------------------------------------------------
/spark/compras_conversion_a_dolares.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from helpers import get_usd_exchange_rates, item_fields, parse_item
 3 | 
 4 | sc = SparkContext('local', 'compras')
 5 | txt = sc.textFile('data/compras_tiny.csv')
 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0]))
 7 | parsed = no_header.map(lambda s: parse_item(s)).cache()
 8 | 
 9 | rates = get_usd_exchange_rates()
10 | 
11 | # El archivo puede tener múltiples problemas, incluso con algo 
12 | # sencillo como una simple conversion a dólares:
13 | # - el precio ya está en dólares
14 | # - item_price no viene como float
15 | # - no existe tasa de cambio para ese item
16 | # - ¿Cómo descartamos la linea? -> None
17 | # - ¿Cómo recogemos las filas que han fallado? ¿debemos?
18 | def convert_to_usd(item):
19 |     if (item.currency_code == 'USD'):
20 |         return item
21 |     if (not item.currency_code in rates):
22 |         return None # error?
23 |     new_price = rates[item.currency_code] * float(item.item_price)
24 |     new_item = item._replace(currency_code='USD', item_price = new_price)
25 |     return new_item
26 | 
27 | in_usd = parsed.map(convert_to_usd)
28 | print(in_usd.take(2))
29 | 
30 | 


--------------------------------------------------------------------------------
/spark/compras_importe_total_agrupado_por_tx_id.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from helpers import item_fields, parse_item
 3 | 
 4 | sc = SparkContext('local', 'compras')
 5 | txt = sc.textFile('data/compras_tiny.csv')
 6 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0]))
 7 | parsed = no_header.map(lambda s: parse_item(s)).cache()
 8 | 
 9 | importes = parsed \
10 |     .map(lambda i: (i.tx_id, float(i.item_price))) \
11 |     .reduceByKey(lambda elemento, acumulado: elemento + acumulado)
12 | 
13 | print(importes.take(10))
14 | 
15 | 


--------------------------------------------------------------------------------
/spark/compras_sql.py:
--------------------------------------------------------------------------------
 1 | from os import walk
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
 6 | 
 7 | df = spark.read.option("delimiter", "|").option("header", "true").csv('./data/compras_tiny.csv')
 8 | df.printSchema()
 9 | df.show()
10 | 
11 | df.createOrReplaceTempView("compras")
12 | spark.sql("SELECT tx_id, SUM(item_price) as tx_total FROM compras GROUP BY tx_id").show()
13 | 


--------------------------------------------------------------------------------
/spark/compras_top_ten_countries.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkContext
 3 | from helpers import dataUrl, item_fields, parse_item
 4 | 
 5 | sc = SparkContext('local', 'compras')
 6 | txt = sc.textFile(dataUrl('compras_tiny.csv'))
 7 | no_header = txt.filter(lambda s: not s.startswith(item_fields[0]))
 8 | parsed = no_header.map(lambda s: parse_item(s)).cache()
 9 | 
10 | countries_rdd = sc \
11 |     .textFile(dataUrl('country_codes.csv')) \
12 |     .map(lambda c: tuple(reversed(c.split(','))))
13 | 
14 | join_rdd = parsed \
15 |     .filter(lambda i: i.currency_code == 'USD') \
16 |     .map(lambda i: (i.country, float(i.item_price))) \
17 |     .reduceByKey(lambda a, b: a + b) \
18 |     .leftOuterJoin(countries_rdd) \
19 |     .sortBy(lambda i: i[1][0], ascending=False)
20 | 
21 | print(join_rdd.take(10))
22 | 
23 | # print map(lambda i: (i[0], i[1][1], i[1][0]), join_rdd.take(10))
24 | # join_rdd.saveAsTextFile(dataUrl('out/top10countries'), 'org.apache.hadoop.io.compress.GzipCodec')
25 | 
26 | 


--------------------------------------------------------------------------------
/spark/container.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | spark = SparkSession.builder.master("local").appName("container").getOrCreate()
 4 | 
 5 | df = spark.read.load('data/containers_tiny.parquet')
 6 | df.printSchema()
 7 | 
 8 | # Using API
 9 | df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show()
10 | 
11 | # Register table alias to allow SQL use
12 | df.createOrReplaceTempView("container")
13 | spark.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show()
14 | 
15 | # ship_imo, num of containers, total ship weight
16 | total_weight_rdd = spark.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo")
17 | total_weight_rdd.printSchema()
18 | total_weight_rdd.show()
19 | # print total_weight_rdd.map(lambda r: r['number']).collect()
20 | 
21 | # UDFs
22 | spark.udf.register('en_toneladas', lambda c: float(c) / 1000.0)
23 | spark.sql("SELECT en_toneladas(net_weight) toneladas, net_weight FROM container WHERE container_id = 'FMBV1684747'").show()
24 | 
25 | # JOINs: Extract description of container codes
26 | codes = spark.read.json('data/iso-container-codes.json')
27 | codes.createOrReplaceTempView('codes')
28 | codes.printSchema()
29 | codes.show()
30 | 
31 | w_desc = spark.sql("SELECT c.container_id, s.code, s.description FROM container c JOIN codes s on c.container_type = s.code")
32 | w_desc.show()
33 | print(w_desc.groupBy("code").count().take(3))
34 | 
35 | 


--------------------------------------------------------------------------------
/spark/container_caching.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import SQLContext, Row
 3 | 
 4 | sc = SparkContext("local", "barcos")
 5 | sq = SQLContext(sc)
 6 | 
 7 | df = sq.read.load("data/containers_tiny.parquet")
 8 | df.registerTempTable("container")
 9 | sq.cacheTable("container")
10 | 
11 | df.select("ship_imo", "ship_name", "country").filter(df['country'] == 'DK').show()
12 | sq.sql("SELECT ship_imo, ship_name FROM container WHERE country = 'DK'").show()
13 | sq.sql("SELECT ship_imo, count(container_id) number, sum(net_weight) total_weight FROM container GROUP BY ship_imo").show()
14 | 
15 | input("Press Enter to continue... http://localhost:4040/storage")
16 | 
17 | 


--------------------------------------------------------------------------------
/spark/container_convertir_a_parquet.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import SQLContext, Row
 3 | 
 4 | from helpers import container_fields, parse_container
 5 | 
 6 | sc = SparkContext('local', 'barcos')
 7 | sq = SQLContext(sc)
 8 | 
 9 | csv_source = sc \
10 |     .textFile('data/containers_tiny.csv') \
11 |     .filter(lambda s: not s.startswith(container_fields[0])) \
12 |     .map(parse_container) \
13 |     .map(lambda c: Row(**dict(c._asdict())))
14 | 
15 | print(csv_source.count())
16 | 
17 | # Python 2.7.6 to 3.5 
18 | # http://stackoverflow.com/a/26180604
19 | # .map(lambda c: Row(**dict(c.__dict__)))
20 | 
21 | # Convert RDD to a DataFrame (in scala, DataSet[Row])
22 | # It will preserve types from the RDD ones. Note it 
23 | # won't do anything fancy, since the namedtuple types
24 | # are just strings.
25 | containerSchema = sq.createDataFrame(csv_source)
26 | containerSchema.createOrReplaceTempView('container')
27 | containerSchema.printSchema()
28 | 
29 | denmark_only = sq.sql("SELECT ship_name FROM container WHERE country = 'DK'")
30 | print(denmark_only.first())
31 | 
32 | todo_df = sq.sql("SELECT * FROM container")
33 | todo_df.printSchema()
34 | 
35 | outpath = 'data/containers_tiny.parquet'
36 | todo_df.write.mode('overwrite').parquet(outpath)
37 | print("\nDatos guardados en", outpath)
38 | 
39 | 


--------------------------------------------------------------------------------
/spark/container_databricks_csv.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
 4 | 
 5 | # https://github.com/databricks/spark-csv#python-api
 6 | df = spark.read \
 7 |     .format("com.databricks.spark.csv") \
 8 |     .options(header='true', inferschema='true', delimiter=";") \
 9 |     .load('data/containers_tiny.csv')
10 | 
11 | df.printSchema()
12 | df.show()
13 | 
14 | df.select("container_id", "container_type", "gross_weight") \
15 |   .filter(df["country"] == "DK") \
16 |   .show()
17 | 
18 | df.groupBy("country").count().show()
19 | 
20 | df.createOrReplaceTempView("container")
21 | spark.sql("SELECT ship_name FROM container WHERE country = 'DK'").show()
22 | 
23 | 


--------------------------------------------------------------------------------
/spark/container_partition.py:
--------------------------------------------------------------------------------
 1 | from os import walk
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
 6 | 
 7 | def segment(df, field, value, num = 5):
 8 |   df.filter(df[field] == value).limit(num) \
 9 |     .write.mode('overwrite') \
10 |     .parquet('data/containers_partitioned/{}={}'.format(field, value))
11 | 
12 | def main():
13 |   df = spark.read.load('data/containers_tiny.parquet')
14 |   segment(df, "country", "DK")
15 |   segment(df, "country", "SB")
16 | 
17 |   for path, dirs, files in walk('data/containers_partitioned/'):
18 |     print("\x1b[38;5;214m"+path+"\033[0m")
19 |     for f in files:
20 |       print("  |-- ", f)
21 |   
22 |   partitioned = spark.read.load("data/containers_partitioned")
23 |   partitioned.select("container_id", "country").show()
24 | 
25 | if __name__ == '__main__':
26 |   main()
27 | 
28 | 


--------------------------------------------------------------------------------
/spark/container_rdd_to_dataset.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import *
 3 | 
 4 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
 5 | 
 6 | csv_source = spark.sparkContext \
 7 |     .textFile('data/containers_tiny.csv') \
 8 |     .filter(lambda s: not s.startswith("ship_imo")) \
 9 |     .map(lambda i: i.split(";")) \
10 |     .map(lambda i: (i[4], i[5], float(i[7]))) \
11 |     .cache()
12 | 
13 | print(csv_source.take(1))
14 | 
15 | # Set schema 
16 | container_id_field = StructField("container_id", StringType(), True) 
17 | container_type_field = StructField("container_type", StringType(), True) 
18 | net_weight_field = StructField("net_weight", FloatType(), True) 
19 | schemaDef = StructType([container_id_field, container_type_field, net_weight_field])
20 | 
21 | schema = spark.createDataFrame(csv_source, schemaDef)
22 | schema.printSchema()
23 | 
24 | 


--------------------------------------------------------------------------------
/spark/data:
--------------------------------------------------------------------------------
1 | ../data


--------------------------------------------------------------------------------
/spark/enable_history.properties:
--------------------------------------------------------------------------------
1 | # spark-submit --verbose --master spark://MASTER:7077 --properties-file enable_history.properties --py-files helpers.py SCRIPT
2 | spark.eventLog.enabled=true
3 | spark.history.fs.logDirectory=/tmp/spark-events
4 | 
5 | 


--------------------------------------------------------------------------------
/spark/friends.py:
--------------------------------------------------------------------------------
 1 | # RUN: ./graphframes.sh ship_routes.py
 2 | 
 3 | from pyspark import SparkContext
 4 | from pyspark.sql import SQLContext
 5 | 
 6 | from graphframes import *
 7 | from graphframes.examples import Graphs
 8 | 
 9 | sc = SparkContext('local', 'friends')
10 | sq = SQLContext(sc)
11 | friends = Graphs(sq).friends()
12 | 
13 | friends.vertices.show()
14 | friends.edges.show()
15 | 
16 | over30 = friends.vertices.filter("age > 30")
17 | only_friends = friends.edges.filter("relationship = 'friend'")
18 | friends_over_30 = GraphFrame(over30, only_friends)
19 | friends_over_30.triplets.show()
20 | 
21 | 


--------------------------------------------------------------------------------
/spark/graphframes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | readonly PKG="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
 5 | if [ $# -eq 0 ]; then
 6 |     pyspark --packages $PKG
 7 | else
 8 |     spark-submit --packages $PKG "$*"
 9 | fi
10 | 
11 | exit $?
12 | 
13 | 


--------------------------------------------------------------------------------
/spark/hello1.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | 
 3 | sc = SparkContext('local', 'hello')
 4 | rdd = sc.textFile('./data/compras_tiny.csv')
 5 | 
 6 | print(rdd.count())
 7 | 
 8 | # Also spark-submit hello1.py --conf spark.logLineage=true
 9 | print(rdd.toDebugString().decode('utf-8'))
10 | 
11 | 


--------------------------------------------------------------------------------
/spark/hello2.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | 
 3 | sc = SparkContext('local', 'hello')
 4 | rdd = sc.textFile('./data/compras_tiny.csv')
 5 | 
 6 | solo_en_euros = rdd.filter(lambda fila: 'EUR' in fila)
 7 | 
 8 | print(solo_en_euros.toDebugString().decode('utf-8'))
 9 | print(solo_en_euros.count())
10 | print(solo_en_euros.take(10))
11 | 
12 | 


--------------------------------------------------------------------------------
/spark/helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | 
 5 | from collections import namedtuple
 6 | from datetime import datetime
 7 | 
 8 | item_fields = ['tx_id', 'tx_time', 'buyer', 'currency_code', 'payment_type', 'credit_card_number', 'country', 'department', 'product', 'item_price', 'coupon_code', 'was_returned']
 9 | Item = namedtuple('Item', item_fields)
10 | 
11 | def parse_item(raw_string):
12 |   f = raw_string.split('|')
13 |   f += [None] * (len(item_fields) - len(f))
14 |   return Item(*f)
15 | 
16 | # Thing = namedtuple('Item', ['foo', 'bar'])
17 | # some = Thing(foo=42, bar='hello')
18 | # some.foo
19 | # item = parse_item(["one", "two"])
20 | # new_item = item._replace(tx_id=1, buyer=5)
21 | 
22 | # API http://fixer.io/
23 | def get_usd_exchange_rates():
24 |   with open('./data/exchange_rates_usd.json') as f:
25 |     data = json.load(f)
26 |     return data['rates']
27 | 
28 | container_fields = ['ship_imo', 'ship_name', 'country', 'departure', 'container_id', 'container_type', 'container_group', 'net_weight', 'gross_weight', 'owner', 'declared', 'contact', 'customs_ok']
29 | Container = namedtuple('Container', container_fields)
30 | 
31 | def parse_container(raw_string):
32 |   f = raw_string.split(';')
33 |   f += [None] * (len(container_fields) - len(f))
34 |   return Container(*f)
35 | 
36 | stock_fields = ['simbolo', 'numero', 'precio_compra', 'ultimo_precio', 'returns']
37 | Stock = namedtuple('Stock', stock_fields)
38 | def parse_stock(raw_string):
39 |   f = raw_string.split(',')
40 |   return Stock(simbolo=f[0], numero=None, precio_compra=None, ultimo_precio=float(f[1]), returns=0.0)
41 | 
42 | def setup_checkpoint(streamingContext):
43 |   checkpoint = './checkpoint'
44 |   if (os.path.exists(checkpoint)):
45 |     shutil.rmtree(checkpoint)
46 |   os.mkdir(checkpoint)
47 |   streamingContext.checkpoint(checkpoint)
48 | 
49 | def isoDate(raw_string):
50 |   try:
51 |     return datetime.strptime(raw_string, "%Y-%m-%dT%H:%M:%SZ")
52 |   except Exception:
53 |     return None
54 | 
55 | def dataUrl(fileName):
56 |   base = "./data"
57 |   # base = "gs://bigdataupv_data"
58 |   return os.path.join(base, fileName)
59 | 
60 | 


--------------------------------------------------------------------------------
/spark/hft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pyspark import SparkContext
 4 | from pyspark.streaming import StreamingContext
 5 | 
 6 | from helpers import *
 7 | 
 8 | sc = SparkContext("local[2]", "NetworkWordCount")
 9 | st = StreamingContext(sc, 1)
10 | setup_checkpoint(st)
11 | 
12 | portfolio = { u'MSFT': Stock('MSFT', 1, 150.06, None, 0.0), u'APPL': Stock('APPL', 4, 70.23, None, 0.0), u'GOOG': Stock('GOOG', 2, 104.55, None, 0.0) }
13 | 
14 | def actualizar_portfolio(stocks):
15 |     actualizaciones = stocks.filter(lambda s: s.simbolo in portfolio).collect()
16 |     al_menos_una_actualizacion = False
17 |     for a in actualizaciones:
18 |         al_menos_una_actualizacion = True
19 |         actual = portfolio[a.simbolo]
20 |         nuevo = actual._replace( \
21 |             ultimo_precio = a.ultimo_precio, \
22 |             returns = (a.ultimo_precio - actual.precio_compra) / actual.precio_compra)
23 |         portfolio[a.simbolo] = nuevo
24 |     if al_menos_una_actualizacion:
25 |         print map(lambda s: list(s), portfolio.values())
26 | 
27 | stocks = st.socketTextStream("localhost", 9999) \
28 |     .map(parse_stock) \
29 |     .foreachRDD(actualizar_portfolio)
30 | 
31 | # stocks.pprint()
32 | # stocks.reduceByKey(lambda a,b: a + b)
33 | 
34 | st.start()
35 | st.awaitTermination()
36 | 
37 | 


--------------------------------------------------------------------------------
/spark/live.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "hello world!\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "print(\"hello world!\")"
18 |    ]
19 |   }
20 |  ],
21 |  "metadata": {
22 |   "kernelspec": {
23 |    "display_name": "Python 3",
24 |    "language": "python",
25 |    "name": "python3"
26 |   },
27 |   "language_info": {
28 |    "codemirror_mode": {
29 |     "name": "ipython",
30 |     "version": 3
31 |    },
32 |    "file_extension": ".py",
33 |    "mimetype": "text/x-python",
34 |    "name": "python",
35 |    "nbconvert_exporter": "python",
36 |    "pygments_lexer": "ipython3",
37 |    "version": "3.7.2"
38 |   }
39 |  },
40 |  "nbformat": 4,
41 |  "nbformat_minor": 2
42 | }
43 | 


--------------------------------------------------------------------------------
/spark/live.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
4 | df = spark.read.load('data/containers_tiny.parquet')
5 | df.select("ship_imo", "container_id", "net_weight").show()
6 | 
7 | 


--------------------------------------------------------------------------------
/spark/live/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: watch
 2 | watch:
 3 | 	./live.sh
 4 | 
 5 | .PHONY: auth
 6 | auth:
 7 | 	docker run -ti --name gcloud-config google/cloud-sdk gcloud auth login
 8 | 
 9 | .PHONY: set-project
10 | set-project:
11 | 	./gcloud config set project bigdataupv2021
12 | 
13 | 


--------------------------------------------------------------------------------
/spark/live/gcloud:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | docker run --rm -ti --volumes-from gcloud-config google/cloud-sdk gcloud "$@"
3 | 
4 | 


--------------------------------------------------------------------------------
/spark/live/gsutil:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | docker run --rm --volumes-from gcloud-config -w /tmp/current -v $(pwd):/tmp/current google/cloud-sdk gsutil "$@"
3 | 
4 | 


--------------------------------------------------------------------------------
/spark/live/live.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | fswatch ../live.py | while read -r fpath; do \
 5 |     echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%M%S")"
 6 |     echo -e "# $(date +"%H:%M:%S")\n" | cat - ../live.py > live_mod.py
 7 |     sed -e '/-python">/r./live_mod.py' live_template.html > live.html
 8 |     ./gsutil -h "Cache-Control:no-cache,max-age=0" \
 9 |         cp /tmp/current/live.html gs://bigdata.luisbelloch.es/en_directo.html
10 | 
11 |     # echo -e "# $(date +"%H:%M:%S")\n" | cat - live.py | pygmentize -f html -O full,linenos=1 -o live.html
12 |     # scp live.html root@live.luisbelloch.es:/var/www/html/index.html
13 | done
14 | 
15 | 


--------------------------------------------------------------------------------
/spark/live/live_jupyter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eou pipefail
 3 | 
 4 | fswatch ../live.ipynb | while read -r fpath; do \
 5 |   echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%m%S")"
 6 |   jupyter nbconvert ../live.ipynb --to html --output-dir="$(pwd)"
 7 |   gsutil -h "Cache-Control:no-cache,max-age=0" \
 8 |     cp live.html gs://bigdata.luisbelloch.es/en_directo.html
 9 |   # scp live/live.html root@live.luisbelloch.es:/var/www/html/index.html
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/spark/live/live_template.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 6 |     <meta http-equiv="X-UA-Compatible" content="ie=edge" />
 7 |     <meta http-equiv="Refresh" content="5">
 8 |     <title>#bidataupv - live</title>
 9 |     <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Fira+Mono:400,500,700|Roboto:300,400,500,700,900" />
10 |     <link rel="stylesheet" href="http://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.14.2/styles/dracula.min.css" />
11 |     <style>
12 |       body {
13 |         background: #282a36;
14 |       }
15 |       pre {
16 |         font-size: 0.875rem;
17 |         line-height: 1.4;
18 |         overflow-x: auto;
19 |       }
20 |       code {
21 |         font-family: "Fira Mono", Menlo, Consolas, "Courier New", Courier, monospace;
22 |       }
23 |     </style>
24 |   </head>
25 |   <body>
26 |     <div>
27 |       <pre><code class="language-python">
28 |       </code></pre>
29 |     </div>
30 |     <script src="http://cdnjs.cloudflare.com/ajax/libs/highlight.js/9.14.2/highlight.min.js"></script>
31 |     <script>
32 |       hljs.initHighlightingOnLoad();
33 |     </script>
34 |   </body>
35 | </html>
36 | 


--------------------------------------------------------------------------------
/spark/peliculas_0_ml.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.ml.recommendation import ALS
 3 | 
 4 | # ALTERNATIVE
 5 | # from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
 6 | # from pyspark.mllib.recommendation import ALS, Rating
 7 | 
 8 | spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()
 9 | 
10 | print("\033[36mInitial data\033[0m")
11 | columns = ["user", "item", "rating"]
12 | data = [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)]
13 | df = spark.createDataFrame(data, columns)
14 | df.show()
15 | 
16 | print("\033[36mTraining model...\033[0m")
17 | als = ALS()
18 | model = als.fit(df)
19 | 
20 | output_model_path = "data/peliculas0_trained_model"
21 | print("\033[36mSaving model to '{}'...\033[0m".format(output_model_path))
22 | model.write().overwrite().save(output_model_path)
23 | 
24 | print("\033[36mTesting some user/item pairs...:\033[0m")
25 | test = spark.createDataFrame([(0, 2), (1, 0), (2, 0), (3, 0)], ["user", "item"])
26 | model.transform(test).show()
27 | 
28 | 


--------------------------------------------------------------------------------
/spark/peliculas_1_mllib.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
 3 | from pyspark.mllib.recommendation import ALS, Rating
 4 | 
 5 | sc = SparkContext()
 6 | 
 7 | # Generar recomendaciones para todos los usuarios
 8 | # - Clasificación: suma(votos) / numero_votos
 9 | # - Clasificación con tiempo: inventar, algoritmo de Reddit p.e.
10 | # - Descartar votos duplicados
11 | # - Report para la web, necesario orden por pelicula: usuario_id, pelicula_id, titulo, rating_medio
12 | # - Guardar en parquet
13 | 
14 | peliculas = sc.textFile("data/peliculas.csv") \
15 |     .filter(lambda l: not l.startswith(u'#') and not l.startswith(u'Entry|')) \
16 |     .map(lambda l: l.split("|"))
17 | # print peliculas.take(2)
18 | 
19 | def parseLine(line):
20 |     fields = line.split("|")
21 |     return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
22 | 
23 | ratings = sc.textFile("data/ratings.csv") \
24 |     .filter(lambda l: not l.startswith('pelicula_id')) \
25 |     .map(lambda l: l.split(",")) \
26 |     .map(lambda l: Rating(int(l[1]), int(l[0]), float(l[2])))
27 | # print ratings.take(2)
28 | 
29 | media_ratings = ratings \
30 |     .map(lambda r: (r.product, (r.rating, 1))) \
31 |     .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \
32 |     .map(lambda p: (p[0], p[1][0] / float(p[1][1])))
33 | # print media_ratings.collect()
34 | 
35 | # Entrenar modelo
36 | model = ALS.train(ratings, 1)
37 | 
38 | # generar posibles pares de usuario / pelicula
39 | # VER bash en shell_trans.sh
40 | ids_pelicula = sc.textFile('data/pelicula_ids.csv')
41 | ids_usuario = sc.textFile('data/pelicula_usuarios.csv')
42 | publico_objetivo = ids_usuario.cartesian(ids_pelicula)  #ids_pelicula.cartesian(ids_usuario)
43 | # print posibles_pares.take(10)
44 | 
45 | # Crear predicciones
46 | predicciones = model.predictAll(publico_objetivo)
47 | # print predicciones.take(4)
48 | 
49 | # Convertir a DF para manipular
50 | # POR QUÉ no hemos de hacer el sort/join aquí, mejor en una BBDD relacional
51 | # Cálculo número de filas + espacio
52 | # Cómo se realizaría la inserción?
53 | from pyspark.sql import SQLContext, Row
54 | sq = SQLContext(sc)
55 | df = sq.createDataFrame(predicciones)
56 | df.registerTempTable('predicciones')
57 | df.show()
58 | 
59 | # ¿Tenemos un modelo correcto?
60 | # R-Squared 0, indicates that the model explains none of the variability of the response data around its mean.
61 | # R-Squared 1, indicates that the model explains all the variability of the response data around its mean.
62 | ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
63 | scoreAndLabels = predicciones \
64 |         .map(lambda r: ((r.user, r.product), r.rating)) \
65 |         .join(ratingsTuple) \
66 |         .map(lambda tup: tup[1])
67 | 
68 | metrics = RegressionMetrics(scoreAndLabels)
69 | print("RMSE = %s" % metrics.rootMeanSquaredError)
70 | print("R-squared = %s" % metrics.r2)
71 | 
72 | 


--------------------------------------------------------------------------------
/spark/peliculas_calculo_de_medias_por_key.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from helpers import *
 3 | 
 4 | sc = SparkContext('local', 'compras')
 5 | 
 6 | ratings = sc.textFile("data/ratings.csv") \
 7 |     .filter(lambda l: not l.startswith('pelicula_id')) \
 8 |     .map(lambda l: l.split(","))
 9 | 
10 | media_ratings = ratings \
11 |     .map(lambda r: (r[0], (float(r[2]), 1))) \
12 |     .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \
13 |     .map(lambda p: (int(p[0]), p[1][0] / float(p[1][1])))
14 | 
15 | print(media_ratings.take(5))
16 | 
17 | 


--------------------------------------------------------------------------------
/spark/reload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | fswatch live.py | while read -r fpath; do \
 5 |     clear
 6 |     echo -e "\033[0;36mRELOAD\033[0m $fpath $(date +"%H%M%S")"
 7 |     spark-submit live.py
 8 | done
 9 | 
10 | 


--------------------------------------------------------------------------------
/spark/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | readonly c_step="$(tput setaf 6)"
 4 | readonly c_norm="$(tput sgr0)"
 5 | readonly excluded=(helpers.py hft.py container_caching.py ship_routes.py)
 6 | 
 7 | for file in *.py; do
 8 |   if [[ ! " ${excluded[*]} " =~ " ${file} " ]]; then
 9 |     echo -e "${c_step}Running${c_norm} $file"
10 |     spark-submit $file 2>/dev/null
11 |   fi
12 | done
13 | 
14 | ./graphframes.sh ship_routes.py
15 | 


--------------------------------------------------------------------------------
/spark/ship_routes.py:
--------------------------------------------------------------------------------
 1 | # RUN: ./graphframes.sh ship_routes.py
 2 | 
 3 | from pyspark import SparkContext
 4 | from pyspark.sql import SQLContext
 5 | from pyspark.sql.types import *
 6 | from pyspark.sql.functions import lead, col, explode
 7 | from pyspark.sql.window import Window
 8 | 
 9 | from graphframes import *
10 | from graphframes.examples import Graphs
11 | 
12 | sc = SparkContext('local', 'barcos')
13 | sq = SQLContext(sc)
14 | 
15 | csv = sc.textFile("data/ship_routes.csv") \
16 |     .map(lambda c: c.split("|")) \
17 |     .map(lambda c: (c[0], c[1], c[4]))
18 | sequential_route = sq.createDataFrame(csv, ["order", "ship_imo", "country_code"])
19 | sequential_route.orderBy("ship_imo", "order").show()
20 | 
21 | w = Window().partitionBy("ship_imo").orderBy(col("order"))
22 | routes = sequential_route.select("*", lead("country_code").over(w).alias("dst")).na.drop()
23 | routes.orderBy("ship_imo", "order").show()
24 | 
25 | edges = routes.select(col("country_code").alias("src"), col("dst"), col("ship_imo"))
26 | # edges.show(100)
27 | 
28 | countries_rdd = sc \
29 |     .textFile('./data/country_codes.csv') \
30 |     .map(lambda c: tuple(reversed(c.split(','))))
31 | vertices = sq.createDataFrame(countries_rdd, ["id", "country_label"])
32 | # vertices.show(100)
33 | 
34 | g = GraphFrame(vertices, edges)
35 | results = g.shortestPaths(landmarks=["AT", "GS"]) \
36 |            .select("id", "country_label", explode("distances"))
37 | results.show(200)
38 | 
39 | 


--------------------------------------------------------------------------------
/spark/spark:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # Alternative: SPARK_DOCKER_IMAGE=apache/spark-py ./spark-submit script.py
 5 | readonly SPARK_DOCKER_IMAGE=${SPARK_DOCKER_IMAGE:-luisbelloch/spark}
 6 | readonly SPARK_SUBMIT=/opt/spark/bin/spark-submit
 7 | readonly DATA_DIR=/tmp/bigdataupv/data
 8 | readonly WORK_DIR=/tmp/bigdataupv/scripts
 9 | 
10 | if [[ $# -lt 1 ]]; then
11 |     >&2 echo "USAGE: ./spark [SCRIPT_NAME]"
12 |     >&2 echo "Sample: ./spark hello1.py"
13 |     exit 1
14 | fi
15 | 
16 | abs_path() {
17 |     echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")"
18 | }
19 | 
20 | get_data_volume() {
21 |     # Probe for source folder first, if it doesn't
22 |     # exists then it'll try with current folder
23 |     if [[ -d "${0}" ]]; then
24 |         echo "-v $(abs_path $0):"${DATA_DIR}""
25 |     elif [[ -d "./data" ]]; then
26 |         echo "-v $(abs_path "./data"):"${DATA_DIR}""
27 |     elif [[ -d "../data" ]]; then
28 |         echo "-v $(abs_path "../data"):"${DATA_DIR}""
29 |     else
30 |         >&2 echo "WARN: ./data directoy not found!"
31 |         echo ""
32 |     fi
33 | }
34 | 
35 | readonly source_folder="$(cd "$(dirname "$1")" && pwd)"
36 | readonly data_volume=$(get_data_volume "${source_folder}")
37 | 
38 | docker run --rm -ti \
39 |     -w "${WORK_DIR}" \
40 |     -v "${source_folder}":"${WORK_DIR}" \
41 |     $data_volume \
42 |     ${SPARK_DOCKER_IMAGE} "${SPARK_SUBMIT}" "${WORK_DIR}"/$1 ${@:2}
43 | 
44 | 


--------------------------------------------------------------------------------
/spark/stock_server.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'socket'
 4 | require 'csv'
 5 | 
 6 | @simbolos = ["MSFT", "IBM", "GOOG", "YHOO", "APPL", "SIFI", "NWBO", "CRTO", "LAMR", "EYES", "ONTX", "FWP", "XXIA", "ASBB", "FTHI", "LSCC", "MRTN", "MBII", "EARS", "FTLB", "PBSK", "PRPH", "VRTU", "QUIK", "RYAAY", "WPRT", "HNNA", "CBSHP", "ADHD", "SGEN", "EZCH", "ADXS", "SNMX", "AXAS", "ASEI", "PME", "AGII", "HABT", "SCAI", "WMAR", "BKSC", "ORBK", "FTSL", "JRVR", "PMTS", "PRTO", "BLVDU", "XCRA", "LIND", "DTLK", "CERS", "TSC", "SONA", "CFGE", "CMFN", "PHIIK", "ASCMA", "HCAP", "HBANP", "WOWO", "KWEB", "CRDS", "EMIF", "MAUI", "LIVE", "ADRD", "AMAT", "EXLS", "FEIC", "QUNR", "LABL", "CDOR", "FRSH", "MTSI", "PCYO", "GOODN", "PRGX", "VXUS", "PCRX", "MAGS", "ALOG", "CYTR", "WHLR", "XBKS", "JRJC", "MDM", "HFBC", "CHY", "WSBF", "WOOD", "GULF", "FNWB", "GMLP", "NATR", "RDI", "RPRX", "EMMS", "ZFGN", "ADI", "BBH"]
 7 | # @simbolos = CSV.read('data/nasdaq.csv', {:col_sep => "|"}).drop(2).map { |s| s[0] }
 8 | @emitted = {}
 9 | 
10 | def generar_stock
11 |   name = @simbolos.sample
12 |   price = 20 + Random.rand(200.0)
13 |   if @emitted.has_key? name
14 |     current = @emitted[name]
15 |     price = current + (current * ([1,-1].sample * Random.rand(0.01)))
16 |   end
17 |   @emitted[name] = price
18 |   "#{name},#{price.round(2)}"
19 | end
20 | 
21 | def envio_continuo(cliente)
22 |   loop do
23 |     [1,3,5,7].sample.times do
24 |       stock = generar_stock
25 |       puts stock
26 |       cliente.puts stock
27 |     end
28 |     sleep Random.rand(2.0)
29 |   end
30 | end
31 | 
32 | def main
33 |   server = TCPServer.new 9999
34 |   puts "Escuchando en tcp://localhost:9999..."
35 | 
36 |   loop do
37 |     Thread.start(server.accept) do |cliente|
38 |       envio_continuo cliente
39 |     end
40 |   end
41 | end
42 | 
43 | if __FILE__ == $0
44 |   main()
45 |   50.times { |n| puts generar_stock }
46 | end
47 | 


--------------------------------------------------------------------------------