├── .bashrc ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Part_II_-_Climbing_the_Pyramid.ipynb ├── README.md ├── Welcome.ipynb ├── airflow.env ├── aws ├── block.device.mappings.json └── ec2_bootstrap.sh ├── bin ├── download_list.txt ├── get_files_from_ec2.sh ├── get_student_work.sh ├── send_files_to_ec2.sh ├── start_notebook.sh ├── stop_flask.sh ├── stop_notebook.sh └── upgrade.sh ├── ch02 ├── Agile_Tools.ipynb ├── Introduction_to_PySpark.ipynb ├── airflow_test.py ├── data │ └── example_name_titles_daily.json │ │ └── 2016-12-01 │ │ └── test.jsonl ├── elasticsearch.sh ├── flatmap.py ├── groupby.py ├── histogram.py ├── images │ ├── ads2_0201.png │ ├── ads2_0202.png │ ├── ads2_0209.png │ ├── ads2_0211.png │ ├── ads2_0212.png │ ├── ads2_0215.png │ ├── ads2_0217.png │ ├── ads2_0219.png │ ├── ads2_0220.png │ ├── ads2_0401.png │ ├── ads2_0402.png │ ├── ads2_0403.png │ ├── ads2_0405.png │ ├── ads2_0406.png │ ├── ads2_0408.png │ ├── ads_bootstrap.png │ ├── faa_table.png │ ├── flask_terminal.png │ └── json.png ├── load_on_time_performance.py ├── mongo.js ├── pyspark_elasticsearch.py ├── pyspark_mongodb.py ├── pyspark_streaming.py ├── pyspark_task_one.py ├── pyspark_task_two.py ├── python_kafka.py ├── setup_airflow_test.sh ├── spark.py ├── sql.py ├── test_elasticsearch.py ├── test_elasticsearch.sh ├── test_json.py ├── test_pymongo.py ├── test_pymongo_2.py └── web │ ├── flask_pymongo.py │ ├── static │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ └── d3.v3.min.js │ ├── templates │ └── table.html │ ├── test_flask.py │ ├── test_flask_bootstrap.py │ └── test_flask_pymongo.py ├── ch04 ├── Collecting_and_Displaying_Records.ipynb ├── convert_data.py ├── download.sh ├── images │ ├── ads2_0201.png │ ├── ads2_0202.png │ ├── ads2_0209.png │ ├── ads2_0211.png │ ├── ads2_0212.png │ ├── ads2_0215.png │ ├── ads2_0217.png │ ├── ads2_0219.png │ ├── ads2_0220.png │ ├── ads2_0401.png │ ├── ads2_0402.png │ ├── ads2_0403.png │ ├── ads2_0405.png │ ├── ads2_0406.png │ ├── ads2_0408.png │ ├── ads2_0409.png │ ├── ads_bootstrap.png │ ├── ags2_0402.png │ ├── airline_data_fields.png │ ├── faa_table.png │ ├── flask_terminal.png │ ├── json.png │ ├── parquet_logo.jpg │ └── row_format_column_format.png ├── load_on_time_pyspark.py ├── mongo.js ├── pyspark_to_elasticsearch.py ├── pyspark_to_mongo.py └── web │ ├── config.py │ ├── on_time_flask.py │ ├── on_time_flask_template.py │ ├── static │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ └── d3.v3.min.js │ └── templates │ ├── flight.html │ ├── flights.html │ ├── layout.html │ ├── macros.jnj │ └── search.html ├── ch05 ├── Visualizing_Data_with_Charts_and_Tables.ipynb ├── assess_airplanes.py ├── assess_faa.py ├── extract_airplanes.py ├── images │ ├── ads2_0501.png │ ├── ads2_0502.png │ ├── ads2_0503.png │ ├── ads2_0504.png │ ├── ads2_0505.png │ ├── ads2_0507.png │ ├── ads2_0508.png │ ├── ads2_0509.png │ ├── ads2_0510.png │ ├── ads2_0511.png │ ├── ads2_0512.png │ ├── first_order_form.png │ ├── flight_search_with_tail_num_link.png │ ├── mapreduce.png │ └── total_flights_2.png ├── install.sh ├── mongo.js ├── save_tail_numbers.py ├── total_flights.py └── web │ ├── chart_flask.py │ ├── config.py │ ├── flights_per_airplane.html │ ├── static │ ├── app.js │ ├── app2.js │ ├── app3.js │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ ├── d3.v3.min.js │ ├── images │ │ └── .exist │ ├── jquery-1.12.2.min.js │ ├── nv.d3.css │ └── nv.d3.min.js │ └── templates │ ├── flight.html │ ├── flights.html │ ├── flights_per_airplane.html │ ├── flights_per_airplane_2.html │ ├── layout.html │ ├── macros.jnj │ ├── search.html │ ├── top_routes.html │ ├── top_routes_chart.html │ ├── total_flights.html │ ├── total_flights_chart.html │ └── total_flights_chart_2.html ├── ch06 ├── Exploring_Data_with_Reports.ipynb ├── add_name_to_airlines.py ├── airplanes_mapping.json ├── airplanes_to_elasticsearch.py ├── analyze_airplanes.py ├── analyze_airplanes_again.py ├── create_airplanes_index.sh ├── enrich_airlines_wikipedia.py ├── extract_airlines.py ├── extract_airports.py ├── images │ ├── ads2_0601.png │ ├── ads2_0602.png │ ├── ads2_0603.png │ ├── ads2_0604.png │ ├── ads2_0605.png │ ├── ads2_0606.png │ ├── ads2_0607.png │ ├── ads2_0608.png │ └── ads2_0609.png ├── import_airlines.sh ├── prepare_airplanes.py ├── resolve_airplane_manufacturers.py ├── scrape_faa.py ├── test_elastic_airplanes.sh └── web │ ├── __init__.py │ ├── config.py │ ├── report_flask.py │ ├── search_helpers.py │ ├── static │ ├── airplanes.js │ ├── app.js │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ ├── d3.v3.min.js │ ├── jquery-1.12.2.min.js │ ├── nv.d3.css │ └── nv.d3.min.js │ └── templates │ ├── airlines.html │ ├── airlines2.html │ ├── airport.html │ ├── all_airlines.html │ ├── all_airplanes.html │ ├── flight.html │ ├── flights.html │ ├── flights_per_airplane.html │ ├── layout.html │ ├── macros.jnj │ ├── search.html │ ├── total_flights.html │ └── total_flights_chart.html ├── ch07 ├── Making_Predictions.ipynb ├── Predicting flight delays with sklearn.ipynb ├── explore_delays.py ├── extract_features.py ├── images │ ├── ads2_0701.png │ └── ads2_0702.png ├── train_sklearn_model.py └── train_spark_mllib_model.py ├── ch08 ├── Deploying_Predictive_Systems.ipynb ├── airflow │ └── setup.py ├── download_data.sh ├── extract_features.py ├── fetch_prediction_requests.py ├── images │ ├── ads2_0807.png │ ├── ads2_0808.png │ ├── ads2_0809.png │ ├── ads2_0810.png │ └── ads2_0811.png ├── import_distances.sh ├── kafka_test.py ├── links.txt ├── load_prediction_results.py ├── make_predictions.py ├── make_predictions_streaming.py ├── origin_dest_distances.py ├── python_kafka_consumer.py ├── python_kafka_producer.py ├── streaming_test.py ├── test_airflow.sh ├── test_classification_api.sh ├── test_regression_api.sh ├── train_spark_mllib_model.py └── web │ ├── __init__.py │ ├── config.py │ ├── predict_flask.py │ ├── predict_utils.py │ ├── static │ ├── airplanes.js │ ├── app.js │ ├── bar.css │ ├── barchart.js │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ ├── d3.v3.min.js │ ├── flight_delay_predict_polling.js │ ├── jquery-1.12.2.min.js │ ├── nv.d3.css │ └── nv.d3.min.js │ └── templates │ ├── airlines.html │ ├── all_airlines.html │ ├── all_airplanes.html │ ├── delays.html │ ├── flight.html │ ├── flight_delays_predict.html │ ├── flight_delays_predict_batch.html │ ├── flight_delays_predict_batch_results.html │ ├── flight_delays_predict_kafka.html │ ├── flights.html │ ├── flights_per_airplane.html │ ├── layout.html │ ├── macros.jnj │ ├── search.html │ ├── total_flights.html │ └── total_flights_chart.html ├── ch09 ├── Debugging Prediction Problems.ipynb ├── Improving flight delay predictions with sklearn.ipynb ├── Improving_Predictions.ipynb ├── baseline_spark_mllib_model.py ├── explore_delays.py ├── extract_features.py ├── extract_features_with_airplanes.py ├── extract_features_with_flight_time.py ├── improve_sklearn_model.py ├── improved_spark_mllib_model.py ├── make_predictions_final.py ├── make_predictions_streaming_final.py ├── spark_model_with_airplanes.py ├── spark_model_with_flight_time.py └── train_spark_mllib_model.py ├── ch10 ├── convert_observations.py ├── explore_weather.py ├── load_weather.py ├── match_airport_with_weather_station.py ├── match_reports_with_flights.py ├── spark_model_with_weather.py └── web │ ├── __init__.py │ ├── config.py │ ├── predict_flask.py │ ├── predict_utils.py │ ├── static │ ├── airplanes.js │ ├── app.js │ ├── bar.css │ ├── barchart.js │ ├── bootstrap-theme.min.css │ ├── bootstrap.min.css │ ├── bootstrap.min.js │ ├── calendar.js │ ├── calendar.min.css │ ├── d3.v3.min.js │ ├── flight_delay_predict_polling.js │ ├── jquery-1.12.2.min.js │ ├── nv.d3.css │ └── nv.d3.min.js │ └── templates │ ├── airlines.html │ ├── all_airlines.html │ ├── all_airplanes.html │ ├── daily_weather_station.html │ ├── delays.html │ ├── flight.html │ ├── flight_delays_predict.html │ ├── flight_delays_predict_batch.html │ ├── flight_delays_predict_batch_results.html │ ├── flight_delays_predict_kafka.html │ ├── flights.html │ ├── flights_per_airplane.html │ ├── layout.html │ ├── macros.jnj │ ├── search.html │ ├── total_flights.html │ ├── total_flights_chart.html │ └── weather_station.html ├── dags └── .exists ├── docker-compose.yml ├── download.sh ├── download_weather.sh ├── elastic_scripts ├── create.sh ├── drop.sh └── query.sh ├── images ├── DeepDiscoveryTechnicalLogo.png ├── airline_page_enriched_wikipedia.png ├── airplanes_page_chart_v1_v2.png ├── back_end_realtime_architecture.png ├── climbing_the_pyramid_chapter_intro.png ├── data_syndrome_logo.png ├── flight_delay_chart_2.0.png ├── front_end_realtime_architecture.png ├── predicting_flight_kafka_waiting.png ├── ubuntu_images.png └── video_course_cover.png ├── install └── phantomjs.sh ├── intro_download.sh ├── jupyter_notebook_config.py ├── lib ├── data │ ├── example.csv │ └── faa_tail_number_inquiry.jsonl ├── pyspark_csv.py ├── setup_spark.py └── utils.py ├── logs └── .exists ├── old.Dockerfile ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── scripts └── .exists /.bashrc: -------------------------------------------------------------------------------- 1 | 2 | # >>> conda initialize >>> 3 | # !! Contents within this block are managed by 'conda init' !! 4 | __conda_setup="$('/opt/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" 5 | if [ $? -eq 0 ]; then 6 | eval "$__conda_setup" 7 | else 8 | if [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then 9 | . "/opt/conda/etc/profile.d/conda.sh" 10 | else 11 | export PATH="/opt/conda/bin:$PATH" 12 | fi 13 | fi 14 | unset __conda_setup 15 | # <<< conda initialize <<< 16 | 17 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data 2 | .git 3 | spark 4 | hadoop 5 | elasticsearch 6 | kafka 7 | mongodb 8 | mongo-hadoop 9 | spark-warehouse 10 | tmp 11 | zeppelin 12 | elasticsearch-hadoop 13 | models 14 | lib 15 | .ivy2 16 | .bash_history 17 | .dbshell 18 | .mongodb 19 | .mongorc.js 20 | .cache 21 | logs 22 | .wget-hsts 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | mongodb 4 | spark 5 | hadoop 6 | data 7 | mongo-hadoop 8 | lib/mongo* 9 | lib/pymongo_spark.py 10 | elasticsearch 11 | elasticsearch-hadoop 12 | lib 13 | ch03/static 14 | ch06/web/static 15 | *.pyc 16 | tmp 17 | .idea 18 | kafka 19 | zeppelin 20 | ch05/scrape_faa.py 21 | models 22 | .vagrant 23 | *.pem 24 | .reservation_id 25 | .ec2_hostname 26 | .ec2_deep_hostname 27 | .deep_reservation_id 28 | deep/data 29 | .vscode 30 | ghostdriver.log 31 | cassandra 32 | janusgraph 33 | deep 34 | ch05/web/static/images/ 35 | .local 36 | .ipython 37 | .jupyter 38 | .ivy2 39 | .bash_history 40 | .dbshell 41 | .mongodb 42 | .mongorc.js 43 | .cache 44 | logs 45 | .wget-hsts 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG OWNER=jupyter 2 | ARG BASE_CONTAINER=$OWNER/pyspark-notebook:spark-3.2.0 3 | FROM $BASE_CONTAINER 4 | 5 | LABEL maintainer="Russell Jurney " 6 | 7 | # Fix DL4006 8 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 9 | 10 | USER root 11 | 12 | # Install the MongoDB Client CLI 13 | RUN apt-get update --yes && \ 14 | sudo apt-get install -y iputils-ping gnupg curl jq && \ 15 | wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | sudo apt-key add - && \ 16 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-5.0.list && \ 17 | sudo apt-get update && \ 18 | sudo apt-get install -y mongodb-mongosh mongodb-org-tools && \ 19 | echo "mongodb-mongosh hold" | sudo dpkg --set-selections && \ 20 | echo "mongodb-org-tools hold" | sudo dpkg --set-selections && \ 21 | apt-get clean 22 | 23 | RUN pip install poetry 24 | 25 | COPY pyproject.toml /home/jovyan/pyproject.toml 26 | COPY poetry.lock /home/jovyan/poetry.lock 27 | COPY requirements.txt /home/jovyan/requirements.txt 28 | 29 | RUN poetry install && pip install -r requirements.txt 30 | 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /airflow.env: -------------------------------------------------------------------------------- 1 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow 2 | AIRFLOW__CORE__EXECUTOR=LocalExecutor -------------------------------------------------------------------------------- /aws/block.device.mappings.json: -------------------------------------------------------------------------------- 1 | { 2 | "DeviceName": "/dev/sda1", 3 | "Ebs": { 4 | "Status": "attached", 5 | "DeleteOnTermination": true, 6 | "VolumeSize": 1024 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /bin/get_files_from_ec2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copy all Python code and model files from the EC2 instance to local filessytem 4 | rsync -ruv -e "ssh -i ./agile_data_science.pem" \ 5 | --exclude=cassandra \ 6 | --exclude=data \ 7 | --exclude=janusgraph \ 8 | --exclude=hadoop \ 9 | --exclude=spark \ 10 | --exclude=kafka \ 11 | --exclude=lib \ 12 | --exclude=elasticsearch-hadoop \ 13 | --exclude=elasticsearch \ 14 | --exclude=mongo-hadoop \ 15 | --exclude=mongodb \ 16 | --exclude=tmp \ 17 | --exclude=zeppelin \ 18 | ubuntu@`cat .ec2_hostname`:Agile_Data_Code_2/* . 19 | -------------------------------------------------------------------------------- /bin/get_student_work.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | scp -i ./agile_data_science.pem bin/download_list.txt ubuntu@$(cat .ec2_hostname):Agile_Data_Code_2/ 4 | 5 | ssh -i ./agile_data_science.pem ubuntu@$(cat .ec2_hostname) << SSH_COMMANDS 6 | 7 | cd Agile_Data_Code_2 8 | tar -cvzf agile_data_science_student_code.tar.gz -T download_list.txt 9 | 10 | SSH_COMMANDS 11 | 12 | scp -i ./agile_data_science.pem ubuntu@$(cat .ec2_hostname):Agile_Data_Code_2/agile_data_science_student_code.tar.gz ./ads_student_$(cat .ec2_hostname).tar.gz 13 | -------------------------------------------------------------------------------- /bin/send_files_to_ec2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copy all files from this directory on localhost to EC2 host Agile_Data_Code_2 directory 4 | rsync -ruv -e "ssh -i ./agile_data_science.pem" \ 5 | --exclude=cassandra \ 6 | --exclude=data \ 7 | --exclude=janusgraph \ 8 | --exclude=hadoop \ 9 | --exclude=spark \ 10 | --exclude=kafka \ 11 | --exclude=lib \ 12 | --exclude=elasticsearch-hadoop \ 13 | --exclude=elasticsearch \ 14 | --exclude=mongo-hadoop \ 15 | --exclude=mongodb \ 16 | --exclude=tmp \ 17 | --exclude=zeppelin \ 18 | * ubuntu@`cat .ec2_hostname`:Agile_Data_Code_2/ 19 | -------------------------------------------------------------------------------- /bin/start_notebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $PROJECT_HOME 4 | 5 | # Stop all existing jupyter notebooks 6 | ps aux | grep -i jupyter | grep -v grep | tr -s ' ' | cut -d ' ' -f2 | xargs -I {} sudo kill -9 {} 7 | 8 | # Start a new Jupyter Notebook 9 | nohup jupyter notebook --ip=0.0.0.0 --NotebookApp.token= --allow-root --no-browser & 10 | 11 | echo "Jupyter notebook started!" 12 | -------------------------------------------------------------------------------- /bin/stop_flask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ps aux|grep -i flask|tr -s ' '|cut -d ' ' -f2|xargs kill -9 4 | sudo netstat -ap|grep 5000|tr -s ' '|cut -d ' ' -f7|cut -d '/' -f1|xargs sudo kill -9 5 | -------------------------------------------------------------------------------- /bin/stop_notebook.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ps aux| grep jupyter | grep -v grep | tr -s ' '| cut -d ' ' -f2 | xargs -I {} kill -9 {} 4 | 5 | echo "Killed Jupyter Notebook!" 6 | -------------------------------------------------------------------------------- /bin/upgrade.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LOG_FILE="/home/vagrant/upgrade.sh.log" 4 | 5 | echo "Removing Spark 2.2.1 ..." | tee -a $LOG_FILE 6 | rm -rf /home/vagrant/spark 7 | 8 | echo "Downloading and installing Spark 2.4.4 ..." | tee -a $LOG_FILE 9 | curl -Lko /tmp/spark-2.4.4-bin-without-hadoop.tgz https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz 10 | mkdir -p /home/vagrant/spark 11 | cd /home/vagrant 12 | tar -xvf /tmp/spark-2.4.4-bin-without-hadoop.tgz -C spark --strip-components=1 13 | 14 | # Have to set spark.io.compression.codec in Spark local mode 15 | cp /home/vagrant/spark/conf/spark-defaults.conf.template /home/vagrant/spark/conf/spark-defaults.conf 16 | echo 'spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec' | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf 17 | 18 | # Give Spark 8GB of RAM, use Python3 19 | echo "spark.driver.memory 8g" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf 20 | echo "spark.executor.cores 2" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf 21 | echo "PYSPARK_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh 22 | echo "PYSPARK_DRIVER_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh 23 | 24 | # Setup log4j config to reduce logging output 25 | cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties 26 | sed -i 's/INFO/ERROR/g' $SPARK_HOME/conf/log4j.properties 27 | 28 | # Give to vagrant 29 | sudo chown -R vagrant /home/vagrant/spark 30 | sudo chgrp -R vagrant /home/vagrant/spark 31 | 32 | echo "spark.speculation false" | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf 33 | 34 | echo "spark.jars /home/vagrant/Agile_Data_Code_2/lib/mongo-hadoop-spark-2.0.2.jar,/home/vagrant/Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar,/home/vagrant/Agile_Data_Code_2/lib/mongo-hadoop-2.0.2.jar,/home/vagrant/Agile_Data_Code_2/lib/elasticsearch-spark-20_2.11-6.1.2.jar,/home/vagrant/Agile_Data_Code_2/lib/snappy-java-1.1.7.1.jar,/home/vagrant/Agile_Data_Code_2/lib/lzo-hadoop-1.0.5.jar,/home/vagrant/Agile_Data_Code_2/lib/commons-httpclient-3.1.jar" | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf 35 | 36 | -------------------------------------------------------------------------------- /ch02/airflow_test.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash_operator import BashOperator 5 | 6 | from datetime import datetime, timedelta 7 | import iso8601 8 | 9 | project_home = os.environ["PROJECT_HOME"] 10 | 11 | default_args = { 12 | 'owner': 'airflow', 13 | 'depends_on_past': False, 14 | 'start_date': iso8601.parse_date("2016-12-01"), 15 | 'email': ['russell.jurney@gmail.com'], 16 | 'email_on_failure': True, 17 | 'email_on_retry': True, 18 | 'retries': 3, 19 | 'retry_delay': timedelta(minutes=5), 20 | } 21 | 22 | # Timedelta 1 is 'run daily' 23 | dag = DAG( 24 | 'agile_data_science_airflow_test', 25 | default_args=default_args, 26 | schedule_interval=timedelta(1) 27 | ) 28 | 29 | # Run a simple PySpark Script 30 | pyspark_local_task_one = BashOperator( 31 | task_id = "pyspark_local_task_one", 32 | bash_command = """spark-submit \ 33 | --master {{ params.master }} 34 | {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""", 35 | params = { 36 | "master": "local[8]", 37 | "filename": "ch02/pyspark_task_one.py", 38 | "base_path": "{}/".format(project_home) 39 | }, 40 | dag=dag 41 | ) 42 | 43 | # Run another simple PySpark Script that depends on the previous one 44 | pyspark_local_task_two = BashOperator( 45 | task_id = "pyspark_local_task_two", 46 | bash_command = """spark-submit \ 47 | --master {{ params.master }} 48 | {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""", 49 | params = { 50 | "master": "local[8]", 51 | "filename": "ch02/pyspark_task_two.py", 52 | "base_path": "{}/".format(project_home) 53 | }, 54 | dag=dag 55 | ) 56 | 57 | # Add the dependency from the second to the first task 58 | pyspark_local_task_two.set_upstream(pyspark_local_task_one) 59 | -------------------------------------------------------------------------------- /ch02/data/example_name_titles_daily.json/2016-12-01/test.jsonl: -------------------------------------------------------------------------------- 1 | {"name": "Russell Jurney", "title": "Data Scientist"} 2 | {"name": "Russell Jurney", "title": "Author"} 3 | {"name": "Russell Jurney", "title": "Dog Lover"} 4 | {"name": "Bob Jones", "title": "CEO"} 5 | {"name": "Susan Shu", "title": "Attorney"} 6 | -------------------------------------------------------------------------------- /ch02/elasticsearch.sh: -------------------------------------------------------------------------------- 1 | curl -XPUT 'localhost:9200/customer/external/1?pretty' -d ' 2 | { 3 | "name": "Russell Jurney" 4 | }' 5 | 6 | curl -XGET 'localhost:9200/customer/external/1?pretty' 7 | -------------------------------------------------------------------------------- /ch02/flatmap.py: -------------------------------------------------------------------------------- 1 | csv_lines = sc.textFile("data/example.csv") 2 | 3 | # Compute a relation of words by line 4 | words_by_line = csv_lines\ 5 | .map(lambda line: line.split(",")) 6 | 7 | words_by_line.collect() 8 | 9 | # Compute a relation of words 10 | flattened_words = csv_lines\ 11 | .map(lambda line: line.split(","))\ 12 | .flatMap(lambda x: x) 13 | 14 | flattened_words.collect() 15 | -------------------------------------------------------------------------------- /ch02/groupby.py: -------------------------------------------------------------------------------- 1 | csv_lines = sc.textFile("data/example.csv") 2 | 3 | # Turn the CSV lines into objects 4 | def csv_to_record(line): 5 | parts = line.split(",") 6 | record = { 7 | "name": parts[0], 8 | "company": parts[1], 9 | "title": parts[2] 10 | } 11 | return record 12 | 13 | # Apply the function to every record 14 | records = csv_lines.map(csv_to_record) 15 | 16 | # Inspect the first item in the dataset 17 | records.first() 18 | 19 | # Group the records by the name of the person 20 | grouped_records = records.groupBy(lambda x: x["name"]) 21 | 22 | # Show the first group 23 | grouped_records.first() 24 | 25 | # Count the groups 26 | job_counts = grouped_records.map( 27 | lambda x: { 28 | "name": x[0], 29 | "job_count": len(x[1]) 30 | } 31 | ) 32 | 33 | job_counts.first() 34 | 35 | job_counts.collect() 36 | -------------------------------------------------------------------------------- /ch02/histogram.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file containing flight delay records 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # Register the data for Spark SQL 5 | on_time_dataframe.registerTempTable("on_time_performance") 6 | 7 | # Compute a histogram of departure delays 8 | on_time_dataframe\ 9 | .select("DepDelay")\ 10 | .rdd\ 11 | .flatMap(lambda x: x)\ 12 | .histogram(10) 13 | 14 | import numpy as np 15 | import matplotlib.mlab as mlab 16 | import matplotlib.pyplot as plt 17 | 18 | # Function to plot a histogram using pyplot 19 | def create_hist(rdd_histogram_data): 20 | """Given an RDD.histogram, plot a pyplot histogram""" 21 | heights = np.array(rdd_histogram_data[1]) 22 | full_bins = rdd_histogram_data[0] 23 | mid_point_bins = full_bins[:-1] 24 | widths = [abs(i - j) for i, j in zip(full_bins[:-1], full_bins[1:])] 25 | bar = plt.bar(mid_point_bins, heights, width=widths, color='b') 26 | return bar 27 | 28 | # Compute a histogram of departure delays 29 | departure_delay_histogram = on_time_dataframe\ 30 | .select("DepDelay")\ 31 | .rdd\ 32 | .flatMap(lambda x: x)\ 33 | .histogram(10, [-60,-30,-15,-10,-5,0,5,10,15,30,60,90,120,180]) 34 | 35 | create_hist(departure_delay_histogram) 36 | -------------------------------------------------------------------------------- /ch02/images/ads2_0201.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0201.png -------------------------------------------------------------------------------- /ch02/images/ads2_0202.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0202.png -------------------------------------------------------------------------------- /ch02/images/ads2_0209.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0209.png -------------------------------------------------------------------------------- /ch02/images/ads2_0211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0211.png -------------------------------------------------------------------------------- /ch02/images/ads2_0212.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0212.png -------------------------------------------------------------------------------- /ch02/images/ads2_0215.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0215.png -------------------------------------------------------------------------------- /ch02/images/ads2_0217.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0217.png -------------------------------------------------------------------------------- /ch02/images/ads2_0219.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0219.png -------------------------------------------------------------------------------- /ch02/images/ads2_0220.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0220.png -------------------------------------------------------------------------------- /ch02/images/ads2_0401.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0401.png -------------------------------------------------------------------------------- /ch02/images/ads2_0402.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0402.png -------------------------------------------------------------------------------- /ch02/images/ads2_0403.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0403.png -------------------------------------------------------------------------------- /ch02/images/ads2_0405.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0405.png -------------------------------------------------------------------------------- /ch02/images/ads2_0406.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0406.png -------------------------------------------------------------------------------- /ch02/images/ads2_0408.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0408.png -------------------------------------------------------------------------------- /ch02/images/ads_bootstrap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads_bootstrap.png -------------------------------------------------------------------------------- /ch02/images/faa_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/faa_table.png -------------------------------------------------------------------------------- /ch02/images/flask_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/flask_terminal.png -------------------------------------------------------------------------------- /ch02/images/json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/json.png -------------------------------------------------------------------------------- /ch02/load_on_time_performance.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file containing flight delay records 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # Register the data for Spark SQL 5 | on_time_dataframe.registerTempTable("on_time_performance") 6 | 7 | # Check out the columns 8 | on_time_dataframe.columns 9 | 10 | # Check out some data 11 | on_time_dataframe\ 12 | .select("FlightDate", "TailNum", "Origin", "Dest", "Carrier", "DepDelay", "ArrDelay")\ 13 | .show() 14 | 15 | # Trim the fields and keep the result 16 | trimmed_on_time = on_time_dataframe\ 17 | .select( 18 | "FlightDate", 19 | "TailNum", 20 | "Origin", 21 | "Dest", 22 | "Carrier", 23 | "DepDelay", 24 | "ArrDelay" 25 | ) 26 | 27 | # Sample 0.01% of the data and show 28 | trimmed_on_time.sample(False, 0.0001).show() 29 | -------------------------------------------------------------------------------- /ch02/mongo.js: -------------------------------------------------------------------------------- 1 | db.test_collection.insert({'name': 'Russell Jurney', 'email': 'russell.jurney@gmail.com'}) 2 | db.test_collection.findOne({'name': 'Russell Jurney'}) 3 | -------------------------------------------------------------------------------- /ch02/pyspark_elasticsearch.py: -------------------------------------------------------------------------------- 1 | csv_lines = sc.textFile("data/example.csv") 2 | data = csv_lines.map(lambda line: line.split(",")) 3 | schema_data = data.map(lambda x: ('key', {'name': x[0], 'company': x[1], 'title': x[2]})) 4 | 5 | schema_data.saveAsNewAPIHadoopFile( 6 | path='-', 7 | outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", 8 | keyClass="org.apache.hadoop.io.NullWritable", 9 | valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 10 | conf={ "es.resource" : "agile_data_science/executives" }) 11 | -------------------------------------------------------------------------------- /ch02/pyspark_mongodb.py: -------------------------------------------------------------------------------- 1 | # This code sample is meant to be executed line-by-line in a 2 | # pyspark session. 3 | # 4 | # Prior to launching pyspark, run the following line in the 5 | # shell where pyspark will be launched. 6 | # 7 | # export PYSPARK_DRIVER_PYTHON=ipython 8 | # 9 | # The pyspark launch command needs to have additional command line 10 | # arguments passed to ensure that Java classes used to connect to 11 | # MongoDB are found. 12 | # 13 | # The Java classes reside in JAR files that were 14 | # preinstalled via the boostrap.sh script and placed in the 15 | # lib directory. You will need to note the version of the 16 | # libraries by inspecting the JAR filenames. For example, 17 | # if running the following shell command: 18 | # 19 | # $ ls Agile_Data_Code_2/lib/mongo*.jar 20 | # 21 | # yields the following listing: 22 | # 23 | # Agile_Data_Code_2/lib/mongo-hadoop-2.0.2.jar 24 | # Agile_Data_Code_2/lib/mongo-hadoop-spark-2.0.2.jar 25 | # Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar 26 | # 27 | # then the mongo-hadoop version would be 2.0.2, and the 28 | # Mongo-Java version would be 3.6.1. 29 | # 30 | # Choosing to set these versions as environment variables 31 | # will make the invocation of the command much less error 32 | # prone. 33 | # 34 | # MONGOHADOOP_VERSION=2.0.2 35 | # MONGOJAVA_VERSION=3.6.1 36 | # 37 | # The names of the JAR files can then be pieced together 38 | # from the version strings. 39 | # 40 | # MONGOHADOOPSPARK_JAR=./lib/mongo-hadoop-spark-$MONGOHADOOP_VERSION.jar 41 | # MONGOJAVADRIVER_JAR=./lib/mongo-java-driver-$MONGOJAVA_VERSION.jar 42 | # MONGOHADOOP_JAR=./lib/mongo-hadoop-$MONGOHADOOP_VERSION.jar 43 | # 44 | # You can then launch the pyspark session using the following 45 | # shell command from the Agile_Data_Code_2 directory: 46 | # 47 | # pyspark \ 48 | # --jars $MONGOHADOOPSPARK_JAR,$MONGOJAVADRIVER_JAR,$MONGOHADOOP_JAR \ 49 | # --driver-class-path $MONGOHADOOPSPARK_JAR:$MONGOJAVADRIVER_JAR:$MONGOHADOOP_JAR 50 | 51 | import pymongo_spark 52 | # Important: activate pymongo_spark. 53 | pymongo_spark.activate() 54 | 55 | csv_lines = sc.textFile("data/example.csv") 56 | data = csv_lines.map(lambda line: line.split(",")) 57 | schema_data = data.map(lambda x: {'name': x[0], 'company': x[1], 'title': x[2]}) 58 | schema_data.saveToMongoDB('mongodb://localhost:27017/agile_data_science.executives') 59 | 60 | -------------------------------------------------------------------------------- /ch02/pyspark_streaming.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import json 3 | 4 | from pyspark import SparkContext, SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils, OffsetRange, TopicAndPartition 7 | 8 | 9 | # Process data every 10 seconds 10 | PERIOD = 10 11 | BROKERS = 'localhost:9092' 12 | TOPIC = 'test' 13 | 14 | conf = SparkConf().set("spark.default.parallelism", 1) 15 | 16 | # Stop the default SparkContext before creating a new one. 17 | sc.stop() 18 | sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf = conf) 19 | ssc = StreamingContext(sc, PERIOD) 20 | 21 | stream = KafkaUtils.createDirectStream( 22 | ssc, 23 | [TOPIC], 24 | { 25 | "metadata.broker.list": BROKERS, 26 | "group.id": "0", 27 | } 28 | ) 29 | object_stream = stream.map(lambda x: json.loads(x[1])) 30 | object_stream.pprint() 31 | 32 | ssc.start() 33 | -------------------------------------------------------------------------------- /ch02/pyspark_task_one.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import json 5 | import datetime, iso8601 6 | 7 | # Pass date and base path to main() from airflow 8 | def main(iso_date, base_path): 9 | APP_NAME = "pyspark_task_one.py" 10 | 11 | # If there is no SparkSession, create the environment 12 | try: 13 | sc and spark 14 | except NameError as e: 15 | import findspark 16 | findspark.init() 17 | import pyspark 18 | import pyspark.sql 19 | 20 | sc = pyspark.SparkContext() 21 | spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() 22 | 23 | # Get today's date 24 | today_dt = iso8601.parse_date(iso_date) 25 | rounded_today = today_dt.date() 26 | 27 | # Load today's data 28 | today_input_path = "{}/ch02/data/example_name_titles_daily.json/{}".format( 29 | base_path, 30 | rounded_today.isoformat() 31 | ) 32 | 33 | # Otherwise load the data and proceed... 34 | people_titles = spark.read.json(today_input_path) 35 | people_titles.show() 36 | 37 | # Group by as an RDD 38 | titles_by_name = people_titles.rdd.groupBy(lambda x: x["name"]) 39 | 40 | # Accept the group key/grouped data and concatenate the various titles... 41 | # into a master title 42 | def concatenate_titles(people_titles): 43 | name = people_titles[0] 44 | title_records = people_titles[1] 45 | master_title = "" 46 | for title_record in sorted(title_records): 47 | title = title_record["title"] 48 | master_title += "{}, ".format(title) 49 | master_title = master_title[:-2] 50 | record = {"name": name, "master_title": master_title} 51 | return record 52 | 53 | people_with_concatenated_titles = titles_by_name.map(concatenate_titles) 54 | people_output_json = people_with_concatenated_titles.map(json.dumps) 55 | 56 | # Get today's output path 57 | today_output_path = "{}/ch02/data/example_master_titles_daily.json/{}".format( 58 | base_path, 59 | rounded_today.isoformat() 60 | ) 61 | 62 | # Write/replace today's output path 63 | os.system("rm -rf {}".format(today_output_path)) 64 | people_output_json.saveAsTextFile(today_output_path) 65 | 66 | if __name__ == "__main__": 67 | main(sys.argv[1], sys.argv[2]) 68 | -------------------------------------------------------------------------------- /ch02/pyspark_task_two.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import json 5 | import datetime, iso8601 6 | 7 | # Pass date and base path to main() from airflow 8 | def main(iso_date, base_path): 9 | APP_NAME = "pyspark_task_two.py" 10 | 11 | # If there is no SparkSession, create the environment 12 | try: 13 | sc and spark 14 | except NameError as e: 15 | import findspark 16 | findspark.init() 17 | import pyspark 18 | import pyspark.sql 19 | 20 | sc = pyspark.SparkContext() 21 | spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() 22 | 23 | import pymongo 24 | import pymongo_spark 25 | # Important: activate pymongo_spark. 26 | pymongo_spark.activate() 27 | 28 | # Get today's date 29 | today_dt = iso8601.parse_date(iso_date) 30 | rounded_today = today_dt.date() 31 | 32 | # Load today's data 33 | today_input_path = "{}/ch02/data/example_master_titles_daily.json/{}".format( 34 | base_path, 35 | rounded_today.isoformat() 36 | ) 37 | 38 | # Otherwise load the data and proceed... 39 | people_master_titles_raw = sc.textFile(today_input_path) 40 | people_master_titles = people_master_titles_raw.map(json.loads) 41 | print(people_master_titles.first()) 42 | 43 | people_master_titles.saveToMongoDB('mongodb://localhost:27017/agile_data_science.people_master_titles') 44 | 45 | if __name__ == "__main__": 46 | main(sys.argv[1], sys.argv[2]) 47 | -------------------------------------------------------------------------------- /ch02/python_kafka.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import json 3 | 4 | from kafka import KafkaConsumer, TopicPartition 5 | consumer = KafkaConsumer() 6 | consumer.assign([TopicPartition('test', 0)]) 7 | consumer.seek_to_beginning() 8 | 9 | for message in consumer: 10 | message_bytes = message.value 11 | message_string = message_bytes.decode() 12 | message_object = json.loads(message_string) 13 | print(message_object) 14 | 15 | -------------------------------------------------------------------------------- /ch02/setup_airflow_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sudo ln -s $PROJECT_HOME/ch02/airflow_test.py ~/airflow/dags/ 4 | -------------------------------------------------------------------------------- /ch02/spark.py: -------------------------------------------------------------------------------- 1 | # Load the text file using the SparkContext 2 | csv_lines = sc.textFile("data/example.csv") 3 | 4 | # Map the data to split the lines into a list 5 | data = csv_lines.map(lambda line: line.split(",")) 6 | 7 | # Collect the dataset into local RAM 8 | data.collect() 9 | -------------------------------------------------------------------------------- /ch02/sql.py: -------------------------------------------------------------------------------- 1 | csv_lines = sc.textFile("data/example.csv") 2 | 3 | from pyspark.sql import Row 4 | 5 | # Convert the CSV into a pyspark.sql.Row 6 | def csv_to_row(line): 7 | parts = line.split(",") 8 | row = Row( 9 | name=parts[0], 10 | company=parts[1], 11 | title=parts[2] 12 | ) 13 | return row 14 | 15 | # Apply the function to get rows in an RDD 16 | rows = csv_lines.map(csv_to_row) 17 | 18 | # Convert to a pyspark.sql.DataFrame 19 | rows_df = rows.toDF() 20 | 21 | # Register the DataFrame for Spark SQL 22 | rows_df.registerTempTable("executives") 23 | 24 | # Generate a new DataFrame with SQL using the SparkSession 25 | job_counts = spark.sql(""" 26 | SELECT 27 | name, 28 | COUNT(*) AS total 29 | FROM executives 30 | GROUP BY name 31 | """) 32 | job_counts.show() 33 | 34 | # Go back to an RDD 35 | job_counts.rdd.collect() 36 | -------------------------------------------------------------------------------- /ch02/test_elasticsearch.py: -------------------------------------------------------------------------------- 1 | from pyelasticsearch import ElasticSearch 2 | es = ElasticSearch('http://localhost:9200/') 3 | es.search('name:Russell', index='agile_data_science') -------------------------------------------------------------------------------- /ch02/test_elasticsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -XPOST 'localhost:9200/agile_data_science/_search?pretty' -d ' 4 | { 5 | "query": { "match_all": {} } 6 | } 7 | ' 8 | -------------------------------------------------------------------------------- /ch02/test_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # How to read and write JSON and JSON Lines files using Python 4 | # 5 | import sys, os, re 6 | import json 7 | import codecs 8 | 9 | ary_of_objects = [ 10 | {'name': 'Russell Jurney', 'title': 'CEO'}, 11 | {'name': 'Muhammad Imran', 'title': 'VP of Marketing'}, 12 | {'name': 'Fe Mata', 'title': 'Chief Marketing Officer'}, 13 | ] 14 | 15 | path = "/tmp/test.jsonl" 16 | 17 | # 18 | # Write our objects to jsonl 19 | # 20 | f = codecs.open(path, 'w', 'utf-8') 21 | for row_object in ary_of_objects: 22 | # ensure_ascii=False is essential or errors/corruption will occur 23 | json_record = json.dumps(row_object, ensure_ascii=False) 24 | f.write(json_record + "\n") 25 | f.close() 26 | 27 | print("Wrote JSON Lines file /tmp/test.jsonl") 28 | 29 | # 30 | # Read this jsonl file back into objects 31 | # 32 | ary_of_objects = [] 33 | f = codecs.open(path, "r", "utf-8") 34 | for line in f: 35 | record = json.loads(line.rstrip("\n|\r")) 36 | ary_of_objects.append(record) 37 | print(ary_of_objects) 38 | print("Read JSON Lines file /tmp/test.jsonl") 39 | 40 | # 41 | # Utility functions to read and write json and jsonl files 42 | # 43 | def write_json_file(obj, path): 44 | '''Dump an object and write it out as json to a file.''' 45 | f = codecs.open(path, 'w', 'utf-8') 46 | f.write(json.dumps(obj, ensure_ascii=False)) 47 | f.close() 48 | 49 | def write_json_lines_file(ary_of_objects, path): 50 | '''Dump a list of objects out as a json lines file.''' 51 | f = codecs.open(path, 'w', 'utf-8') 52 | for row_object in ary_of_objects: 53 | json_record = json.dumps(row_object, ensure_ascii=False) 54 | f.write(json_record + "\n") 55 | f.close() 56 | 57 | def read_json_file(path): 58 | '''Turn a normal json file (no CRs per record) into an object.''' 59 | text = codecs.open(path, 'r', 'utf-8').read() 60 | return json.loads(text) 61 | 62 | def read_json_lines_file(path): 63 | '''Turn a json cr file (CRs per record) into an array of objects''' 64 | ary = [] 65 | f = codecs.open(path, "r", "utf-8") 66 | for line in f: 67 | record = json.loads(line.rstrip("\n|\r")) 68 | return ary 69 | -------------------------------------------------------------------------------- /ch02/test_pymongo.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | client = MongoClient() 3 | db = client.agile_data_science 4 | list(db.executives.find({"name": "Russell Jurney"})) 5 | -------------------------------------------------------------------------------- /ch02/test_pymongo_2.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | client = MongoClient() 4 | 5 | record = {"foo": "bar"} 6 | 7 | client.agile_data_science.collection_two.insert_one(record) 8 | 9 | record2 = client.agile_data_science.collection_one.find_one( 10 | { 11 | "foo": "bar" 12 | } 13 | ) 14 | -------------------------------------------------------------------------------- /ch02/web/flask_pymongo.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | from pymongo import MongoClient 3 | import bson.json_util 4 | 5 | # Set up Flask 6 | app = Flask(__name__) 7 | 8 | # Set up Mongo 9 | client = MongoClient() # defaults to localhost 10 | db = client.agile_data_science 11 | 12 | # Fetch from/to totals, given a pair of email addresses 13 | @app.route("/executive/") 14 | def executive(name): 15 | executive = db.executives.find({"name": name}) 16 | return bson.json_util.dumps(list(executive)) 17 | 18 | 19 | if __name__ == "__main__": 20 | app.run(debug=True, host="0.0.0.0") 21 | -------------------------------------------------------------------------------- /ch02/web/templates/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 |
18 | 21 |

Executives

22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | {% for executive in executives -%} 30 | 31 | 32 | 33 | 34 | 35 | {% endfor -%} 36 | 37 |
NameCompanyTitle
{{executive.name}}{{executive.company}}{{executive.title}}
38 |
39 | 40 |
41 |
42 | 43 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /ch02/web/test_flask.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | 3 | 4 | app = Flask(__name__) 5 | 6 | 7 | @app.route("/") 8 | def hello(input): 9 | return input 10 | 11 | 12 | if __name__ == "__main__": 13 | app.run(debug=True, host="0.0.0.0") 14 | -------------------------------------------------------------------------------- /ch02/web/test_flask_bootstrap.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | from pymongo import MongoClient 3 | import bson.json_util 4 | 5 | # Set up Flask 6 | app = Flask(__name__) 7 | 8 | # Set up Mongo 9 | client = MongoClient("mongo") # defaults to localhost 10 | db = client.agile_data_science 11 | 12 | # Fetch from/to totals, given a pair of email addresses 13 | @app.route("/executive/") 14 | def executive(name): 15 | executives = db.executives.find({"name": name}) 16 | return render_template("table.html", executives=list(executives)) 17 | 18 | 19 | if __name__ == "__main__": 20 | app.run(debug=True, host="0.0.0.0") 21 | -------------------------------------------------------------------------------- /ch02/web/test_flask_pymongo.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request 2 | from pymongo import MongoClient 3 | import bson.json_util 4 | 5 | # Set up Flask 6 | app = Flask(__name__) 7 | 8 | # Set up Mongo 9 | client = MongoClient("mongo") # defaults to localhost 10 | db = client.agile_data_science 11 | 12 | # Fetch from/to totals, given a pair of email addresses 13 | @app.route("/executive/") 14 | def executive(name): 15 | executive = db.executives.find({"name": name}) 16 | return bson.json_util.dumps(list(executive)) 17 | 18 | 19 | if __name__ == "__main__": 20 | app.run(debug=True, host="0.0.0.0") 21 | -------------------------------------------------------------------------------- /ch04/convert_data.py: -------------------------------------------------------------------------------- 1 | # Loads CSV with header parsing and type inference, in one line! 2 | on_time_dataframe = spark.read.format('com.databricks.spark.csv')\ 3 | .options( 4 | header='true', 5 | treatEmptyValuesAsNulls='true', 6 | )\ 7 | .load('data/On_Time_On_Time_Performance_2015.csv.bz2') 8 | on_time_dataframe.registerTempTable("on_time_performance") 9 | 10 | trimmed_cast_performance = spark.sql(""" 11 | SELECT 12 | Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate, 13 | Carrier, TailNum, FlightNum, 14 | Origin, OriginCityName, OriginState, 15 | Dest, DestCityName, DestState, 16 | DepTime, cast(DepDelay as float), cast(DepDelayMinutes as int), 17 | cast(TaxiOut as float), cast(TaxiIn as float), 18 | WheelsOff, WheelsOn, 19 | ArrTime, cast(ArrDelay as float), cast(ArrDelayMinutes as float), 20 | cast(Cancelled as int), cast(Diverted as int), 21 | cast(ActualElapsedTime as float), cast(AirTime as float), 22 | cast(Flights as int), cast(Distance as float), 23 | cast(CarrierDelay as float), cast(WeatherDelay as float), cast(NASDelay as float), 24 | cast(SecurityDelay as float), cast(LateAircraftDelay as float), 25 | CRSDepTime, CRSArrTime 26 | FROM 27 | on_time_performance 28 | """) 29 | 30 | # Replace on_time_performance table with our new, trimmed table and show its contents 31 | trimmed_cast_performance.registerTempTable("on_time_performance") 32 | trimmed_cast_performance.show() 33 | 34 | # Verify we can sum numeric columns 35 | spark.sql("""SELECT 36 | SUM(WeatherDelay), SUM(CarrierDelay), SUM(NASDelay), 37 | SUM(SecurityDelay), SUM(LateAircraftDelay) 38 | FROM on_time_performance 39 | """).show() 40 | 41 | # Save records as gzipped json lines 42 | trimmed_cast_performance.toJSON()\ 43 | .saveAsTextFile( 44 | 'data/on_time_performance.jsonl.gz', 45 | 'org.apache.hadoop.io.compress.GzipCodec' 46 | ) 47 | 48 | # View records on filesystem 49 | # gunzip -c data/on_time_performance.jsonl.gz/part-00000.gz | head 50 | 51 | # Save records using Parquet 52 | trimmed_cast_performance.write.mode("overwrite").parquet("data/on_time_performance.parquet") 53 | 54 | # Load JSON records back 55 | on_time_dataframe = spark.read.json('data/on_time_performance.jsonl.gz') 56 | on_time_dataframe.show() 57 | 58 | # Load the parquet file back 59 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 60 | on_time_dataframe.show() 61 | -------------------------------------------------------------------------------- /ch04/download.sh: -------------------------------------------------------------------------------- 1 | # 2 | # Script to download data for book 3 | # 4 | mkdir ../data 5 | 6 | # Get on-time records for all flights in 2015 - 273MB 7 | # wget -P ../data/ http://s3.amazonaws.com/agile_data_science/On_Time_On_Time_Performance_2015.csv.gz 8 | 9 | # Get openflights data 10 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat 11 | mv /tmp/airports.dat ../data/airports.csv 12 | 13 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/airlines.dat 14 | mv /tmp/airlines.dat ../data/airlines.csv 15 | 16 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat 17 | mv /tmp/routes.dat ../data/routes.csv 18 | 19 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/countries.dat 20 | mv /tmp/countries.dat ../data/countries.csv 21 | 22 | # Get FAA data 23 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/aircraft.txt 24 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/ata.txt 25 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/compt.txt 26 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/engine.txt 27 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/prop.txt 28 | 29 | # Get Aircraft database 30 | # wget -P /tmp/ http://registry.faa.gov/database/AR042016.zip 31 | # unzip -d ../data/ /tmp/AR042016.zip 32 | 33 | # Get FAA Registration data 34 | # wget -P /tmp/ http://registry.faa.gov/database/AR042016.zip 35 | # unzip -d ../data/ /tmp/AR042016.zip 36 | -------------------------------------------------------------------------------- /ch04/images/ads2_0201.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0201.png -------------------------------------------------------------------------------- /ch04/images/ads2_0202.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0202.png -------------------------------------------------------------------------------- /ch04/images/ads2_0209.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0209.png -------------------------------------------------------------------------------- /ch04/images/ads2_0211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0211.png -------------------------------------------------------------------------------- /ch04/images/ads2_0212.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0212.png -------------------------------------------------------------------------------- /ch04/images/ads2_0215.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0215.png -------------------------------------------------------------------------------- /ch04/images/ads2_0217.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0217.png -------------------------------------------------------------------------------- /ch04/images/ads2_0219.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0219.png -------------------------------------------------------------------------------- /ch04/images/ads2_0220.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0220.png -------------------------------------------------------------------------------- /ch04/images/ads2_0401.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0401.png -------------------------------------------------------------------------------- /ch04/images/ads2_0402.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0402.png -------------------------------------------------------------------------------- /ch04/images/ads2_0403.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0403.png -------------------------------------------------------------------------------- /ch04/images/ads2_0405.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0405.png -------------------------------------------------------------------------------- /ch04/images/ads2_0406.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0406.png -------------------------------------------------------------------------------- /ch04/images/ads2_0408.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0408.png -------------------------------------------------------------------------------- /ch04/images/ads2_0409.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0409.png -------------------------------------------------------------------------------- /ch04/images/ads_bootstrap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads_bootstrap.png -------------------------------------------------------------------------------- /ch04/images/ags2_0402.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ags2_0402.png -------------------------------------------------------------------------------- /ch04/images/airline_data_fields.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/airline_data_fields.png -------------------------------------------------------------------------------- /ch04/images/faa_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/faa_table.png -------------------------------------------------------------------------------- /ch04/images/flask_terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/flask_terminal.png -------------------------------------------------------------------------------- /ch04/images/json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/json.png -------------------------------------------------------------------------------- /ch04/images/parquet_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/parquet_logo.jpg -------------------------------------------------------------------------------- /ch04/images/row_format_column_format.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/row_format_column_format.png -------------------------------------------------------------------------------- /ch04/load_on_time_pyspark.py: -------------------------------------------------------------------------------- 1 | # Loads CSV with header parsing and type inference, in one line! 2 | # Must use 'pyspark --packages com.databricks:spark-csv_2.10:1.4.0' for this to work 3 | on_time_dataframe = spark.read.format('com.databricks.spark.csv')\ 4 | .options(header='true', inferschema='true')\ 5 | .load('data/On_Time_On_Time_Performance_2015.csv.bz2') 6 | 7 | # Check out the data - very wide so hard to see 8 | on_time_dataframe.show() 9 | 10 | # Use SQL to query data - what airport pairs have the most flights? 11 | on_time_dataframe.registerTempTable("on_time_dataframe") 12 | airport_pair_totals = spark.sql("""SELECT 13 | Origin, Dest, COUNT(*) AS total 14 | FROM on_time_dataframe 15 | GROUP BY Origin, Dest 16 | ORDER BY total DESC""" 17 | ) 18 | 19 | # Use dataflows 20 | airport_pair_totals.limit(10).show() 21 | 22 | # We can go back and forth as we see fit! 23 | 24 | -------------------------------------------------------------------------------- /ch04/mongo.js: -------------------------------------------------------------------------------- 1 | db.on_time_performance.findOne({Carrier: 'DL', FlightDate: '2015-01-01', FlightNum: 478}) // Slow 2 | db.on_time_performance.ensureIndex({Carrier: 1, FlightDate: 1, FlightNum: 1}) 3 | db.on_time_performance.findOne({Carrier: 'DL', FlightDate: '2015-01-01', FlightNum: 478}) // Fast 4 | 5 | db.on_time_performance.find({Origin: 'ATL', Dest: 'SFO', FlightDate: '2015-01-01'}).sort({DepTime: 1, ArrTime: 1}) // Slow or broken 6 | db.on_time_performance.ensureIndex({Origin: 1, Dest: 1, FlightDate: 1}) // Fast 7 | db.on_time_performance.find({Origin: 'ATL', Dest: 'SFO', FlightDate: '2015-01-01'}).sort({DepTime: 1, ArrTime: 1}) // Fast 8 | -------------------------------------------------------------------------------- /ch04/pyspark_to_elasticsearch.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # Save the DataFrame to Elasticsearch 5 | on_time_dataframe.write.format("org.elasticsearch.spark.sql")\ 6 | .option("es.resource","agile_data_science/on_time_performance")\ 7 | .option("es.batch.size.entries","100")\ 8 | .mode("overwrite")\ 9 | .save() 10 | 11 | # Format data for Elasticsearch, as a tuple with a dummy key in the first field 12 | # on_time_performance = on_time_dataframe.rdd.map(lambda x: ('ignored_key', x.asDict())) 13 | # 14 | # on_time_performance.saveAsNewAPIHadoopFile( 15 | # path='-', 16 | # outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", 17 | # keyClass="org.apache.hadoop.io.NullWritable", 18 | # valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 19 | # conf={ "es.resource" : "agile_data_science/on_time_performance" }) 20 | -------------------------------------------------------------------------------- /ch04/pyspark_to_mongo.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import pymongo_spark 3 | # Important: activate pymongo_spark. 4 | pymongo_spark.activate() 5 | 6 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 7 | 8 | # Note we have to convert the row to a dict to avoid https://jira.mongodb.org/browse/HADOOP-276 9 | as_dict = on_time_dataframe.rdd.map(lambda row: row.asDict()) 10 | as_dict.saveToMongoDB('mongodb://localhost:27017/agile_data_science.on_time_performance') 11 | -------------------------------------------------------------------------------- /ch04/web/config.py: -------------------------------------------------------------------------------- 1 | # config.py, a configuration file for index.py 2 | RECORDS_PER_PAGE = 15 3 | ELASTIC_URL = "http://elastic:9200" 4 | -------------------------------------------------------------------------------- /ch04/web/on_time_flask.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | from pymongo import MongoClient 3 | from bson import json_util 4 | 5 | 6 | # Set up Flask and Mongo 7 | app = Flask(__name__) 8 | client = MongoClient("mongo") 9 | 10 | 11 | # Controller: Fetch an email and display it 12 | @app.route("/on_time_performance") 13 | def on_time_performance(): 14 | 15 | carrier = request.args.get("Carrier") 16 | flight_date = request.args.get("FlightDate") 17 | flight_num = request.args.get("FlightNum") 18 | 19 | flight = client.agile_data_science.on_time_performance.find_one( 20 | {"Carrier": carrier, "FlightDate": flight_date, "FlightNum": flight_num} 21 | ) 22 | 23 | print(flight) 24 | 25 | return json_util.dumps(flight) 26 | 27 | 28 | if __name__ == "__main__": 29 | 30 | app.run(debug=True, host="0.0.0.0") 31 | -------------------------------------------------------------------------------- /ch04/web/templates/flight.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Flight {{flight.FlightNum}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
AirlineOriginDestinationTail NumberDateAir TimeDistance
{{flight.Carrier}}{{flight.Origin}}{{flight.Dest}}{{flight.TailNum}}{{flight.FlightDate}}{{flight.AirTime}}{{flight.Distance}}
27 |
28 | {% endblock %} -------------------------------------------------------------------------------- /ch04/web/templates/flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

{{flight_count}} Flights on {{flight_date}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for flight in flights %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | {% endfor %} 29 | 30 |
AirlineFlight NumberOriginDestinationDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}}{{flight.Origin}}{{flight.Dest}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
31 |
32 | {% import "macros.jnj" as common %} 33 | {% if nav_offsets and nav_path -%} 34 | {{ common.display_nav(nav_offsets, nav_path, flight_count, query)|safe }} 35 | {% endif -%} 36 | {% endblock %} -------------------------------------------------------------------------------- /ch04/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 |
18 | 21 | {% block body %}{% endblock %} 22 |
23 | 24 |
25 |
26 | 27 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /ch04/web/templates/macros.jnj: -------------------------------------------------------------------------------- 1 | ; 2 | {% macro display_nav(offsets, path, count, query) -%} 3 |
4 | {% for key, values in offsets.items() -%} 5 | {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%} 6 | {{ key }} 7 | {% else -%} 8 | {{ key }} 9 | {% endif %} 10 | {% endfor -%} 11 |
12 | {%- endmacro %} 13 | -------------------------------------------------------------------------------- /ch04/web/templates/search.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

{{flight_count['value']}} Flights

5 |
6 | 7 | 9 | 10 | 12 | 13 | 14 | 15 | 17 | 18 | 20 | 21 | 23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | {% for flight in flights %} 39 | 40 | 41 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | {% endfor %} 53 | 54 |
AirlineFlight NumberOriginDestinationDateDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}} 43 | {{flight.Origin}}{{flight.Dest}}{{flight.FlightDate}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
55 |
56 | 57 | {% import "macros.jnj" as common %} 58 | {% if nav_offsets and nav_path -%} 59 | {{ common.display_nav(nav_offsets, nav_path, flight_count)|safe }} 60 | {% endif -%} 61 | 62 | {% endblock %} -------------------------------------------------------------------------------- /ch05/assess_airplanes.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | on_time_dataframe.registerTempTable("on_time_performance") 4 | 5 | # Dump the unneeded fields 6 | tail_numbers = on_time_dataframe.rdd.map(lambda x: x.TailNum) 7 | tail_numbers = tail_numbers.filter(lambda x: x != '') 8 | 9 | # distinct() gets us unique tail numbers 10 | unique_tail_numbers = tail_numbers.distinct() 11 | 12 | # now we need a count() of unique tail numbers 13 | airplane_count = unique_tail_numbers.count() 14 | print("Total airplanes: {}".format(airplane_count)) 15 | -------------------------------------------------------------------------------- /ch05/assess_faa.py: -------------------------------------------------------------------------------- 1 | # Load the FAA N-Number Inquiry Records 2 | faa_tail_number_inquiry = spark.read.json('data/faa_tail_number_inquiry.jsonl') 3 | faa_tail_number_inquiry.show() 4 | 5 | # Count the records 6 | faa_tail_number_inquiry.count() 7 | 8 | # Load our unique tail numbers 9 | unique_tail_numbers = spark.read.json('data/tail_numbers.jsonl') 10 | unique_tail_numbers.show() 11 | 12 | # left outer join tail numbers to our inquries to see how many came through 13 | tail_num_plus_inquiry = unique_tail_numbers.join( 14 | faa_tail_number_inquiry, 15 | unique_tail_numbers.TailNum == faa_tail_number_inquiry.TailNum, 16 | 'left_outer' 17 | ) 18 | tail_num_plus_inquiry.show() 19 | 20 | # Now compute the total records and the successfully joined records 21 | total_records = tail_num_plus_inquiry.count() 22 | join_hits = tail_num_plus_inquiry.filter( 23 | tail_num_plus_inquiry.owner.isNotNull() 24 | ).count() 25 | 26 | # This being Python, we can now compute and print a join percent... 27 | hit_ratio = float(join_hits)/float(total_records) 28 | hit_pct = hit_ratio * 100 29 | print("Successful joins: {:.2f}%".format(hit_pct)) 30 | -------------------------------------------------------------------------------- /ch05/extract_airplanes.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | on_time_dataframe.registerTempTable("on_time_performance") 4 | 5 | # Filter down to the fields we need to identify and link to a flight 6 | flights = on_time_dataframe.rdd.map( 7 | lambda x: 8 | { 9 | 'Carrier': x.Carrier, 10 | 'FlightDate': x.FlightDate, 11 | 'FlightNum': x.FlightNum, 12 | 'Origin': x.Origin, 13 | 'Dest': x.Dest, 14 | 'TailNum': x.TailNum 15 | } 16 | ) 17 | flights.first() 18 | 19 | # Group flights by tail number, sorted by flight number, date, then origin/dest 20 | flights_per_airplane = flights\ 21 | .map(lambda record: (record['TailNum'], [record]))\ 22 | .reduceByKey(lambda a, b: a + b)\ 23 | .map(lambda tuple: 24 | { 25 | 'TailNum': tuple[0], 26 | 'Flights': sorted(tuple[1], key=lambda x: (x['FlightNum'], x['FlightDate'], x['Origin'], x['Dest'])) 27 | } 28 | ) 29 | flights_per_airplane.first() 30 | 31 | # Save to Mongo 32 | import pymongo_spark 33 | pymongo_spark.activate() 34 | flights_per_airplane.saveToMongoDB('mongodb://localhost:27017/agile_data_science.flights_per_airplane') 35 | -------------------------------------------------------------------------------- /ch05/images/ads2_0501.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0501.png -------------------------------------------------------------------------------- /ch05/images/ads2_0502.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0502.png -------------------------------------------------------------------------------- /ch05/images/ads2_0503.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0503.png -------------------------------------------------------------------------------- /ch05/images/ads2_0504.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0504.png -------------------------------------------------------------------------------- /ch05/images/ads2_0505.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0505.png -------------------------------------------------------------------------------- /ch05/images/ads2_0507.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0507.png -------------------------------------------------------------------------------- /ch05/images/ads2_0508.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0508.png -------------------------------------------------------------------------------- /ch05/images/ads2_0509.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0509.png -------------------------------------------------------------------------------- /ch05/images/ads2_0510.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0510.png -------------------------------------------------------------------------------- /ch05/images/ads2_0511.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0511.png -------------------------------------------------------------------------------- /ch05/images/ads2_0512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0512.png -------------------------------------------------------------------------------- /ch05/images/first_order_form.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/first_order_form.png -------------------------------------------------------------------------------- /ch05/images/flight_search_with_tail_num_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/flight_search_with_tail_num_link.png -------------------------------------------------------------------------------- /ch05/images/mapreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/mapreduce.png -------------------------------------------------------------------------------- /ch05/images/total_flights_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/total_flights_2.png -------------------------------------------------------------------------------- /ch05/install.sh: -------------------------------------------------------------------------------- 1 | # Get bootstrap 2 | mkdir web/static 3 | cd web/static 4 | wget 'https://code.jquery.com/jquery-1.12.2.min.js' 5 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css' 6 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css' 7 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js' 8 | wget 'http://d3js.org/d3.v3.min.js' 9 | wget 'https://cdn.rawgit.com/novus/nvd3/v1.8.1/build/nv.d3.min.js' 10 | wget 'https://cdn.rawgit.com/novus/nvd3/v1.8.1/build/nv.d3.css' 11 | cd ../.. 12 | -------------------------------------------------------------------------------- /ch05/mongo.js: -------------------------------------------------------------------------------- 1 | db.flights_per_airplane.findOne({TailNum: 'N249AU'}) 2 | 3 | db.flights_per_airplane.ensureIndex({TailNum: 1}) 4 | 5 | db.flights_per_airplane.findOne({TailNum: 'N249AU'}) 6 | -------------------------------------------------------------------------------- /ch05/save_tail_numbers.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | on_time_dataframe.registerTempTable("on_time_performance") 4 | 5 | # Dump the unneeded fields and filter nulls 6 | tail_numbers = on_time_dataframe.rdd.map(lambda x: x.TailNum) 7 | tail_numbers = tail_numbers.filter(lambda x: x != '') 8 | 9 | # distinct() gets us unique tail numbers 10 | unique_tail_numbers = tail_numbers.distinct() 11 | 12 | # Store as JSON objects via a dataframe. Repartition to 1 to get 1 json file. 13 | unique_records = unique_tail_numbers.map(lambda x: {'TailNum': x}).toDF() 14 | unique_records.repartition(1).write.mode("overwrite").json("data/tail_numbers.json") 15 | 16 | # Now from bash: ls data/tail_numbers.json/part* 17 | -------------------------------------------------------------------------------- /ch05/total_flights.py: -------------------------------------------------------------------------------- 1 | # Load the parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # Use SQL to look at the total flights by month across 2015 5 | on_time_dataframe.registerTempTable("on_time_dataframe") 6 | total_flights_by_month = spark.sql( 7 | """SELECT Month, Year, COUNT(*) AS total_flights 8 | FROM on_time_dataframe 9 | GROUP BY Year, Month 10 | ORDER BY Year, Month""" 11 | ) 12 | 13 | # This map/asDict trick makes the rows print a little prettier. It is optional. 14 | flights_chart_data = total_flights_by_month.rdd.map(lambda row: row.asDict()) 15 | flights_chart_data.collect() 16 | 17 | # Save chart to MongoDB 18 | import pymongo_spark 19 | pymongo_spark.activate() 20 | flights_chart_data.saveToMongoDB( 21 | 'mongodb://localhost:27017/agile_data_science.flights_by_month' 22 | ) 23 | 24 | -------------------------------------------------------------------------------- /ch05/web/config.py: -------------------------------------------------------------------------------- 1 | # config.py, a configuration file for index.py 2 | RECORDS_PER_PAGE = 15 3 | ELASTIC_URL = "http://elastic:9200" 4 | -------------------------------------------------------------------------------- /ch05/web/flights_per_airplane.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Flights by Tail Number {{tail_number}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 16 | 17 | 18 | 19 |
Images
12 | {% for image in images['Images'] %} 13 | 14 | {% endfor %} 15 |
20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |
Serial NumberManufacturerModelMFR YearOwnerOwner StateEngine ManufacturerEngine Model
{{descriptions['serial_number']}}{{descriptions['manufacturer']}}{{descriptions['model']}}{{descriptions['mfr_year']}}{{descriptions['owner']}}{{descriptions['owner_state']}}{{descriptions['engine_manufacturer']}}{{descriptions['engine_model']}}
44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | {% for flight in flights['Flights'] %} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | {% endfor %} 62 | 63 |
CarrierDateFlight NumberOriginDestination
{{flight['Carrier']}}{{flight['FlightDate']}}{{flight['FlightNum']}}{{flight['Origin']}}{{flight['Dest']}}
64 |
65 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/static/app.js: -------------------------------------------------------------------------------- 1 | // Define the width and height of our chart 2 | var width = 960, 3 | height = 350; 4 | 5 | // Define the y scale, which is linear and maps to between the range of the height of the chart and 0 6 | var y = d3.scale.linear() 7 | .range([height, 0]); 8 | // We define the domain once we get our data in d3.json, below 9 | 10 | // Our chart object is defined using the height and width 11 | var chart = d3.select(".chart") 12 | .attr("width", width) 13 | .attr("height", height); 14 | 15 | // We fetch the JSON from our controller, then process the resulting data 16 | d3.json("/total_flights.json", function (data) { 17 | 18 | // We define colors for the bars 19 | var barColor = 'steelblue'; 20 | 21 | // We compute the maximum value for the bars, then set the domain for the y axis. 22 | // This means that y will now map from [0 -> maxY] to [height -> 0]. 23 | var maxY = d3.max(data, function (d) { return d.total_flights; }); 24 | y.domain([0, maxY]); 25 | 26 | var varColor = function (d, i) { 27 | if (d['total_flights'] == maxY) { return modeColor; } 28 | else { return defaultColor; } 29 | } 30 | 31 | // Divide the width by the number of bars to get the bar width 32 | var barWidth = width / data.length; 33 | var bar = chart.selectAll("g") 34 | .data(data) 35 | .enter() 36 | .append("g") 37 | .attr("transform", function (d, i) { return "translate(" + i * barWidth + ",0)"; }); 38 | 39 | // Now we define a rectangle for each container with the height mapped from the total_flights data point 40 | // to the y axis, and the width barWidth - 1 pixel. We will it with the bar color. 41 | bar.append("rect") 42 | .attr("y", function (d) { return y(d.total_flights); }) 43 | .attr("height", function (d) { return height - y(d.total_flights); }) 44 | .attr("width", barWidth - 1) 45 | .style("fill", barColor); 46 | 47 | // We then label each bar with a the raw value in the top middle of the bar. 48 | // We offset the label by 3 to make it under the end of the bar, in the blue bit and we make it white 49 | // to stand out from the blue using the CSS from the HTML template above for text. 50 | bar.append("text") 51 | .attr("x", barWidth / 2) 52 | .attr("y", function (d) { return y(d.total_flights) + 3; }) 53 | .attr("dy", ".75em") 54 | .text(function (d) { return d.total_flights; }); 55 | }); 56 | -------------------------------------------------------------------------------- /ch05/web/static/app3.js: -------------------------------------------------------------------------------- 1 | // Define the width and height of our chart 2 | var width = 960, 3 | height = 350; 4 | 5 | // Define the y scale, which is linear and maps to between the range of the height of the chart and 0 6 | var y = d3.scale.linear() 7 | .range([height, 0]); 8 | // We define the domain once we get our data in d3.json, below 9 | 10 | // Our chart object is defined using the height and width 11 | var chart = d3.select(".chart") 12 | .attr("width", width) 13 | .attr("height", height); 14 | 15 | // We fetch the JSON from our controller, then process the resulting data 16 | d3.json("/top_routes.json", function (data) { 17 | 18 | // We define colors for the bars 19 | var barColor = 'steelblue'; 20 | 21 | // We compute the maximum value for the bars, then set the domain for the y axis. 22 | // This means that y will now map from [0 -> maxY] to [height -> 0]. 23 | var maxY = d3.max(data, function (d) { return d.total; }); 24 | y.domain([0, maxY]); 25 | 26 | var varColor = function (d, i) { 27 | if (d['total'] == maxY) { return modeColor; } 28 | else { return defaultColor; } 29 | } 30 | 31 | // Divide the width by the number of bars to get the bar width 32 | var barWidth = width / data.length; 33 | var bar = chart.selectAll("g") 34 | .data(data) 35 | .enter() 36 | .append("g") 37 | .attr("transform", function (d, i) { return "translate(" + i * barWidth + ",0)"; }); 38 | 39 | // Now we define a rectangle for each container with the height mapped from the total data point 40 | // to the y axis, and the width barWidth - 1 pixel. We will it with the bar color. 41 | bar.append("rect") 42 | .attr("y", function (d) { return y(d.total); }) 43 | .attr("height", function (d) { return height - y(d.total); }) 44 | .attr("width", barWidth - 1) 45 | .style("fill", barColor); 46 | 47 | // We then label each bar with a the raw value in the top middle of the bar. 48 | // We offset the label by 3 to make it under the end of the bar, in the blue bit and we make it white 49 | // to stand out from the blue using the CSS from the HTML template above for text. 50 | bar.append("text") 51 | .attr("x", barWidth / 2) 52 | .attr("y", function (d) { return y(d.total) + 3; }) 53 | .attr("dy", ".75em") 54 | .text(function (d) { return d.total; }); 55 | 56 | // to stand out from the blue using the CSS from the HTML template above for text. 57 | bar.append("text") 58 | .attr("x", barWidth / 2) 59 | .attr("y", function (d) { return y(d.total) + 13; }) 60 | .attr("dy", ".75em") 61 | .text(function (d) { return d.Origin + ' ' + d.Dest; }); 62 | }); 63 | -------------------------------------------------------------------------------- /ch05/web/static/images/.exist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/web/static/images/.exist -------------------------------------------------------------------------------- /ch05/web/templates/flight.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Flight {{flight.FlightNum}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |
AirlineOriginDestinationTail NumberDateAir TimeDistance
{{flight.Carrier}}{{flight.Origin}}{{flight.Dest}}{{flight.TailNum}}{{flight.FlightDate}}{{flight.AirTime}}{{flight.Distance}}
27 |
28 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

{{flight_count}} Flights on {{flight_date}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for flight in flights %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | {% endfor %} 29 | 30 |
AirlineFlight NumberOriginDestinationDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}}{{flight.Origin}}{{flight.Dest}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
31 |
32 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/flights_per_airplane.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Flights by Tail Number {{tail_number}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | {% for flight in flights['Flights'] %} 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | {% endfor %} 23 | 24 |
CarrierDateFlight NumberOriginDestination
{{flight['Carrier']}}{{flight['FlightDate']}}{{flight['FlightNum']}}{{flight['Origin']}}{{flight['Dest']}}
25 |
26 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/flights_per_airplane_2.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Flights by Tail Number {{tail_number}}

5 | 6 | 7 | 8 | 9 | 10 | 11 | 16 | 17 | 18 | 19 |
Images
12 | {% for image in images['Images'] %} 13 | 14 | {% endfor %} 15 |
20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |
Serial NumberManufacturerModelMFR YearOwnerOwner StateEngine ManufacturerEngine Model
{{descriptions['serial_number']}}{{descriptions['manufacturer']}}{{descriptions['model']}}{{descriptions['mfr_year']}}{{descriptions['owner']}}{{descriptions['owner_state']}}{{descriptions['engine_manufacturer']}}{{descriptions['engine_model']}}
44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | {% for flight in flights['Flights'] %} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | {% endfor %} 62 | 63 |
CarrierDateFlight NumberOriginDestination
{{flight['Carrier']}}{{flight['FlightDate']}}{{flight['FlightNum']}}{{flight['Origin']}}{{flight['Dest']}}
64 |
65 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 |
21 | 24 | {% block body %}{% endblock %} 25 |
26 | 27 |
28 |
29 | 30 | 35 | 36 | -------------------------------------------------------------------------------- /ch05/web/templates/macros.jnj: -------------------------------------------------------------------------------- 1 | ; 2 | {% macro display_nav(offsets, path, count, query) -%} 3 |
4 | {% for key, values in offsets.items() -%} 5 | {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count['value'] > values['bottom_offset'] -%} 6 | {{ key }} 7 | {% else -%} 8 | {{ key }} 9 | {% endif %} 10 | {% endfor -%} 11 |
12 | {%- endmacro %} 13 | -------------------------------------------------------------------------------- /ch05/web/templates/search.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

{{flight_count['value']}} Flights

5 |
6 | 7 | 9 | 10 | 12 | 13 | 14 | 15 | 17 | 18 | 20 | 21 | 23 | 24 |
25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | {% for flight in flights %} 39 | 40 | 41 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | {% endfor %} 53 | 54 |
AirlineFlight NumberOriginDestinationDateDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}} 43 | {{flight.Origin}}{{flight.Dest}}{{flight.FlightDate}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
55 |
56 | {% import "macros.jnj" as common %} 57 | {% if nav_offsets and nav_path -%} 58 | {{ common.display_nav(nav_offsets, nav_path, flight_count)|safe }} 59 | {% endif -%} 60 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/top_routes.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Total Flights by Month

5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for month in total_flights %} 12 | 13 | 14 | 15 | 16 | {% endfor %} 17 | 18 |
MonthTotal Flights
{{month.Month}}{{month.total_flights}}
19 |
20 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/top_routes_chart.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

National Top Routes

19 |
20 |
21 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /ch05/web/templates/total_flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Total Flights by Month

5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for month in total_flights %} 12 | 13 | 14 | 15 | 16 | {% endfor %} 17 | 18 |
MonthTotal Flights
{{month.Month}}{{month.total_flights}}
19 |
20 | {% endblock %} -------------------------------------------------------------------------------- /ch05/web/templates/total_flights_chart.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

Total Flights by Month

19 |
20 |
21 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /ch05/web/templates/total_flights_chart_2.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

Total Flights by Month

19 |
20 |
21 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /ch06/add_name_to_airlines.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | 3 | # Load the on-time parquet file 4 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 5 | 6 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline 7 | on_time_dataframe.registerTempTable("on_time_performance") 8 | carrier_codes = spark.sql( 9 | "SELECT DISTINCT Carrier FROM on_time_performance" 10 | ) 11 | carrier_codes.collect() 12 | 13 | from pyspark.sql.types import StringType, IntegerType 14 | from pyspark.sql.types import StructType, StructField 15 | 16 | schema = StructType([ 17 | StructField("ID", IntegerType(), True), # "ArrDelay":5.0 18 | StructField("Name", StringType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" 19 | StructField("Alias", StringType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" 20 | StructField("IATA", StringType(), True), # "Carrier":"WN" 21 | StructField("ICAO", StringType(), True), # "DayOfMonth":31 22 | StructField("CallSign", StringType(), True), # "DayOfWeek":4 23 | StructField("Country", StringType(), True), # "DayOfYear":365 24 | StructField("Active", StringType(), True), # "DepDelay":14.0 25 | ]) 26 | 27 | airlines = spark.read.format('com.databricks.spark.csv')\ 28 | .options(header='false', nullValue='\\N')\ 29 | .schema(schema)\ 30 | .load('data/airlines.csv') 31 | airlines.show() 32 | 33 | # Is Delta around? 34 | airlines.filter(airlines.IATA == 'DL').show() 35 | 36 | # Drop fields except for C1 as name, C3 as carrier code 37 | airlines.registerTempTable("airlines") 38 | airlines = spark.sql("SELECT Name, IATA AS CarrierCode from airlines") 39 | 40 | # Join our 14 carrier codes to the airliens table to get our set of airlines 41 | our_airlines = carrier_codes.join(airlines, carrier_codes.Carrier == airlines.CarrierCode) 42 | our_airlines = our_airlines.select('Name', 'CarrierCode') 43 | our_airlines.show() 44 | 45 | # Store as JSON objects via a dataframe. Repartition to 1 to get 1 json file. 46 | our_airlines.repartition(1).write.mode("overwrite").json("data/our_airlines.json") 47 | 48 | os.system("cp data/our_airlines.json/part* data/our_airlines.jsonl") 49 | 50 | #wikidata = spark.read.json('data/wikidata-20160404-all.json.bz2') 51 | -------------------------------------------------------------------------------- /ch06/airplanes_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "airplane" : { 3 | "properties" : { 4 | "Owner" : { 5 | "type": "string", 6 | "analyzer": "english", 7 | "fields": { 8 | "raw": { 9 | "type": "string", 10 | "index": "not_analyzed" 11 | } 12 | } 13 | }, 14 | "TailNum": { 15 | "type": "string", 16 | "analyzer": "english" 17 | }, 18 | "EngineManufacturer": { 19 | "type": "string", 20 | "analyzer": "english" 21 | }, 22 | "EngineModel": { 23 | "type": "string", 24 | "analyzer": "english" 25 | }, 26 | "Manufacturer": { 27 | "type": "string", 28 | "analyzer": "english" 29 | }, 30 | "ManufacturerYear": { 31 | "type": "string", 32 | "analyzer": "english" 33 | }, 34 | "Model": { 35 | "type": "string", 36 | "analyzer": "english" 37 | }, 38 | "Owner": { 39 | "type": "string", 40 | "analyzer": "english" 41 | }, 42 | "OwnerState": { 43 | "type": "string", 44 | "analyzer": "english" 45 | }, 46 | "SerialNumber": { 47 | "type": "string", 48 | "analyzer": "english" 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /ch06/airplanes_to_elasticsearch.py: -------------------------------------------------------------------------------- 1 | # Load our airplanes 2 | airplanes = spark.read.json("data/airplanes.json") 3 | airplanes.show() 4 | 5 | airplanes.write.format("org.elasticsearch.spark.sql")\ 6 | .option("es.resource","agile_data_science/airplane")\ 7 | .mode("overwrite")\ 8 | .save() 9 | 10 | # Format data for Elasticsearch, as a tuple with a dummy key in the first field 11 | # airplanes_dict = airplanes.rdd.map(lambda x: ('ignored_key', x.asDict())) 12 | # 13 | # airplanes_dict.saveAsNewAPIHadoopFile( 14 | # path='-', 15 | # outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", 16 | # keyClass="org.apache.hadoop.io.NullWritable", 17 | # valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 18 | # conf={ "es.resource" : "agile_data_science/airplanes" }) 19 | -------------------------------------------------------------------------------- /ch06/analyze_airplanes.py: -------------------------------------------------------------------------------- 1 | airplanes = spark.read.json('data/airplanes.json') 2 | 3 | # 4 | # Who makes the airplanes in the US commercial fleet, as a % 5 | # 6 | 7 | # How many airplanes are made by each manufacturer? 8 | airplanes.registerTempTable("airplanes") 9 | manufacturer_counts = spark.sql("""SELECT 10 | Manufacturer, 11 | COUNT(*) AS Total 12 | FROM 13 | airplanes 14 | GROUP BY 15 | Manufacturer 16 | ORDER BY 17 | Total DESC""" 18 | ) 19 | manufacturer_counts.show(10) # show top 10 20 | 21 | # How many airplanes total? 22 | total_airplanes = spark.sql( 23 | """SELECT 24 | COUNT(*) AS OverallTotal 25 | FROM airplanes""" 26 | ) 27 | print("Total airplanes: {}".format(total_airplanes.collect()[0].OverallTotal)) 28 | 29 | mfr_with_totals = manufacturer_counts.crossJoin(total_airplanes) 30 | mfr_with_totals = mfr_with_totals.rdd.map( 31 | lambda x: { 32 | 'Manufacturer': x.Manufacturer, 33 | 'Total': x.Total, 34 | 'Percentage': round( 35 | ( 36 | float(x.Total)/float(x.OverallTotal) 37 | ) * 100, 38 | 2 39 | ) 40 | } 41 | ) 42 | mfr_with_totals.toDF().show() 43 | 44 | # 45 | # Same with sub-queries 46 | # 47 | relative_manufacturer_counts = spark.sql("""SELECT 48 | Manufacturer, 49 | COUNT(*) AS Total, 50 | ROUND( 51 | 100 * ( 52 | COUNT(*)/(SELECT COUNT(*) FROM airplanes) 53 | ), 54 | 2 55 | ) AS PercentageTotal 56 | FROM 57 | airplanes 58 | GROUP BY 59 | Manufacturer 60 | ORDER BY 61 | Total DESC, Manufacturer 62 | LIMIT 10""" 63 | ) 64 | relative_manufacturer_counts.show(30) # show top 30 65 | 66 | # 67 | # Now get these things on the web 68 | # 69 | relative_manufacturer_counts = relative_manufacturer_counts.rdd.map(lambda row: row.asDict()) 70 | grouped_manufacturer_counts = relative_manufacturer_counts.groupBy(lambda x: 1) 71 | 72 | # Save to Mongo in the airplanes_per_carrier relation 73 | import pymongo_spark 74 | pymongo_spark.activate() 75 | grouped_manufacturer_counts.saveToMongoDB( 76 | 'mongodb://localhost:27017/agile_data_science.airplane_manufacturer_totals' 77 | ) 78 | -------------------------------------------------------------------------------- /ch06/analyze_airplanes_again.py: -------------------------------------------------------------------------------- 1 | airplanes = spark.read.json('data/resolved_airplanes.json') 2 | 3 | # 4 | # Who makes the airplanes in the US commercial fleet, as a % 5 | # 6 | 7 | # How many airplanes are made by each manufacturer? 8 | airplanes.registerTempTable("airplanes") 9 | manufacturer_counts = spark.sql("""SELECT 10 | Manufacturer, 11 | COUNT(*) AS Total 12 | FROM 13 | airplanes 14 | GROUP BY 15 | Manufacturer 16 | ORDER BY 17 | Total DESC""" 18 | ) 19 | manufacturer_counts.show(10) # show top 10 20 | 21 | # How many airplanes total? 22 | total_airplanes = spark.sql( 23 | """SELECT 24 | COUNT(*) AS OverallTotal 25 | FROM airplanes""" 26 | ) 27 | print("Total airplanes: {}".format(total_airplanes.collect()[0].OverallTotal)) 28 | 29 | mfr_with_totals = manufacturer_counts.crossJoin(total_airplanes) 30 | mfr_with_totals = mfr_with_totals.rdd.map( 31 | lambda x: { 32 | 'Manufacturer': x.Manufacturer, 33 | 'Total': x.Total, 34 | 'Percentage': round( 35 | ( 36 | float(x.Total)/float(x.OverallTotal) 37 | ) * 100, 38 | 2 39 | ) 40 | } 41 | ) 42 | mfr_with_totals.toDF().show() 43 | 44 | # 45 | # Same with sub-queries 46 | # 47 | relative_manufacturer_counts = spark.sql("""SELECT 48 | Manufacturer, 49 | COUNT(*) AS Total, 50 | ROUND( 51 | 100 * ( 52 | COUNT(*)/(SELECT COUNT(*) FROM airplanes) 53 | ), 54 | 2 55 | ) AS PercentageTotal 56 | FROM 57 | airplanes 58 | GROUP BY 59 | Manufacturer 60 | ORDER BY 61 | Total DESC, Manufacturer 62 | LIMIT 10""" 63 | ) 64 | relative_manufacturer_counts.show(30) # show top 30 65 | 66 | # 67 | # Now get these things on the web 68 | # 69 | relative_manufacturer_counts_dict = relative_manufacturer_counts.rdd.map(lambda row: row.asDict()) 70 | grouped_manufacturer_counts = relative_manufacturer_counts_dict.groupBy(lambda x: 1) 71 | 72 | # Save to Mongo in the airplanes_per_carrier relation 73 | import pymongo_spark 74 | pymongo_spark.activate() 75 | grouped_manufacturer_counts.saveToMongoDB( 76 | 'mongodb://localhost:27017/agile_data_science.airplane_manufacturer_totals' 77 | ) 78 | -------------------------------------------------------------------------------- /ch06/create_airplanes_index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create the entire agile_data_science index 4 | curl -XPUT 'http://localhost:9200/agile_data_science/' 5 | 6 | # Create the mapping to make search results sort right 7 | curl -XPUT 'http://localhost:9200/agile_data_science/_mapping/airplane' --data @airplanes_mapping.json 8 | 9 | # Get the mapping we just put in 10 | curl -XGET 'http://localhost:9200/agile_data_science/_mapping/airplane' 11 | -------------------------------------------------------------------------------- /ch06/enrich_airlines_wikipedia.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | sys.path.append("lib") 3 | import utils 4 | 5 | import wikipedia 6 | from bs4 import BeautifulSoup 7 | import tldextract 8 | 9 | # Load our airlines... 10 | our_airlines = utils.read_json_lines_file('data/our_airlines.jsonl') 11 | 12 | # Build a new list that includes wikipedia data 13 | with_url = [] 14 | for airline in our_airlines: 15 | # Get the wikipedia page for the airline name 16 | wikipage = wikipedia.page(airline['Name']) 17 | 18 | # Get the summary 19 | summary = wikipage.summary 20 | airline['summary'] = summary 21 | 22 | # Get the HTML of the page 23 | page = BeautifulSoup(wikipage.html()) 24 | 25 | # Task: get the logo from the right 'vcard' column 26 | # 1) Get the vcard table 27 | vcard_table = page.find_all('table', class_='vcard')[0] 28 | # 2) The logo is always the first image inside this table 29 | first_image = vcard_table.find_all('img')[0] 30 | # 3) Set the url to the image 31 | logo_url = 'http:' + first_image.get('src') 32 | airline['logo_url'] = logo_url 33 | 34 | # Task: Get the company website 35 | # 1) Find the 'Website' table header 36 | th = page.find_all('th', text='Website')[0] 37 | # 2) find the parent tr element 38 | tr = th.parent 39 | # 3) find the a (link) tag within the tr 40 | a = tr.find_all('a')[0] 41 | # 4) finally get the href of the a tag 42 | url = a.get('href') 43 | airline['url'] = url 44 | 45 | # Get the domain to display with the url 46 | url_parts = tldextract.extract(url) 47 | airline['domain'] = url_parts.domain + '.' + url_parts.suffix 48 | 49 | with_url.append(airline) 50 | 51 | utils.write_json_lines_file(with_url, 'data/our_airlines_with_wiki.jsonl') 52 | 53 | -------------------------------------------------------------------------------- /ch06/extract_airlines.py: -------------------------------------------------------------------------------- 1 | # Load the on-time parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline 5 | on_time_dataframe.registerTempTable("on_time_performance") 6 | carrier_airplane = spark.sql( 7 | "SELECT DISTINCT Carrier, TailNum FROM on_time_performance" 8 | ) 9 | 10 | # Now we need to store a sorted group for each Carrier, along with a fleet count 11 | airplanes_per_carrier = carrier_airplane.rdd\ 12 | .map(lambda nameTuple: (nameTuple[0], [nameTuple[1]]))\ 13 | .reduceByKey(lambda a, b: a + b)\ 14 | .map(lambda tuple: 15 | { 16 | 'Carrier': tuple[0], 17 | 'TailNumbers': sorted( 18 | filter( 19 | lambda x: x is not None and x != '', tuple[1] # empty string tail numbers were getting through 20 | ) 21 | ), 22 | 'FleetCount': len(tuple[1]) 23 | } 24 | ) 25 | airplanes_per_carrier.count() # 14 26 | 27 | # Save to Mongo in the airplanes_per_carrier relation 28 | import pymongo_spark 29 | pymongo_spark.activate() 30 | airplanes_per_carrier.saveToMongoDB( 31 | 'mongodb://localhost:27017/agile_data_science.airplanes_per_carrier' 32 | ) 33 | -------------------------------------------------------------------------------- /ch06/extract_airports.py: -------------------------------------------------------------------------------- 1 | # Load the on-time parquet file 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 3 | 4 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline 5 | on_time_dataframe.registerTempTable("on_time_performance") 6 | carrier_airplane = spark.sql( 7 | "SELECT DISTINCT Carrier, TailNum FROM on_time_performance" 8 | ) 9 | 10 | -------------------------------------------------------------------------------- /ch06/images/ads2_0601.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0601.png -------------------------------------------------------------------------------- /ch06/images/ads2_0602.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0602.png -------------------------------------------------------------------------------- /ch06/images/ads2_0603.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0603.png -------------------------------------------------------------------------------- /ch06/images/ads2_0604.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0604.png -------------------------------------------------------------------------------- /ch06/images/ads2_0605.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0605.png -------------------------------------------------------------------------------- /ch06/images/ads2_0606.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0606.png -------------------------------------------------------------------------------- /ch06/images/ads2_0607.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0607.png -------------------------------------------------------------------------------- /ch06/images/ads2_0608.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0608.png -------------------------------------------------------------------------------- /ch06/images/ads2_0609.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0609.png -------------------------------------------------------------------------------- /ch06/import_airlines.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Import our enriched airline data as the 'airlines' collection 4 | mongoimport -d agile_data_science -c airlines --file data/our_airlines_with_wiki.jsonl 5 | -------------------------------------------------------------------------------- /ch06/prepare_airplanes.py: -------------------------------------------------------------------------------- 1 | # Load the FAA N-Number Inquiry Records 2 | faa_tail_number_inquiry = spark.read.json('data/faa_tail_number_inquiry.jsonl') 3 | faa_tail_number_inquiry.show() 4 | 5 | # Count the records 6 | faa_tail_number_inquiry.count() 7 | 8 | # Load our unique tail numbers 9 | unique_tail_numbers = spark.read.json('data/tail_numbers.jsonl') 10 | unique_tail_numbers.show() 11 | 12 | # join tail numbers to our inquries 13 | tail_num_plus_inquiry = unique_tail_numbers.join( 14 | faa_tail_number_inquiry, 15 | unique_tail_numbers.TailNum == faa_tail_number_inquiry.TailNum, 16 | ) 17 | tail_num_plus_inquiry = tail_num_plus_inquiry.drop(unique_tail_numbers.TailNum) 18 | tail_num_plus_inquiry.show() 19 | 20 | # Dump extra field and store tail_numbers plus inquiry 21 | tail_num_plus_inquiry.registerTempTable("tail_num_plus_inquiry") 22 | airplanes = spark.sql("""SELECT 23 | TailNum AS TailNum, 24 | engine_manufacturer AS EngineManufacturer, 25 | engine_model AS EngineModel, 26 | manufacturer AS Manufacturer, 27 | mfr_year AS ManufacturerYear, 28 | model AS Model, 29 | owner AS Owner, 30 | owner_state AS OwnerState, 31 | serial_number AS SerialNumber 32 | FROM 33 | tail_num_plus_inquiry""") 34 | 35 | airplanes.repartition(1).write.mode("overwrite").json('data/airplanes.json') 36 | -------------------------------------------------------------------------------- /ch06/scrape_faa.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import time 3 | 4 | sys.path.append("lib") 5 | import utils 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | tail_number_records = utils.read_json_lines_file('data/tail_numbers.jsonl') 11 | 12 | aircraft_records = [] 13 | # Loop through the tail numbers, fetching 14 | for tail_number_record in tail_number_records: 15 | time.sleep(0.1) # essential to sleep FIRST in loop or you will flood sites 16 | 17 | # Parameterize the URL with the tail number 18 | BASE_URL = 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt={}' 19 | tail_number = tail_number_record['TailNum'] 20 | url = BASE_URL.format(tail_number) 21 | 22 | # Fetch the page, parse the HTML 23 | r = requests.get(url) 24 | 25 | html = r.text 26 | soup = BeautifulSoup(html) 27 | 28 | # The table structure is constant for all pages that contain data 29 | try: 30 | aircraft_description = soup.find_all('table')[4] 31 | craft_tds = aircraft_description.find_all('td') 32 | serial_number = craft_tds[1].text.strip() 33 | manufacturer = craft_tds[5].text.strip() 34 | model = craft_tds[9].text.strip() 35 | mfr_year = craft_tds[25].text.strip() 36 | 37 | registered_owner = soup.find_all('table')[5] 38 | reg_tds = registered_owner.find_all('td') 39 | owner = reg_tds[1].text.strip() 40 | owner_state = reg_tds[9].text.strip() 41 | 42 | airworthiness = soup.find_all('table')[6] 43 | worthy_tds = airworthiness.find_all('td') 44 | engine_manufacturer = worthy_tds[1].text.strip() 45 | engine_model = worthy_tds[5].text.strip() 46 | 47 | aircraft_record = { 48 | 'TailNum': tail_number, 49 | 'serial_number': serial_number, 50 | 'manufacturer': manufacturer, 51 | 'model': model, 52 | 'mfr_year': mfr_year, 53 | 'owner': owner, 54 | 'owner_state': owner_state, 55 | 'engine_manufacturer': engine_manufacturer, 56 | 'engine_model': engine_model, 57 | } 58 | aircraft_records.append( 59 | aircraft_record 60 | ) 61 | print(aircraft_record) 62 | 63 | except IndexError as e: 64 | print("Missing {} record: {}".format(tail_number, e)) 65 | 66 | utils.write_json_lines_file( 67 | aircraft_records, 'data/faa_tail_number_inquiry.jsonl' 68 | ) 69 | -------------------------------------------------------------------------------- /ch06/test_elastic_airplanes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -XGET 'localhost:9200/agile_data_science/airplanes/_search?q=*' 4 | -------------------------------------------------------------------------------- /ch06/web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/web/__init__.py -------------------------------------------------------------------------------- /ch06/web/config.py: -------------------------------------------------------------------------------- 1 | # config.py, a configuration file for index.py 2 | RECORDS_PER_PAGE = 15 3 | AIRPLANE_RECORDS_PER_PAGE = 5 4 | ELASTIC_URL = "http://elastic:9200" 5 | -------------------------------------------------------------------------------- /ch06/web/search_helpers.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | 3 | # Process elasticsearch hits and return flights records 4 | def process_search(results): 5 | records = [] 6 | total = 0 7 | if results['hits'] and results['hits']['hits']: 8 | total = results['hits']['total'] 9 | hits = results['hits']['hits'] 10 | for hit in hits: 11 | record = hit['_source'] 12 | records.append(record) 13 | return records, total 14 | 15 | # Calculate offsets for fetching lists of flights from MongoDB 16 | def get_navigation_offsets(offset1, offset2, increment): 17 | offsets = {} 18 | offsets['Next'] = {'top_offset': offset2 + increment, 'bottom_offset': 19 | offset1 + increment} 20 | offsets['Previous'] = {'top_offset': max(offset2 - increment, 0), 21 | 'bottom_offset': max(offset1 - increment, 0)} # Don't go < 0 22 | return offsets 23 | 24 | # Strip the existing start and end parameters from the query string 25 | def strip_place(url): 26 | try: 27 | p = re.match('(.+)\?start=.+&end=.+', url).group(1) 28 | except AttributeError as e: 29 | return url 30 | return p 31 | -------------------------------------------------------------------------------- /ch06/web/static/airplanes.js: -------------------------------------------------------------------------------- 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40}, 2 | width = 900 - margin.left - margin.right, 3 | height = 300 - margin.top - margin.bottom; 4 | 5 | var x = d3.scale.ordinal() 6 | .rangeRoundBands([0, width], .1); 7 | var y = d3.scale.linear() 8 | .range([height, 0]); 9 | 10 | var xAxis = d3.svg.axis() 11 | .scale(x) 12 | .orient("bottom") 13 | .tickFormat(function(d) { 14 | return truncate(d, 14); 15 | }); 16 | var yAxis = d3.svg.axis() 17 | .scale(y) 18 | .orient("left"); 19 | 20 | var chart = d3.select(".chart") 21 | .attr("width", width + margin.left + margin.right) 22 | .attr("height", height + margin.top + margin.bottom) 23 | .append("g") 24 | .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); 25 | 26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) { 27 | var data = data.data; 28 | 29 | x.domain(data.map(function(d) { return d.Manufacturer; })); 30 | y.domain([0, d3.max(data, function(d) { return d.Total; })]); 31 | 32 | chart.append("g") 33 | .attr("class", "x axis") 34 | .attr("transform", "translate(0," + height + ")") 35 | .call(xAxis); 36 | 37 | chart.append("g") 38 | .attr("class", "y axis") 39 | .call(yAxis); 40 | 41 | chart.selectAll(".bar") 42 | .data(data) 43 | .enter().append("rect") 44 | .attr("class", "bar") 45 | .attr("x", function(d) { return x(d.Manufacturer); }) 46 | .attr("y", function(d) { return y(d.Total); }) 47 | .attr("height", function(d) { return height - y(d.Total); }) 48 | .attr("width", x.rangeBand()); 49 | }); 50 | 51 | function truncate(d, l) { 52 | if(d.length > l) 53 | return d.substring(0,l)+'...'; 54 | else 55 | return d; 56 | } 57 | -------------------------------------------------------------------------------- /ch06/web/static/app.js: -------------------------------------------------------------------------------- 1 | var width = 960, 2 | height = 500; 3 | 4 | var y = d3.scale.linear() 5 | .range([height, 0]); 6 | // We define the domain once we get our data in d3.json, below 7 | 8 | var chart = d3.select(".chart") 9 | .attr("width", width) 10 | .attr("height", height); 11 | 12 | d3.json("/total_flights.json", function(data) { 13 | y.domain([0, d3.max(data, function(d) { return d.total_flights; })]); 14 | 15 | var barWidth = width / data.length; 16 | 17 | var bar = chart.selectAll("g") 18 | .data(data) 19 | .enter() 20 | .append("g") 21 | .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; }); 22 | 23 | bar.append("rect") 24 | .attr("y", function(d) { return y(d.total_flights); }) 25 | .attr("height", function(d) { return height - y(d.total_flights); }) 26 | .attr("width", barWidth - 1); 27 | 28 | bar.append("text") 29 | .attr("x", barWidth / 2) 30 | .attr("y", function(d) { return y(d.total_flights) + 3; }) 31 | .attr("dy", ".75em") 32 | .text(function(d) { return d.total_flights; }); 33 | }); -------------------------------------------------------------------------------- /ch06/web/templates/airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines / {{carrier_code}} 5 | 6 | 7 |

Airline {{ carrier_code }}

8 | 9 |

Fleet: {{airline_airplanes.FleetCount}} Planes

10 | 17 | 18 |

Airports: {{airline_airports.length}}

19 | 26 | {% endblock %} -------------------------------------------------------------------------------- /ch06/web/templates/airlines2.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines / {{carrier_code}} 5 | 6 | 7 | 8 | 9 |

10 | 11 | {{airline_summary.Name}} 12 | / {{airline_summary.domain}} 13 |

14 | 15 | 16 |

{{airline_summary.summary}}

17 | 18 | 19 |

Airports: {{airline_airports.length}}

20 | 27 | 28 | 29 |

Fleet: {{airline_airplanes.FleetCount}} Planes

30 | 37 | 38 | {% endblock %} -------------------------------------------------------------------------------- /ch06/web/templates/airport.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airport / {{airport_code}} 5 | 6 | 7 |

{{airport_metadata.City}} / {{airport_metadata.Name}} / {{ airport_code }}

8 | 9 |

Coordinates: {{airport_metadata.Latitude}} lat {{airport_metadata.Longitude}} lon

10 |

Timezone: {{airport_metadata.TZ_DB}} / {{airport_metadata.Timezone}} from GMT

11 | 12 |

Carriers: {{carriers_per_airport.Carriers.length}}

13 | 20 | 21 | {% endblock %} -------------------------------------------------------------------------------- /ch06/web/templates/all_airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines 5 | 6 |

US Domestic Airlines

7 | 14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /ch06/web/templates/all_airplanes.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airplanes 5 | 6 |

7 | 8 | US Commercial Fleet 9 |

10 | 11 | 12 |
13 |

Airplanes by Manufacturer

14 |
15 |
16 | 17 | 18 | 19 |
20 | {% for item in search_config %} 21 | {% if 'label' in item %} 22 | 23 | {% else %} 24 | 25 | {% endif %} 26 | 27 | {% endfor %} 28 | 29 |
30 | 31 | 32 | 33 | 34 | {% for item in search_config %} 35 | {% if 'label' in item %} 36 | 37 | {% else %} 38 | 39 | {% endif %} 40 | {% endfor %} 41 | 42 | 43 | 44 | 45 | {% for airplane in airplanes %} 46 | 47 | {% for item in search_config %} 48 | 49 | {% endfor %} 50 | 51 | {% endfor %} 52 | 53 |
{{item['label']}}{{item['field']}}
{{airplane[item['field']]}}
54 | 55 | 56 | 72 | 73 | {% import "macros.jnj" as common %} 74 | {% if nav_offsets and nav_path -%} 75 | {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }} 76 | {% endif -%} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /ch06/web/templates/flight.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights / {{flight.TailNum}} 6 | 7 |

Flight {{flight.FlightNum}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
AirlineOriginDestinationTail NumberDateAir TimeDistance
{{flight.Carrier}}{{flight.Origin}}{{flight.Dest}}{{flight.TailNum}}{{flight.FlightDate}}{{flight.AirTime}}{{flight.Distance}}
30 |
31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /ch06/web/templates/flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights 6 | 7 |

{{flight_count}} Flights on {{flight_date}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | {% for flight in flights %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | {% endfor %} 32 | 33 |
AirlineFlight NumberOriginDestinationDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}}{{flight.Origin}}{{flight.Dest}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
34 |
35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /ch06/web/templates/flights_per_airplane.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Airplanes / {{tail_number}} 6 | 7 |

Flights by Tail Number {{tail_number}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for flight in flights['Flights'] %} 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | {% endfor %} 28 | 29 |
CarrierDateFlight NumberOriginDestination
{{flight['Carrier']}}{{flight['FlightDate']}}{{flight['FlightNum']}} 23 | {{flight['Origin']}}{{flight['Dest']}}
30 |
31 | {% endblock %} -------------------------------------------------------------------------------- /ch06/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 |
21 | 24 | {% block body %}{% endblock %} 25 |
26 | 27 |
28 |
29 | 30 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /ch06/web/templates/macros.jnj: -------------------------------------------------------------------------------- 1 | 2 | {% macro display_nav(offsets, path, count) -%} 3 |
4 | {% for key, values in offsets.items() -%} 5 | {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%} 6 | {{ key }} 9 | {% else -%} 10 | {{ key }} 11 | {% endif %} 12 | {% endfor -%} 13 |
14 | {% endmacro -%} 15 | -------------------------------------------------------------------------------- /ch06/web/templates/total_flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Total Flights by Month

5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for month in total_flights %} 12 | 13 | 14 | 15 | 16 | {% endfor %} 17 | 18 |
MonthTotal Flights
{{month.Month}}{{month.total_flights}}
19 |
20 | {% endblock %} -------------------------------------------------------------------------------- /ch06/web/templates/total_flights_chart.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

Total Flights by Month

19 |
20 |
21 | 22 | 25 | {% endblock %} -------------------------------------------------------------------------------- /ch07/images/ads2_0701.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch07/images/ads2_0701.png -------------------------------------------------------------------------------- /ch07/images/ads2_0702.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch07/images/ads2_0702.png -------------------------------------------------------------------------------- /ch08/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Get the absolute path of this script, see http://bit.ly/find_path 4 | ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" 5 | ABSOLUTE_DIR=$(dirname "${ABSOLUTE_PATH}") 6 | 7 | # Extract to Agile_Data_Code_2/data/on_time_performance.parquet, wherever we are executed from 8 | cd $ABSOLUTE_DIR/../data/ 9 | curl -Lko ./simple_flight_delay_features.jsonl.bz2 http://s3.amazonaws.com/agile_data_science/simple_flight_delay_features.jsonl.bz2 10 | 11 | # Get the distances between pairs of airports 12 | curl -Lko ./origin_dest_distances.jsonl http://s3.amazonaws.com/agile_data_science/origin_dest_distances.jsonl 13 | 14 | # Get the models to make ch08/web/predict_flask.py go 15 | cd $ABSOLUTE_DIR/.. 16 | mkdir models 17 | curl -Lko ./models/sklearn_vectorizer.pkl http://s3.amazonaws.com/agile_data_science/sklearn_vectorizer.pkl 18 | curl -Lko ./models/sklearn_regressor.pkl http://s3.amazonaws.com/agile_data_science/sklearn_regressor.pkl 19 | -------------------------------------------------------------------------------- /ch08/fetch_prediction_requests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import json 5 | import datetime, iso8601 6 | 7 | # Save to Mongo 8 | from bson import json_util 9 | import pymongo_spark 10 | pymongo_spark.activate() 11 | 12 | # Pass date and base path to main() from airflow 13 | def main(iso_date, base_path): 14 | 15 | APP_NAME = "fetch_prediction_requests.py" 16 | 17 | # If there is no SparkSession, create the environment 18 | try: 19 | sc and spark 20 | except NameError as e: 21 | import findspark 22 | findspark.init() 23 | import pyspark 24 | import pyspark.sql 25 | 26 | sc = pyspark.SparkContext() 27 | spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() 28 | 29 | # Get today and tomorrow's dates as iso strings to scope query 30 | today_dt = iso8601.parse_date(iso_date) 31 | rounded_today = today_dt.date() 32 | iso_today = rounded_today.isoformat() 33 | rounded_tomorrow_dt = rounded_today + datetime.timedelta(days=1) 34 | iso_tomorrow = rounded_tomorrow_dt.isoformat() 35 | 36 | # Create mongo query string for today's data 37 | mongo_query_string = """{{ 38 | "Timestamp": {{ 39 | "$gte": "{iso_today}", 40 | "$lte": "{iso_tomorrow}" 41 | }} 42 | }}""".format( 43 | iso_today=iso_today, 44 | iso_tomorrow=iso_tomorrow 45 | ) 46 | mongo_query_string = mongo_query_string.replace('\n', '') 47 | 48 | # Create the config object with the query string 49 | mongo_query_config = dict() 50 | mongo_query_config["mongo.input.query"] = mongo_query_string 51 | 52 | # Load the day's requests using pymongo_spark 53 | prediction_requests = sc.mongoRDD( 54 | 'mongodb://localhost:27017/agile_data_science.prediction_tasks', 55 | config=mongo_query_config 56 | ) 57 | 58 | # Build the day's output path: a date based primary key directory structure 59 | today_output_path = "{}/data/prediction_tasks_daily.json/{}".format( 60 | base_path, 61 | iso_today 62 | ) 63 | 64 | # Generate json records 65 | prediction_requests_json = prediction_requests.map(json_util.dumps) 66 | 67 | # Write/replace today's output path 68 | os.system("rm -rf {}".format(today_output_path)) 69 | prediction_requests_json.saveAsTextFile(today_output_path) 70 | 71 | if __name__ == "__main__": 72 | main(sys.argv[1], sys.argv[2]) 73 | -------------------------------------------------------------------------------- /ch08/images/ads2_0807.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0807.png -------------------------------------------------------------------------------- /ch08/images/ads2_0808.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0808.png -------------------------------------------------------------------------------- /ch08/images/ads2_0809.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0809.png -------------------------------------------------------------------------------- /ch08/images/ads2_0810.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0810.png -------------------------------------------------------------------------------- /ch08/images/ads2_0811.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0811.png -------------------------------------------------------------------------------- /ch08/import_distances.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Import our enriched airline data as the 'airlines' collection 4 | mongoimport -d agile_data_science -c origin_dest_distances --file data/origin_dest_distances.jsonl 5 | mongo agile_data_science --eval 'db.origin_dest_distances.ensureIndex({Origin: 1, Dest: 1})' 6 | -------------------------------------------------------------------------------- /ch08/kafka_test.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaProducer 2 | producer = KafkaProducer() 3 | 4 | producer.send( 5 | 'flight_delay_classification_request', 6 | '{"Hello": "World!"}'.encode() 7 | ) 8 | -------------------------------------------------------------------------------- /ch08/links.txt: -------------------------------------------------------------------------------- 1 | https://davidwalsh.name/curl-post-file 2 | http://localhost:5000/on_time_performance?Carrier=AA&FlightDate=2015-01-01&FlightNum=1519 3 | http://blog.luisrei.com/articles/flaskrest.html 4 | http://stackoverflow.com/questions/7172784/how-to-post-json-data-with-curl-from-terminal-commandline-to-test-spring-rest 5 | https://airflow.incubator.apache.org/cli.html 6 | https://en.wikipedia.org/wiki/Scientific_method 7 | http://www.slideshare.net/xamat/agile-science 8 | https://www.tutorialspoint.com/python/dictionary_update.htm 9 | https://docs.python.org/3/library/string.html#formatstrings 10 | http://stackoverflow.com/questions/2943222/find-objects-between-two-dates-mongodb 11 | https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/SparkSession.html 12 | http://stackoverflow.com/questions/3908156/grep-output-to-show-only-matching-file 13 | https://github.com/mongodb/mongo-hadoop/tree/master/spark/src/main/python 14 | http://stackoverflow.com/questions/2943222/find-objects-between-two-dates-mongodb 15 | http://stackoverflow.com/questions/19819870/date-query-with-isodate-in-mongodb-doesnt-seem-to-work 16 | http://stackoverflow.com/questions/27523337/how-to-query-to-mongo-using-spark 17 | https://github.com/mongodb/mongo-hadoop/blob/master/spark/src/main/python/pymongo_spark.py 18 | -------------------------------------------------------------------------------- /ch08/load_prediction_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import json 5 | import datetime, iso8601 6 | 7 | # Save to Mongo 8 | from bson import json_util 9 | import pymongo_spark 10 | pymongo_spark.activate() 11 | 12 | # Pass date and base path to main() from airflow 13 | def main(iso_date, base_path): 14 | 15 | APP_NAME = "load_prediction_results.py" 16 | 17 | # If there is no SparkSession, create the environment 18 | try: 19 | sc and spark 20 | except NameError as e: 21 | import findspark 22 | findspark.init() 23 | import pyspark 24 | import pyspark.sql 25 | 26 | sc = pyspark.SparkContext() 27 | spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() 28 | 29 | # Get today and tomorrow's dates as iso strings to scope query 30 | today_dt = iso8601.parse_date(iso_date) 31 | rounded_today = today_dt.date() 32 | iso_today = rounded_today.isoformat() 33 | 34 | input_path = "{}/data/prediction_results_daily.json/{}".format( 35 | base_path, 36 | iso_today 37 | ) 38 | 39 | # Load and JSONize text 40 | prediction_results_raw = sc.textFile(input_path) 41 | prediction_results = prediction_results_raw.map(json_util.loads) 42 | 43 | # Store to MongoDB 44 | prediction_results.saveToMongoDB( 45 | "mongodb://localhost:27017/agile_data_science.prediction_results" 46 | ) 47 | 48 | if __name__ == "__main__": 49 | main(sys.argv[1], sys.argv[2]) 50 | -------------------------------------------------------------------------------- /ch08/origin_dest_distances.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import json 3 | 4 | # Load the on-time parquet file 5 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet') 6 | on_time_dataframe.registerTempTable("on_time_performance") 7 | 8 | origin_dest_distances = spark.sql(""" 9 | SELECT Origin, Dest, AVG(Distance) AS Distance 10 | FROM on_time_performance 11 | GROUP BY Origin, Dest 12 | ORDER BY Distance 13 | """) 14 | origin_dest_distances.repartition(1).write.mode("overwrite").json("data/origin_dest_distances.json") 15 | os.system("cp data/origin_dest_distances.json/part* data/origin_dest_distances.jsonl") 16 | -------------------------------------------------------------------------------- /ch08/python_kafka_consumer.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import json 3 | 4 | from kafka import KafkaConsumer, TopicPartition 5 | consumer = KafkaConsumer() 6 | consumer.assign([TopicPartition('flight_delay_classification_request', 0)]) 7 | consumer.seek_to_beginning() 8 | 9 | for message in consumer: 10 | message_bytes = message.value 11 | message_string = message_bytes.decode() 12 | message_object = json.loads(message_string) 13 | print(message_object) 14 | -------------------------------------------------------------------------------- /ch08/python_kafka_producer.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaProducer 2 | producer = KafkaProducer() 3 | 4 | producer.send( 5 | 'flight_delay_classification_request', 6 | '{"Hello": "Producer!"}'.encode() 7 | ) 8 | -------------------------------------------------------------------------------- /ch08/streaming_test.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import json 3 | 4 | from pyspark import SparkContext, SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils, OffsetRange, TopicAndPartition 7 | 8 | # Process data every 10 seconds 9 | PERIOD=10 10 | BROKERS='localhost:9092' 11 | TOPIC='flight_delay_classification_request' 12 | 13 | conf = SparkConf().set("spark.default.parallelism", 1) 14 | # sc = SparkContext(appName = "Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) 15 | ssc = StreamingContext(sc, PERIOD) 16 | 17 | stream = KafkaUtils.createDirectStream( 18 | ssc, 19 | [TOPIC], 20 | { 21 | "metadata.broker.list": BROKERS, 22 | "group.id": "0", 23 | } 24 | ) 25 | 26 | # Parse the JSON message and print the resulting object 27 | object_stream = stream.map(lambda x: json.loads(x[1])) 28 | object_stream.pprint() 29 | 30 | ssc.start() 31 | -------------------------------------------------------------------------------- /ch08/test_airflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Compute today's date: 4 | export ISO_DATE=`date "+%Y-%m-%d"` 5 | 6 | # List DAGs 7 | airflow list_dags 8 | 9 | # List tasks in each DAG 10 | airflow list_tasks agile_data_science_batch_prediction_model_training 11 | airflow list_tasks agile_data_science_batch_predictions_daily 12 | 13 | # Test each task in each DAG 14 | airflow test agile_data_science_batch_prediction_model_training pyspark_extract_features $ISO_DATE 15 | airflow test agile_data_science_batch_prediction_model_training pyspark_train_classifier_model $ISO_DATE 16 | 17 | airflow test agile_data_science_batch_predictions_daily pyspark_fetch_prediction_requests $ISO_DATE 18 | airflow test agile_data_science_batch_predictions_daily pyspark_make_predictions $ISO_DATE 19 | airflow test agile_data_science_batch_predictions_daily pyspark_load_prediction_results $ISO_DATE 20 | 21 | # Test the training and persistence of the models 22 | airflow backfill -s $ISO_DATE -e $ISO_DATE agile_data_science_batch_prediction_model_training 23 | 24 | # Test the daily operation of the model 25 | airflow backfill -s $ISO_DATE -e $ISO_DATE agile_data_science_batch_predictions_daily 26 | -------------------------------------------------------------------------------- /ch08/test_classification_api.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Fetch the delay prediction for a hypothetical flight 4 | curl -XPOST 'http://localhost:5000/flights/delays/predict/classify' \ 5 | -F 'DepDelay=5.0' \ 6 | -F 'Carrier=AA' \ 7 | -F 'FlightDate=2016-12-23' \ 8 | -F 'Dest=ATL' \ 9 | -F 'FlightNum=1519' \ 10 | -F 'Origin=SFO' \ 11 | | json_pp 12 | -------------------------------------------------------------------------------- /ch08/test_regression_api.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Fetch the delay prediction for a hypothetical flight 4 | curl -XPOST 'http://localhost:5000/flights/delays/predict/regress' \ 5 | -F 'DepDelay=5.0' \ 6 | -F 'Carrier=AA' \ 7 | -F 'Date=2016-12-23' \ 8 | -F 'Dest=ATL' \ 9 | -F 'FlightNum=1519' \ 10 | -F 'Origin=SFO' \ 11 | | json_pp 12 | -------------------------------------------------------------------------------- /ch08/web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/web/__init__.py -------------------------------------------------------------------------------- /ch08/web/config.py: -------------------------------------------------------------------------------- 1 | # config.py, a configuration file for index.py 2 | RECORDS_PER_PAGE = 15 3 | AIRPLANE_RECORDS_PER_PAGE = 5 4 | ELASTIC_URL = "http://elastic:9200" 5 | -------------------------------------------------------------------------------- /ch08/web/predict_utils.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import pymongo 3 | import datetime, iso8601 4 | 5 | 6 | def process_search(results): 7 | """Process elasticsearch hits and return flights records""" 8 | records = [] 9 | total = 0 10 | if results["hits"] and results["hits"]["hits"]: 11 | total = results["hits"]["total"] 12 | hits = results["hits"]["hits"] 13 | for hit in hits: 14 | record = hit["_source"] 15 | records.append(record) 16 | return records, total 17 | 18 | 19 | def get_navigation_offsets(offset1, offset2, increment): 20 | """Calculate offsets for fetching lists of flights from MongoDB""" 21 | offsets = {} 22 | offsets["Next"] = { 23 | "top_offset": offset2 + increment, 24 | "bottom_offset": offset1 + increment, 25 | } 26 | offsets["Previous"] = { 27 | "top_offset": max(offset2 - increment, 0), 28 | "bottom_offset": max(offset1 - increment, 0), 29 | } # Don't go < 0 30 | return offsets 31 | 32 | 33 | def strip_place(url): 34 | """Strip the existing start and end parameters from the query string""" 35 | try: 36 | p = re.match("(.+)\?start=.+&end=.+", url).group(1) 37 | except AttributeError as e: 38 | return url 39 | return p 40 | 41 | 42 | def get_flight_distance(client, origin, dest): 43 | """Get the distance between a pair of airport codes""" 44 | query = { 45 | "Origin": origin, 46 | "Dest": dest, 47 | } 48 | record = client.agile_data_science.origin_dest_distances.find_one(query) 49 | return record["Distance"] 50 | 51 | 52 | def get_regression_date_args(iso_date): 53 | """Given an ISO Date, return the day of year, day of month, day of week as the API expects them.""" 54 | dt = iso8601.parse_date(iso_date) 55 | day_of_year = dt.timetuple().tm_yday 56 | day_of_month = dt.day 57 | day_of_week = dt.weekday() 58 | return { 59 | "DayOfYear": day_of_year, 60 | "DayOfMonth": day_of_month, 61 | "DayOfWeek": day_of_week, 62 | } 63 | 64 | 65 | def get_current_timestamp(): 66 | iso_now = datetime.datetime.now().isoformat() 67 | return iso_now 68 | -------------------------------------------------------------------------------- /ch08/web/static/airplanes.js: -------------------------------------------------------------------------------- 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40}, 2 | width = 900 - margin.left - margin.right, 3 | height = 300 - margin.top - margin.bottom; 4 | 5 | var x = d3.scale.ordinal() 6 | .rangeRoundBands([0, width], .1); 7 | var y = d3.scale.linear() 8 | .range([height, 0]); 9 | 10 | var xAxis = d3.svg.axis() 11 | .scale(x) 12 | .orient("bottom") 13 | .tickFormat(function(d) { 14 | return truncate(d, 14); 15 | }); 16 | var yAxis = d3.svg.axis() 17 | .scale(y) 18 | .orient("left"); 19 | 20 | var chart = d3.select(".chart") 21 | .attr("width", width + margin.left + margin.right) 22 | .attr("height", height + margin.top + margin.bottom) 23 | .append("g") 24 | .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); 25 | 26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) { 27 | var data = data.data; 28 | 29 | x.domain(data.map(function(d) { return d.Manufacturer; })); 30 | y.domain([0, d3.max(data, function(d) { return d.Total; })]); 31 | 32 | chart.append("g") 33 | .attr("class", "x axis") 34 | .attr("transform", "translate(0," + height + ")") 35 | .call(xAxis); 36 | 37 | chart.append("g") 38 | .attr("class", "y axis") 39 | .call(yAxis); 40 | 41 | chart.selectAll(".bar") 42 | .data(data) 43 | .enter().append("rect") 44 | .attr("class", "bar") 45 | .attr("x", function(d) { return x(d.Manufacturer); }) 46 | .attr("y", function(d) { return y(d.Total); }) 47 | .attr("height", function(d) { return height - y(d.Total); }) 48 | .attr("width", x.rangeBand()); 49 | }); 50 | 51 | function truncate(d, l) { 52 | if(d.length > l) 53 | return d.substring(0,l)+'...'; 54 | else 55 | return d; 56 | } 57 | -------------------------------------------------------------------------------- /ch08/web/static/app.js: -------------------------------------------------------------------------------- 1 | var width = 960, 2 | height = 500; 3 | 4 | var y = d3.scale.linear() 5 | .range([height, 0]); 6 | // We define the domain once we get our data in d3.json, below 7 | 8 | var chart = d3.select(".chart") 9 | .attr("width", width) 10 | .attr("height", height); 11 | 12 | d3.json("/total_flights.json", function(data) { 13 | y.domain([0, d3.max(data, function(d) { return d.total_flights; })]); 14 | 15 | var barWidth = width / data.length; 16 | 17 | var bar = chart.selectAll("g") 18 | .data(data) 19 | .enter() 20 | .append("g") 21 | .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; }); 22 | 23 | bar.append("rect") 24 | .attr("y", function(d) { return y(d.total_flights); }) 25 | .attr("height", function(d) { return height - y(d.total_flights); }) 26 | .attr("width", barWidth - 1); 27 | 28 | bar.append("text") 29 | .attr("x", barWidth / 2) 30 | .attr("y", function(d) { return y(d.total_flights) + 3; }) 31 | .attr("dy", ".75em") 32 | .text(function(d) { return d.total_flights; }); 33 | }); -------------------------------------------------------------------------------- /ch08/web/static/bar.css: -------------------------------------------------------------------------------- 1 | 2 | .axis text { 3 | font: 8px sans-serif; 4 | } 5 | 6 | .axis path, 7 | .axis line { 8 | fill: none; 9 | stroke: #000; 10 | shape-rendering: crispEdges; 11 | } 12 | 13 | .bar { 14 | fill: #ff6600; 15 | } 16 | -------------------------------------------------------------------------------- /ch08/web/static/barchart.js: -------------------------------------------------------------------------------- 1 | class BarChart { 2 | 3 | constructor(url, labelName, valueName, chartClassName) { 4 | this.url = url; 5 | this.labelName = (typeof labelName !== 'undefined') ? labelName : 'label'; 6 | this.valueName = (typeof valueName !== 'undefined') ? valueName : 'value'; 7 | this.chartClassName = (typeof chartClassName !== 'undefined') ? chartClassName : 'chart'; 8 | } 9 | 10 | render() { 11 | var margin = {top: 20, right: 30, bottom: 30, left: 40}, 12 | width = 900 - margin.left - margin.right, 13 | height = 300 - margin.top - margin.bottom; 14 | 15 | var x = d3.scale.ordinal() 16 | .rangeRoundBands([0, width], .1); 17 | var y = d3.scale.linear() 18 | .range([height, 0]); 19 | 20 | var xAxis = d3.svg.axis() 21 | .scale(x) 22 | .orient("bottom") 23 | .tickFormat(function(d) { 24 | return truncate(d, 14); 25 | }); 26 | var yAxis = d3.svg.axis() 27 | .scale(y) 28 | .orient("left"); 29 | 30 | var chart = d3.select('.' + this.chartClassName) 31 | .attr("width", width + margin.left + margin.right) 32 | .attr("height", height + margin.top + margin.bottom) 33 | .append("g") 34 | .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); 35 | 36 | var labelName = this.labelName; 37 | var valueName = this.valueName; 38 | d3.json(this.url, function(error, data) { 39 | var data = data.data; 40 | 41 | x.domain(data.map(function(d) { return d[labelName]; })); 42 | y.domain([0, d3.max(data, function(d) { return d[valueName]; })]); 43 | 44 | chart.append("g") 45 | .attr("class", "x axis") 46 | .attr("transform", "translate(0," + height + ")") 47 | .call(xAxis); 48 | 49 | chart.append("g") 50 | .attr("class", "y axis") 51 | .call(yAxis); 52 | 53 | chart.selectAll(".bar") 54 | .data(data) 55 | .enter().append("rect") 56 | .attr("class", "bar") 57 | .attr("x", function(d) { return x(d[labelName]); }) 58 | .attr("y", function(d) { return y(d[valueName]); }) 59 | .attr("height", function(d) { return height - y(d[valueName]); }) 60 | .attr("width", x.rangeBand()); 61 | }); 62 | 63 | function truncate(d, l) { 64 | if(d.length > l) 65 | return d.substring(0,l)+'...'; 66 | else 67 | return d; 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /ch08/web/static/flight_delay_predict_polling.js: -------------------------------------------------------------------------------- 1 | // Attach a submit handler to the form 2 | $( "#flight_delay_classification" ).submit(function( event ) { 3 | 4 | // Stop form from submitting normally 5 | event.preventDefault(); 6 | 7 | // Get some values from elements on the page: 8 | var $form = $( this ), 9 | term = $form.find( "input[name='s']" ).val(), 10 | url = $form.attr( "action" ); 11 | 12 | // Send the data using post 13 | var posting = $.post( 14 | url, 15 | $( "#flight_delay_classification" ).serialize() 16 | ); 17 | 18 | // Submit the form and parse the response 19 | posting.done(function( data ) { 20 | var response = JSON.parse(data); 21 | 22 | // If the response is ok, print a message to wait and start polling 23 | if(response.status == "OK") { 24 | $( "#result" ).empty().append( "Processing..." ); 25 | 26 | // Every 1 second, poll the response url until we get a response 27 | poll(response.id); 28 | } 29 | }); 30 | }); 31 | 32 | // Poll the prediction URL 33 | function poll(id) { 34 | var responseUrlBase = "/flights/delays/predict/classify_realtime/response/"; 35 | console.log("Polling for request id " + id + "..."); 36 | 37 | // Append the uuid to the URL as a slug argument 38 | var predictionUrl = responseUrlBase + id; 39 | 40 | $.ajax( 41 | { 42 | url: predictionUrl, 43 | type: "GET", 44 | complete: conditionalPoll 45 | }); 46 | } 47 | 48 | // Decide whether to poll based on the response status 49 | function conditionalPoll(data) { 50 | var response = JSON.parse(data.responseText); 51 | 52 | if(response.status == "OK") { 53 | renderPage(response.prediction); 54 | } 55 | else if(response.status == "WAIT") { 56 | setTimeout(function() {poll(response.id)}, 1000); 57 | } 58 | } 59 | 60 | // Render the response on the page for splits: 61 | // [-float("inf"), -15.0, 0, 30.0, float("inf")] 62 | function renderPage(response) { 63 | 64 | console.log(response); 65 | 66 | var displayMessage; 67 | 68 | if(response.Prediction == 0 || response.Prediction == '0') { 69 | displayMessage = "Early (15+ Minutes Early)"; 70 | } 71 | else if(response.Prediction == 1 || response.Prediction == '1') { 72 | displayMessage = "Slightly Early (0-15 Minute Early)"; 73 | } 74 | else if(response.Prediction == 2 || response.Prediction == '2') { 75 | displayMessage = "Slightly Late (0-30 Minute Delay)"; 76 | } 77 | else if(response.Prediction == 3 || response.Prediction == '3') { 78 | displayMessage = "Very Late (30+ Minutes Late)"; 79 | } 80 | 81 | console.log(displayMessage) 82 | 83 | $( "#result" ).empty().append( displayMessage ); 84 | } 85 | -------------------------------------------------------------------------------- /ch08/web/templates/airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines / {{carrier_code}} 5 | 6 | 7 | 8 | 9 |

10 | 11 | {{airline_summary.Name}} 12 | / {{airline_summary.domain}} 13 |

14 | 15 | 16 |

{{airline_summary.summary}}

17 |

Fleet: {{airline_airplanes.FleetCount}} Planes

18 | 25 | {% endblock %} 26 | -------------------------------------------------------------------------------- /ch08/web/templates/all_airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines 5 | 6 |

US Domestic Airlines

7 | 14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /ch08/web/templates/all_airplanes.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airplanes 5 | 6 |

7 | 8 | US Commercial Fleet 9 |

10 | 11 | 12 |
13 |

Airplanes by Manufacturer

14 |
15 |
16 | 17 | 18 | 19 |
20 | {% for item in search_config %} 21 | {% if 'label' in item %} 22 | 23 | {% else %} 24 | 25 | {% endif %} 26 | 27 | {% endfor %} 28 | 29 |
30 | 31 | 32 | 33 | 34 | {% for item in search_config %} 35 | {% if 'label' in item %} 36 | 37 | {% else %} 38 | 39 | {% endif %} 40 | {% endfor %} 41 | 42 | 43 | 44 | 45 | {% for airplane in airplanes %} 46 | 47 | {% for item in search_config %} 48 | 49 | {% endfor %} 50 | 51 | {% endfor %} 52 | 53 |
{{item['label']}}{{item['field']}}
{{airplane[item['field']]}}
54 | 55 | 56 | 72 | 73 | {% import "macros.jnj" as common %} 74 | {% if nav_offsets and nav_path -%} 75 | {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }} 76 | {% endif -%} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /ch08/web/templates/delays.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Delays 6 | 7 |

8 | 9 | Summary of Flight Delays 10 |

11 | 12 |
13 | 14 | 15 | 16 | 20 |
21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /ch08/web/templates/flight.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights / {{flight.TailNum}} 6 | 7 |

Flight {{flight.FlightNum}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
AirlineOriginDestinationTail NumberDateAir TimeDistance
{{flight.Carrier}}{{flight.Origin}}{{flight.Dest}}{{flight.TailNum}}{{flight.FlightDate}}{{flight.AirTime}}{{flight.Distance}}
30 |
31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /ch08/web/templates/flight_delays_predict.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction 5 | 6 |

7 | 8 | Predicting Flight Delays 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Delay:

26 |
27 | 28 | 50 | {% endblock %} 51 | -------------------------------------------------------------------------------- /ch08/web/templates/flight_delays_predict_batch.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction via Spark in Batch 5 | 6 |

7 | 8 | Predicting Flight Delays via Spark in Batch 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Prediction Request Successful:

26 |
27 | 28 | 49 | {% endblock %} 50 | -------------------------------------------------------------------------------- /ch08/web/templates/flight_delays_predict_batch_results.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction Results via Spark in Batch 5 | 6 |

7 | 8 | Presenting Flight Delay Predictions via Spark in Batch 9 |

10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | {% for item in predictions %} 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 46 | 47 | {% endfor %} 48 | 49 |
Request TimestampCarrierFlight DateOriginDestinationDistanceDeparture DelayPredicted Arrival Delay
{{ item['Timestamp'] }}{{ item['Carrier'] }}{{ item['FlightDate'] }}{{ item['Origin'] }}{{ item['Dest'] }}{{ item['Distance'] }}{{ item['DepDelay'] }} 36 | 37 | {% if item['Prediction'] == 0.0 %} 38 | On Time (0-15 Minute Delay) 39 | {% elif item['Prediction'] == 1.0 %} 40 | Slightly Late (15-60 Minute Delay) 41 | {% elif item['Prediction'] == 2.0 %} 42 | Very Late (60+ Minute Delay) 43 | {% endif %} 44 | 45 |
50 | 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /ch08/web/templates/flight_delays_predict_kafka.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction with Kafka 5 | 6 |

7 | 8 | Predicting Flight Delays with Kafka 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Delay:

26 |
27 | 28 | 29 | 30 | {% endblock %} 31 | -------------------------------------------------------------------------------- /ch08/web/templates/flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights 6 | 7 |

{{flight_count}} Flights on {{flight_date}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | {% for flight in flights %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | {% endfor %} 32 | 33 |
AirlineFlight NumberOriginDestinationDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}}{{flight.Origin}}{{flight.Dest}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
34 |
35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /ch08/web/templates/flights_per_airplane.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Airplanes / {{tail_number}} 6 | 7 |

Flights by Tail Number {{tail_number}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for flight in flights['Flights'] %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | {% endfor %} 26 | 27 |
CarrierDateFlight NumberOriginDestination
{{flight[0]}}{{flight[1]}}{{flight[2]}}{{flight[3]}}{{flight[4]}}
28 |
29 | {% endblock %} 30 | -------------------------------------------------------------------------------- /ch08/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 |
21 | 24 | {% block body %}{% endblock %} 25 |
26 | 27 |
28 |
29 | 30 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /ch08/web/templates/macros.jnj: -------------------------------------------------------------------------------- 1 | 2 | {% macro display_nav(offsets, path, count) -%} 3 |
4 | {% for key, values in offsets.items() -%} 5 | {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%} 6 | {{ key }} 9 | {% else -%} 10 | {{ key }} 11 | {% endif %} 12 | {% endfor -%} 13 |
14 | {% endmacro -%} 15 | -------------------------------------------------------------------------------- /ch08/web/templates/total_flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Total Flights by Month

5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for month in total_flights %} 12 | 13 | 14 | 15 | 16 | {% endfor %} 17 | 18 |
MonthTotal Flights
{{month.Month}}{{month.total_flights}}
19 |
20 | {% endblock %} -------------------------------------------------------------------------------- /ch08/web/templates/total_flights_chart.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

Total Flights by Month

19 |
20 |
21 | 22 | 25 | {% endblock %} -------------------------------------------------------------------------------- /ch09/explore_delays.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | 3 | import sys, os, re 4 | import json 5 | import datetime, iso8601 6 | 7 | base_path = "." 8 | 9 | from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType 10 | from pyspark.sql.types import StructType, StructField 11 | from pyspark.sql.functions import udf 12 | 13 | schema = StructType([ 14 | StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 15 | StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" 16 | StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" 17 | StructField("Carrier", StringType(), True), # "Carrier":"WN" 18 | StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 19 | StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 20 | StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 21 | StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 22 | StructField("Dest", StringType(), True), # "Dest":"SAN" 23 | StructField("Distance", DoubleType(), True), # "Distance":368.0 24 | StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" 25 | StructField("FlightNum", StringType(), True), # "FlightNum":"6109" 26 | StructField("Origin", StringType(), True), # "Origin":"TUS" 27 | ]) 28 | 29 | features = spark.read.json( 30 | "data/simple_flight_delay_features.json", 31 | schema=schema 32 | ) 33 | features.registerTempTable("features") 34 | features.show() 35 | 36 | # 37 | # Check whether lateness varies a lot by hour scheduled departure/arrival 38 | # 39 | 40 | spark.sql(""" 41 | SELECT 42 | HOUR(CRSDepTime) + 1 AS Hour, 43 | AVG(ArrDelay), 44 | STD(ArrDelay) 45 | FROM features 46 | GROUP BY HOUR(CRSDepTime) 47 | ORDER BY HOUR(CRSDepTime) 48 | """).show(24) 49 | 50 | spark.sql(""" 51 | SELECT 52 | HOUR(CRSArrTime) + 1 AS Hour, 53 | AVG(ArrDelay), 54 | STD(ArrDelay) 55 | FROM features 56 | GROUP BY HOUR(CRSArrTime) 57 | ORDER BY HOUR(CRSArrTime) 58 | """).show(24) 59 | 60 | 61 | from pyspark.sql.functions import hour 62 | 63 | features = features.withColumn('CRSDepHourOfDay', hour(features.CRSDepTime)) 64 | features = features.withColumn('CRSArrHourOfDay', hour(features.CRSArrTime)) 65 | 66 | departure_cov = features.stat.cov('CRSDepHourOfDay', 'ArrDelay') 67 | arrival_cov = features.stat.cov('CRSArrHourOfDay', 'ArrDelay') 68 | 69 | print("Departure delay covariance: {:,}".format(departure_cov)) 70 | print("Arrival delay covariance: {:,}".format(arrival_cov)) 71 | -------------------------------------------------------------------------------- /ch09/make_predictions_final.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch09/make_predictions_final.py -------------------------------------------------------------------------------- /ch09/make_predictions_streaming_final.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch09/make_predictions_streaming_final.py -------------------------------------------------------------------------------- /ch10/spark_model_with_weather.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch10/spark_model_with_weather.py -------------------------------------------------------------------------------- /ch10/web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch10/web/__init__.py -------------------------------------------------------------------------------- /ch10/web/config.py: -------------------------------------------------------------------------------- 1 | # config.py, a configuration file for index.py 2 | RECORDS_PER_PAGE=15 3 | AIRPLANE_RECORDS_PER_PAGE=5 4 | ELASTIC_URL='http://localhost:9200/agile_data_science' 5 | -------------------------------------------------------------------------------- /ch10/web/predict_utils.py: -------------------------------------------------------------------------------- 1 | import sys, os, re 2 | import pymongo 3 | import datetime, iso8601 4 | 5 | def process_search(results): 6 | """Process elasticsearch hits and return flights records""" 7 | records = [] 8 | total = 0 9 | if results['hits'] and results['hits']['hits']: 10 | total = results['hits']['total'] 11 | hits = results['hits']['hits'] 12 | for hit in hits: 13 | record = hit['_source'] 14 | records.append(record) 15 | return records, total 16 | 17 | def get_navigation_offsets(offset1, offset2, increment): 18 | """Calculate offsets for fetching lists of flights from MongoDB""" 19 | offsets = {} 20 | offsets['Next'] = {'top_offset': offset2 + increment, 'bottom_offset': 21 | offset1 + increment} 22 | offsets['Previous'] = {'top_offset': max(offset2 - increment, 0), 23 | 'bottom_offset': max(offset1 - increment, 0)} # Don't go < 0 24 | return offsets 25 | 26 | def strip_place(url): 27 | """Strip the existing start and end parameters from the query string""" 28 | try: 29 | p = re.match('(.+)\?start=.+&end=.+', url).group(1) 30 | except AttributeError as e: 31 | return url 32 | return p 33 | 34 | def get_flight_distance(client, origin, dest): 35 | """Get the distance between a pair of airport codes""" 36 | query = { 37 | "Origin": origin, 38 | "Dest": dest, 39 | } 40 | record = client.agile_data_science.origin_dest_distances.find_one(query) 41 | return record["Distance"] 42 | 43 | def get_regression_date_args(iso_date): 44 | """Given an ISO Date, return the day of year, day of month, day of week as the API expects them.""" 45 | dt = iso8601.parse_date(iso_date) 46 | day_of_year = dt.timetuple().tm_yday 47 | day_of_month = dt.day 48 | day_of_week = dt.weekday() 49 | return { 50 | "DayOfYear": day_of_year, 51 | "DayOfMonth": day_of_month, 52 | "DayOfWeek": day_of_week, 53 | } 54 | 55 | def get_current_timestamp(): 56 | iso_now = datetime.datetime.now().isoformat() 57 | return iso_now 58 | -------------------------------------------------------------------------------- /ch10/web/static/airplanes.js: -------------------------------------------------------------------------------- 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40}, 2 | width = 900 - margin.left - margin.right, 3 | height = 300 - margin.top - margin.bottom; 4 | 5 | var x = d3.scale.ordinal() 6 | .rangeRoundBands([0, width], .1); 7 | var y = d3.scale.linear() 8 | .range([height, 0]); 9 | 10 | var xAxis = d3.svg.axis() 11 | .scale(x) 12 | .orient("bottom") 13 | .tickFormat(function(d) { 14 | return truncate(d, 14); 15 | }); 16 | var yAxis = d3.svg.axis() 17 | .scale(y) 18 | .orient("left"); 19 | 20 | var chart = d3.select(".chart") 21 | .attr("width", width + margin.left + margin.right) 22 | .attr("height", height + margin.top + margin.bottom) 23 | .append("g") 24 | .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); 25 | 26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) { 27 | var data = data.data; 28 | 29 | x.domain(data.map(function(d) { return d.Manufacturer; })); 30 | y.domain([0, d3.max(data, function(d) { return d.Total; })]); 31 | 32 | chart.append("g") 33 | .attr("class", "x axis") 34 | .attr("transform", "translate(0," + height + ")") 35 | .call(xAxis); 36 | 37 | chart.append("g") 38 | .attr("class", "y axis") 39 | .call(yAxis); 40 | 41 | chart.selectAll(".bar") 42 | .data(data) 43 | .enter().append("rect") 44 | .attr("class", "bar") 45 | .attr("x", function(d) { return x(d.Manufacturer); }) 46 | .attr("y", function(d) { return y(d.Total); }) 47 | .attr("height", function(d) { return height - y(d.Total); }) 48 | .attr("width", x.rangeBand()); 49 | }); 50 | 51 | function truncate(d, l) { 52 | if(d.length > l) 53 | return d.substring(0,l)+'...'; 54 | else 55 | return d; 56 | } 57 | -------------------------------------------------------------------------------- /ch10/web/static/app.js: -------------------------------------------------------------------------------- 1 | var width = 960, 2 | height = 500; 3 | 4 | var y = d3.scale.linear() 5 | .range([height, 0]); 6 | // We define the domain once we get our data in d3.json, below 7 | 8 | var chart = d3.select(".chart") 9 | .attr("width", width) 10 | .attr("height", height); 11 | 12 | d3.json("/total_flights.json", function(data) { 13 | y.domain([0, d3.max(data, function(d) { return d.total_flights; })]); 14 | 15 | var barWidth = width / data.length; 16 | 17 | var bar = chart.selectAll("g") 18 | .data(data) 19 | .enter() 20 | .append("g") 21 | .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; }); 22 | 23 | bar.append("rect") 24 | .attr("y", function(d) { return y(d.total_flights); }) 25 | .attr("height", function(d) { return height - y(d.total_flights); }) 26 | .attr("width", barWidth - 1); 27 | 28 | bar.append("text") 29 | .attr("x", barWidth / 2) 30 | .attr("y", function(d) { return y(d.total_flights) + 3; }) 31 | .attr("dy", ".75em") 32 | .text(function(d) { return d.total_flights; }); 33 | }); -------------------------------------------------------------------------------- /ch10/web/static/bar.css: -------------------------------------------------------------------------------- 1 | 2 | .axis text { 3 | font: 8px sans-serif; 4 | } 5 | 6 | .axis path, 7 | .axis line { 8 | fill: none; 9 | stroke: #000; 10 | shape-rendering: crispEdges; 11 | } 12 | 13 | .bar { 14 | fill: #ff6600; 15 | } 16 | -------------------------------------------------------------------------------- /ch10/web/static/flight_delay_predict_polling.js: -------------------------------------------------------------------------------- 1 | // Attach a submit handler to the form 2 | $( "#flight_delay_classification" ).submit(function( event ) { 3 | 4 | // Stop form from submitting normally 5 | event.preventDefault(); 6 | 7 | // Get some values from elements on the page: 8 | var $form = $( this ), 9 | term = $form.find( "input[name='s']" ).val(), 10 | url = $form.attr( "action" ); 11 | 12 | // Send the data using post 13 | var posting = $.post( 14 | url, 15 | $( "#flight_delay_classification" ).serialize() 16 | ); 17 | 18 | // Submit the form and parse the response 19 | posting.done(function( data ) { 20 | response = JSON.parse(data); 21 | 22 | // If the response is ok, print a message to wait and start polling 23 | if(response.status == "OK") { 24 | $( "#result" ).empty().append( "Processing..." ); 25 | 26 | // Every 1 second, poll the response url until we get a response 27 | poll(response.id); 28 | } 29 | }); 30 | }); 31 | 32 | // Poll the prediction URL 33 | function poll(id) { 34 | var responseUrlBase = "/flights/delays/predict/classify_realtime/response/"; 35 | console.log("Polling for request id " + id + "..."); 36 | 37 | // Append the uuid to the URL as a slug argument 38 | var predictionUrl = responseUrlBase + id; 39 | 40 | $.ajax( 41 | { 42 | url: predictionUrl, 43 | type: "GET", 44 | complete: conditionalPoll 45 | }); 46 | } 47 | 48 | // Decide whether to poll based on the response status 49 | function conditionalPoll(data) { 50 | var response = JSON.parse(data.responseText); 51 | 52 | if(response.status == "OK") { 53 | renderPage(response.prediction); 54 | } 55 | else if(response.status == "WAIT") { 56 | setTimeout(function() {poll(response.id)}, 1000); 57 | } 58 | } 59 | 60 | // Render the response on the page for splits: 61 | // [-float("inf"), -15.0, 0, 30.0, float("inf")] 62 | function renderPage(response) { 63 | 64 | var displayMessage; 65 | 66 | if(response.Prediction == 0) { 67 | displayMessage = "Early (15+ Minutes Early)"; 68 | } 69 | else if(response.Prediction == 1) { 70 | displayMessage = "Slightly Early (0-15 Minute Early)"; 71 | } 72 | else if(response.Prediction == 2) { 73 | displayMessage = "Slightly Late (0-30 Minute Delay)"; 74 | } 75 | else if(response.Prediction == 3) { 76 | displayMessage = "Very Late (30+ Minutes Late)"; 77 | } 78 | 79 | $( "#result" ).empty().append( displayMessage ); 80 | } 81 | -------------------------------------------------------------------------------- /ch10/web/templates/airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines / {{carrier_code}} 5 | 6 | 7 | 8 | 9 |

10 | 11 | {{airline_summary.Name}} 12 | / {{airline_summary.domain}} 13 |

14 | 15 | 16 |

{{airline_summary.summary}}

17 |

Fleet: {{airline_airplanes.FleetCount}} Planes

18 | 25 | {% endblock %} 26 | -------------------------------------------------------------------------------- /ch10/web/templates/all_airlines.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airlines 5 | 6 |

US Domestic Airlines

7 | 14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /ch10/web/templates/all_airplanes.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Airplanes 5 | 6 |

7 | 8 | US Commercial Fleet 9 |

10 | 11 | 12 |
13 |

Airplanes by Manufacturer

14 |
15 |
16 | 17 | 18 | 19 |
20 | {% for item in search_config %} 21 | {% if 'label' in item %} 22 | 23 | {% else %} 24 | 25 | {% endif %} 26 | 27 | {% endfor %} 28 | 29 |
30 | 31 | 32 | 33 | 34 | {% for item in search_config %} 35 | {% if 'label' in item %} 36 | 37 | {% else %} 38 | 39 | {% endif %} 40 | {% endfor %} 41 | 42 | 43 | 44 | 45 | {% for airplane in airplanes %} 46 | 47 | {% for item in search_config %} 48 | 49 | {% endfor %} 50 | 51 | {% endfor %} 52 | 53 |
{{item['label']}}{{item['field']}}
{{airplane[item['field']]}}
54 | 55 | 56 | 72 | 73 | {% import "macros.jnj" as common %} 74 | {% if nav_offsets and nav_path -%} 75 | {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }} 76 | {% endif -%} 77 | {% endblock %} 78 | -------------------------------------------------------------------------------- /ch10/web/templates/delays.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Delays 6 | 7 |

8 | 9 | Summary of Flight Delays 10 |

11 | 12 |
13 | 14 | 15 | 16 | 20 |
21 | {% endblock %} 22 | -------------------------------------------------------------------------------- /ch10/web/templates/flight.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights / {{flight.TailNum}} 6 | 7 |

Flight {{flight.FlightNum}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
AirlineOriginDestinationTail NumberDateAir TimeDistance
{{flight.Carrier}}{{flight.Origin}}{{flight.Dest}}{{flight.TailNum}}{{flight.FlightDate}}{{flight.AirTime}}{{flight.Distance}}
30 |
31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /ch10/web/templates/flight_delays_predict.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction 5 | 6 |

7 | 8 | Predicting Flight Delays 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Delay:

26 |
27 | 28 | 50 | {% endblock %} 51 | -------------------------------------------------------------------------------- /ch10/web/templates/flight_delays_predict_batch.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction via Spark in Batch 5 | 6 |

7 | 8 | Predicting Flight Delays via Spark in Batch 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Prediction Request Successful:

26 |
27 | 28 | 49 | {% endblock %} 50 | -------------------------------------------------------------------------------- /ch10/web/templates/flight_delays_predict_batch_results.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction Results via Spark in Batch 5 | 6 |

7 | 8 | Presenting Flight Delay Predictions via Spark in Batch 9 |

10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | {% for item in predictions %} 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 46 | 47 | {% endfor %} 48 | 49 |
Request TimestampCarrierFlight DateOriginDestinationDistanceDeparture DelayPredicted Arrival Delay
{{ item['Timestamp'] }}{{ item['Carrier'] }}{{ item['FlightDate'] }}{{ item['Origin'] }}{{ item['Dest'] }}{{ item['Distance'] }}{{ item['DepDelay'] }} 36 | 37 | {% if item['Prediction'] == 0.0 %} 38 | On Time (0-15 Minute Delay) 39 | {% elif item['Prediction'] == 1.0 %} 40 | Slightly Late (15-60 Minute Delay) 41 | {% elif item['Prediction'] == 2.0 %} 42 | Very Late (60+ Minute Delay) 43 | {% endif %} 44 | 45 |
50 | 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /ch10/web/templates/flight_delays_predict_kafka.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 4 | / Flight Delay Prediction with Kafka 5 | 6 |

7 | 8 | Predicting Flight Delays with Kafka 9 |

10 | 11 | 12 |
13 | {% for item in form_config %} 14 | {% if 'label' in item %} 15 | 16 | {% else %} 17 | 18 | {% endif %} 19 | 20 | {% endfor %} 21 | 22 |
23 | 24 |
25 |

Delay:

26 |
27 | 28 | 29 | 30 | {% endblock %} 31 | -------------------------------------------------------------------------------- /ch10/web/templates/flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Flights 6 | 7 |

{{flight_count}} Flights on {{flight_date}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | {% for flight in flights %} 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | {% endfor %} 32 | 33 |
AirlineFlight NumberOriginDestinationDeparture TimeTail NumberAir TimeDistance
{{flight.Carrier}}{{flight.FlightNum}}{{flight.Origin}}{{flight.Dest}}{{flight.DepTime}}{{flight.TailNum}}{{flight.AirTime}}{{flight.Distance}}
34 |
35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /ch10/web/templates/flights_per_airplane.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | / Airplanes / {{tail_number}} 6 | 7 |

Flights by Tail Number {{tail_number}}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% for flight in flights['Flights'] %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | {% endfor %} 26 | 27 |
CarrierDateFlight NumberOriginDestination
{{flight[0]}}{{flight[1]}}{{flight[2]}}{{flight[3]}}{{flight[4]}}
28 |
29 | {% endblock %} 30 | -------------------------------------------------------------------------------- /ch10/web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Agile Data Science 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 | 20 |
21 | 24 | {% block body %}{% endblock %} 25 |
26 | 27 |
28 |
29 | 30 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /ch10/web/templates/macros.jnj: -------------------------------------------------------------------------------- 1 | 2 | {% macro display_nav(offsets, path, count) -%} 3 |
4 | {% for key, values in offsets.items() -%} 5 | {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%} 6 | {{ key }} 9 | {% else -%} 10 | {{ key }} 11 | {% endif %} 12 | {% endfor -%} 13 |
14 | {% endmacro -%} 15 | -------------------------------------------------------------------------------- /ch10/web/templates/total_flights.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 |

Total Flights by Month

5 | 6 | 7 | 8 | 9 | 10 | 11 | {% for month in total_flights %} 12 | 13 | 14 | 15 | 16 | {% endfor %} 17 | 18 |
MonthTotal Flights
{{month.Month}}{{month.total_flights}}
19 |
20 | {% endblock %} -------------------------------------------------------------------------------- /ch10/web/templates/total_flights_chart.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 16 | 17 |
18 |

Total Flights by Month

19 |
20 |
21 | 22 | 25 | {% endblock %} -------------------------------------------------------------------------------- /ch10/web/templates/weather_station.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | 5 | 10 | 11 |

Weather Station {{profile_obserations.Profile.STATION_NAME}}

12 | 13 |
14 | {% endblock %} 15 | -------------------------------------------------------------------------------- /dags/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/dags/.exists -------------------------------------------------------------------------------- /download_weather.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Get weather data 5 | # 6 | 7 | cd data 8 | 9 | # Get the station master list as pipe-seperated-values 10 | curl -Lko /tmp/wbanmasterlist.psv.zip http://www.ncdc.noaa.gov/homr/file/wbanmasterlist.psv.zip 11 | unzip -o /tmp/wbanmasterlist.psv.zip 12 | gzip wbanmasterlist.psv 13 | rm -f /tmp/wbanmasterlist.psv.zip 14 | 15 | # Get monthly files of daily summaries for all stations 16 | # curl -Lko /tmp/ http://www.ncdc.noaa.gov/orders/qclcd/QCLCD201501.zip 17 | for i in $(seq -w 1 12) 18 | do 19 | curl -Lko /tmp/QCLCD2015${i}.zip http://www.ncdc.noaa.gov/orders/qclcd/QCLCD2015${i}.zip 20 | unzip -o /tmp/QCLCD2015${i}.zip 21 | gzip 2015${i}*.txt 22 | rm -f /tmp/QCLCD2015${i}.zip 23 | done 24 | -------------------------------------------------------------------------------- /elastic_scripts/create.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -XPUT 'http://localhost:9200/agile_data_science/' -d '{ 4 | "settings" : { 5 | "index" : { 6 | "number_of_shards" : 1, 7 | "number_of_replicas" : 1 8 | } 9 | } 10 | }' 11 | 12 | curl -XPUT 'http://localhost:9200/agile_data_science_airplanes/' -d '{ 13 | "settings" : { 14 | "index" : { 15 | "number_of_shards" : 1, 16 | "number_of_replicas" : 1 17 | } 18 | } 19 | }' 20 | -------------------------------------------------------------------------------- /elastic_scripts/drop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -XDELETE 'http://localhost:9200/agile_data_science/' 4 | 5 | curl -XDELETE 'http://localhost:9200/agile_data_science_airplanes/' 6 | -------------------------------------------------------------------------------- /elastic_scripts/query.sh: -------------------------------------------------------------------------------- 1 | curl -X GET "localhost:9200/agile_data_science/_search?q=" 2 | 3 | curl -X GET "localhost:9200/agile_data_science_airplanes/_search?q=" 4 | -------------------------------------------------------------------------------- /images/DeepDiscoveryTechnicalLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/DeepDiscoveryTechnicalLogo.png -------------------------------------------------------------------------------- /images/airline_page_enriched_wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/airline_page_enriched_wikipedia.png -------------------------------------------------------------------------------- /images/airplanes_page_chart_v1_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/airplanes_page_chart_v1_v2.png -------------------------------------------------------------------------------- /images/back_end_realtime_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/back_end_realtime_architecture.png -------------------------------------------------------------------------------- /images/climbing_the_pyramid_chapter_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/climbing_the_pyramid_chapter_intro.png -------------------------------------------------------------------------------- /images/data_syndrome_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/data_syndrome_logo.png -------------------------------------------------------------------------------- /images/flight_delay_chart_2.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/flight_delay_chart_2.0.png -------------------------------------------------------------------------------- /images/front_end_realtime_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/front_end_realtime_architecture.png -------------------------------------------------------------------------------- /images/predicting_flight_kafka_waiting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/predicting_flight_kafka_waiting.png -------------------------------------------------------------------------------- /images/ubuntu_images.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/ubuntu_images.png -------------------------------------------------------------------------------- /images/video_course_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/video_course_cover.png -------------------------------------------------------------------------------- /install/phantomjs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sudo apt-get -y update 3 | sudo apt-get -y install build-essential chrpath libssl-dev libxft-dev 4 | sudo apt-get -y install libfreetype6 libfreetype6-dev 5 | sudo apt-get -y install libfontconfig1 libfontconfig1-dev 6 | 7 | cd /home/ubuntu 8 | 9 | export PHANTOM_JS="phantomjs-2.1.1-linux-x86_64" 10 | curl -Lko /tmp/$PHANTOM_JS.tar.bz2 https://github.com/Medium/phantomjs/releases/download/v2.1.1/$PHANTOM_JS.tar.bz2 11 | sudo tar -xvjf /tmp/$PHANTOM_JS.tar.bz2 12 | sudo mv $PHANTOM_JS /usr/local/share 13 | sudo ln -sf /usr/local/share/$PHANTOM_JS/bin/phantomjs /usr/local/bin 14 | phantomjs --version 15 | -------------------------------------------------------------------------------- /intro_download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -Lko data/on_time_performance.parquet.tgz https://s3.amazonaws.com/agile_data_science/on_time_performance.parquet.tgz 4 | tar -xvzf data/on_time_performance.parquet.tgz -C data 5 | -------------------------------------------------------------------------------- /jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | c = get_config() 2 | 3 | # Notebook config this is where you saved your pem cert 4 | # c.NotebookApp.certfile = u'/home/ubuntu/certs/mycert.pem' 5 | # Run on all IP addresses of your instance 6 | c.NotebookApp.ip = '*' 7 | # Don't open browser by default 8 | c.NotebookApp.open_browser = False 9 | # Fix port to 8888 10 | c.NotebookApp.port = 8888 11 | # Disable token authentication 12 | c.NotebookApp.token = "" 13 | -------------------------------------------------------------------------------- /lib/data/example.csv: -------------------------------------------------------------------------------- 1 | Russell Jurney,Relato,CEO 2 | Florian Liebert,Mesosphere,CEO 3 | Don Brown,Rocana,CIO 4 | Steve Jobs,Apple,CEO 5 | Donald Trump,The Trump Organization,CEO 6 | Russell Jurney,Data Syndrome,Principal Consultant 7 | -------------------------------------------------------------------------------- /lib/setup_spark.py: -------------------------------------------------------------------------------- 1 | # If there is no SparkSession, create the environment... 2 | # Note that this must be inserted IN your script. You can't import this, it won't work. 3 | try: 4 | sc and spark 5 | except (NameError, UnboundLocalError) as e: 6 | 7 | import findspark 8 | 9 | findspark.init() 10 | import pyspark 11 | import pyspark.sql 12 | 13 | sc = pyspark.SparkContext() 14 | spark = pyspark.sql.SparkSession(sc).builder.appName("Agile Data Science").getOrCreate() 15 | 16 | # continue... 17 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Utility functions to read and write json and jsonl files 3 | # 4 | import bz2 5 | import codecs 6 | import json 7 | import os 8 | 9 | from frozendict import frozendict 10 | 11 | 12 | def write_json_file(obj, path): 13 | '''Dump an object and write it out as json to a file.''' 14 | f = codecs.open(path, 'w', 'utf-8') 15 | f.write(json.dumps(obj, ensure_ascii=False)) 16 | f.close() 17 | 18 | 19 | def write_json_lines_file(ary_of_objects, path): 20 | '''Dump a list of objects out as a json lines file.''' 21 | f = codecs.open(path, 'w', 'utf-8') 22 | for row_object in ary_of_objects: 23 | json_record = json.dumps(row_object, ensure_ascii=False) 24 | f.write(json_record + "\n") 25 | f.close() 26 | 27 | 28 | def read_json_file(path): 29 | '''Turn a normal json file (no CRs per record) into an object.''' 30 | text = codecs.open(path, 'r', 'utf-8').read() 31 | return json.loads(text) 32 | 33 | 34 | def read_json_lines_bz(path): 35 | '''Read a JSON Lines bzip compressed file''' 36 | ary = [] 37 | with bz2.open(path, "rt") as bz_file: 38 | for line in bz_file: 39 | record = json.loads(line.rstrip("\n|\r")) 40 | ary.append(record) 41 | return ary 42 | 43 | 44 | def read_json_lines(path): 45 | '''Read a JSON Lines file''' 46 | ary = [] 47 | with codecs.open(path, "r", "utf-8") as f: 48 | for line in f: 49 | record = json.loads(line.rstrip("\n|\r")) 50 | ary.append(record) 51 | return ary 52 | 53 | 54 | def read_json_lines_file(path): 55 | '''Turn a json cr file (CRs per record) into an array of objects''' 56 | ary = [] 57 | 58 | if os.path.isdir(path): 59 | for (dirpath, dirnames, filenames) in os.walk(path): 60 | for filename in filenames: 61 | full_path = f'{dirpath}/{filename}' 62 | if full_path.endswith('json') or full_path.endswith('jsonl'): 63 | ary.extend( 64 | read_json_lines(full_path) 65 | ) 66 | if path.endswith('bz2'): 67 | ary.extend( 68 | read_json_lines_bz(full_path) 69 | ) 70 | else: 71 | if path.endswith('bz2'): 72 | ary.extend( 73 | read_json_lines_bz(path) 74 | ) 75 | else: 76 | ary.extend( 77 | read_json_lines(path) 78 | ) 79 | return ary 80 | 81 | 82 | class FrozenEncoder(json.JSONEncoder): 83 | def default(self, obj): 84 | if isinstance(obj, frozendict): 85 | return dict(obj) 86 | if isinstance(obj, frozenset): 87 | return list(obj) 88 | # Let the base class default method raise the TypeError 89 | return json.JSONEncoder.default(self, obj) 90 | -------------------------------------------------------------------------------- /logs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/logs/.exists -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "agile_data_science" 3 | version = "0.1.0" 4 | description = "Code for Agile Data Science 2.0" 5 | authors = ["Russell Jurney "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.7,<3.10" 10 | python-dateutil = "^2.8.2" 11 | Jinja2 = "^3.1.3" 12 | requests = "^2.26.0" 13 | Flask = "^2.0.2" 14 | elasticsearch7 = ">=7.14.0 <7.15.0" 15 | beautifulsoup4 = "^4.10.0" 16 | frozendict = "^2.0.7" 17 | geopy = "^2.2.0" 18 | ipython = "^7.28.0" 19 | confluent-kafka = {extras = ["avro", "json", "protobuf"], version = "^1.7.0"} 20 | matplotlib = "^3.4.3" 21 | seaborn = "^0.11.2" 22 | pymongo = "^3.12.1" 23 | scipy = "^1.7.1" 24 | numpy = "^1.21.3" 25 | selenium = "^4.0.0" 26 | tabulate = "^0.8.9" 27 | tldextract = "^3.1.2" 28 | wikipedia = "^1.4.0" 29 | iso8601 = "^0.1.16" 30 | notebook = "^6.4.5" 31 | WTForms = "^2.3.3" 32 | scikit-learn = "^1.0" 33 | avro = ">= 1.0" 34 | 35 | [tool.poetry.dev-dependencies] 36 | 37 | [build-system] 38 | requires = ["poetry-core>=1.0.0"] 39 | build-backend = "poetry.core.masonry.api" 40 | -------------------------------------------------------------------------------- /scripts/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/scripts/.exists --------------------------------------------------------------------------------