├── .bashrc
├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Part_II_-_Climbing_the_Pyramid.ipynb
├── README.md
├── Welcome.ipynb
├── airflow.env
├── aws
    ├── block.device.mappings.json
    └── ec2_bootstrap.sh
├── bin
    ├── download_list.txt
    ├── get_files_from_ec2.sh
    ├── get_student_work.sh
    ├── send_files_to_ec2.sh
    ├── start_notebook.sh
    ├── stop_flask.sh
    ├── stop_notebook.sh
    └── upgrade.sh
├── ch02
    ├── Agile_Tools.ipynb
    ├── Introduction_to_PySpark.ipynb
    ├── airflow_test.py
    ├── data
    │   └── example_name_titles_daily.json
    │   │   └── 2016-12-01
    │   │       └── test.jsonl
    ├── elasticsearch.sh
    ├── flatmap.py
    ├── groupby.py
    ├── histogram.py
    ├── images
    │   ├── ads2_0201.png
    │   ├── ads2_0202.png
    │   ├── ads2_0209.png
    │   ├── ads2_0211.png
    │   ├── ads2_0212.png
    │   ├── ads2_0215.png
    │   ├── ads2_0217.png
    │   ├── ads2_0219.png
    │   ├── ads2_0220.png
    │   ├── ads2_0401.png
    │   ├── ads2_0402.png
    │   ├── ads2_0403.png
    │   ├── ads2_0405.png
    │   ├── ads2_0406.png
    │   ├── ads2_0408.png
    │   ├── ads_bootstrap.png
    │   ├── faa_table.png
    │   ├── flask_terminal.png
    │   └── json.png
    ├── load_on_time_performance.py
    ├── mongo.js
    ├── pyspark_elasticsearch.py
    ├── pyspark_mongodb.py
    ├── pyspark_streaming.py
    ├── pyspark_task_one.py
    ├── pyspark_task_two.py
    ├── python_kafka.py
    ├── setup_airflow_test.sh
    ├── spark.py
    ├── sql.py
    ├── test_elasticsearch.py
    ├── test_elasticsearch.sh
    ├── test_json.py
    ├── test_pymongo.py
    ├── test_pymongo_2.py
    └── web
    │   ├── flask_pymongo.py
    │   ├── static
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       └── d3.v3.min.js
    │   ├── templates
    │       └── table.html
    │   ├── test_flask.py
    │   ├── test_flask_bootstrap.py
    │   └── test_flask_pymongo.py
├── ch04
    ├── Collecting_and_Displaying_Records.ipynb
    ├── convert_data.py
    ├── download.sh
    ├── images
    │   ├── ads2_0201.png
    │   ├── ads2_0202.png
    │   ├── ads2_0209.png
    │   ├── ads2_0211.png
    │   ├── ads2_0212.png
    │   ├── ads2_0215.png
    │   ├── ads2_0217.png
    │   ├── ads2_0219.png
    │   ├── ads2_0220.png
    │   ├── ads2_0401.png
    │   ├── ads2_0402.png
    │   ├── ads2_0403.png
    │   ├── ads2_0405.png
    │   ├── ads2_0406.png
    │   ├── ads2_0408.png
    │   ├── ads2_0409.png
    │   ├── ads_bootstrap.png
    │   ├── ags2_0402.png
    │   ├── airline_data_fields.png
    │   ├── faa_table.png
    │   ├── flask_terminal.png
    │   ├── json.png
    │   ├── parquet_logo.jpg
    │   └── row_format_column_format.png
    ├── load_on_time_pyspark.py
    ├── mongo.js
    ├── pyspark_to_elasticsearch.py
    ├── pyspark_to_mongo.py
    └── web
    │   ├── config.py
    │   ├── on_time_flask.py
    │   ├── on_time_flask_template.py
    │   ├── static
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       └── d3.v3.min.js
    │   └── templates
    │       ├── flight.html
    │       ├── flights.html
    │       ├── layout.html
    │       ├── macros.jnj
    │       └── search.html
├── ch05
    ├── Visualizing_Data_with_Charts_and_Tables.ipynb
    ├── assess_airplanes.py
    ├── assess_faa.py
    ├── extract_airplanes.py
    ├── images
    │   ├── ads2_0501.png
    │   ├── ads2_0502.png
    │   ├── ads2_0503.png
    │   ├── ads2_0504.png
    │   ├── ads2_0505.png
    │   ├── ads2_0507.png
    │   ├── ads2_0508.png
    │   ├── ads2_0509.png
    │   ├── ads2_0510.png
    │   ├── ads2_0511.png
    │   ├── ads2_0512.png
    │   ├── first_order_form.png
    │   ├── flight_search_with_tail_num_link.png
    │   ├── mapreduce.png
    │   └── total_flights_2.png
    ├── install.sh
    ├── mongo.js
    ├── save_tail_numbers.py
    ├── total_flights.py
    └── web
    │   ├── chart_flask.py
    │   ├── config.py
    │   ├── flights_per_airplane.html
    │   ├── static
    │       ├── app.js
    │       ├── app2.js
    │       ├── app3.js
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       ├── d3.v3.min.js
    │       ├── images
    │       │   └── .exist
    │       ├── jquery-1.12.2.min.js
    │       ├── nv.d3.css
    │       └── nv.d3.min.js
    │   └── templates
    │       ├── flight.html
    │       ├── flights.html
    │       ├── flights_per_airplane.html
    │       ├── flights_per_airplane_2.html
    │       ├── layout.html
    │       ├── macros.jnj
    │       ├── search.html
    │       ├── top_routes.html
    │       ├── top_routes_chart.html
    │       ├── total_flights.html
    │       ├── total_flights_chart.html
    │       └── total_flights_chart_2.html
├── ch06
    ├── Exploring_Data_with_Reports.ipynb
    ├── add_name_to_airlines.py
    ├── airplanes_mapping.json
    ├── airplanes_to_elasticsearch.py
    ├── analyze_airplanes.py
    ├── analyze_airplanes_again.py
    ├── create_airplanes_index.sh
    ├── enrich_airlines_wikipedia.py
    ├── extract_airlines.py
    ├── extract_airports.py
    ├── images
    │   ├── ads2_0601.png
    │   ├── ads2_0602.png
    │   ├── ads2_0603.png
    │   ├── ads2_0604.png
    │   ├── ads2_0605.png
    │   ├── ads2_0606.png
    │   ├── ads2_0607.png
    │   ├── ads2_0608.png
    │   └── ads2_0609.png
    ├── import_airlines.sh
    ├── prepare_airplanes.py
    ├── resolve_airplane_manufacturers.py
    ├── scrape_faa.py
    ├── test_elastic_airplanes.sh
    └── web
    │   ├── __init__.py
    │   ├── config.py
    │   ├── report_flask.py
    │   ├── search_helpers.py
    │   ├── static
    │       ├── airplanes.js
    │       ├── app.js
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       ├── d3.v3.min.js
    │       ├── jquery-1.12.2.min.js
    │       ├── nv.d3.css
    │       └── nv.d3.min.js
    │   └── templates
    │       ├── airlines.html
    │       ├── airlines2.html
    │       ├── airport.html
    │       ├── all_airlines.html
    │       ├── all_airplanes.html
    │       ├── flight.html
    │       ├── flights.html
    │       ├── flights_per_airplane.html
    │       ├── layout.html
    │       ├── macros.jnj
    │       ├── search.html
    │       ├── total_flights.html
    │       └── total_flights_chart.html
├── ch07
    ├── Making_Predictions.ipynb
    ├── Predicting flight delays with sklearn.ipynb
    ├── explore_delays.py
    ├── extract_features.py
    ├── images
    │   ├── ads2_0701.png
    │   └── ads2_0702.png
    ├── train_sklearn_model.py
    └── train_spark_mllib_model.py
├── ch08
    ├── Deploying_Predictive_Systems.ipynb
    ├── airflow
    │   └── setup.py
    ├── download_data.sh
    ├── extract_features.py
    ├── fetch_prediction_requests.py
    ├── images
    │   ├── ads2_0807.png
    │   ├── ads2_0808.png
    │   ├── ads2_0809.png
    │   ├── ads2_0810.png
    │   └── ads2_0811.png
    ├── import_distances.sh
    ├── kafka_test.py
    ├── links.txt
    ├── load_prediction_results.py
    ├── make_predictions.py
    ├── make_predictions_streaming.py
    ├── origin_dest_distances.py
    ├── python_kafka_consumer.py
    ├── python_kafka_producer.py
    ├── streaming_test.py
    ├── test_airflow.sh
    ├── test_classification_api.sh
    ├── test_regression_api.sh
    ├── train_spark_mllib_model.py
    └── web
    │   ├── __init__.py
    │   ├── config.py
    │   ├── predict_flask.py
    │   ├── predict_utils.py
    │   ├── static
    │       ├── airplanes.js
    │       ├── app.js
    │       ├── bar.css
    │       ├── barchart.js
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       ├── d3.v3.min.js
    │       ├── flight_delay_predict_polling.js
    │       ├── jquery-1.12.2.min.js
    │       ├── nv.d3.css
    │       └── nv.d3.min.js
    │   └── templates
    │       ├── airlines.html
    │       ├── all_airlines.html
    │       ├── all_airplanes.html
    │       ├── delays.html
    │       ├── flight.html
    │       ├── flight_delays_predict.html
    │       ├── flight_delays_predict_batch.html
    │       ├── flight_delays_predict_batch_results.html
    │       ├── flight_delays_predict_kafka.html
    │       ├── flights.html
    │       ├── flights_per_airplane.html
    │       ├── layout.html
    │       ├── macros.jnj
    │       ├── search.html
    │       ├── total_flights.html
    │       └── total_flights_chart.html
├── ch09
    ├── Debugging Prediction Problems.ipynb
    ├── Improving flight delay predictions with sklearn.ipynb
    ├── Improving_Predictions.ipynb
    ├── baseline_spark_mllib_model.py
    ├── explore_delays.py
    ├── extract_features.py
    ├── extract_features_with_airplanes.py
    ├── extract_features_with_flight_time.py
    ├── improve_sklearn_model.py
    ├── improved_spark_mllib_model.py
    ├── make_predictions_final.py
    ├── make_predictions_streaming_final.py
    ├── spark_model_with_airplanes.py
    ├── spark_model_with_flight_time.py
    └── train_spark_mllib_model.py
├── ch10
    ├── convert_observations.py
    ├── explore_weather.py
    ├── load_weather.py
    ├── match_airport_with_weather_station.py
    ├── match_reports_with_flights.py
    ├── spark_model_with_weather.py
    └── web
    │   ├── __init__.py
    │   ├── config.py
    │   ├── predict_flask.py
    │   ├── predict_utils.py
    │   ├── static
    │       ├── airplanes.js
    │       ├── app.js
    │       ├── bar.css
    │       ├── barchart.js
    │       ├── bootstrap-theme.min.css
    │       ├── bootstrap.min.css
    │       ├── bootstrap.min.js
    │       ├── calendar.js
    │       ├── calendar.min.css
    │       ├── d3.v3.min.js
    │       ├── flight_delay_predict_polling.js
    │       ├── jquery-1.12.2.min.js
    │       ├── nv.d3.css
    │       └── nv.d3.min.js
    │   └── templates
    │       ├── airlines.html
    │       ├── all_airlines.html
    │       ├── all_airplanes.html
    │       ├── daily_weather_station.html
    │       ├── delays.html
    │       ├── flight.html
    │       ├── flight_delays_predict.html
    │       ├── flight_delays_predict_batch.html
    │       ├── flight_delays_predict_batch_results.html
    │       ├── flight_delays_predict_kafka.html
    │       ├── flights.html
    │       ├── flights_per_airplane.html
    │       ├── layout.html
    │       ├── macros.jnj
    │       ├── search.html
    │       ├── total_flights.html
    │       ├── total_flights_chart.html
    │       └── weather_station.html
├── dags
    └── .exists
├── docker-compose.yml
├── download.sh
├── download_weather.sh
├── elastic_scripts
    ├── create.sh
    ├── drop.sh
    └── query.sh
├── images
    ├── DeepDiscoveryTechnicalLogo.png
    ├── airline_page_enriched_wikipedia.png
    ├── airplanes_page_chart_v1_v2.png
    ├── back_end_realtime_architecture.png
    ├── climbing_the_pyramid_chapter_intro.png
    ├── data_syndrome_logo.png
    ├── flight_delay_chart_2.0.png
    ├── front_end_realtime_architecture.png
    ├── predicting_flight_kafka_waiting.png
    ├── ubuntu_images.png
    └── video_course_cover.png
├── install
    └── phantomjs.sh
├── intro_download.sh
├── jupyter_notebook_config.py
├── lib
    ├── data
    │   ├── example.csv
    │   └── faa_tail_number_inquiry.jsonl
    ├── pyspark_csv.py
    ├── setup_spark.py
    └── utils.py
├── logs
    └── .exists
├── old.Dockerfile
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── scripts
    └── .exists


/.bashrc:
--------------------------------------------------------------------------------
 1 | 
 2 | # >>> conda initialize >>>
 3 | # !! Contents within this block are managed by 'conda init' !!
 4 | __conda_setup="$('/opt/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
 5 | if [ $? -eq 0 ]; then
 6 |     eval "$__conda_setup"
 7 | else
 8 |     if [ -f "/opt/conda/etc/profile.d/conda.sh" ]; then
 9 |         . "/opt/conda/etc/profile.d/conda.sh"
10 |     else
11 |         export PATH="/opt/conda/bin:$PATH"
12 |     fi
13 | fi
14 | unset __conda_setup
15 | # <<< conda initialize <<<
16 | 
17 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | data
 2 | .git
 3 | spark
 4 | hadoop
 5 | elasticsearch
 6 | kafka
 7 | mongodb
 8 | mongo-hadoop
 9 | spark-warehouse
10 | tmp
11 | zeppelin
12 | elasticsearch-hadoop
13 | models
14 | lib
15 | .ivy2
16 | .bash_history
17 | .dbshell
18 | .mongodb
19 | .mongorc.js
20 | .cache
21 | logs
22 | .wget-hsts
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .ipynb_checkpoints
 3 | mongodb
 4 | spark
 5 | hadoop
 6 | data
 7 | mongo-hadoop
 8 | lib/mongo*
 9 | lib/pymongo_spark.py
10 | elasticsearch
11 | elasticsearch-hadoop
12 | lib
13 | ch03/static
14 | ch06/web/static
15 | *.pyc
16 | tmp
17 | .idea
18 | kafka
19 | zeppelin
20 | ch05/scrape_faa.py
21 | models
22 | .vagrant
23 | *.pem
24 | .reservation_id
25 | .ec2_hostname
26 | .ec2_deep_hostname
27 | .deep_reservation_id
28 | deep/data
29 | .vscode
30 | ghostdriver.log
31 | cassandra
32 | janusgraph
33 | deep
34 | ch05/web/static/images/
35 | .local
36 | .ipython
37 | .jupyter
38 | .ivy2
39 | .bash_history
40 | .dbshell
41 | .mongodb
42 | .mongorc.js
43 | .cache
44 | logs
45 | .wget-hsts
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG OWNER=jupyter
 2 | ARG BASE_CONTAINER=$OWNER/pyspark-notebook:spark-3.2.0
 3 | FROM $BASE_CONTAINER
 4 | 
 5 | LABEL maintainer="Russell Jurney <russell.jurney@gmail.com>"
 6 | 
 7 | # Fix DL4006
 8 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 9 | 
10 | USER root
11 | 
12 | # Install the MongoDB Client CLI
13 | RUN apt-get update --yes && \
14 |     sudo apt-get install -y iputils-ping gnupg curl jq && \
15 |     wget -qO - https://www.mongodb.org/static/pgp/server-5.0.asc | sudo apt-key add - && \
16 |     echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/5.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-5.0.list && \
17 |     sudo apt-get update && \
18 |     sudo apt-get install -y mongodb-mongosh mongodb-org-tools && \
19 |     echo "mongodb-mongosh hold" | sudo dpkg --set-selections && \
20 |     echo "mongodb-org-tools hold" | sudo dpkg --set-selections && \
21 |     apt-get clean
22 | 
23 | RUN pip install poetry
24 | 
25 | COPY pyproject.toml /home/jovyan/pyproject.toml
26 | COPY poetry.lock /home/jovyan/poetry.lock
27 | COPY requirements.txt /home/jovyan/requirements.txt
28 | 
29 | RUN poetry install && pip install -r requirements.txt
30 | 
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/airflow.env:
--------------------------------------------------------------------------------
1 | AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
2 | AIRFLOW__CORE__EXECUTOR=LocalExecutor


--------------------------------------------------------------------------------
/aws/block.device.mappings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "DeviceName": "/dev/sda1",
3 |   "Ebs": {
4 |     "Status": "attached",
5 |     "DeleteOnTermination": true,
6 |     "VolumeSize": 1024
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/bin/get_files_from_ec2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copy all Python code and model files from the EC2 instance to local filessytem
 4 | rsync -ruv -e "ssh -i ./agile_data_science.pem" \
 5 |     --exclude=cassandra \
 6 |     --exclude=data \
 7 |     --exclude=janusgraph \
 8 |     --exclude=hadoop \
 9 |     --exclude=spark \
10 |     --exclude=kafka \
11 |     --exclude=lib \
12 |     --exclude=elasticsearch-hadoop \
13 |     --exclude=elasticsearch \
14 |     --exclude=mongo-hadoop \
15 |     --exclude=mongodb \
16 |     --exclude=tmp \
17 |     --exclude=zeppelin \
18 |     ubuntu@`cat .ec2_hostname`:Agile_Data_Code_2/* .
19 | 


--------------------------------------------------------------------------------
/bin/get_student_work.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | scp -i ./agile_data_science.pem bin/download_list.txt ubuntu@$(cat .ec2_hostname):Agile_Data_Code_2/
 4 | 
 5 | ssh -i ./agile_data_science.pem ubuntu@$(cat .ec2_hostname) << SSH_COMMANDS
 6 | 
 7 | cd Agile_Data_Code_2
 8 | tar -cvzf agile_data_science_student_code.tar.gz -T download_list.txt
 9 | 
10 | SSH_COMMANDS
11 | 
12 | scp -i ./agile_data_science.pem ubuntu@$(cat .ec2_hostname):Agile_Data_Code_2/agile_data_science_student_code.tar.gz ./ads_student_$(cat .ec2_hostname).tar.gz
13 | 


--------------------------------------------------------------------------------
/bin/send_files_to_ec2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copy all files from this directory on localhost to EC2 host Agile_Data_Code_2 directory
 4 | rsync -ruv -e "ssh -i ./agile_data_science.pem" \
 5 |     --exclude=cassandra \
 6 |     --exclude=data \
 7 |     --exclude=janusgraph \
 8 |     --exclude=hadoop \
 9 |     --exclude=spark \
10 |     --exclude=kafka \
11 |     --exclude=lib \
12 |     --exclude=elasticsearch-hadoop \
13 |     --exclude=elasticsearch \
14 |     --exclude=mongo-hadoop \
15 |     --exclude=mongodb \
16 |     --exclude=tmp \
17 |     --exclude=zeppelin \
18 |     * ubuntu@`cat .ec2_hostname`:Agile_Data_Code_2/
19 | 


--------------------------------------------------------------------------------
/bin/start_notebook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $PROJECT_HOME
 4 | 
 5 | # Stop all existing jupyter notebooks
 6 | ps aux | grep -i jupyter | grep -v grep | tr -s ' ' | cut -d ' ' -f2 | xargs -I {} sudo kill -9 {}
 7 | 
 8 | # Start a new Jupyter Notebook
 9 | nohup jupyter notebook --ip=0.0.0.0 --NotebookApp.token= --allow-root --no-browser &
10 | 
11 | echo "Jupyter notebook started!"
12 | 


--------------------------------------------------------------------------------
/bin/stop_flask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ps aux|grep -i flask|tr -s ' '|cut -d ' ' -f2|xargs kill -9
4 | sudo netstat -ap|grep 5000|tr -s ' '|cut -d ' ' -f7|cut -d '/' -f1|xargs sudo kill -9
5 | 


--------------------------------------------------------------------------------
/bin/stop_notebook.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ps aux| grep jupyter | grep -v grep | tr -s ' '| cut -d ' ' -f2 | xargs -I {} kill -9 {}
4 | 
5 | echo "Killed Jupyter Notebook!"
6 | 


--------------------------------------------------------------------------------
/bin/upgrade.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export LOG_FILE="/home/vagrant/upgrade.sh.log"
 4 | 
 5 | echo "Removing Spark 2.2.1 ..." | tee -a $LOG_FILE
 6 | rm -rf /home/vagrant/spark
 7 | 
 8 | echo "Downloading and installing Spark 2.4.4 ..." | tee -a $LOG_FILE
 9 | curl -Lko /tmp/spark-2.4.4-bin-without-hadoop.tgz https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
10 | mkdir -p /home/vagrant/spark
11 | cd /home/vagrant
12 | tar -xvf /tmp/spark-2.4.4-bin-without-hadoop.tgz -C spark --strip-components=1
13 | 
14 | # Have to set spark.io.compression.codec in Spark local mode
15 | cp /home/vagrant/spark/conf/spark-defaults.conf.template /home/vagrant/spark/conf/spark-defaults.conf
16 | echo 'spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec' | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf
17 | 
18 | # Give Spark 8GB of RAM, use Python3
19 | echo "spark.driver.memory 8g" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf
20 | echo "spark.executor.cores 2" | sudo tee -a $SPARK_HOME/conf/spark-defaults.conf
21 | echo "PYSPARK_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh
22 | echo "PYSPARK_DRIVER_PYTHON=python3" | sudo tee -a $SPARK_HOME/conf/spark-env.sh
23 | 
24 | # Setup log4j config to reduce logging output
25 | cp $SPARK_HOME/conf/log4j.properties.template $SPARK_HOME/conf/log4j.properties
26 | sed -i 's/INFO/ERROR/g' $SPARK_HOME/conf/log4j.properties
27 | 
28 | # Give to vagrant
29 | sudo chown -R vagrant /home/vagrant/spark
30 | sudo chgrp -R vagrant /home/vagrant/spark
31 | 
32 | echo "spark.speculation false" | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf
33 | 
34 | echo "spark.jars /home/vagrant/Agile_Data_Code_2/lib/mongo-hadoop-spark-2.0.2.jar,/home/vagrant/Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar,/home/vagrant/Agile_Data_Code_2/lib/mongo-hadoop-2.0.2.jar,/home/vagrant/Agile_Data_Code_2/lib/elasticsearch-spark-20_2.11-6.1.2.jar,/home/vagrant/Agile_Data_Code_2/lib/snappy-java-1.1.7.1.jar,/home/vagrant/Agile_Data_Code_2/lib/lzo-hadoop-1.0.5.jar,/home/vagrant/Agile_Data_Code_2/lib/commons-httpclient-3.1.jar" | sudo tee -a /home/vagrant/spark/conf/spark-defaults.conf
35 | 
36 | 


--------------------------------------------------------------------------------
/ch02/airflow_test.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash_operator import BashOperator
 5 | 
 6 | from datetime import datetime, timedelta
 7 | import iso8601
 8 | 
 9 | project_home = os.environ["PROJECT_HOME"]
10 | 
11 | default_args = {
12 |   'owner': 'airflow',
13 |   'depends_on_past': False,
14 |   'start_date': iso8601.parse_date("2016-12-01"),
15 |   'email': ['russell.jurney@gmail.com'],
16 |   'email_on_failure': True,
17 |   'email_on_retry': True,
18 |   'retries': 3,
19 |   'retry_delay': timedelta(minutes=5),
20 | }
21 | 
22 | # Timedelta 1 is 'run daily'
23 | dag = DAG(
24 |   'agile_data_science_airflow_test',
25 |   default_args=default_args,
26 |   schedule_interval=timedelta(1)
27 | )
28 | 
29 | # Run a simple PySpark Script
30 | pyspark_local_task_one = BashOperator(
31 |   task_id = "pyspark_local_task_one",
32 |   bash_command = """spark-submit \
33 |   --master {{ params.master }}
34 |   {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""",
35 |   params = {
36 |     "master": "local[8]",
37 |     "filename": "ch02/pyspark_task_one.py",
38 |     "base_path": "{}/".format(project_home)
39 |   },
40 |   dag=dag
41 | )
42 | 
43 | # Run another simple PySpark Script that depends on the previous one
44 | pyspark_local_task_two = BashOperator(
45 |   task_id = "pyspark_local_task_two",
46 |   bash_command = """spark-submit \
47 |   --master {{ params.master }}
48 |   {{ params.base_path }}/{{ params.filename }} {{ ts }} {{ params.base_path }}""",
49 |   params = {
50 |     "master": "local[8]",
51 |     "filename": "ch02/pyspark_task_two.py",
52 |     "base_path": "{}/".format(project_home)
53 |   },
54 |   dag=dag
55 | )
56 | 
57 | # Add the dependency from the second to the first task
58 | pyspark_local_task_two.set_upstream(pyspark_local_task_one)
59 | 


--------------------------------------------------------------------------------
/ch02/data/example_name_titles_daily.json/2016-12-01/test.jsonl:
--------------------------------------------------------------------------------
1 | {"name": "Russell Jurney", "title": "Data Scientist"}
2 | {"name": "Russell Jurney", "title": "Author"}
3 | {"name": "Russell Jurney", "title": "Dog Lover"}
4 | {"name": "Bob Jones", "title": "CEO"}
5 | {"name": "Susan Shu", "title": "Attorney"}
6 | 


--------------------------------------------------------------------------------
/ch02/elasticsearch.sh:
--------------------------------------------------------------------------------
1 | curl -XPUT 'localhost:9200/customer/external/1?pretty' -d '
2 | {
3 |   "name": "Russell Jurney"
4 | }'
5 | 
6 | curl -XGET 'localhost:9200/customer/external/1?pretty'
7 | 


--------------------------------------------------------------------------------
/ch02/flatmap.py:
--------------------------------------------------------------------------------
 1 | csv_lines = sc.textFile("data/example.csv")
 2 | 
 3 | # Compute a relation of words by line
 4 | words_by_line = csv_lines\
 5 |   .map(lambda line: line.split(","))
 6 | 
 7 | words_by_line.collect()
 8 | 
 9 | # Compute a relation of words
10 | flattened_words = csv_lines\
11 |   .map(lambda line: line.split(","))\
12 |   .flatMap(lambda x: x)
13 | 
14 | flattened_words.collect()
15 | 


--------------------------------------------------------------------------------
/ch02/groupby.py:
--------------------------------------------------------------------------------
 1 | csv_lines = sc.textFile("data/example.csv")
 2 | 
 3 | # Turn the CSV lines into objects
 4 | def csv_to_record(line):
 5 |   parts = line.split(",")
 6 |   record = {
 7 |     "name": parts[0],
 8 |     "company": parts[1],
 9 |     "title": parts[2]
10 |   }
11 |   return record
12 | 
13 | # Apply the function to every record
14 | records = csv_lines.map(csv_to_record)
15 | 
16 | # Inspect the first item in the dataset
17 | records.first()
18 | 
19 | # Group the records by the name of the person
20 | grouped_records = records.groupBy(lambda x: x["name"])
21 | 
22 | # Show the first group
23 | grouped_records.first()
24 | 
25 | # Count the groups
26 | job_counts = grouped_records.map(
27 |   lambda x: {
28 |     "name": x[0],
29 |     "job_count": len(x[1])
30 |   }
31 | )
32 | 
33 | job_counts.first()
34 | 
35 | job_counts.collect()
36 | 


--------------------------------------------------------------------------------
/ch02/histogram.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file containing flight delay records
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # Register the data for Spark SQL
 5 | on_time_dataframe.registerTempTable("on_time_performance")
 6 | 
 7 | # Compute a histogram of departure delays
 8 | on_time_dataframe\
 9 |   .select("DepDelay")\
10 |   .rdd\
11 |   .flatMap(lambda x: x)\
12 |   .histogram(10)
13 | 
14 | import numpy as np
15 | import matplotlib.mlab as mlab
16 | import matplotlib.pyplot as plt
17 | 
18 | # Function to plot a histogram using pyplot
19 | def create_hist(rdd_histogram_data):
20 |   """Given an RDD.histogram, plot a pyplot histogram"""
21 |   heights = np.array(rdd_histogram_data[1])
22 |   full_bins = rdd_histogram_data[0]
23 |   mid_point_bins = full_bins[:-1]
24 |   widths = [abs(i - j) for i, j in zip(full_bins[:-1], full_bins[1:])]
25 |   bar = plt.bar(mid_point_bins, heights, width=widths, color='b')
26 |   return bar
27 | 
28 | # Compute a histogram of departure delays
29 | departure_delay_histogram = on_time_dataframe\
30 |   .select("DepDelay")\
31 |   .rdd\
32 |   .flatMap(lambda x: x)\
33 |   .histogram(10, [-60,-30,-15,-10,-5,0,5,10,15,30,60,90,120,180])
34 | 
35 | create_hist(departure_delay_histogram)
36 | 


--------------------------------------------------------------------------------
/ch02/images/ads2_0201.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0201.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0202.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0209.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0209.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0211.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0211.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0212.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0212.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0215.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0215.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0217.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0217.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0219.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0219.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0220.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0220.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0401.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0401.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0402.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0402.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0403.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0403.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0405.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0405.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0406.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0406.png


--------------------------------------------------------------------------------
/ch02/images/ads2_0408.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads2_0408.png


--------------------------------------------------------------------------------
/ch02/images/ads_bootstrap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/ads_bootstrap.png


--------------------------------------------------------------------------------
/ch02/images/faa_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/faa_table.png


--------------------------------------------------------------------------------
/ch02/images/flask_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/flask_terminal.png


--------------------------------------------------------------------------------
/ch02/images/json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch02/images/json.png


--------------------------------------------------------------------------------
/ch02/load_on_time_performance.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file containing flight delay records
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # Register the data for Spark SQL
 5 | on_time_dataframe.registerTempTable("on_time_performance")
 6 | 
 7 | # Check out the columns
 8 | on_time_dataframe.columns
 9 | 
10 | # Check out some data
11 | on_time_dataframe\
12 |   .select("FlightDate", "TailNum", "Origin", "Dest", "Carrier", "DepDelay", "ArrDelay")\
13 |   .show()
14 | 
15 | # Trim the fields and keep the result
16 | trimmed_on_time = on_time_dataframe\
17 |   .select(
18 |     "FlightDate",
19 |     "TailNum",
20 |     "Origin",
21 |     "Dest",
22 |     "Carrier",
23 |     "DepDelay",
24 |     "ArrDelay"
25 |   )
26 | 
27 | # Sample 0.01% of the data and show
28 | trimmed_on_time.sample(False, 0.0001).show()
29 | 


--------------------------------------------------------------------------------
/ch02/mongo.js:
--------------------------------------------------------------------------------
1 | db.test_collection.insert({'name': 'Russell Jurney', 'email': 'russell.jurney@gmail.com'})
2 | db.test_collection.findOne({'name': 'Russell Jurney'})
3 | 


--------------------------------------------------------------------------------
/ch02/pyspark_elasticsearch.py:
--------------------------------------------------------------------------------
 1 | csv_lines = sc.textFile("data/example.csv")
 2 | data = csv_lines.map(lambda line: line.split(","))
 3 | schema_data = data.map(lambda x: ('key', {'name': x[0], 'company': x[1], 'title': x[2]}))
 4 | 
 5 | schema_data.saveAsNewAPIHadoopFile(
 6 |   path='-', 
 7 |   outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
 8 |   keyClass="org.apache.hadoop.io.NullWritable", 
 9 |   valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", 
10 |   conf={ "es.resource" : "agile_data_science/executives" })
11 | 


--------------------------------------------------------------------------------
/ch02/pyspark_mongodb.py:
--------------------------------------------------------------------------------
 1 | # This code sample is meant to be executed line-by-line in a 
 2 | # pyspark session.
 3 | #
 4 | # Prior to launching pyspark, run the following line in the 
 5 | # shell where pyspark will be launched.
 6 | #
 7 | # export PYSPARK_DRIVER_PYTHON=ipython
 8 | #
 9 | # The pyspark launch command needs to have additional command line
10 | # arguments passed to ensure that Java classes used to connect to
11 | # MongoDB are found.
12 | #
13 | # The Java classes reside in JAR files that were
14 | # preinstalled via the boostrap.sh script and placed in the 
15 | # lib directory. You will need to note the version of the
16 | # libraries by inspecting the JAR filenames.  For example,
17 | # if running the following shell command:
18 | #
19 | # $ ls Agile_Data_Code_2/lib/mongo*.jar
20 | #
21 | # yields the following listing:
22 | #
23 | # Agile_Data_Code_2/lib/mongo-hadoop-2.0.2.jar	    
24 | # Agile_Data_Code_2/lib/mongo-hadoop-spark-2.0.2.jar
25 | # Agile_Data_Code_2/lib/mongo-java-driver-3.6.1.jar
26 | #
27 | # then the mongo-hadoop version would be 2.0.2, and the 
28 | # Mongo-Java version would be 3.6.1.
29 | #
30 | # Choosing to set these versions as environment variables
31 | # will make the invocation of the command much less error
32 | # prone.
33 | #
34 | # MONGOHADOOP_VERSION=2.0.2
35 | # MONGOJAVA_VERSION=3.6.1
36 | #
37 | # The names of the JAR files can then be pieced together
38 | # from the version strings.
39 | #
40 | # MONGOHADOOPSPARK_JAR=./lib/mongo-hadoop-spark-$MONGOHADOOP_VERSION.jar
41 | # MONGOJAVADRIVER_JAR=./lib/mongo-java-driver-$MONGOJAVA_VERSION.jar
42 | # MONGOHADOOP_JAR=./lib/mongo-hadoop-$MONGOHADOOP_VERSION.jar 
43 | #
44 | # You can then launch the pyspark session using the following
45 | # shell command from the Agile_Data_Code_2 directory:
46 | #
47 | # pyspark \
48 | #   --jars $MONGOHADOOPSPARK_JAR,$MONGOJAVADRIVER_JAR,$MONGOHADOOP_JAR \
49 | #   --driver-class-path $MONGOHADOOPSPARK_JAR:$MONGOJAVADRIVER_JAR:$MONGOHADOOP_JAR
50 | 
51 | import pymongo_spark
52 | # Important: activate pymongo_spark.
53 | pymongo_spark.activate()
54 | 
55 | csv_lines = sc.textFile("data/example.csv")
56 | data = csv_lines.map(lambda line: line.split(","))
57 | schema_data = data.map(lambda x: {'name': x[0], 'company': x[1], 'title': x[2]})
58 | schema_data.saveToMongoDB('mongodb://localhost:27017/agile_data_science.executives')
59 | 
60 | 


--------------------------------------------------------------------------------
/ch02/pyspark_streaming.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import json
 3 | 
 4 | from pyspark import SparkContext, SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils, OffsetRange, TopicAndPartition
 7 | 
 8 | 
 9 | # Process data every 10 seconds
10 | PERIOD = 10
11 | BROKERS = 'localhost:9092'
12 | TOPIC = 'test'
13 | 
14 | conf = SparkConf().set("spark.default.parallelism", 1)
15 | 
16 | # Stop the default SparkContext before creating a new one.
17 | sc.stop()
18 | sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf = conf)
19 | ssc = StreamingContext(sc, PERIOD)
20 | 
21 | stream = KafkaUtils.createDirectStream(
22 |   ssc,
23 |   [TOPIC],
24 |   {
25 |     "metadata.broker.list": BROKERS,
26 |     "group.id": "0",
27 |   }
28 | )
29 | object_stream = stream.map(lambda x: json.loads(x[1]))
30 | object_stream.pprint()
31 | 
32 | ssc.start()
33 | 


--------------------------------------------------------------------------------
/ch02/pyspark_task_one.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, os, re
 4 | import json
 5 | import datetime, iso8601
 6 | 
 7 | # Pass date and base path to main() from airflow
 8 | def main(iso_date, base_path):
 9 |   APP_NAME = "pyspark_task_one.py"
10 |   
11 |   # If there is no SparkSession, create the environment
12 |   try:
13 |     sc and spark
14 |   except NameError as e:
15 |     import findspark
16 |     findspark.init()
17 |     import pyspark
18 |     import pyspark.sql
19 |     
20 |     sc = pyspark.SparkContext()
21 |     spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
22 | 
23 |   # Get today's date
24 |   today_dt = iso8601.parse_date(iso_date)
25 |   rounded_today = today_dt.date()
26 | 
27 |   # Load today's data
28 |   today_input_path = "{}/ch02/data/example_name_titles_daily.json/{}".format(
29 |     base_path,
30 |     rounded_today.isoformat()
31 |   )
32 | 
33 |   # Otherwise load the data and proceed...
34 |   people_titles = spark.read.json(today_input_path)
35 |   people_titles.show()
36 |   
37 |   # Group by as an RDD
38 |   titles_by_name = people_titles.rdd.groupBy(lambda x: x["name"])
39 |   
40 |   # Accept the group key/grouped data and concatenate the various titles...
41 |   # into a master title
42 |   def concatenate_titles(people_titles):
43 |     name = people_titles[0]
44 |     title_records = people_titles[1]
45 |     master_title = ""
46 |     for title_record in sorted(title_records):
47 |       title = title_record["title"]
48 |       master_title += "{}, ".format(title)
49 |     master_title = master_title[:-2]
50 |     record = {"name": name, "master_title": master_title}
51 |     return record
52 |   
53 |   people_with_concatenated_titles = titles_by_name.map(concatenate_titles)
54 |   people_output_json = people_with_concatenated_titles.map(json.dumps)
55 |   
56 |   # Get today's output path
57 |   today_output_path = "{}/ch02/data/example_master_titles_daily.json/{}".format(
58 |     base_path,
59 |     rounded_today.isoformat()
60 |   )
61 |   
62 |   # Write/replace today's output path
63 |   os.system("rm -rf {}".format(today_output_path))
64 |   people_output_json.saveAsTextFile(today_output_path)
65 | 
66 | if __name__ == "__main__":
67 |   main(sys.argv[1], sys.argv[2])
68 | 


--------------------------------------------------------------------------------
/ch02/pyspark_task_two.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, os, re
 4 | import json
 5 | import datetime, iso8601
 6 | 
 7 | # Pass date and base path to main() from airflow
 8 | def main(iso_date, base_path):
 9 |   APP_NAME = "pyspark_task_two.py"
10 |   
11 |   # If there is no SparkSession, create the environment
12 |   try:
13 |     sc and spark
14 |   except NameError as e:
15 |     import findspark
16 |     findspark.init()
17 |     import pyspark
18 |     import pyspark.sql
19 |     
20 |     sc = pyspark.SparkContext()
21 |     spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
22 | 
23 |   import pymongo
24 |   import pymongo_spark
25 |   # Important: activate pymongo_spark.
26 |   pymongo_spark.activate()
27 |   
28 |   # Get today's date
29 |   today_dt = iso8601.parse_date(iso_date)
30 |   rounded_today = today_dt.date()
31 |   
32 |   # Load today's data
33 |   today_input_path = "{}/ch02/data/example_master_titles_daily.json/{}".format(
34 |     base_path,
35 |     rounded_today.isoformat()
36 |   )
37 |   
38 |   # Otherwise load the data and proceed...
39 |   people_master_titles_raw = sc.textFile(today_input_path)
40 |   people_master_titles = people_master_titles_raw.map(json.loads)
41 |   print(people_master_titles.first())
42 | 
43 |   people_master_titles.saveToMongoDB('mongodb://localhost:27017/agile_data_science.people_master_titles')
44 | 
45 | if __name__ == "__main__":
46 |   main(sys.argv[1], sys.argv[2])
47 | 


--------------------------------------------------------------------------------
/ch02/python_kafka.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import json
 3 | 
 4 | from kafka import KafkaConsumer, TopicPartition
 5 | consumer = KafkaConsumer()
 6 | consumer.assign([TopicPartition('test', 0)])
 7 | consumer.seek_to_beginning()
 8 | 
 9 | for message in consumer:
10 |   message_bytes = message.value
11 |   message_string = message_bytes.decode()
12 |   message_object = json.loads(message_string)
13 |   print(message_object)
14 | 
15 | 


--------------------------------------------------------------------------------
/ch02/setup_airflow_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | sudo ln -s $PROJECT_HOME/ch02/airflow_test.py ~/airflow/dags/
4 | 


--------------------------------------------------------------------------------
/ch02/spark.py:
--------------------------------------------------------------------------------
1 | # Load the text file using the SparkContext
2 | csv_lines = sc.textFile("data/example.csv")
3 | 
4 | # Map the data to split the lines into a list
5 | data = csv_lines.map(lambda line: line.split(","))
6 | 
7 | # Collect the dataset into local RAM
8 | data.collect()
9 | 


--------------------------------------------------------------------------------
/ch02/sql.py:
--------------------------------------------------------------------------------
 1 | csv_lines = sc.textFile("data/example.csv")
 2 | 
 3 | from pyspark.sql import Row
 4 | 
 5 | # Convert the CSV into a pyspark.sql.Row
 6 | def csv_to_row(line):
 7 |   parts = line.split(",")
 8 |   row = Row(
 9 |     name=parts[0],
10 |     company=parts[1],
11 |     title=parts[2]
12 |   )
13 |   return row
14 | 
15 | # Apply the function to get rows in an RDD
16 | rows = csv_lines.map(csv_to_row)
17 | 
18 | # Convert to a pyspark.sql.DataFrame
19 | rows_df = rows.toDF()
20 | 
21 | # Register the DataFrame for Spark SQL
22 | rows_df.registerTempTable("executives")
23 | 
24 | # Generate a new DataFrame with SQL using the SparkSession
25 | job_counts = spark.sql("""
26 | SELECT
27 |   name,
28 |   COUNT(*) AS total
29 |   FROM executives
30 |   GROUP BY name
31 | """)
32 | job_counts.show()
33 | 
34 | # Go back to an RDD
35 | job_counts.rdd.collect()
36 | 


--------------------------------------------------------------------------------
/ch02/test_elasticsearch.py:
--------------------------------------------------------------------------------
1 | from pyelasticsearch import ElasticSearch
2 | es = ElasticSearch('http://localhost:9200/')
3 | es.search('name:Russell', index='agile_data_science')


--------------------------------------------------------------------------------
/ch02/test_elasticsearch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -XPOST 'localhost:9200/agile_data_science/_search?pretty' -d '
4 | {
5 |     "query": { "match_all": {} }
6 | }
7 | '
8 | 


--------------------------------------------------------------------------------
/ch02/test_json.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # How to read and write JSON and JSON Lines files using Python
 4 | #
 5 | import sys, os, re
 6 | import json
 7 | import codecs
 8 | 
 9 | ary_of_objects = [
10 |   {'name': 'Russell Jurney', 'title': 'CEO'},
11 |   {'name': 'Muhammad Imran', 'title': 'VP of Marketing'},
12 |   {'name': 'Fe Mata', 'title': 'Chief Marketing Officer'},
13 | ]
14 | 
15 | path = "/tmp/test.jsonl"
16 | 
17 | #
18 | # Write our objects to jsonl
19 | #
20 | f = codecs.open(path, 'w', 'utf-8')
21 | for row_object in ary_of_objects:
22 |   # ensure_ascii=False is essential or errors/corruption will occur
23 |   json_record = json.dumps(row_object, ensure_ascii=False)
24 |   f.write(json_record + "\n")
25 | f.close()
26 | 
27 | print("Wrote JSON Lines file /tmp/test.jsonl")
28 | 
29 | #
30 | # Read this jsonl file back into objects
31 | #
32 | ary_of_objects = []
33 | f = codecs.open(path, "r", "utf-8")
34 | for line in f:
35 |   record = json.loads(line.rstrip("\n|\r"))
36 |   ary_of_objects.append(record)
37 | print(ary_of_objects)
38 | print("Read JSON Lines file /tmp/test.jsonl")
39 | 
40 | #
41 | # Utility functions to read and write json and jsonl files
42 | #
43 | def write_json_file(obj, path):
44 |   '''Dump an object and write it out as json to a file.'''
45 |   f = codecs.open(path, 'w', 'utf-8')
46 |   f.write(json.dumps(obj, ensure_ascii=False))
47 |   f.close()
48 | 
49 | def write_json_lines_file(ary_of_objects, path):
50 |   '''Dump a list of objects out as a json lines file.'''
51 |   f = codecs.open(path, 'w', 'utf-8')
52 |   for row_object in ary_of_objects:
53 |     json_record = json.dumps(row_object, ensure_ascii=False)
54 |     f.write(json_record + "\n")
55 |   f.close()
56 | 
57 | def read_json_file(path):
58 |   '''Turn a normal json file (no CRs per record) into an object.'''
59 |   text = codecs.open(path, 'r', 'utf-8').read()
60 |   return json.loads(text)
61 | 
62 | def read_json_lines_file(path):
63 |   '''Turn a json cr file (CRs per record) into an array of objects'''
64 |   ary = []
65 |   f = codecs.open(path, "r", "utf-8")
66 |   for line in f:
67 |     record = json.loads(line.rstrip("\n|\r"))
68 |   return ary
69 | 


--------------------------------------------------------------------------------
/ch02/test_pymongo.py:
--------------------------------------------------------------------------------
1 | from pymongo import MongoClient
2 | client = MongoClient()
3 | db = client.agile_data_science
4 | list(db.executives.find({"name": "Russell Jurney"}))
5 | 


--------------------------------------------------------------------------------
/ch02/test_pymongo_2.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | client = MongoClient()
 4 | 
 5 | record = {"foo": "bar"}
 6 | 
 7 | client.agile_data_science.collection_two.insert_one(record)
 8 | 
 9 | record2 = client.agile_data_science.collection_one.find_one(
10 |   {
11 |     "foo": "bar"
12 |   }
13 | )
14 | 


--------------------------------------------------------------------------------
/ch02/web/flask_pymongo.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | from pymongo import MongoClient
 3 | import bson.json_util
 4 | 
 5 | # Set up Flask
 6 | app = Flask(__name__)
 7 | 
 8 | # Set up Mongo
 9 | client = MongoClient()  # defaults to localhost
10 | db = client.agile_data_science
11 | 
12 | # Fetch from/to totals, given a pair of email addresses
13 | @app.route("/executive/<name>")
14 | def executive(name):
15 |     executive = db.executives.find({"name": name})
16 |     return bson.json_util.dumps(list(executive))
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     app.run(debug=True, host="0.0.0.0")
21 | 


--------------------------------------------------------------------------------
/ch02/web/templates/table.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Bootstrap example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <div id="wrap">
15 | 
16 |       <!-- Begin page content -->
17 |       <div class="container">
18 |         <div class="page-header">
19 |           <h1>Agile Data Science</h1>
20 |         </div>
21 |         <p class="lead">Executives</p>
22 |         <table class="table">
23 |           <thead>
24 |               <th>Name</th>
25 |               <th>Company</th>
26 |               <th>Title</th>
27 |           </thead>
28 |           <tbody>
29 |             {% for executive in executives -%}
30 |             <tr>
31 |               <td>{{executive.name}}</td>
32 |               <td>{{executive.company}}</td>
33 |               <td>{{executive.title}}</td>
34 |             </tr>
35 |             {% endfor -%}
36 |           </tbody>
37 |         </table>
38 |       </div>
39 | 
40 |       <div id="push"></div>
41 |     </div>
42 | 
43 |     <div id="footer">
44 |       <div class="container">
45 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
46 |       </div>
47 |     </div>
48 |     <script src="/static/bootstrap.min.js"></script>
49 |   </body>
50 | </html>


--------------------------------------------------------------------------------
/ch02/web/test_flask.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | 
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | 
 7 | @app.route("/<input>")
 8 | def hello(input):
 9 |     return input
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     app.run(debug=True, host="0.0.0.0")
14 | 


--------------------------------------------------------------------------------
/ch02/web/test_flask_bootstrap.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request
 2 | from pymongo import MongoClient
 3 | import bson.json_util
 4 | 
 5 | # Set up Flask
 6 | app = Flask(__name__)
 7 | 
 8 | # Set up Mongo
 9 | client = MongoClient("mongo")  # defaults to localhost
10 | db = client.agile_data_science
11 | 
12 | # Fetch from/to totals, given a pair of email addresses
13 | @app.route("/executive/<name>")
14 | def executive(name):
15 |     executives = db.executives.find({"name": name})
16 |     return render_template("table.html", executives=list(executives))
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     app.run(debug=True, host="0.0.0.0")
21 | 


--------------------------------------------------------------------------------
/ch02/web/test_flask_pymongo.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request
 2 | from pymongo import MongoClient
 3 | import bson.json_util
 4 | 
 5 | # Set up Flask
 6 | app = Flask(__name__)
 7 | 
 8 | # Set up Mongo
 9 | client = MongoClient("mongo")  # defaults to localhost
10 | db = client.agile_data_science
11 | 
12 | # Fetch from/to totals, given a pair of email addresses
13 | @app.route("/executive/<name>")
14 | def executive(name):
15 |     executive = db.executives.find({"name": name})
16 |     return bson.json_util.dumps(list(executive))
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     app.run(debug=True, host="0.0.0.0")
21 | 


--------------------------------------------------------------------------------
/ch04/convert_data.py:
--------------------------------------------------------------------------------
 1 | # Loads CSV with header parsing and type inference, in one line!
 2 | on_time_dataframe = spark.read.format('com.databricks.spark.csv')\
 3 |   .options(
 4 |     header='true',
 5 |     treatEmptyValuesAsNulls='true',
 6 |   )\
 7 |   .load('data/On_Time_On_Time_Performance_2015.csv.bz2')
 8 | on_time_dataframe.registerTempTable("on_time_performance")
 9 | 
10 | trimmed_cast_performance = spark.sql("""
11 | SELECT
12 |   Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate,
13 |   Carrier, TailNum, FlightNum,
14 |   Origin, OriginCityName, OriginState,
15 |   Dest, DestCityName, DestState,
16 |   DepTime, cast(DepDelay as float), cast(DepDelayMinutes as int),
17 |   cast(TaxiOut as float), cast(TaxiIn as float),
18 |   WheelsOff, WheelsOn,
19 |   ArrTime, cast(ArrDelay as float), cast(ArrDelayMinutes as float),
20 |   cast(Cancelled as int), cast(Diverted as int),
21 |   cast(ActualElapsedTime as float), cast(AirTime as float),
22 |   cast(Flights as int), cast(Distance as float),
23 |   cast(CarrierDelay as float), cast(WeatherDelay as float), cast(NASDelay as float),
24 |   cast(SecurityDelay as float), cast(LateAircraftDelay as float),
25 |   CRSDepTime, CRSArrTime
26 | FROM
27 |   on_time_performance
28 | """)
29 | 
30 | # Replace on_time_performance table with our new, trimmed table and show its contents
31 | trimmed_cast_performance.registerTempTable("on_time_performance")
32 | trimmed_cast_performance.show()
33 | 
34 | # Verify we can sum numeric columns
35 | spark.sql("""SELECT
36 |   SUM(WeatherDelay), SUM(CarrierDelay), SUM(NASDelay),
37 |   SUM(SecurityDelay), SUM(LateAircraftDelay)
38 | FROM on_time_performance
39 | """).show()
40 | 
41 | # Save records as gzipped json lines
42 | trimmed_cast_performance.toJSON()\
43 |   .saveAsTextFile(
44 |     'data/on_time_performance.jsonl.gz',
45 |     'org.apache.hadoop.io.compress.GzipCodec'
46 |   )
47 | 
48 | # View records on filesystem
49 | # gunzip -c data/on_time_performance.jsonl.gz/part-00000.gz | head
50 | 
51 | # Save records using Parquet
52 | trimmed_cast_performance.write.mode("overwrite").parquet("data/on_time_performance.parquet")
53 | 
54 | # Load JSON records back
55 | on_time_dataframe = spark.read.json('data/on_time_performance.jsonl.gz')
56 | on_time_dataframe.show()
57 | 
58 | # Load the parquet file back
59 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
60 | on_time_dataframe.show()
61 | 


--------------------------------------------------------------------------------
/ch04/download.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # Script to download data for book
 3 | #
 4 | mkdir ../data
 5 | 
 6 | # Get on-time records for all flights in 2015 - 273MB
 7 | # wget -P ../data/ http://s3.amazonaws.com/agile_data_science/On_Time_On_Time_Performance_2015.csv.gz
 8 | 
 9 | # Get openflights data
10 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat
11 | mv /tmp/airports.dat ../data/airports.csv
12 | 
13 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/airlines.dat
14 | mv /tmp/airlines.dat ../data/airlines.csv
15 | 
16 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat
17 | mv /tmp/routes.dat ../data/routes.csv
18 | 
19 | wget -P /tmp/ https://raw.githubusercontent.com/jpatokal/openflights/master/data/countries.dat
20 | mv /tmp/countries.dat ../data/countries.csv
21 | 
22 | # Get FAA data
23 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/aircraft.txt
24 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/ata.txt
25 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/compt.txt
26 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/engine.txt
27 | wget -P ../data/ http://av-info.faa.gov/data/ACRef/tab/prop.txt
28 | 
29 | # Get Aircraft database
30 | # wget -P /tmp/ http://registry.faa.gov/database/AR042016.zip
31 | # unzip -d ../data/ /tmp/AR042016.zip
32 | 
33 | # Get FAA Registration data
34 | # wget -P /tmp/ http://registry.faa.gov/database/AR042016.zip
35 | # unzip -d ../data/ /tmp/AR042016.zip
36 | 


--------------------------------------------------------------------------------
/ch04/images/ads2_0201.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0201.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0202.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0202.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0209.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0209.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0211.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0211.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0212.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0212.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0215.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0215.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0217.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0217.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0219.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0219.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0220.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0220.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0401.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0401.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0402.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0402.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0403.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0403.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0405.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0405.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0406.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0406.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0408.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0408.png


--------------------------------------------------------------------------------
/ch04/images/ads2_0409.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads2_0409.png


--------------------------------------------------------------------------------
/ch04/images/ads_bootstrap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ads_bootstrap.png


--------------------------------------------------------------------------------
/ch04/images/ags2_0402.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/ags2_0402.png


--------------------------------------------------------------------------------
/ch04/images/airline_data_fields.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/airline_data_fields.png


--------------------------------------------------------------------------------
/ch04/images/faa_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/faa_table.png


--------------------------------------------------------------------------------
/ch04/images/flask_terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/flask_terminal.png


--------------------------------------------------------------------------------
/ch04/images/json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/json.png


--------------------------------------------------------------------------------
/ch04/images/parquet_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/parquet_logo.jpg


--------------------------------------------------------------------------------
/ch04/images/row_format_column_format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch04/images/row_format_column_format.png


--------------------------------------------------------------------------------
/ch04/load_on_time_pyspark.py:
--------------------------------------------------------------------------------
 1 | # Loads CSV with header parsing and type inference, in one line!
 2 | # Must use 'pyspark --packages com.databricks:spark-csv_2.10:1.4.0' for this to work
 3 | on_time_dataframe = spark.read.format('com.databricks.spark.csv')\
 4 |   .options(header='true', inferschema='true')\
 5 |   .load('data/On_Time_On_Time_Performance_2015.csv.bz2')
 6 | 
 7 | # Check out the data - very wide so hard to see
 8 | on_time_dataframe.show()
 9 | 
10 | # Use SQL to query data - what airport pairs have the most flights?
11 | on_time_dataframe.registerTempTable("on_time_dataframe")
12 | airport_pair_totals = spark.sql("""SELECT
13 |   Origin, Dest, COUNT(*) AS total
14 |   FROM on_time_dataframe
15 |   GROUP BY Origin, Dest
16 |   ORDER BY total DESC"""
17 | )
18 | 
19 | # Use dataflows
20 | airport_pair_totals.limit(10).show()
21 | 
22 | # We can go back and forth as we see fit!
23 | 
24 | 


--------------------------------------------------------------------------------
/ch04/mongo.js:
--------------------------------------------------------------------------------
1 | db.on_time_performance.findOne({Carrier: 'DL', FlightDate: '2015-01-01', FlightNum: 478}) // Slow
2 | db.on_time_performance.ensureIndex({Carrier: 1, FlightDate: 1, FlightNum: 1})
3 | db.on_time_performance.findOne({Carrier: 'DL', FlightDate: '2015-01-01', FlightNum: 478}) // Fast
4 | 
5 | db.on_time_performance.find({Origin: 'ATL', Dest: 'SFO', FlightDate: '2015-01-01'}).sort({DepTime: 1, ArrTime: 1}) // Slow or broken
6 | db.on_time_performance.ensureIndex({Origin: 1, Dest: 1, FlightDate: 1}) // Fast
7 | db.on_time_performance.find({Origin: 'ATL', Dest: 'SFO', FlightDate: '2015-01-01'}).sort({DepTime: 1, ArrTime: 1}) // Fast
8 | 


--------------------------------------------------------------------------------
/ch04/pyspark_to_elasticsearch.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # Save the DataFrame to Elasticsearch
 5 | on_time_dataframe.write.format("org.elasticsearch.spark.sql")\
 6 |   .option("es.resource","agile_data_science/on_time_performance")\
 7 |   .option("es.batch.size.entries","100")\
 8 |   .mode("overwrite")\
 9 |   .save()
10 | 
11 | # Format data for Elasticsearch, as a tuple with a dummy key in the first field
12 | # on_time_performance = on_time_dataframe.rdd.map(lambda x: ('ignored_key', x.asDict()))
13 | #
14 | # on_time_performance.saveAsNewAPIHadoopFile(
15 | #   path='-',
16 | #   outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
17 | #   keyClass="org.apache.hadoop.io.NullWritable",
18 | #   valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
19 | #   conf={ "es.resource" : "agile_data_science/on_time_performance" })
20 | 


--------------------------------------------------------------------------------
/ch04/pyspark_to_mongo.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | import pymongo_spark
 3 | # Important: activate pymongo_spark.
 4 | pymongo_spark.activate()
 5 | 
 6 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 7 | 
 8 | # Note we have to convert the row to a dict to avoid https://jira.mongodb.org/browse/HADOOP-276
 9 | as_dict = on_time_dataframe.rdd.map(lambda row: row.asDict())
10 | as_dict.saveToMongoDB('mongodb://localhost:27017/agile_data_science.on_time_performance')
11 | 


--------------------------------------------------------------------------------
/ch04/web/config.py:
--------------------------------------------------------------------------------
1 | # config.py, a configuration file for index.py
2 | RECORDS_PER_PAGE = 15
3 | ELASTIC_URL = "http://elastic:9200"
4 | 


--------------------------------------------------------------------------------
/ch04/web/on_time_flask.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request
 2 | from pymongo import MongoClient
 3 | from bson import json_util
 4 | 
 5 | 
 6 | # Set up Flask and Mongo
 7 | app = Flask(__name__)
 8 | client = MongoClient("mongo")
 9 | 
10 | 
11 | # Controller: Fetch an email and display it
12 | @app.route("/on_time_performance")
13 | def on_time_performance():
14 | 
15 |     carrier = request.args.get("Carrier")
16 |     flight_date = request.args.get("FlightDate")
17 |     flight_num = request.args.get("FlightNum")
18 | 
19 |     flight = client.agile_data_science.on_time_performance.find_one(
20 |         {"Carrier": carrier, "FlightDate": flight_date, "FlightNum": flight_num}
21 |     )
22 | 
23 |     print(flight)
24 | 
25 |     return json_util.dumps(flight)
26 | 
27 | 
28 | if __name__ == "__main__":
29 | 
30 |     app.run(debug=True, host="0.0.0.0")
31 | 


--------------------------------------------------------------------------------
/ch04/web/templates/flight.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Flight {{flight.FlightNum}}</p>
 5 |     <table class="table">
 6 |       <thead>
 7 |         <th>Airline</th>
 8 |         <th>Origin</th>
 9 |         <th>Destination</th>
10 |         <th>Tail Number</th>
11 |         <th>Date</th>
12 |         <th>Air Time</th>
13 |         <th>Distance</th>
14 |       </thead>
15 |       <tbody>
16 |         <tr>
17 |           <td>{{flight.Carrier}}</td>
18 |           <td>{{flight.Origin}}</td>
19 |           <td>{{flight.Dest}}</td>
20 |           <td>{{flight.TailNum}}</td>
21 |           <td>{{flight.FlightDate}}</td>
22 |           <td>{{flight.AirTime}}</td>
23 |           <td>{{flight.Distance}}</td>
24 |         </tr>
25 |       </tbody>
26 |     </table>
27 |   </div>
28 | {% endblock %}


--------------------------------------------------------------------------------
/ch04/web/templates/flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">{{flight_count}} Flights on {{flight_date}}</p>
 5 |     <table class="table table-condensed table-striped">
 6 |       <thead>
 7 |         <th>Airline</th>
 8 |         <th>Flight Number</th>
 9 |         <th>Origin</th>
10 |         <th>Destination</th>
11 |         <th>Departure Time</th>
12 |         <th>Tail Number</th>
13 |         <th>Air Time</th>
14 |         <th>Distance</th>
15 |       </thead>
16 |       <tbody>
17 |         {% for flight in flights %}
18 |         <tr>
19 |           <td>{{flight.Carrier}}</td>
20 |           <td><a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a></td>
21 |           <td>{{flight.Origin}}</td>
22 |           <td>{{flight.Dest}}</td>
23 |           <td>{{flight.DepTime}}</td>
24 |           <td>{{flight.TailNum}}</td>
25 |           <td>{{flight.AirTime}}</td>
26 |           <td>{{flight.Distance}}</td>
27 |         </tr>
28 |         {% endfor %}
29 |       </tbody>
30 |     </table>
31 |   </div>
32 |   {% import "macros.jnj" as common %}
33 |   {% if nav_offsets and nav_path -%}
34 |     {{ common.display_nav(nav_offsets, nav_path, flight_count, query)|safe }}
35 |   {% endif -%}
36 | {% endblock %}


--------------------------------------------------------------------------------
/ch04/web/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Chapter 5 example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <div id="wrap">
15 | 
16 |       <!-- Begin page content -->
17 |       <div class="container">
18 |         <div class="page-header">
19 |           <h1>Agile Data Science</h1>
20 |         </div>
21 |         {% block body %}{% endblock %}
22 |       </div>
23 | 
24 |       <div id="push"></div>
25 |     </div>
26 | 
27 |     <div id="footer">
28 |       <div class="container">
29 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
30 |       </div>
31 |     </div>
32 |     <script src="/static/bootstrap.min.js"></script>
33 |   </body>
34 | </html>


--------------------------------------------------------------------------------
/ch04/web/templates/macros.jnj:
--------------------------------------------------------------------------------
 1 | <!-- Display two navigation links for previous/next page in the flight list -->;
 2 | {% macro display_nav(offsets, path, count, query) -%}
 3 |   <div style="text-align: center;">
 4 |     {% for key, values in offsets.items() -%}
 5 |       {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%}
 6 |         <a style="margin-left: 20px; margin-right: 20px;" href="{{ path }}&start={{ values ['bottom_offset'] }}&amp;end={{ values['top_offset'] }}{%- if query -%}?search={{query}}{%- endif -%}">{{ key }}</a>
 7 |       {% else -%}
 8 |         {{ key }}
 9 |       {% endif %}
10 |     {% endfor -%}
11 |   </div>
12 | {%- endmacro %}
13 | 


--------------------------------------------------------------------------------
/ch04/web/templates/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <div>
 4 |   <p class="lead">{{flight_count['value']}} Flights</p>
 5 |   <form action="/flights/search" method="get">
 6 |     <label for="Carrier">Carrier</label>
 7 |     <input name="Carrier" maxlength="3" style="width: 36px; margin-right: 10px;"
 8 |       value="{{carrier if carrier else ''}}"></input>
 9 |     <label for="Origin">Origin</label>
10 |     <input name="Origin" maxlength="3" style="width: 36px; margin-right: 10px;"
11 |       value="{{origin if origin else ''}}"></input>
12 |     <label for="Dest">Dest</label>
13 |     <input name="Dest" maxlength="3" style="width: 36px; margin-right: 10px;" value="{{dest if dest else ''}}"></input>
14 |     <label for="FlightDate">FlightDate</label>
15 |     <input name="FlightDate" style="width: 100px; margin-right: 10px;"
16 |       value="{{flight_date if flight_date else ''}}"></input>
17 |     <label for="TailNum">TailNum</label>
18 |     <input name="TailNum" style="width: 100px; margin-right: 10px;"
19 |       value="{{tail_number if tail_number else ''}}"></input>
20 |     <label for="FlightNum">FlightNum</label>
21 |     <input name="FlightNum" style="width: 50px; margin-right: 10px;"
22 |       value="{{flight_number if flight_number else ''}}"></input>
23 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
24 |   </form>
25 |   <table class="table table-condensed table-striped">
26 |     <thead>
27 |       <th>Airline</th>
28 |       <th>Flight Number</th>
29 |       <th>Origin</th>
30 |       <th>Destination</th>
31 |       <th>Date</th>
32 |       <th>Departure Time</th>
33 |       <th>Tail Number</th>
34 |       <th>Air Time</th>
35 |       <th>Distance</th>
36 |     </thead>
37 |     <tbody>
38 |       {% for flight in flights %}
39 |       <tr>
40 |         <td>{{flight.Carrier}}</td>
41 |         <td><a
42 |             href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a>
43 |         </td>
44 |         <td>{{flight.Origin}}</td>
45 |         <td>{{flight.Dest}}</td>
46 |         <td>{{flight.FlightDate}}</td>
47 |         <td>{{flight.DepTime}}</td>
48 |         <td>{{flight.TailNum}}</td>
49 |         <td>{{flight.AirTime}}</td>
50 |         <td>{{flight.Distance}}</td>
51 |       </tr>
52 |       {% endfor %}
53 |     </tbody>
54 |   </table>
55 | </div>
56 | 
57 | {% import "macros.jnj" as common %}
58 | {% if nav_offsets and nav_path -%}
59 | {{ common.display_nav(nav_offsets, nav_path, flight_count)|safe }}
60 | {% endif -%}
61 | 
62 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/assess_airplanes.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | on_time_dataframe.registerTempTable("on_time_performance")
 4 | 
 5 | # Dump the unneeded fields
 6 | tail_numbers = on_time_dataframe.rdd.map(lambda x: x.TailNum)
 7 | tail_numbers = tail_numbers.filter(lambda x: x != '')
 8 | 
 9 | # distinct() gets us unique tail numbers
10 | unique_tail_numbers = tail_numbers.distinct()
11 | 
12 | # now we need a count() of unique tail numbers
13 | airplane_count = unique_tail_numbers.count()
14 | print("Total airplanes: {}".format(airplane_count))
15 | 


--------------------------------------------------------------------------------
/ch05/assess_faa.py:
--------------------------------------------------------------------------------
 1 | # Load the FAA N-Number Inquiry Records
 2 | faa_tail_number_inquiry = spark.read.json('data/faa_tail_number_inquiry.jsonl')
 3 | faa_tail_number_inquiry.show()
 4 | 
 5 | # Count the records
 6 | faa_tail_number_inquiry.count()
 7 | 
 8 | # Load our unique tail numbers
 9 | unique_tail_numbers = spark.read.json('data/tail_numbers.jsonl')
10 | unique_tail_numbers.show()
11 | 
12 | # left outer join tail numbers to our inquries to see how many came through
13 | tail_num_plus_inquiry = unique_tail_numbers.join(
14 |   faa_tail_number_inquiry,
15 |   unique_tail_numbers.TailNum == faa_tail_number_inquiry.TailNum,
16 |   'left_outer'
17 | )
18 | tail_num_plus_inquiry.show()
19 | 
20 | # Now compute the total records and the successfully joined records
21 | total_records = tail_num_plus_inquiry.count()
22 | join_hits = tail_num_plus_inquiry.filter(
23 |   tail_num_plus_inquiry.owner.isNotNull()
24 | ).count()
25 | 
26 | # This being Python, we can now compute and print a join percent...
27 | hit_ratio = float(join_hits)/float(total_records)
28 | hit_pct = hit_ratio * 100
29 | print("Successful joins: {:.2f}%".format(hit_pct))
30 | 


--------------------------------------------------------------------------------
/ch05/extract_airplanes.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | on_time_dataframe.registerTempTable("on_time_performance")
 4 | 
 5 | # Filter down to the fields we need to identify and link to a flight
 6 | flights = on_time_dataframe.rdd.map(
 7 |     lambda x: 
 8 |   {
 9 |       'Carrier': x.Carrier, 
10 |       'FlightDate': x.FlightDate, 
11 |       'FlightNum': x.FlightNum, 
12 |       'Origin': x.Origin, 
13 |       'Dest': x.Dest, 
14 |       'TailNum': x.TailNum
15 |   }
16 | )
17 | flights.first()
18 | 
19 | # Group flights by tail number, sorted by flight number, date, then origin/dest
20 | flights_per_airplane = flights\
21 |   .map(lambda record: (record['TailNum'], [record]))\
22 |   .reduceByKey(lambda a, b: a + b)\
23 |   .map(lambda tuple:
24 |       {
25 |         'TailNum': tuple[0], 
26 |         'Flights': sorted(tuple[1], key=lambda x: (x['FlightNum'], x['FlightDate'], x['Origin'], x['Dest']))
27 |       }
28 |     )
29 | flights_per_airplane.first()
30 | 
31 | # Save to Mongo
32 | import pymongo_spark
33 | pymongo_spark.activate()
34 | flights_per_airplane.saveToMongoDB('mongodb://localhost:27017/agile_data_science.flights_per_airplane')
35 | 


--------------------------------------------------------------------------------
/ch05/images/ads2_0501.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0501.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0502.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0502.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0503.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0503.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0504.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0504.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0505.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0505.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0507.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0507.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0508.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0508.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0509.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0509.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0510.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0510.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0511.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0511.png


--------------------------------------------------------------------------------
/ch05/images/ads2_0512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/ads2_0512.png


--------------------------------------------------------------------------------
/ch05/images/first_order_form.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/first_order_form.png


--------------------------------------------------------------------------------
/ch05/images/flight_search_with_tail_num_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/flight_search_with_tail_num_link.png


--------------------------------------------------------------------------------
/ch05/images/mapreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/mapreduce.png


--------------------------------------------------------------------------------
/ch05/images/total_flights_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/images/total_flights_2.png


--------------------------------------------------------------------------------
/ch05/install.sh:
--------------------------------------------------------------------------------
 1 | # Get bootstrap
 2 | mkdir web/static
 3 | cd web/static
 4 | wget 'https://code.jquery.com/jquery-1.12.2.min.js'
 5 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css'
 6 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap-theme.min.css'
 7 | wget 'https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js'
 8 | wget 'http://d3js.org/d3.v3.min.js'
 9 | wget 'https://cdn.rawgit.com/novus/nvd3/v1.8.1/build/nv.d3.min.js'
10 | wget 'https://cdn.rawgit.com/novus/nvd3/v1.8.1/build/nv.d3.css'
11 | cd ../..
12 | 


--------------------------------------------------------------------------------
/ch05/mongo.js:
--------------------------------------------------------------------------------
1 | db.flights_per_airplane.findOne({TailNum: 'N249AU'})
2 | 
3 | db.flights_per_airplane.ensureIndex({TailNum: 1})
4 | 
5 | db.flights_per_airplane.findOne({TailNum: 'N249AU'})
6 | 


--------------------------------------------------------------------------------
/ch05/save_tail_numbers.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | on_time_dataframe.registerTempTable("on_time_performance")
 4 | 
 5 | # Dump the unneeded fields and filter nulls
 6 | tail_numbers = on_time_dataframe.rdd.map(lambda x: x.TailNum)
 7 | tail_numbers = tail_numbers.filter(lambda x: x != '')
 8 | 
 9 | # distinct() gets us unique tail numbers
10 | unique_tail_numbers = tail_numbers.distinct()
11 | 
12 | # Store as JSON objects via a dataframe. Repartition to 1 to get 1 json file.
13 | unique_records = unique_tail_numbers.map(lambda x: {'TailNum': x}).toDF()
14 | unique_records.repartition(1).write.mode("overwrite").json("data/tail_numbers.json")
15 | 
16 | # Now from bash: ls data/tail_numbers.json/part*
17 | 


--------------------------------------------------------------------------------
/ch05/total_flights.py:
--------------------------------------------------------------------------------
 1 | # Load the parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # Use SQL to look at the total flights by month across 2015
 5 | on_time_dataframe.registerTempTable("on_time_dataframe")
 6 | total_flights_by_month = spark.sql(
 7 |   """SELECT Month, Year, COUNT(*) AS total_flights
 8 |   FROM on_time_dataframe
 9 |   GROUP BY Year, Month
10 |   ORDER BY Year, Month"""
11 | )
12 | 
13 | # This map/asDict trick makes the rows print a little prettier. It is optional.
14 | flights_chart_data = total_flights_by_month.rdd.map(lambda row: row.asDict())
15 | flights_chart_data.collect()
16 | 
17 | # Save chart to MongoDB
18 | import pymongo_spark
19 | pymongo_spark.activate()
20 | flights_chart_data.saveToMongoDB(
21 |   'mongodb://localhost:27017/agile_data_science.flights_by_month'
22 | )
23 | 
24 | 


--------------------------------------------------------------------------------
/ch05/web/config.py:
--------------------------------------------------------------------------------
1 | # config.py, a configuration file for index.py
2 | RECORDS_PER_PAGE = 15
3 | ELASTIC_URL = "http://elastic:9200"
4 | 


--------------------------------------------------------------------------------
/ch05/web/flights_per_airplane.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Flights by Tail Number {{tail_number}}</p>
 5 |     <table class="table table-condensed table-striped">
 6 |       <thead>
 7 |         <th>Images</th>
 8 |       </thead>
 9 |       <tbody>
10 |         <tr>
11 |           <td>
12 |           {% for image in images['Images'] %}
13 |           <img src={{image}} style=width:24%>
14 |           {% endfor %}
15 |           </td>
16 |         </tr>
17 | 
18 |       </tbody>
19 |     </table>
20 |         <table class="table table-condensed table-striped">
21 |       <thead>
22 |         <th>Serial Number</th>
23 |         <th>Manufacturer</th>
24 |         <th>Model</th>
25 |         <th>MFR Year</th>
26 |         <th>Owner</th>
27 |         <th>Owner State</th>
28 |         <th>Engine Manufacturer</th>
29 |         <th>Engine Model</th>
30 |       </thead>
31 |       <tbody>
32 |         <tr>
33 |           <td>{{descriptions['serial_number']}}</td>
34 |           <td>{{descriptions['manufacturer']}}</td>
35 |           <td>{{descriptions['model']}}</td>
36 |           <td>{{descriptions['mfr_year']}}</td>
37 |           <td>{{descriptions['owner']}}</td>
38 |           <td>{{descriptions['owner_state']}}</td>
39 |           <td>{{descriptions['engine_manufacturer']}}</td> 
40 |           <td>{{descriptions['engine_model']}}</td>
41 |         </tr>
42 |       </tbody>
43 |     </table>
44 |     <table class="table table-condensed table-striped">
45 |       <thead>
46 |         <th>Carrier</th>
47 |         <th>Date</th>
48 |         <th>Flight Number</th>
49 |         <th>Origin</th>
50 |         <th>Destination</th>
51 |       </thead>
52 |       <tbody>
53 |         {% for flight in flights['Flights'] %}
54 |         <tr>
55 |           <td>{{flight['Carrier']}}</td>
56 |           <td>{{flight['FlightDate']}}</td>
57 |           <td><a href="/on_time_performance?Carrier={{flight['Carrier']}}&FlightDate={{flight['FlightDate']}}&FlightNum={{flight['FlightNum']}}">{{flight['FlightNum']}}</a></td>
58 |           <td>{{flight['Origin']}}</td>
59 |           <td>{{flight['Dest']}}</td>
60 |         </tr>
61 |         {% endfor %}
62 |       </tbody>
63 |     </table>
64 |   </div>
65 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/static/app.js:
--------------------------------------------------------------------------------
 1 | // Define the width and height of our chart
 2 | var width = 960,
 3 |     height = 350;
 4 | 
 5 | // Define the y scale, which is linear and maps to between the range of the height of the chart and 0
 6 | var y = d3.scale.linear()
 7 |     .range([height, 0]);
 8 | // We define the domain once we get our data in d3.json, below
 9 | 
10 | // Our chart object is defined using the height and width
11 | var chart = d3.select(".chart")
12 |     .attr("width", width)
13 |     .attr("height", height);
14 | 
15 | // We fetch the JSON from our controller, then process the resulting data
16 | d3.json("/total_flights.json", function (data) {
17 | 
18 |     // We define colors for the bars
19 |     var barColor = 'steelblue';
20 | 
21 |     // We compute the maximum value for the bars, then set the domain for the y axis.
22 |     // This means that y will now map from [0 -> maxY] to [height -> 0].
23 |     var maxY = d3.max(data, function (d) { return d.total_flights; });
24 |     y.domain([0, maxY]);
25 | 
26 |     var varColor = function (d, i) {
27 |         if (d['total_flights'] == maxY) { return modeColor; }
28 |         else { return defaultColor; }
29 |     }
30 | 
31 |     // Divide the width by the number of bars to get the bar width
32 |     var barWidth = width / data.length;
33 |     var bar = chart.selectAll("g")
34 |         .data(data)
35 |         .enter()
36 |         .append("g")
37 |         .attr("transform", function (d, i) { return "translate(" + i * barWidth + ",0)"; });
38 | 
39 |     // Now we define a rectangle for each container with the height mapped from the total_flights data point
40 |     // to the y axis, and the width barWidth - 1 pixel. We will it with the bar color.
41 |     bar.append("rect")
42 |         .attr("y", function (d) { return y(d.total_flights); })
43 |         .attr("height", function (d) { return height - y(d.total_flights); })
44 |         .attr("width", barWidth - 1)
45 |         .style("fill", barColor);
46 | 
47 |     // We then label each bar with a the raw value in the top middle of the bar.
48 |     // We offset the label by 3 to make it under the end of the bar, in the blue bit and we make it white
49 |     // to stand out from the blue using the CSS from the HTML template above for text.
50 |     bar.append("text")
51 |         .attr("x", barWidth / 2)
52 |         .attr("y", function (d) { return y(d.total_flights) + 3; })
53 |         .attr("dy", ".75em")
54 |         .text(function (d) { return d.total_flights; });
55 | });
56 | 


--------------------------------------------------------------------------------
/ch05/web/static/app3.js:
--------------------------------------------------------------------------------
 1 | // Define the width and height of our chart
 2 | var width = 960,
 3 |     height = 350;
 4 | 
 5 | // Define the y scale, which is linear and maps to between the range of the height of the chart and 0
 6 | var y = d3.scale.linear()
 7 |     .range([height, 0]);
 8 | // We define the domain once we get our data in d3.json, below
 9 | 
10 | // Our chart object is defined using the height and width
11 | var chart = d3.select(".chart")
12 |     .attr("width", width)
13 |     .attr("height", height);
14 | 
15 | // We fetch the JSON from our controller, then process the resulting data
16 | d3.json("/top_routes.json", function (data) {
17 | 
18 |     // We define colors for the bars
19 |     var barColor = 'steelblue';
20 | 
21 |     // We compute the maximum value for the bars, then set the domain for the y axis.
22 |     // This means that y will now map from [0 -> maxY] to [height -> 0].
23 |     var maxY = d3.max(data, function (d) { return d.total; });
24 |     y.domain([0, maxY]);
25 | 
26 |     var varColor = function (d, i) {
27 |         if (d['total'] == maxY) { return modeColor; }
28 |         else { return defaultColor; }
29 |     }
30 | 
31 |     // Divide the width by the number of bars to get the bar width
32 |     var barWidth = width / data.length;
33 |     var bar = chart.selectAll("g")
34 |         .data(data)
35 |         .enter()
36 |         .append("g")
37 |         .attr("transform", function (d, i) { return "translate(" + i * barWidth + ",0)"; });
38 | 
39 |     // Now we define a rectangle for each container with the height mapped from the total data point
40 |     // to the y axis, and the width barWidth - 1 pixel. We will it with the bar color.
41 |     bar.append("rect")
42 |         .attr("y", function (d) { return y(d.total); })
43 |         .attr("height", function (d) { return height - y(d.total); })
44 |         .attr("width", barWidth - 1)
45 |         .style("fill", barColor);
46 | 
47 |     // We then label each bar with a the raw value in the top middle of the bar.
48 |     // We offset the label by 3 to make it under the end of the bar, in the blue bit and we make it white
49 |     // to stand out from the blue using the CSS from the HTML template above for text.
50 |     bar.append("text")
51 |         .attr("x", barWidth / 2)
52 |         .attr("y", function (d) { return y(d.total) + 3; })
53 |         .attr("dy", ".75em")
54 |         .text(function (d) { return d.total; });
55 | 
56 |     // to stand out from the blue using the CSS from the HTML template above for text.
57 |     bar.append("text")
58 |         .attr("x", barWidth / 2)
59 |         .attr("y", function (d) { return y(d.total) + 13; })
60 |         .attr("dy", ".75em")
61 |         .text(function (d) { return d.Origin + ' ' + d.Dest; });
62 | });
63 | 


--------------------------------------------------------------------------------
/ch05/web/static/images/.exist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch05/web/static/images/.exist


--------------------------------------------------------------------------------
/ch05/web/templates/flight.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Flight {{flight.FlightNum}}</p>
 5 |     <table class="table">
 6 |       <thead>
 7 |         <th>Airline</th>
 8 |         <th>Origin</th>
 9 |         <th>Destination</th>
10 |         <th>Tail Number</th>
11 |         <th>Date</th>
12 |         <th>Air Time</th>
13 |         <th>Distance</th>
14 |       </thead>
15 |       <tbody>
16 |         <tr>
17 |           <td>{{flight.Carrier}}</td>
18 |           <td>{{flight.Origin}}</td>
19 |           <td>{{flight.Dest}}</td>
20 |           <td><a href="/airplane/flights/{{flight.TailNum}}">{{flight.TailNum}}</a></td>
21 |           <td>{{flight.FlightDate}}</td>
22 |           <td>{{flight.AirTime}}</td>
23 |           <td>{{flight.Distance}}</td>
24 |         </tr>
25 |       </tbody>
26 |     </table>
27 |   </div>
28 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">{{flight_count}} Flights on {{flight_date}}</p>
 5 |     <table class="table table-condensed table-striped">
 6 |       <thead>
 7 |         <th>Airline</th>
 8 |         <th>Flight Number</th>
 9 |         <th>Origin</th>
10 |         <th>Destination</th>
11 |         <th>Departure Time</th>
12 |         <th>Tail Number</th>
13 |         <th>Air Time</th>
14 |         <th>Distance</th>
15 |       </thead>
16 |       <tbody>
17 |         {% for flight in flights %}
18 |         <tr>
19 |           <td>{{flight.Carrier}}</td>
20 |           <td><a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a></td>
21 |           <td>{{flight.Origin}}</td>
22 |           <td>{{flight.Dest}}</td>
23 |           <td>{{flight.DepTime}}</td>
24 |           <td>{{flight.TailNum}}</td>
25 |           <td>{{flight.AirTime}}</td>
26 |           <td>{{flight.Distance}}</td>
27 |         </tr>
28 |         {% endfor %}
29 |       </tbody>
30 |     </table>
31 |   </div>
32 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/flights_per_airplane.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Flights by Tail Number {{tail_number}}</p>
 5 |     <table class="table table-condensed table-striped">
 6 |       <thead>
 7 |         <th>Carrier</th>
 8 |         <th>Date</th>
 9 |         <th>Flight Number</th>
10 |         <th>Origin</th>
11 |         <th>Destination</th>
12 |       </thead>
13 |       <tbody>
14 |         {% for flight in flights['Flights'] %}
15 |         <tr>
16 |           <td>{{flight['Carrier']}}</td>
17 |           <td>{{flight['FlightDate']}}</td>
18 |           <td><a href="/on_time_performance?Carrier={{flight['Carrier']}}&FlightDate={{flight['FlightDate']}}&FlightNum={{flight['FlightNum']}}">{{flight['FlightNum']}}</a></td>
19 |           <td>{{flight['Origin']}}</td>
20 |           <td>{{flight['Dest']}}</td>
21 |         </tr>
22 |         {% endfor %}
23 |       </tbody>
24 |     </table>
25 |   </div>
26 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/flights_per_airplane_2.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Flights by Tail Number {{tail_number}}</p>
 5 |     <table class="table table-condensed table-striped">
 6 |       <thead>
 7 |         <th>Images</th>
 8 |       </thead>
 9 |       <tbody>
10 |         <tr>
11 |           <td>
12 |           {% for image in images['Images'] %}
13 |           <img src={{image}} style=width:24%>
14 |           {% endfor %}
15 |           </td>
16 |         </tr>
17 | 
18 |       </tbody>
19 |     </table>
20 |         <table class="table table-condensed table-striped">
21 |       <thead>
22 |         <th>Serial Number</th>
23 |         <th>Manufacturer</th>
24 |         <th>Model</th>
25 |         <th>MFR Year</th>
26 |         <th>Owner</th>
27 |         <th>Owner State</th>
28 |         <th>Engine Manufacturer</th>
29 |         <th>Engine Model</th>
30 |       </thead>
31 |       <tbody>
32 |         <tr>
33 |           <td>{{descriptions['serial_number']}}</td>
34 |           <td>{{descriptions['manufacturer']}}</td>
35 |           <td>{{descriptions['model']}}</td>
36 |           <td>{{descriptions['mfr_year']}}</td>
37 |           <td>{{descriptions['owner']}}</td>
38 |           <td>{{descriptions['owner_state']}}</td>
39 |           <td>{{descriptions['engine_manufacturer']}}</td> 
40 |           <td>{{descriptions['engine_model']}}</td>
41 |         </tr>
42 |       </tbody>
43 |     </table>
44 |     <table class="table table-condensed table-striped">
45 |       <thead>
46 |         <th>Carrier</th>
47 |         <th>Date</th>
48 |         <th>Flight Number</th>
49 |         <th>Origin</th>
50 |         <th>Destination</th>
51 |       </thead>
52 |       <tbody>
53 |         {% for flight in flights['Flights'] %}
54 |         <tr>
55 |           <td>{{flight['Carrier']}}</td>
56 |           <td>{{flight['FlightDate']}}</td>
57 |           <td><a href="/on_time_performance?Carrier={{flight['Carrier']}}&FlightDate={{flight['FlightDate']}}&FlightNum={{flight['FlightNum']}}">{{flight['FlightNum']}}</a></td>
58 |           <td>{{flight['Origin']}}</td>
59 |           <td>{{flight['Dest']}}</td>
60 |         </tr>
61 |         {% endfor %}
62 |       </tbody>
63 |     </table>
64 |   </div>
65 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Chapter 5 example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <script src="/static/jquery-1.12.2.min.js"></script>
15 |     <script src="/static/bootstrap.min.js"></script>
16 |     <script src="/static/d3.v3.min.js"></script>
17 |     <div id="wrap">
18 | 
19 |       <!-- Begin page content -->
20 |       <div class="container">
21 |         <div class="page-header">
22 |           <h1>Agile Data Science</h1>
23 |         </div>
24 |         {% block body %}{% endblock %}
25 |       </div>
26 | 
27 |       <div id="push"></div>
28 |     </div>
29 | 
30 |     <div id="footer">
31 |       <div class="container">
32 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
33 |       </div>
34 |     </div>
35 |   </body>
36 | </html>


--------------------------------------------------------------------------------
/ch05/web/templates/macros.jnj:
--------------------------------------------------------------------------------
 1 | <!-- Display two navigation links for previous/next page in the flight list -->;
 2 | {% macro display_nav(offsets, path, count, query) -%}
 3 |   <div style="text-align: center;">
 4 |     {% for key, values in offsets.items() -%}
 5 |       {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count['value'] > values['bottom_offset'] -%}
 6 |         <a style="margin-left: 20px; margin-right: 20px;" href="{{ path }}&start={{ values ['bottom_offset'] }}&amp;end={{ values['top_offset'] }}{%- if query -%}?search={{query}}{%- endif -%}">{{ key }}</a>
 7 |       {% else -%}
 8 |         {{ key }}
 9 |       {% endif %}
10 |     {% endfor -%}
11 |   </div>
12 | {%- endmacro %}
13 | 


--------------------------------------------------------------------------------
/ch05/web/templates/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <div>
 4 |   <p class="lead">{{flight_count['value']}} Flights</p>
 5 |   <form action="/flights/search" method="get">
 6 |     <label for="Carrier">Carrier</label>
 7 |     <input name="Carrier" maxlength="3" style="width: 36px; margin-right: 10px;"
 8 |       value="{{carrier if carrier else ''}}"></input>
 9 |     <label for="Origin">Origin</label>
10 |     <input name="Origin" maxlength="3" style="width: 36px; margin-right: 10px;"
11 |       value="{{origin if origin else ''}}"></input>
12 |     <label for="Dest">Dest</label>
13 |     <input name="Dest" maxlength="3" style="width: 36px; margin-right: 10px;" value="{{dest if dest else ''}}"></input>
14 |     <label for="FlightDate">FlightDate</label>
15 |     <input name="FlightDate" style="width: 100px; margin-right: 10px;"
16 |       value="{{flight_date if flight_date else ''}}"></input>
17 |     <label for="TailNum">TailNum</label>
18 |     <input name="TailNum" style="width: 100px; margin-right: 10px;"
19 |       value="{{tail_number if tail_number else ''}}"></input>
20 |     <label for="FlightNum">FlightNum</label>
21 |     <input name="FlightNum" style="width: 50px; margin-right: 10px;"
22 |       value="{{flight_number if flight_number else ''}}"></input>
23 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
24 |   </form>
25 |   <table class="table table-condensed table-striped">
26 |     <thead>
27 |       <th>Airline</th>
28 |       <th>Flight Number</th>
29 |       <th>Origin</th>
30 |       <th>Destination</th>
31 |       <th>Date</th>
32 |       <th>Departure Time</th>
33 |       <th>Tail Number</th>
34 |       <th>Air Time</th>
35 |       <th>Distance</th>
36 |     </thead>
37 |     <tbody>
38 |       {% for flight in flights %}
39 |       <tr>
40 |         <td>{{flight.Carrier}}</td>
41 |         <td><a
42 |             href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a>
43 |         </td>
44 |         <td>{{flight.Origin}}</td>
45 |         <td>{{flight.Dest}}</td>
46 |         <td>{{flight.FlightDate}}</td>
47 |         <td>{{flight.DepTime}}</td>
48 |         <td><a href="/airplane/flights/{{flight.TailNum}}">{{flight.TailNum}}</a></td>
49 |         <td>{{flight.AirTime}}</td>
50 |         <td>{{flight.Distance}}</td>
51 |       </tr>
52 |       {% endfor %}
53 |     </tbody>
54 |   </table>
55 | </div>
56 | {% import "macros.jnj" as common %}
57 | {% if nav_offsets and nav_path -%}
58 | {{ common.display_nav(nav_offsets, nav_path, flight_count)|safe }}
59 | {% endif -%}
60 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/top_routes.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Total Flights by Month</p>
 5 |     <table class="table table-condensed table-striped" style="width: 200px;">
 6 |       <thead>
 7 |         <th>Month</th>
 8 |         <th>Total Flights</th>
 9 |       </thead>
10 |       <tbody>
11 |         {% for month in total_flights %}
12 |         <tr>
13 |           <td>{{month.Month}}</td>
14 |           <td>{{month.total_flights}}</td>
15 |         </tr>
16 |         {% endfor %}
17 |       </tbody>
18 |     </table>
19 |   </div>
20 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/top_routes_chart.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">National Top Routes</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app3.js"></script>
22 | {% endblock %}
23 | 


--------------------------------------------------------------------------------
/ch05/web/templates/total_flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Total Flights by Month</p>
 5 |     <table class="table table-condensed table-striped" style="width: 200px;">
 6 |       <thead>
 7 |         <th>Month</th>
 8 |         <th>Total Flights</th>
 9 |       </thead>
10 |       <tbody>
11 |         {% for month in total_flights %}
12 |         <tr>
13 |           <td>{{month.Month}}</td>
14 |           <td>{{month.total_flights}}</td>
15 |         </tr>
16 |         {% endfor %}
17 |       </tbody>
18 |     </table>
19 |   </div>
20 | {% endblock %}


--------------------------------------------------------------------------------
/ch05/web/templates/total_flights_chart.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">Total Flights by Month</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app.js"></script>
22 | {% endblock %}
23 | 


--------------------------------------------------------------------------------
/ch05/web/templates/total_flights_chart_2.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">Total Flights by Month</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app2.js"></script>
22 | {% endblock %}
23 | 


--------------------------------------------------------------------------------
/ch06/add_name_to_airlines.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | 
 3 | # Load the on-time parquet file
 4 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 5 | 
 6 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline
 7 | on_time_dataframe.registerTempTable("on_time_performance")
 8 | carrier_codes = spark.sql(
 9 |   "SELECT DISTINCT Carrier FROM on_time_performance"
10 |   )
11 | carrier_codes.collect()
12 | 
13 | from pyspark.sql.types import StringType, IntegerType
14 | from pyspark.sql.types import StructType, StructField
15 | 
16 | schema = StructType([
17 |   StructField("ID", IntegerType(), True),  # "ArrDelay":5.0
18 |   StructField("Name", StringType(), True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
19 |   StructField("Alias", StringType(), True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
20 |   StructField("IATA", StringType(), True),  # "Carrier":"WN"
21 |   StructField("ICAO", StringType(), True),  # "DayOfMonth":31
22 |   StructField("CallSign", StringType(), True),  # "DayOfWeek":4
23 |   StructField("Country", StringType(), True),  # "DayOfYear":365
24 |   StructField("Active", StringType(), True),  # "DepDelay":14.0
25 | ])
26 | 
27 | airlines = spark.read.format('com.databricks.spark.csv')\
28 |   .options(header='false', nullValue='\\N')\
29 |   .schema(schema)\
30 |   .load('data/airlines.csv')
31 | airlines.show()
32 | 
33 | # Is Delta around?
34 | airlines.filter(airlines.IATA == 'DL').show()
35 | 
36 | # Drop fields except for C1 as name, C3 as carrier code
37 | airlines.registerTempTable("airlines")
38 | airlines = spark.sql("SELECT Name, IATA AS CarrierCode from airlines")
39 | 
40 | # Join our 14 carrier codes to the airliens table to get our set of airlines
41 | our_airlines = carrier_codes.join(airlines, carrier_codes.Carrier == airlines.CarrierCode)
42 | our_airlines = our_airlines.select('Name', 'CarrierCode')
43 | our_airlines.show()
44 | 
45 | # Store as JSON objects via a dataframe. Repartition to 1 to get 1 json file.
46 | our_airlines.repartition(1).write.mode("overwrite").json("data/our_airlines.json")
47 | 
48 | os.system("cp data/our_airlines.json/part* data/our_airlines.jsonl")
49 | 
50 | #wikidata = spark.read.json('data/wikidata-20160404-all.json.bz2')
51 | 


--------------------------------------------------------------------------------
/ch06/airplanes_mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "airplane" : {
 3 |     "properties" : {
 4 |       "Owner" : {
 5 |         "type":     "string",
 6 |         "analyzer": "english",
 7 |         "fields": {
 8 |           "raw": {
 9 |               "type":  "string",
10 |               "index": "not_analyzed"
11 |           }
12 |         }
13 |       },
14 |       "TailNum": {
15 |         "type": "string",
16 |         "analyzer": "english"
17 |       },
18 |       "EngineManufacturer": {
19 |         "type": "string",
20 |         "analyzer": "english"
21 |       },
22 |       "EngineModel": {
23 |         "type": "string",
24 |         "analyzer": "english"
25 |       },
26 |       "Manufacturer": {
27 |         "type": "string",
28 |         "analyzer": "english"
29 |       },
30 |       "ManufacturerYear": {
31 |         "type": "string",
32 |         "analyzer": "english"
33 |       },
34 |       "Model": {
35 |         "type": "string",
36 |         "analyzer": "english"
37 |       },
38 |       "Owner": {
39 |         "type": "string",
40 |         "analyzer": "english"
41 |       },
42 |       "OwnerState": {
43 |         "type": "string",
44 |         "analyzer": "english"
45 |       },
46 |       "SerialNumber": {
47 |         "type": "string",
48 |         "analyzer": "english"
49 |       }
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/ch06/airplanes_to_elasticsearch.py:
--------------------------------------------------------------------------------
 1 | # Load our airplanes
 2 | airplanes = spark.read.json("data/airplanes.json")
 3 | airplanes.show()
 4 | 
 5 | airplanes.write.format("org.elasticsearch.spark.sql")\
 6 |   .option("es.resource","agile_data_science/airplane")\
 7 |   .mode("overwrite")\
 8 |   .save()
 9 | 
10 | # Format data for Elasticsearch, as a tuple with a dummy key in the first field
11 | # airplanes_dict = airplanes.rdd.map(lambda x: ('ignored_key', x.asDict()))
12 | #
13 | # airplanes_dict.saveAsNewAPIHadoopFile(
14 | #   path='-',
15 | #   outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
16 | #   keyClass="org.apache.hadoop.io.NullWritable",
17 | #   valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
18 | #   conf={ "es.resource" : "agile_data_science/airplanes" })
19 | 


--------------------------------------------------------------------------------
/ch06/analyze_airplanes.py:
--------------------------------------------------------------------------------
 1 | airplanes = spark.read.json('data/airplanes.json')
 2 | 
 3 | #
 4 | # Who makes the airplanes in the US commercial fleet, as a %
 5 | #
 6 | 
 7 | # How many airplanes are made by each manufacturer?
 8 | airplanes.registerTempTable("airplanes")
 9 | manufacturer_counts = spark.sql("""SELECT
10 |   Manufacturer,
11 |   COUNT(*) AS Total
12 | FROM
13 |   airplanes
14 | GROUP BY
15 |   Manufacturer
16 | ORDER BY
17 |   Total DESC"""
18 | )
19 | manufacturer_counts.show(10) # show top 10
20 | 
21 | # How many airplanes total?
22 | total_airplanes = spark.sql(
23 |   """SELECT
24 |   COUNT(*) AS OverallTotal
25 |   FROM airplanes"""
26 | )
27 | print("Total airplanes: {}".format(total_airplanes.collect()[0].OverallTotal))
28 | 
29 | mfr_with_totals = manufacturer_counts.crossJoin(total_airplanes)
30 | mfr_with_totals = mfr_with_totals.rdd.map(
31 |   lambda x: {
32 |     'Manufacturer': x.Manufacturer,
33 |     'Total': x.Total,
34 |     'Percentage': round(
35 |       (
36 |         float(x.Total)/float(x.OverallTotal)
37 |       ) * 100,
38 |       2
39 |     )
40 |   }
41 | )
42 | mfr_with_totals.toDF().show()
43 | 
44 | #
45 | # Same with sub-queries
46 | #
47 | relative_manufacturer_counts = spark.sql("""SELECT
48 |   Manufacturer,
49 |   COUNT(*) AS Total,
50 |   ROUND(
51 |     100 * (
52 |       COUNT(*)/(SELECT COUNT(*) FROM airplanes)
53 |     ),
54 |     2
55 |   ) AS PercentageTotal
56 | FROM
57 |   airplanes
58 | GROUP BY
59 |   Manufacturer
60 | ORDER BY
61 |   Total DESC, Manufacturer
62 | LIMIT 10"""
63 | )
64 | relative_manufacturer_counts.show(30) # show top 30
65 | 
66 | #
67 | # Now get these things on the web
68 | #
69 | relative_manufacturer_counts = relative_manufacturer_counts.rdd.map(lambda row: row.asDict())
70 | grouped_manufacturer_counts = relative_manufacturer_counts.groupBy(lambda x: 1)
71 | 
72 | # Save to Mongo in the airplanes_per_carrier relation
73 | import pymongo_spark
74 | pymongo_spark.activate()
75 | grouped_manufacturer_counts.saveToMongoDB(
76 |   'mongodb://localhost:27017/agile_data_science.airplane_manufacturer_totals'
77 | )
78 | 


--------------------------------------------------------------------------------
/ch06/analyze_airplanes_again.py:
--------------------------------------------------------------------------------
 1 | airplanes = spark.read.json('data/resolved_airplanes.json')
 2 | 
 3 | #
 4 | # Who makes the airplanes in the US commercial fleet, as a %
 5 | #
 6 | 
 7 | # How many airplanes are made by each manufacturer?
 8 | airplanes.registerTempTable("airplanes")
 9 | manufacturer_counts = spark.sql("""SELECT
10 |   Manufacturer,
11 |   COUNT(*) AS Total
12 | FROM
13 |   airplanes
14 | GROUP BY
15 |   Manufacturer
16 | ORDER BY
17 |   Total DESC"""
18 | )
19 | manufacturer_counts.show(10) # show top 10
20 | 
21 | # How many airplanes total?
22 | total_airplanes = spark.sql(
23 |   """SELECT
24 |   COUNT(*) AS OverallTotal
25 |   FROM airplanes"""
26 | )
27 | print("Total airplanes: {}".format(total_airplanes.collect()[0].OverallTotal))
28 | 
29 | mfr_with_totals = manufacturer_counts.crossJoin(total_airplanes)
30 | mfr_with_totals = mfr_with_totals.rdd.map(
31 |   lambda x: {
32 |     'Manufacturer': x.Manufacturer,
33 |     'Total': x.Total,
34 |     'Percentage': round(
35 |       (
36 |         float(x.Total)/float(x.OverallTotal)
37 |       ) * 100,
38 |       2
39 |     )
40 |   }
41 | )
42 | mfr_with_totals.toDF().show()
43 | 
44 | #
45 | # Same with sub-queries
46 | #
47 | relative_manufacturer_counts = spark.sql("""SELECT
48 |   Manufacturer,
49 |   COUNT(*) AS Total,
50 |   ROUND(
51 |     100 * (
52 |       COUNT(*)/(SELECT COUNT(*) FROM airplanes)
53 |     ),
54 |     2
55 |   ) AS PercentageTotal
56 | FROM
57 |   airplanes
58 | GROUP BY
59 |   Manufacturer
60 | ORDER BY
61 |   Total DESC, Manufacturer
62 | LIMIT 10"""
63 | )
64 | relative_manufacturer_counts.show(30) # show top 30
65 | 
66 | #
67 | # Now get these things on the web
68 | #
69 | relative_manufacturer_counts_dict = relative_manufacturer_counts.rdd.map(lambda row: row.asDict())
70 | grouped_manufacturer_counts = relative_manufacturer_counts_dict.groupBy(lambda x: 1)
71 | 
72 | # Save to Mongo in the airplanes_per_carrier relation
73 | import pymongo_spark
74 | pymongo_spark.activate()
75 | grouped_manufacturer_counts.saveToMongoDB(
76 |   'mongodb://localhost:27017/agile_data_science.airplane_manufacturer_totals'
77 | )
78 | 


--------------------------------------------------------------------------------
/ch06/create_airplanes_index.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Create the entire agile_data_science index
 4 | curl -XPUT 'http://localhost:9200/agile_data_science/'
 5 | 
 6 | # Create the mapping to make search results sort right
 7 | curl -XPUT 'http://localhost:9200/agile_data_science/_mapping/airplane' --data @airplanes_mapping.json
 8 | 
 9 | # Get the mapping we just put in
10 | curl -XGET 'http://localhost:9200/agile_data_science/_mapping/airplane'
11 | 


--------------------------------------------------------------------------------
/ch06/enrich_airlines_wikipedia.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | sys.path.append("lib")
 3 | import utils
 4 | 
 5 | import wikipedia
 6 | from bs4 import BeautifulSoup
 7 | import tldextract
 8 | 
 9 | # Load our airlines...
10 | our_airlines = utils.read_json_lines_file('data/our_airlines.jsonl')
11 | 
12 | # Build a new list that includes wikipedia data
13 | with_url = []
14 | for airline in our_airlines:
15 |   # Get the wikipedia page for the airline name
16 |   wikipage = wikipedia.page(airline['Name'])
17 | 
18 |   # Get the summary
19 |   summary = wikipage.summary
20 |   airline['summary'] = summary
21 | 
22 |   # Get the HTML of the page
23 |   page = BeautifulSoup(wikipage.html())
24 | 
25 |   # Task: get the logo from the right 'vcard' column
26 |   # 1) Get the vcard table
27 |   vcard_table = page.find_all('table', class_='vcard')[0]
28 |   # 2) The logo is always the first image inside this table
29 |   first_image = vcard_table.find_all('img')[0]
30 |   # 3) Set the url to the image
31 |   logo_url = 'http:' + first_image.get('src')
32 |   airline['logo_url'] = logo_url
33 | 
34 |   # Task: Get the company website
35 |   # 1) Find the 'Website' table header
36 |   th = page.find_all('th', text='Website')[0]
37 |   # 2) find the parent tr element
38 |   tr = th.parent
39 |   # 3) find the a (link) tag within the tr
40 |   a = tr.find_all('a')[0]
41 |   # 4) finally get the href of the a tag
42 |   url = a.get('href')
43 |   airline['url'] = url
44 | 
45 |   # Get the domain to display with the url
46 |   url_parts = tldextract.extract(url)
47 |   airline['domain'] = url_parts.domain + '.' + url_parts.suffix
48 | 
49 |   with_url.append(airline)
50 | 
51 | utils.write_json_lines_file(with_url, 'data/our_airlines_with_wiki.jsonl')
52 | 
53 | 


--------------------------------------------------------------------------------
/ch06/extract_airlines.py:
--------------------------------------------------------------------------------
 1 | # Load the on-time parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline
 5 | on_time_dataframe.registerTempTable("on_time_performance")
 6 | carrier_airplane = spark.sql(
 7 |   "SELECT DISTINCT Carrier, TailNum FROM on_time_performance"
 8 |   )
 9 | 
10 | # Now we need to store a sorted group for each Carrier, along with a fleet count
11 | airplanes_per_carrier = carrier_airplane.rdd\
12 |   .map(lambda nameTuple: (nameTuple[0], [nameTuple[1]]))\
13 |   .reduceByKey(lambda a, b: a + b)\
14 |   .map(lambda tuple:
15 |       {
16 |         'Carrier': tuple[0], 
17 |         'TailNumbers': sorted(
18 |           filter(
19 |             lambda x: x is not None and x != '', tuple[1] # empty string tail numbers were getting through
20 |             )
21 |           ),
22 |         'FleetCount': len(tuple[1])
23 |       }
24 |     )
25 | airplanes_per_carrier.count() # 14
26 | 
27 | # Save to Mongo in the airplanes_per_carrier relation
28 | import pymongo_spark
29 | pymongo_spark.activate()
30 | airplanes_per_carrier.saveToMongoDB(
31 |   'mongodb://localhost:27017/agile_data_science.airplanes_per_carrier'
32 | )
33 | 


--------------------------------------------------------------------------------
/ch06/extract_airports.py:
--------------------------------------------------------------------------------
 1 | # Load the on-time parquet file
 2 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 3 | 
 4 | # The first step is easily expressed as SQL: get all unique tail numbers for each airline
 5 | on_time_dataframe.registerTempTable("on_time_performance")
 6 | carrier_airplane = spark.sql(
 7 |   "SELECT DISTINCT Carrier, TailNum FROM on_time_performance"
 8 |   )
 9 | 
10 | 


--------------------------------------------------------------------------------
/ch06/images/ads2_0601.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0601.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0602.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0602.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0603.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0603.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0604.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0604.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0605.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0605.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0606.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0606.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0607.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0607.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0608.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0608.png


--------------------------------------------------------------------------------
/ch06/images/ads2_0609.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/images/ads2_0609.png


--------------------------------------------------------------------------------
/ch06/import_airlines.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Import our enriched airline data as the 'airlines' collection
4 | mongoimport -d agile_data_science -c airlines --file data/our_airlines_with_wiki.jsonl
5 | 


--------------------------------------------------------------------------------
/ch06/prepare_airplanes.py:
--------------------------------------------------------------------------------
 1 | # Load the FAA N-Number Inquiry Records
 2 | faa_tail_number_inquiry = spark.read.json('data/faa_tail_number_inquiry.jsonl')
 3 | faa_tail_number_inquiry.show()
 4 | 
 5 | # Count the records
 6 | faa_tail_number_inquiry.count()
 7 | 
 8 | # Load our unique tail numbers
 9 | unique_tail_numbers = spark.read.json('data/tail_numbers.jsonl')
10 | unique_tail_numbers.show()
11 | 
12 | # join tail numbers to our inquries
13 | tail_num_plus_inquiry = unique_tail_numbers.join(
14 |   faa_tail_number_inquiry,
15 |   unique_tail_numbers.TailNum == faa_tail_number_inquiry.TailNum,
16 | )
17 | tail_num_plus_inquiry = tail_num_plus_inquiry.drop(unique_tail_numbers.TailNum)
18 | tail_num_plus_inquiry.show()
19 | 
20 | # Dump extra field and store tail_numbers plus inquiry
21 | tail_num_plus_inquiry.registerTempTable("tail_num_plus_inquiry")
22 | airplanes = spark.sql("""SELECT
23 |   TailNum AS TailNum,
24 |   engine_manufacturer AS EngineManufacturer,
25 |   engine_model AS EngineModel,
26 |   manufacturer AS Manufacturer,
27 |   mfr_year AS ManufacturerYear,
28 |   model AS Model,
29 |   owner AS Owner,
30 |   owner_state AS OwnerState,
31 |   serial_number AS SerialNumber
32 | FROM
33 |   tail_num_plus_inquiry""")
34 | 
35 | airplanes.repartition(1).write.mode("overwrite").json('data/airplanes.json')
36 | 


--------------------------------------------------------------------------------
/ch06/scrape_faa.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import time
 3 | 
 4 | sys.path.append("lib")
 5 | import utils
 6 | 
 7 | import requests
 8 | from bs4 import BeautifulSoup
 9 | 
10 | tail_number_records = utils.read_json_lines_file('data/tail_numbers.jsonl')
11 | 
12 | aircraft_records = []
13 | # Loop through the tail numbers, fetching
14 | for tail_number_record in tail_number_records:
15 |   time.sleep(0.1) # essential to sleep FIRST in loop or you will flood sites
16 |   
17 |   # Parameterize the URL with the tail number
18 |   BASE_URL = 'http://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?NNumbertxt={}'
19 |   tail_number = tail_number_record['TailNum']
20 |   url = BASE_URL.format(tail_number)
21 | 
22 |   # Fetch the page, parse the HTML
23 |   r = requests.get(url)
24 |   
25 |   html = r.text
26 |   soup = BeautifulSoup(html)
27 |   
28 |   # The table structure is constant for all pages that contain data
29 |   try:
30 |     aircraft_description = soup.find_all('table')[4]
31 |     craft_tds = aircraft_description.find_all('td')
32 |     serial_number = craft_tds[1].text.strip()
33 |     manufacturer = craft_tds[5].text.strip()
34 |     model = craft_tds[9].text.strip()
35 |     mfr_year = craft_tds[25].text.strip()
36 | 
37 |     registered_owner = soup.find_all('table')[5]
38 |     reg_tds = registered_owner.find_all('td')
39 |     owner = reg_tds[1].text.strip()
40 |     owner_state = reg_tds[9].text.strip()
41 | 
42 |     airworthiness = soup.find_all('table')[6]
43 |     worthy_tds = airworthiness.find_all('td')
44 |     engine_manufacturer = worthy_tds[1].text.strip()
45 |     engine_model = worthy_tds[5].text.strip()
46 | 
47 |     aircraft_record = {
48 |       'TailNum': tail_number,
49 |       'serial_number': serial_number,
50 |       'manufacturer': manufacturer,
51 |       'model': model,
52 |       'mfr_year': mfr_year,
53 |       'owner': owner,
54 |       'owner_state': owner_state,
55 |       'engine_manufacturer': engine_manufacturer,
56 |       'engine_model': engine_model,
57 |     }
58 |     aircraft_records.append(
59 |       aircraft_record
60 |     )
61 |     print(aircraft_record)
62 |     
63 |   except IndexError as e:
64 |     print("Missing {} record: {}".format(tail_number, e))
65 | 
66 | utils.write_json_lines_file(
67 |   aircraft_records, 'data/faa_tail_number_inquiry.jsonl'
68 | )
69 | 


--------------------------------------------------------------------------------
/ch06/test_elastic_airplanes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -XGET 'localhost:9200/agile_data_science/airplanes/_search?q=*'
4 | 


--------------------------------------------------------------------------------
/ch06/web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch06/web/__init__.py


--------------------------------------------------------------------------------
/ch06/web/config.py:
--------------------------------------------------------------------------------
1 | # config.py, a configuration file for index.py
2 | RECORDS_PER_PAGE = 15
3 | AIRPLANE_RECORDS_PER_PAGE = 5
4 | ELASTIC_URL = "http://elastic:9200"
5 | 


--------------------------------------------------------------------------------
/ch06/web/search_helpers.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | 
 3 | # Process elasticsearch hits and return flights records
 4 | def process_search(results):
 5 |   records = []
 6 |   total = 0
 7 |   if results['hits'] and results['hits']['hits']:
 8 |     total = results['hits']['total']
 9 |     hits = results['hits']['hits']
10 |     for hit in hits:
11 |       record = hit['_source']
12 |       records.append(record)
13 |   return records, total
14 | 
15 | # Calculate offsets for fetching lists of flights from MongoDB
16 | def get_navigation_offsets(offset1, offset2, increment):
17 |   offsets = {}
18 |   offsets['Next'] = {'top_offset': offset2 + increment, 'bottom_offset':
19 |   offset1 + increment}
20 |   offsets['Previous'] = {'top_offset': max(offset2 - increment, 0),
21 |  'bottom_offset': max(offset1 - increment, 0)} # Don't go < 0
22 |   return offsets
23 | 
24 | # Strip the existing start and end parameters from the query string
25 | def strip_place(url):
26 |   try:
27 |     p = re.match('(.+)\?start=.+&end=.+', url).group(1)
28 |   except AttributeError as e:
29 |     return url
30 |   return p
31 | 


--------------------------------------------------------------------------------
/ch06/web/static/airplanes.js:
--------------------------------------------------------------------------------
 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40},
 2 |     width = 900 - margin.left - margin.right,
 3 |     height = 300 - margin.top - margin.bottom;
 4 | 
 5 | var x = d3.scale.ordinal()
 6 |     .rangeRoundBands([0, width], .1);
 7 | var y = d3.scale.linear()
 8 |     .range([height, 0]);
 9 | 
10 | var xAxis = d3.svg.axis()
11 |     .scale(x)
12 |     .orient("bottom")
13 |     .tickFormat(function(d) {
14 |         return truncate(d, 14);
15 |     });
16 | var yAxis = d3.svg.axis()
17 |     .scale(y)
18 |     .orient("left");
19 | 
20 | var chart = d3.select(".chart")
21 |     .attr("width", width + margin.left + margin.right)
22 |     .attr("height", height + margin.top + margin.bottom)
23 |     .append("g")
24 |     .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
25 | 
26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) {
27 |     var data = data.data;
28 | 
29 |     x.domain(data.map(function(d) { return d.Manufacturer; }));
30 |     y.domain([0, d3.max(data, function(d) { return d.Total; })]);
31 | 
32 |     chart.append("g")
33 |         .attr("class", "x axis")
34 |         .attr("transform", "translate(0," + height + ")")
35 |         .call(xAxis);
36 | 
37 |     chart.append("g")
38 |         .attr("class", "y axis")
39 |         .call(yAxis);
40 | 
41 |     chart.selectAll(".bar")
42 |         .data(data)
43 |         .enter().append("rect")
44 |         .attr("class", "bar")
45 |         .attr("x", function(d) { return x(d.Manufacturer); })
46 |         .attr("y", function(d) { return y(d.Total); })
47 |         .attr("height", function(d) { return height - y(d.Total); })
48 |         .attr("width", x.rangeBand());
49 | });
50 | 
51 | function truncate(d, l) {
52 |      if(d.length > l)
53 |          return d.substring(0,l)+'...';
54 |      else
55 |          return d;
56 | }
57 | 


--------------------------------------------------------------------------------
/ch06/web/static/app.js:
--------------------------------------------------------------------------------
 1 | var width = 960,
 2 |     height = 500;
 3 | 
 4 | var y = d3.scale.linear()
 5 |     .range([height, 0]);
 6 |     // We define the domain once we get our data in d3.json, below
 7 | 
 8 | var chart = d3.select(".chart")
 9 |     .attr("width", width)
10 |     .attr("height", height);
11 | 
12 | d3.json("/total_flights.json", function(data) {
13 |   y.domain([0, d3.max(data, function(d) { return d.total_flights; })]);
14 | 
15 |   var barWidth = width / data.length;
16 | 
17 |   var bar = chart.selectAll("g")
18 |       .data(data)
19 |       .enter()
20 |       .append("g")
21 |       .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; });
22 | 
23 |   bar.append("rect")
24 |       .attr("y", function(d) { return y(d.total_flights); })
25 |       .attr("height", function(d) { return height - y(d.total_flights); })
26 |       .attr("width", barWidth - 1);
27 | 
28 |   bar.append("text")
29 |       .attr("x", barWidth / 2)
30 |       .attr("y", function(d) { return y(d.total_flights) + 3; })
31 |       .attr("dy", ".75em")
32 |       .text(function(d) { return d.total_flights; });
33 | });


--------------------------------------------------------------------------------
/ch06/web/templates/airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <!-- Navigation guide -->
 4 | / <a href="/airlines">Airlines</a> / <a href="/airlines/{{carrier_code}}">{{carrier_code}}</a>
 5 | 
 6 | <!-- Summary -->
 7 | <p class="lead">Airline {{ carrier_code }}</p>
 8 | 
 9 | <h4>Fleet: {{airline_airplanes.FleetCount}} Planes</h4>
10 | <ul class="nav nav-pills">
11 |   {% for tail_number in airline_airplanes.TailNumbers -%}
12 |   <li class="button">
13 |     <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
14 |   </li>
15 |   {% endfor -%}
16 | </ul>
17 | 
18 | <h4>Airports: {{airline_airports.length}}</h4>
19 | <ul class="nav nav-pills">
20 |   {% for airport_code in airline_airports.Airports -%}
21 |   <li class="button">
22 |     <a href="/airport/{{airport_code}}">{{airport_code}}</a>
23 |   </li>
24 |   {% endfor -%}
25 | </ul>
26 | {% endblock %}


--------------------------------------------------------------------------------
/ch06/web/templates/airlines2.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <!-- Navigation guide -->
 4 | / <a href="/airlines">Airlines</a> / <a href="/airline/{{carrier_code}}">{{carrier_code}}</a>
 5 | 
 6 | <!-- Logo -->
 7 | <img src="{{airline_summary.logo_url}}" style="float: right;" />
 8 | 
 9 | <p class="lead">
10 |   <!-- Airline Name and website-->
11 |   {{airline_summary.Name}}
12 |   / <a href="{{airline_summary.url}}">{{airline_summary.domain}}</a>
13 | </p>
14 | 
15 | <!-- Summary -->
16 | <p style="text-align: justify;">{{airline_summary.summary}}</p>
17 | 
18 | <!-- Airports -->
19 | <h4>Airports: {{airline_airports.length}}</h4>
20 | <ul class="nav nav-pills">
21 |   {% for airport in airline_airports.Airports -%}
22 |   <li class="button">
23 |     <a href="/airport/{{airport}}">{{airport}}</a>
24 |   </li>
25 |   {% endfor -%}
26 | </ul>
27 | 
28 | <!-- Fleet -->
29 | <h4>Fleet: {{airline_airplanes.FleetCount}} Planes</h4>
30 | <ul class="nav nav-pills">
31 |   {% for tail_number in airline_airplanes.TailNumbers -%}
32 |   <li class="button">
33 |     <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
34 |   </li>
35 |   {% endfor -%}
36 | </ul>
37 | 
38 | {% endblock %}


--------------------------------------------------------------------------------
/ch06/web/templates/airport.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <!-- Navigation guide -->
 4 | / <a href="/airport">Airport</a> / <a href="/airport/{{airport_code}}">{{airport_code}}</a>
 5 | 
 6 | <!-- Summary -->
 7 | <p class="lead">{{airport_metadata.City}} / {{airport_metadata.Name}} / {{ airport_code }}</p>
 8 | 
 9 | <p>Coordinates: {{airport_metadata.Latitude}} lat {{airport_metadata.Longitude}} lon</p>
10 | <p>Timezone: {{airport_metadata.TZ_DB}} / {{airport_metadata.Timezone}} from GMT</p>
11 | 
12 | <h4>Carriers: {{carriers_per_airport.Carriers.length}}</h4>
13 | <ul class="nav nav-pills">
14 |   {% for carrier in carriers_per_airport.Carriers -%}
15 |   <li class="button">
16 |     <a href="/airlines/{{carrier}}">{{carrier}}</a>
17 |   </li>
18 |   {% endfor -%}
19 | </ul>
20 | 
21 | {% endblock %}


--------------------------------------------------------------------------------
/ch06/web/templates/all_airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airlines">Airlines</a>
 5 | 
 6 |   <p class="lead">US Domestic Airlines</p>
 7 |   <ul class="nav nav-pills">
 8 |     {% for airline in airlines -%}
 9 |     <li class="button">
10 |       <a href="/airlines/{{airline.Carrier}}">{{airline.Carrier}}</a>
11 |     </li>
12 |     {% endfor -%}
13 |   </ul>
14 | {% endblock %}
15 | 


--------------------------------------------------------------------------------
/ch06/web/templates/all_airplanes.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airplanes">Airplanes</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     US Commercial Fleet
 9 |   </p>
10 | 
11 |   <!-- Chart of fleet manufacturers -->
12 |   <div>
13 |     <p style="margin: 0px;">Airplanes by Manufacturer</p>
14 |     <div id="chart"><svg class="chart"></svg></div>
15 |   </div>
16 |   <script src="/static/airplanes.js"></script>
17 | 
18 |   <!-- Generate form from search_config and request args -->
19 |   <form action="/airplanes" method="get">
20 |     {% for item in search_config %}
21 |       {% if 'label' in item %}
22 |         <label for="{{item['field']}}">{{item['label']}}</label>
23 |       {% else %}
24 |         <label for="{{item['field']}}">{{item['field']}}</label>
25 |       {% endif %}
26 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value="{{args[item['field']] if args[item['field']] else ''}}"></input>
27 |     {% endfor %}
28 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
29 |   </form>
30 | 
31 |   <table class="table table-condensed table-striped">
32 |     <!-- Create table header, based on search_config -->
33 |     <thead>
34 |       {% for item in search_config %}
35 |         {% if 'label' in item %}
36 |           <th>{{item['label']}}</th>
37 |         {% else %}
38 |           <th>{{item['field']}}</th>
39 |         {% endif %}
40 |       {% endfor %}
41 |     </thead>
42 | 
43 |     <!-- Create table content, based on airplanes for each <tr> and search_config for each <td> -->
44 |     <tbody>
45 |       {% for airplane in airplanes %}
46 |       <tr>
47 |         {% for item in search_config %}
48 |           <td>{{airplane[item['field']]}}</td>
49 |         {% endfor %}
50 |       </tr>
51 |       {% endfor %}
52 |     </tbody>
53 |   </table>
54 | 
55 |   <!-- css for x axis in chart -->
56 |   <style>
57 |     .axis text {
58 |       font: 8px sans-serif;
59 |     }
60 | 
61 |     .axis path,
62 |     .axis line {
63 |       fill: none;
64 |       stroke: #000;
65 |       shape-rendering: crispEdges;
66 |     }
67 | 
68 |     .bar {
69 |       fill: #ff6600;
70 |     }
71 |   </style>
72 | 
73 | {% import "macros.jnj" as common %}
74 | {% if nav_offsets and nav_path -%}
75 |   {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }}
76 | {% endif -%}
77 | {% endblock %}
78 | 


--------------------------------------------------------------------------------
/ch06/web/templates/flight.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / Flights / <a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.TailNum}}</a>
 6 | 
 7 |     <p class="lead">Flight {{flight.FlightNum}}</p>
 8 |     <table class="table">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Origin</th>
12 |         <th>Destination</th>
13 |         <th>Tail Number</th>
14 |         <th>Date</th>
15 |         <th>Air Time</th>
16 |         <th>Distance</th>
17 |       </thead>
18 |       <tbody>
19 |         <tr>
20 |           <td><a href="/airlines/{{flight.Carrier}}">{{flight.Carrier}}</a></td>
21 |           <td>{{flight.Origin}}</td>
22 |           <td>{{flight.Dest}}</td>
23 |           <td><a href="/airplane/flights/{{flight.TailNum}}">{{flight.TailNum}}</a></td>
24 |           <td>{{flight.FlightDate}}</td>
25 |           <td>{{flight.AirTime}}</td>
26 |           <td>{{flight.Distance}}</td>
27 |         </tr>
28 |       </tbody>
29 |     </table>
30 |   </div>
31 | {% endblock %}
32 | 


--------------------------------------------------------------------------------
/ch06/web/templates/flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / <a href="/flights">Flights</a>
 6 | 
 7 |     <p class="lead">{{flight_count}} Flights on {{flight_date}}</p>
 8 |     <table class="table table-condensed table-striped">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Flight Number</th>
12 |         <th>Origin</th>
13 |         <th>Destination</th>
14 |         <th>Departure Time</th>
15 |         <th>Tail Number</th>
16 |         <th>Air Time</th>
17 |         <th>Distance</th>
18 |       </thead>
19 |       <tbody>
20 |         {% for flight in flights %}
21 |         <tr>
22 |           <td>{{flight.Carrier}}</td>
23 |           <td><a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a></td>
24 |           <td>{{flight.Origin}}</td>
25 |           <td>{{flight.Dest}}</td>
26 |           <td>{{flight.DepTime}}</td>
27 |           <td>{{flight.TailNum}}</td>
28 |           <td>{{flight.AirTime}}</td>
29 |           <td>{{flight.Distance}}</td>
30 |         </tr>
31 |         {% endfor %}
32 |       </tbody>
33 |     </table>
34 |   </div>
35 | {% endblock %}
36 | 


--------------------------------------------------------------------------------
/ch06/web/templates/flights_per_airplane.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <div>
 4 |   <!-- Navigation guide -->
 5 |   / Airplanes / <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
 6 | 
 7 |   <p class="lead">Flights by Tail Number {{tail_number}}</p>
 8 |   <table class="table table-condensed table-striped">
 9 |     <thead>
10 |       <th>Carrier</th>
11 |       <th>Date</th>
12 |       <th>Flight Number</th>
13 |       <th>Origin</th>
14 |       <th>Destination</th>
15 |     </thead>
16 |     <tbody>
17 |       {% for flight in flights['Flights'] %}
18 |       <tr>
19 |         <td><a href="/airlines/{{flight['Carrier']}}">{{flight['Carrier']}}</a></td>
20 |         <td>{{flight['FlightDate']}}</td>
21 |         <td><a
22 |             href="/on_time_performance?Carrier={{flight['Carrier']}}&FlightDate={{flight['FlightDate']}}&FlightNum={{flight['FlightNum']}}">{{flight['FlightNum']}}</a>
23 |         </td>
24 |         <td><a href="/airport/{{flight['Origin']}}">{{flight['Origin']}}</a></td>
25 |         <td><a href="/airport/{{flight['Dest']}}">{{flight['Dest']}}</a></td>
26 |       </tr>
27 |       {% endfor %}
28 |     </tbody>
29 |   </table>
30 | </div>
31 | {% endblock %}


--------------------------------------------------------------------------------
/ch06/web/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Chapter 5 example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <script src="/static/jquery-1.12.2.min.js"></script>
15 |     <script src="/static/bootstrap.min.js"></script>
16 |     <script src="/static/d3.v3.min.js"></script>
17 |     <div id="wrap">
18 | 
19 |       <!-- Begin page content -->
20 |       <div class="container">
21 |         <div class="page-header" style="margin: 0px;">
22 |           <h2>Agile Data Science</h2>
23 |         </div>
24 |         {% block body %}{% endblock %}
25 |       </div>
26 | 
27 |       <div id="push"></div>
28 |     </div>
29 | 
30 |     <div id="footer">
31 |       <div class="container">
32 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
33 |       </div>
34 |     </div>
35 |   </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/ch06/web/templates/macros.jnj:
--------------------------------------------------------------------------------
 1 | <!-- Display two navigation links for previous/next page in the email list -->
 2 | {% macro display_nav(offsets, path, count) -%}
 3 |   <div style="text-align: center;">
 4 |     {% for key, values in offsets.items() -%}
 5 |       {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%}
 6 |         <a style="margin-left: 20px; margin-right: 20px;" href="{{ path }}?start={{ values
 7 |           ['bottom_offset'] }}&end={{ values['top_offset'] }}{%- if query -%}?search=
 8 |           {{query}}{%- endif -%}">{{ key }}</a>
 9 |       {% else -%}
10 |         {{ key }}
11 |       {% endif %}
12 |     {% endfor -%}
13 |   </div>
14 | {% endmacro -%}
15 | 


--------------------------------------------------------------------------------
/ch06/web/templates/total_flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Total Flights by Month</p>
 5 |     <table class="table table-condensed table-striped" style="width: 200px;">
 6 |       <thead>
 7 |         <th>Month</th>
 8 |         <th>Total Flights</th>
 9 |       </thead>
10 |       <tbody>
11 |         {% for month in total_flights %}
12 |         <tr>
13 |           <td>{{month.Month}}</td>
14 |           <td>{{month.total_flights}}</td>
15 |         </tr>
16 |         {% endfor %}
17 |       </tbody>
18 |     </table>
19 |   </div>
20 | {% endblock %}


--------------------------------------------------------------------------------
/ch06/web/templates/total_flights_chart.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">Total Flights by Month</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app.js"></script>
22 |   <script>
23 | 
24 |   </script>
25 | {% endblock %}


--------------------------------------------------------------------------------
/ch07/images/ads2_0701.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch07/images/ads2_0701.png


--------------------------------------------------------------------------------
/ch07/images/ads2_0702.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch07/images/ads2_0702.png


--------------------------------------------------------------------------------
/ch08/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Get the absolute path of this script, see http://bit.ly/find_path
 4 | ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
 5 | ABSOLUTE_DIR=$(dirname "${ABSOLUTE_PATH}")
 6 | 
 7 | # Extract to Agile_Data_Code_2/data/on_time_performance.parquet, wherever we are executed from
 8 | cd $ABSOLUTE_DIR/../data/
 9 | curl -Lko ./simple_flight_delay_features.jsonl.bz2 http://s3.amazonaws.com/agile_data_science/simple_flight_delay_features.jsonl.bz2
10 | 
11 | # Get the distances between pairs of airports
12 | curl -Lko ./origin_dest_distances.jsonl http://s3.amazonaws.com/agile_data_science/origin_dest_distances.jsonl
13 | 
14 | # Get the models to make ch08/web/predict_flask.py go
15 | cd $ABSOLUTE_DIR/..
16 | mkdir models
17 | curl -Lko ./models/sklearn_vectorizer.pkl http://s3.amazonaws.com/agile_data_science/sklearn_vectorizer.pkl
18 | curl -Lko ./models/sklearn_regressor.pkl http://s3.amazonaws.com/agile_data_science/sklearn_regressor.pkl
19 | 


--------------------------------------------------------------------------------
/ch08/fetch_prediction_requests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, os, re
 4 | import json
 5 | import datetime, iso8601
 6 | 
 7 | # Save to Mongo
 8 | from bson import json_util
 9 | import pymongo_spark
10 | pymongo_spark.activate()
11 | 
12 | # Pass date and base path to main() from airflow
13 | def main(iso_date, base_path):
14 |   
15 |   APP_NAME = "fetch_prediction_requests.py"
16 |   
17 |   # If there is no SparkSession, create the environment
18 |   try:
19 |     sc and spark
20 |   except NameError as e:
21 |     import findspark
22 |     findspark.init()
23 |     import pyspark
24 |     import pyspark.sql
25 |     
26 |     sc = pyspark.SparkContext()
27 |     spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
28 |   
29 |   # Get today and tomorrow's dates as iso strings to scope query
30 |   today_dt = iso8601.parse_date(iso_date)
31 |   rounded_today = today_dt.date()
32 |   iso_today = rounded_today.isoformat()
33 |   rounded_tomorrow_dt = rounded_today + datetime.timedelta(days=1)
34 |   iso_tomorrow = rounded_tomorrow_dt.isoformat()
35 |   
36 |   # Create mongo query string for today's data
37 |   mongo_query_string = """{{
38 |     "Timestamp": {{
39 |       "$gte": "{iso_today}",
40 |       "$lte": "{iso_tomorrow}"
41 |     }}
42 |   }}""".format(
43 |     iso_today=iso_today,
44 |     iso_tomorrow=iso_tomorrow
45 |   )
46 |   mongo_query_string = mongo_query_string.replace('\n', '')
47 |   
48 |   # Create the config object with the query string
49 |   mongo_query_config = dict()
50 |   mongo_query_config["mongo.input.query"] = mongo_query_string
51 |   
52 |   # Load the day's requests using pymongo_spark
53 |   prediction_requests = sc.mongoRDD(
54 |     'mongodb://localhost:27017/agile_data_science.prediction_tasks',
55 |     config=mongo_query_config
56 |   )
57 |   
58 |   # Build the day's output path: a date based primary key directory structure
59 |   today_output_path = "{}/data/prediction_tasks_daily.json/{}".format(
60 |     base_path,
61 |     iso_today
62 |   )
63 |   
64 |   # Generate json records
65 |   prediction_requests_json = prediction_requests.map(json_util.dumps)
66 |   
67 |   # Write/replace today's output path
68 |   os.system("rm -rf {}".format(today_output_path))
69 |   prediction_requests_json.saveAsTextFile(today_output_path)
70 | 
71 | if __name__ == "__main__":
72 |   main(sys.argv[1], sys.argv[2])
73 | 


--------------------------------------------------------------------------------
/ch08/images/ads2_0807.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0807.png


--------------------------------------------------------------------------------
/ch08/images/ads2_0808.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0808.png


--------------------------------------------------------------------------------
/ch08/images/ads2_0809.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0809.png


--------------------------------------------------------------------------------
/ch08/images/ads2_0810.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0810.png


--------------------------------------------------------------------------------
/ch08/images/ads2_0811.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/images/ads2_0811.png


--------------------------------------------------------------------------------
/ch08/import_distances.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Import our enriched airline data as the 'airlines' collection
4 | mongoimport -d agile_data_science -c origin_dest_distances --file data/origin_dest_distances.jsonl
5 | mongo agile_data_science --eval 'db.origin_dest_distances.ensureIndex({Origin: 1, Dest: 1})'
6 | 


--------------------------------------------------------------------------------
/ch08/kafka_test.py:
--------------------------------------------------------------------------------
1 | from kafka import KafkaProducer
2 | producer = KafkaProducer()
3 | 
4 | producer.send(
5 | 	'flight_delay_classification_request',
6 |   '{"Hello": "World!"}'.encode()
7 | )
8 | 


--------------------------------------------------------------------------------
/ch08/links.txt:
--------------------------------------------------------------------------------
 1 | https://davidwalsh.name/curl-post-file
 2 | http://localhost:5000/on_time_performance?Carrier=AA&FlightDate=2015-01-01&FlightNum=1519
 3 | http://blog.luisrei.com/articles/flaskrest.html
 4 | http://stackoverflow.com/questions/7172784/how-to-post-json-data-with-curl-from-terminal-commandline-to-test-spring-rest
 5 | https://airflow.incubator.apache.org/cli.html
 6 | https://en.wikipedia.org/wiki/Scientific_method
 7 | http://www.slideshare.net/xamat/agile-science
 8 | https://www.tutorialspoint.com/python/dictionary_update.htm
 9 | https://docs.python.org/3/library/string.html#formatstrings
10 | http://stackoverflow.com/questions/2943222/find-objects-between-two-dates-mongodb
11 | https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/SparkSession.html
12 | http://stackoverflow.com/questions/3908156/grep-output-to-show-only-matching-file
13 | https://github.com/mongodb/mongo-hadoop/tree/master/spark/src/main/python
14 | http://stackoverflow.com/questions/2943222/find-objects-between-two-dates-mongodb
15 | http://stackoverflow.com/questions/19819870/date-query-with-isodate-in-mongodb-doesnt-seem-to-work
16 | http://stackoverflow.com/questions/27523337/how-to-query-to-mongo-using-spark
17 | https://github.com/mongodb/mongo-hadoop/blob/master/spark/src/main/python/pymongo_spark.py
18 | 


--------------------------------------------------------------------------------
/ch08/load_prediction_results.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, os, re
 4 | import json
 5 | import datetime, iso8601
 6 | 
 7 | # Save to Mongo
 8 | from bson import json_util
 9 | import pymongo_spark
10 | pymongo_spark.activate()
11 | 
12 | # Pass date and base path to main() from airflow
13 | def main(iso_date, base_path):
14 |   
15 |   APP_NAME = "load_prediction_results.py"
16 |   
17 |   # If there is no SparkSession, create the environment
18 |   try:
19 |     sc and spark
20 |   except NameError as e:
21 |     import findspark
22 |     findspark.init()
23 |     import pyspark
24 |     import pyspark.sql
25 |     
26 |     sc = pyspark.SparkContext()
27 |     spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
28 |   
29 |   # Get today and tomorrow's dates as iso strings to scope query
30 |   today_dt = iso8601.parse_date(iso_date)
31 |   rounded_today = today_dt.date()
32 |   iso_today = rounded_today.isoformat()
33 |   
34 |   input_path = "{}/data/prediction_results_daily.json/{}".format(
35 |     base_path,
36 |     iso_today
37 |   )
38 |   
39 |   # Load and JSONize text
40 |   prediction_results_raw = sc.textFile(input_path)
41 |   prediction_results = prediction_results_raw.map(json_util.loads)
42 |   
43 |   # Store to MongoDB
44 |   prediction_results.saveToMongoDB(
45 |     "mongodb://localhost:27017/agile_data_science.prediction_results"
46 |   )
47 | 
48 | if __name__ == "__main__":
49 |   main(sys.argv[1], sys.argv[2])
50 | 


--------------------------------------------------------------------------------
/ch08/origin_dest_distances.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import json
 3 | 
 4 | # Load the on-time parquet file
 5 | on_time_dataframe = spark.read.parquet('data/on_time_performance.parquet')
 6 | on_time_dataframe.registerTempTable("on_time_performance")
 7 | 
 8 | origin_dest_distances = spark.sql("""
 9 |   SELECT Origin, Dest, AVG(Distance) AS Distance
10 |   FROM on_time_performance
11 |   GROUP BY Origin, Dest
12 |   ORDER BY Distance
13 |   """)
14 | origin_dest_distances.repartition(1).write.mode("overwrite").json("data/origin_dest_distances.json")
15 | os.system("cp data/origin_dest_distances.json/part* data/origin_dest_distances.jsonl")
16 | 


--------------------------------------------------------------------------------
/ch08/python_kafka_consumer.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import json
 3 | 
 4 | from kafka import KafkaConsumer, TopicPartition
 5 | consumer = KafkaConsumer()
 6 | consumer.assign([TopicPartition('flight_delay_classification_request', 0)])
 7 | consumer.seek_to_beginning()
 8 | 
 9 | for message in consumer:
10 |   message_bytes = message.value
11 |   message_string = message_bytes.decode()
12 |   message_object = json.loads(message_string)
13 |   print(message_object)
14 | 


--------------------------------------------------------------------------------
/ch08/python_kafka_producer.py:
--------------------------------------------------------------------------------
1 | from kafka import KafkaProducer
2 | producer = KafkaProducer()
3 | 
4 | producer.send(
5 |   'flight_delay_classification_request',
6 |   '{"Hello": "Producer!"}'.encode()
7 | )
8 | 


--------------------------------------------------------------------------------
/ch08/streaming_test.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import json
 3 | 
 4 | from pyspark import SparkContext, SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils, OffsetRange, TopicAndPartition
 7 | 
 8 | # Process data every 10 seconds
 9 | PERIOD=10
10 | BROKERS='localhost:9092'
11 | TOPIC='flight_delay_classification_request'
12 | 
13 | conf = SparkConf().set("spark.default.parallelism", 1)
14 | # sc = SparkContext(appName = "Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
15 | ssc = StreamingContext(sc, PERIOD)
16 | 
17 | stream = KafkaUtils.createDirectStream(
18 |   ssc,
19 |   [TOPIC],
20 |   {
21 |     "metadata.broker.list": BROKERS,
22 |     "group.id": "0",
23 |   }
24 | )
25 | 
26 | # Parse the JSON message and print the resulting object
27 | object_stream = stream.map(lambda x: json.loads(x[1]))
28 | object_stream.pprint()
29 | 
30 | ssc.start()
31 | 


--------------------------------------------------------------------------------
/ch08/test_airflow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Compute today's date:
 4 | export ISO_DATE=`date "+%Y-%m-%d"`
 5 | 
 6 | # List DAGs
 7 | airflow list_dags
 8 | 
 9 | # List tasks in each DAG
10 | airflow list_tasks agile_data_science_batch_prediction_model_training
11 | airflow list_tasks agile_data_science_batch_predictions_daily
12 | 
13 | # Test each task in each DAG
14 | airflow test agile_data_science_batch_prediction_model_training pyspark_extract_features $ISO_DATE
15 | airflow test agile_data_science_batch_prediction_model_training pyspark_train_classifier_model $ISO_DATE
16 | 
17 | airflow test agile_data_science_batch_predictions_daily pyspark_fetch_prediction_requests $ISO_DATE
18 | airflow test agile_data_science_batch_predictions_daily pyspark_make_predictions $ISO_DATE
19 | airflow test agile_data_science_batch_predictions_daily pyspark_load_prediction_results $ISO_DATE
20 | 
21 | # Test the training and persistence of the models
22 | airflow backfill -s $ISO_DATE -e $ISO_DATE agile_data_science_batch_prediction_model_training
23 | 
24 | # Test the daily operation of the model
25 | airflow backfill -s $ISO_DATE -e $ISO_DATE agile_data_science_batch_predictions_daily
26 | 


--------------------------------------------------------------------------------
/ch08/test_classification_api.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Fetch the delay prediction for a hypothetical flight
 4 | curl -XPOST 'http://localhost:5000/flights/delays/predict/classify' \
 5 |   -F 'DepDelay=5.0' \
 6 |   -F 'Carrier=AA' \
 7 |   -F 'FlightDate=2016-12-23' \
 8 |   -F 'Dest=ATL' \
 9 |   -F 'FlightNum=1519' \
10 |   -F 'Origin=SFO' \
11 | | json_pp
12 | 


--------------------------------------------------------------------------------
/ch08/test_regression_api.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Fetch the delay prediction for a hypothetical flight
 4 | curl -XPOST 'http://localhost:5000/flights/delays/predict/regress' \
 5 |   -F 'DepDelay=5.0' \
 6 |   -F 'Carrier=AA' \
 7 |   -F 'Date=2016-12-23' \
 8 |   -F 'Dest=ATL' \
 9 |   -F 'FlightNum=1519' \
10 |   -F 'Origin=SFO' \
11 | | json_pp
12 | 


--------------------------------------------------------------------------------
/ch08/web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch08/web/__init__.py


--------------------------------------------------------------------------------
/ch08/web/config.py:
--------------------------------------------------------------------------------
1 | # config.py, a configuration file for index.py
2 | RECORDS_PER_PAGE = 15
3 | AIRPLANE_RECORDS_PER_PAGE = 5
4 | ELASTIC_URL = "http://elastic:9200"
5 | 


--------------------------------------------------------------------------------
/ch08/web/predict_utils.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import pymongo
 3 | import datetime, iso8601
 4 | 
 5 | 
 6 | def process_search(results):
 7 |     """Process elasticsearch hits and return flights records"""
 8 |     records = []
 9 |     total = 0
10 |     if results["hits"] and results["hits"]["hits"]:
11 |         total = results["hits"]["total"]
12 |         hits = results["hits"]["hits"]
13 |         for hit in hits:
14 |             record = hit["_source"]
15 |             records.append(record)
16 |     return records, total
17 | 
18 | 
19 | def get_navigation_offsets(offset1, offset2, increment):
20 |     """Calculate offsets for fetching lists of flights from MongoDB"""
21 |     offsets = {}
22 |     offsets["Next"] = {
23 |         "top_offset": offset2 + increment,
24 |         "bottom_offset": offset1 + increment,
25 |     }
26 |     offsets["Previous"] = {
27 |         "top_offset": max(offset2 - increment, 0),
28 |         "bottom_offset": max(offset1 - increment, 0),
29 |     }  # Don't go < 0
30 |     return offsets
31 | 
32 | 
33 | def strip_place(url):
34 |     """Strip the existing start and end parameters from the query string"""
35 |     try:
36 |         p = re.match("(.+)\?start=.+&end=.+", url).group(1)
37 |     except AttributeError as e:
38 |         return url
39 |     return p
40 | 
41 | 
42 | def get_flight_distance(client, origin, dest):
43 |     """Get the distance between a pair of airport codes"""
44 |     query = {
45 |         "Origin": origin,
46 |         "Dest": dest,
47 |     }
48 |     record = client.agile_data_science.origin_dest_distances.find_one(query)
49 |     return record["Distance"]
50 | 
51 | 
52 | def get_regression_date_args(iso_date):
53 |     """Given an ISO Date, return the day of year, day of month, day of week as the API expects them."""
54 |     dt = iso8601.parse_date(iso_date)
55 |     day_of_year = dt.timetuple().tm_yday
56 |     day_of_month = dt.day
57 |     day_of_week = dt.weekday()
58 |     return {
59 |         "DayOfYear": day_of_year,
60 |         "DayOfMonth": day_of_month,
61 |         "DayOfWeek": day_of_week,
62 |     }
63 | 
64 | 
65 | def get_current_timestamp():
66 |     iso_now = datetime.datetime.now().isoformat()
67 |     return iso_now
68 | 


--------------------------------------------------------------------------------
/ch08/web/static/airplanes.js:
--------------------------------------------------------------------------------
 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40},
 2 |     width = 900 - margin.left - margin.right,
 3 |     height = 300 - margin.top - margin.bottom;
 4 | 
 5 | var x = d3.scale.ordinal()
 6 |     .rangeRoundBands([0, width], .1);
 7 | var y = d3.scale.linear()
 8 |     .range([height, 0]);
 9 | 
10 | var xAxis = d3.svg.axis()
11 |     .scale(x)
12 |     .orient("bottom")
13 |     .tickFormat(function(d) {
14 |         return truncate(d, 14);
15 |     });
16 | var yAxis = d3.svg.axis()
17 |     .scale(y)
18 |     .orient("left");
19 | 
20 | var chart = d3.select(".chart")
21 |     .attr("width", width + margin.left + margin.right)
22 |     .attr("height", height + margin.top + margin.bottom)
23 |     .append("g")
24 |     .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
25 | 
26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) {
27 |     var data = data.data;
28 | 
29 |     x.domain(data.map(function(d) { return d.Manufacturer; }));
30 |     y.domain([0, d3.max(data, function(d) { return d.Total; })]);
31 | 
32 |     chart.append("g")
33 |         .attr("class", "x axis")
34 |         .attr("transform", "translate(0," + height + ")")
35 |         .call(xAxis);
36 | 
37 |     chart.append("g")
38 |         .attr("class", "y axis")
39 |         .call(yAxis);
40 | 
41 |     chart.selectAll(".bar")
42 |         .data(data)
43 |         .enter().append("rect")
44 |         .attr("class", "bar")
45 |         .attr("x", function(d) { return x(d.Manufacturer); })
46 |         .attr("y", function(d) { return y(d.Total); })
47 |         .attr("height", function(d) { return height - y(d.Total); })
48 |         .attr("width", x.rangeBand());
49 | });
50 | 
51 | function truncate(d, l) {
52 |      if(d.length > l)
53 |          return d.substring(0,l)+'...';
54 |      else
55 |          return d;
56 | }
57 | 


--------------------------------------------------------------------------------
/ch08/web/static/app.js:
--------------------------------------------------------------------------------
 1 | var width = 960,
 2 |     height = 500;
 3 | 
 4 | var y = d3.scale.linear()
 5 |     .range([height, 0]);
 6 |     // We define the domain once we get our data in d3.json, below
 7 | 
 8 | var chart = d3.select(".chart")
 9 |     .attr("width", width)
10 |     .attr("height", height);
11 | 
12 | d3.json("/total_flights.json", function(data) {
13 |   y.domain([0, d3.max(data, function(d) { return d.total_flights; })]);
14 | 
15 |   var barWidth = width / data.length;
16 | 
17 |   var bar = chart.selectAll("g")
18 |       .data(data)
19 |       .enter()
20 |       .append("g")
21 |       .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; });
22 | 
23 |   bar.append("rect")
24 |       .attr("y", function(d) { return y(d.total_flights); })
25 |       .attr("height", function(d) { return height - y(d.total_flights); })
26 |       .attr("width", barWidth - 1);
27 | 
28 |   bar.append("text")
29 |       .attr("x", barWidth / 2)
30 |       .attr("y", function(d) { return y(d.total_flights) + 3; })
31 |       .attr("dy", ".75em")
32 |       .text(function(d) { return d.total_flights; });
33 | });


--------------------------------------------------------------------------------
/ch08/web/static/bar.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .axis text {
 3 |   font: 8px sans-serif;
 4 | }
 5 | 
 6 | .axis path,
 7 | .axis line {
 8 |   fill: none;
 9 |   stroke: #000;
10 |   shape-rendering: crispEdges;
11 | }
12 | 
13 | .bar {
14 |   fill: #ff6600;
15 | }
16 | 


--------------------------------------------------------------------------------
/ch08/web/static/barchart.js:
--------------------------------------------------------------------------------
 1 | class BarChart {
 2 | 
 3 |     constructor(url, labelName, valueName, chartClassName) {
 4 |         this.url = url;
 5 |         this.labelName = (typeof labelName !== 'undefined') ? labelName : 'label';
 6 |         this.valueName = (typeof valueName !== 'undefined') ? valueName : 'value';
 7 |         this.chartClassName = (typeof chartClassName !== 'undefined') ? chartClassName : 'chart';
 8 |     }
 9 | 
10 |     render() {
11 |         var margin = {top: 20, right: 30, bottom: 30, left: 40},
12 |             width = 900 - margin.left - margin.right,
13 |             height = 300 - margin.top - margin.bottom;
14 | 
15 |         var x = d3.scale.ordinal()
16 |             .rangeRoundBands([0, width], .1);
17 |         var y = d3.scale.linear()
18 |             .range([height, 0]);
19 | 
20 |         var xAxis = d3.svg.axis()
21 |             .scale(x)
22 |             .orient("bottom")
23 |             .tickFormat(function(d) {
24 |                 return truncate(d, 14);
25 |             });
26 |         var yAxis = d3.svg.axis()
27 |             .scale(y)
28 |             .orient("left");
29 | 
30 |         var chart = d3.select('.' + this.chartClassName)
31 |             .attr("width", width + margin.left + margin.right)
32 |             .attr("height", height + margin.top + margin.bottom)
33 |             .append("g")
34 |             .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
35 | 
36 |         var labelName = this.labelName;
37 |         var valueName = this.valueName;
38 |         d3.json(this.url, function(error, data) {
39 |             var data = data.data;
40 | 
41 |             x.domain(data.map(function(d) { return d[labelName]; }));
42 |             y.domain([0, d3.max(data, function(d) { return d[valueName]; })]);
43 | 
44 |             chart.append("g")
45 |                 .attr("class", "x axis")
46 |                 .attr("transform", "translate(0," + height + ")")
47 |                 .call(xAxis);
48 | 
49 |             chart.append("g")
50 |                 .attr("class", "y axis")
51 |                 .call(yAxis);
52 | 
53 |             chart.selectAll(".bar")
54 |                 .data(data)
55 |                 .enter().append("rect")
56 |                 .attr("class", "bar")
57 |                 .attr("x", function(d) { return x(d[labelName]); })
58 |                 .attr("y", function(d) { return y(d[valueName]); })
59 |                 .attr("height", function(d) { return height - y(d[valueName]); })
60 |                 .attr("width", x.rangeBand());
61 |         });
62 | 
63 |         function truncate(d, l) {
64 |              if(d.length > l)
65 |                  return d.substring(0,l)+'...';
66 |              else
67 |                  return d;
68 |         }
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/ch08/web/static/flight_delay_predict_polling.js:
--------------------------------------------------------------------------------
 1 | // Attach a submit handler to the form
 2 | $( "#flight_delay_classification" ).submit(function( event ) {
 3 | 
 4 |   // Stop form from submitting normally
 5 |   event.preventDefault();
 6 | 
 7 |   // Get some values from elements on the page:
 8 |   var $form = $( this ),
 9 |     term = $form.find( "input[name='s']" ).val(),
10 |     url = $form.attr( "action" );
11 | 
12 |   // Send the data using post
13 |   var posting = $.post(
14 |     url,
15 |     $( "#flight_delay_classification" ).serialize()
16 |   );
17 | 
18 |   // Submit the form and parse the response
19 |   posting.done(function( data ) {
20 |     var response = JSON.parse(data);
21 | 
22 |     // If the response is ok, print a message to wait and start polling
23 |     if(response.status == "OK") {
24 |       $( "#result" ).empty().append( "Processing..." );
25 | 
26 |       // Every 1 second, poll the response url until we get a response
27 |       poll(response.id);
28 |     }
29 |   });
30 | });
31 | 
32 | // Poll the prediction URL
33 | function poll(id) {
34 |   var responseUrlBase = "/flights/delays/predict/classify_realtime/response/";
35 |   console.log("Polling for request id " + id + "...");
36 | 
37 |   // Append the uuid to the URL as a slug argument
38 |   var predictionUrl = responseUrlBase + id;
39 | 
40 |   $.ajax(
41 |   {
42 |     url: predictionUrl,
43 |     type: "GET",
44 |     complete: conditionalPoll
45 |   });
46 | }
47 | 
48 | // Decide whether to poll based on the response status
49 | function conditionalPoll(data) {
50 |   var response = JSON.parse(data.responseText);
51 | 
52 |   if(response.status == "OK") {
53 |     renderPage(response.prediction);
54 |   }
55 |   else if(response.status == "WAIT") {
56 |     setTimeout(function() {poll(response.id)}, 1000);
57 |   }
58 | }
59 | 
60 | // Render the response on the page for splits:
61 | // [-float("inf"), -15.0, 0, 30.0, float("inf")]
62 | function renderPage(response) {
63 | 
64 |   console.log(response);
65 | 
66 |   var displayMessage;
67 | 
68 | if(response.Prediction == 0 || response.Prediction == '0') {
69 |     displayMessage = "Early (15+ Minutes Early)";
70 |   }
71 |   else if(response.Prediction == 1 || response.Prediction == '1') {
72 |     displayMessage = "Slightly Early (0-15 Minute Early)";
73 |   }
74 |   else if(response.Prediction == 2 || response.Prediction == '2') {
75 |     displayMessage = "Slightly Late (0-30 Minute Delay)";
76 |   }
77 |   else if(response.Prediction == 3 || response.Prediction == '3') {
78 |     displayMessage = "Very Late (30+ Minutes Late)";
79 |   }
80 |   
81 |   console.log(displayMessage)
82 | 
83 |   $( "#result" ).empty().append( displayMessage );
84 | }
85 | 


--------------------------------------------------------------------------------
/ch08/web/templates/airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airlines">Airlines</a> / <a href="/airline/{{carrier_code}}">{{carrier_code}}</a>
 5 | 
 6 |   <!-- Logo -->
 7 |   <img src="{{airline_summary.logo_url}}" style="float: right;"/>
 8 | 
 9 |   <p class="lead">
10 |     <!-- Airline Name and website-->
11 |     {{airline_summary.Name}}
12 |     / <a href="{{airline_summary.url}}">{{airline_summary.domain}}</a>
13 |   </p>
14 | 
15 |   <!-- Summary -->
16 |   <p style="text-align: justify;">{{airline_summary.summary}}</p>
17 |   <h4>Fleet: {{airline_airplanes.FleetCount}} Planes</h4>
18 |   <ul class="nav nav-pills">
19 |     {% for tail_number in airline_airplanes.TailNumbers -%}
20 |     <li class="button">
21 |       <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
22 |     </li>
23 |     {% endfor -%}
24 |   </ul>
25 | {% endblock %}
26 | 


--------------------------------------------------------------------------------
/ch08/web/templates/all_airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airlines">Airlines</a>
 5 | 
 6 |   <p class="lead">US Domestic Airlines</p>
 7 |   <ul class="nav nav-pills">
 8 |     {% for airline in airlines -%}
 9 |     <li class="button">
10 |       <a href="/airline/{{airline.Carrier}}">{{airline.Carrier}}</a>
11 |     </li>
12 |     {% endfor -%}
13 |   </ul>
14 | {% endblock %}
15 | 


--------------------------------------------------------------------------------
/ch08/web/templates/all_airplanes.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airplanes">Airplanes</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     US Commercial Fleet
 9 |   </p>
10 | 
11 |   <!-- Chart of fleet manufacturers -->
12 |   <div>
13 |     <p style="margin: 0px;">Airplanes by Manufacturer</p>
14 |     <div id="chart"><svg class="chart"></svg></div>
15 |   </div>
16 |   <script src="/static/airplanes.js"></script>
17 | 
18 |   <!-- Generate form from search_config and request args -->
19 |   <form action="/airplanes" method="get">
20 |     {% for item in search_config %}
21 |       {% if 'label' in item %}
22 |         <label for="{{item['field']}}">{{item['label']}}</label>
23 |       {% else %}
24 |         <label for="{{item['field']}}">{{item['field']}}</label>
25 |       {% endif %}
26 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value="{{args[item['field']] if args[item['field']] else ''}}"></input>
27 |     {% endfor %}
28 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
29 |   </form>
30 | 
31 |   <table class="table table-condensed table-striped">
32 |     <!-- Create table header, based on search_config -->
33 |     <thead>
34 |       {% for item in search_config %}
35 |         {% if 'label' in item %}
36 |           <th>{{item['label']}}</th>
37 |         {% else %}
38 |           <th>{{item['field']}}</th>
39 |         {% endif %}
40 |       {% endfor %}
41 |     </thead>
42 | 
43 |     <!-- Create table content, based on airplanes for each <tr> and search_config for each <td> -->
44 |     <tbody>
45 |       {% for airplane in airplanes %}
46 |       <tr>
47 |         {% for item in search_config %}
48 |           <td>{{airplane[item['field']]}}</td>
49 |         {% endfor %}
50 |       </tr>
51 |       {% endfor %}
52 |     </tbody>
53 |   </table>
54 | 
55 |   <!-- css for x axis in chart -->
56 |   <style>
57 |     .axis text {
58 |       font: 8px sans-serif;
59 |     }
60 | 
61 |     .axis path,
62 |     .axis line {
63 |       fill: none;
64 |       stroke: #000;
65 |       shape-rendering: crispEdges;
66 |     }
67 | 
68 |     .bar {
69 |       fill: #ff6600;
70 |     }
71 |   </style>
72 | 
73 | {% import "macros.jnj" as common %}
74 | {% if nav_offsets and nav_path -%}
75 |   {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }}
76 | {% endif -%}
77 | {% endblock %}
78 | 


--------------------------------------------------------------------------------
/ch08/web/templates/delays.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |       <!-- Navigation guide -->
 5 |       / <a href="/delays">Delays</a>
 6 | 
 7 |       <p class="lead" style="margin: 10px; margin-left: 0px;">
 8 |         <!-- Airline Name and website-->
 9 |         Summary of Flight Delays
10 |       </p>
11 | 
12 |     <div id="chart"><svg class="chart"></svg></div>
13 |     <script src="/static/barchart.js"></script>
14 |     <link href="/static/bar.css" type="text/css" rel="stylesheet" />
15 | 
16 |     <script>
17 |         var barChart = new BarChart('/weather_delay_histogram.json');
18 |         barChart.render();
19 |     </script>
20 |   </div>
21 | {% endblock %}
22 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flight.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / Flights / <a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.TailNum}}</a>
 6 | 
 7 |     <p class="lead">Flight {{flight.FlightNum}}</p>
 8 |     <table class="table">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Origin</th>
12 |         <th>Destination</th>
13 |         <th>Tail Number</th>
14 |         <th>Date</th>
15 |         <th>Air Time</th>
16 |         <th>Distance</th>
17 |       </thead>
18 |       <tbody>
19 |         <tr>
20 |           <td><a href="/airline/{{flight.Carrier}}">{{flight.Carrier}}</a></td>
21 |           <td>{{flight.Origin}}</td>
22 |           <td>{{flight.Dest}}</td>
23 |           <td><a href="/airplane/flights/{{flight.TailNum}}">{{flight.TailNum}}</a></td>
24 |           <td>{{flight.FlightDate}}</td>
25 |           <td>{{flight.AirTime}}</td>
26 |           <td>{{flight.Distance}}</td>
27 |         </tr>
28 |       </tbody>
29 |     </table>
30 |   </div>
31 | {% endblock %}
32 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flight_delays_predict.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict">Flight Delay Prediction</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_regression" action="/flights/delays/predict/regress" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Delay: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <script>
29 |     // Attach a submit handler to the form
30 |     $( "#flight_delay_regression" ).submit(function( event ) {
31 | 
32 |       // Stop form from submitting normally
33 |       event.preventDefault();
34 | 
35 |       // Get some values from elements on the page:
36 |       var $form = $( this ),
37 |         term = $form.find( "input[name='s']" ).val(),
38 |         url = $form.attr( "action" );
39 | 
40 |       // Send the data using post
41 |       var posting = $.post( url, $( "#flight_delay_regression" ).serialize() );
42 | 
43 |       // Put the results in a div
44 |       posting.done(function( data ) {
45 |         result = JSON.parse(data);
46 |         $( "#result" ).empty().append( result.Delay );
47 |       });
48 |     });
49 |   </script>
50 | {% endblock %}
51 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flight_delays_predict_batch.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_batch">Flight Delay Prediction via Spark in Batch</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays via Spark in Batch
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_classification" action="/flights/delays/predict/classify" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Prediction Request Successful: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <script>
29 |     // Attach a submit handler to the form
30 |     $( "#flight_delay_classification" ).submit(function( event ) {
31 | 
32 |       // Stop form from submitting normally
33 |       event.preventDefault();
34 | 
35 |       // Get some values from elements on the page:
36 |       var $form = $( this ),
37 |         term = $form.find( "input[name='s']" ).val(),
38 |         url = $form.attr( "action" );
39 | 
40 |       // Send the data using post
41 |       var posting = $.post( url, $( "#flight_delay_classification" ).serialize() );
42 | 
43 |       // Put the results in a div
44 |       posting.done(function( data ) {
45 |         $( "#result" ).empty().append( data );
46 |       });
47 |     });
48 |   </script>
49 | {% endblock %}
50 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flight_delays_predict_batch_results.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_batch/results/{{ iso_date }}">Flight Delay Prediction Results via Spark in Batch</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 | 
 8 |     Presenting Flight Delay Predictions via Spark in Batch
 9 |   </p>
10 | 
11 |   <!-- Generate table from prediction results -->
12 |   <table class="table">
13 |       <thead>
14 |         <tr>
15 |             <td>Request Timestamp</td>
16 |             <td>Carrier</td>
17 |             <td>Flight Date</td>
18 |             <td>Origin</td>
19 |             <td>Destination</td>
20 |             <td>Distance</td>
21 |             <td>Departure Delay</td>
22 |             <td><span style="color: red;">Predicted Arrival Delay</span></td>
23 |         </tr>
24 |       </thead>
25 |       <tbody>
26 |         {% for item in predictions %}
27 |             <tr>
28 |                 <td>{{ item['Timestamp'] }}</td>
29 |                 <td>{{ item['Carrier'] }}</td>
30 |                 <td>{{ item['FlightDate'] }}</td>
31 |                 <td>{{ item['Origin'] }}</td>
32 |                 <td>{{ item['Dest'] }}</td>
33 |                 <td>{{ item['Distance'] }}</td>
34 |                 <td>{{ item['DepDelay'] }}</td>
35 |                 <td>
36 |                     <span style="color: red;">
37 |                         {% if item['Prediction'] == 0.0 %}
38 |                             On Time (0-15 Minute Delay)
39 |                         {% elif item['Prediction'] == 1.0 %}
40 |                             Slightly Late (15-60 Minute Delay)
41 |                         {% elif item['Prediction'] == 2.0 %}
42 |                             Very Late (60+ Minute Delay)
43 |                         {% endif %}
44 |                     </span>
45 |                 </td>
46 |             </tr>
47 |         {% endfor %}
48 |       </tbody>
49 |   </table>
50 | 
51 | {% endblock %}
52 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flight_delays_predict_kafka.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_kafka">Flight Delay Prediction with Kafka</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays with Kafka
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_classification" action="/flights/delays/predict/classify_realtime" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Delay: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <!-- Load our form processing javascript -->
29 |   <script src="/static/flight_delay_predict_polling.js"></script>
30 | {% endblock %}
31 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / <a href="/flights">Flights</a>
 6 | 
 7 |     <p class="lead">{{flight_count}} Flights on {{flight_date}}</p>
 8 |     <table class="table table-condensed table-striped">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Flight Number</th>
12 |         <th>Origin</th>
13 |         <th>Destination</th>
14 |         <th>Departure Time</th>
15 |         <th>Tail Number</th>
16 |         <th>Air Time</th>
17 |         <th>Distance</th>
18 |       </thead>
19 |       <tbody>
20 |         {% for flight in flights %}
21 |         <tr>
22 |           <td>{{flight.Carrier}}</td>
23 |           <td><a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a></td>
24 |           <td>{{flight.Origin}}</td>
25 |           <td>{{flight.Dest}}</td>
26 |           <td>{{flight.DepTime}}</td>
27 |           <td>{{flight.TailNum}}</td>
28 |           <td>{{flight.AirTime}}</td>
29 |           <td>{{flight.Distance}}</td>
30 |         </tr>
31 |         {% endfor %}
32 |       </tbody>
33 |     </table>
34 |   </div>
35 | {% endblock %}
36 | 


--------------------------------------------------------------------------------
/ch08/web/templates/flights_per_airplane.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / Airplanes / <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
 6 | 
 7 |     <p class="lead">Flights by Tail Number {{tail_number}}</p>
 8 |     <table class="table table-condensed table-striped">
 9 |       <thead>
10 |         <th>Carrier</th>
11 |         <th>Date</th>
12 |         <th>Flight Number</th>
13 |         <th>Origin</th>
14 |         <th>Destination</th>
15 |       </thead>
16 |       <tbody>
17 |         {% for flight in flights['Flights'] %}
18 |         <tr>
19 |           <td><a href="/airline/{{flight[0]}}">{{flight[0]}}</a></td>
20 |           <td>{{flight[1]}}</td>
21 |           <td><a href="/on_time_performance?Carrier={{flight[0]}}&FlightDate={{flight[1]}}&FlightNum={{flight[2]}}">{{flight[2]}}</a></td>
22 |           <td>{{flight[3]}}</td>
23 |           <td>{{flight[4]}}</td>
24 |         </tr>
25 |         {% endfor %}
26 |       </tbody>
27 |     </table>
28 |   </div>
29 | {% endblock %}
30 | 


--------------------------------------------------------------------------------
/ch08/web/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Chapter 5 example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <script src="/static/jquery-1.12.2.min.js"></script>
15 |     <script src="/static/bootstrap.min.js"></script>
16 |     <script src="/static/d3.v3.min.js"></script>
17 |     <div id="wrap">
18 | 
19 |       <!-- Begin page content -->
20 |       <div class="container">
21 |         <div class="page-header" style="margin: 0px;">
22 |           <h2>Agile Data Science</h2>
23 |         </div>
24 |         {% block body %}{% endblock %}
25 |       </div>
26 | 
27 |       <div id="push"></div>
28 |     </div>
29 | 
30 |     <div id="footer">
31 |       <div class="container">
32 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
33 |       </div>
34 |     </div>
35 |   </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/ch08/web/templates/macros.jnj:
--------------------------------------------------------------------------------
 1 | <!-- Display two navigation links for previous/next page in the email list -->
 2 | {% macro display_nav(offsets, path, count) -%}
 3 |   <div style="text-align: center;">
 4 |     {% for key, values in offsets.items() -%}
 5 |       {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%}
 6 |         <a style="margin-left: 20px; margin-right: 20px;" href="{{ path }}?start={{ values
 7 |           ['bottom_offset'] }}&end={{ values['top_offset'] }}{%- if query -%}?search=
 8 |           {{query}}{%- endif -%}">{{ key }}</a>
 9 |       {% else -%}
10 |         {{ key }}
11 |       {% endif %}
12 |     {% endfor -%}
13 |   </div>
14 | {% endmacro -%}
15 | 


--------------------------------------------------------------------------------
/ch08/web/templates/total_flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Total Flights by Month</p>
 5 |     <table class="table table-condensed table-striped" style="width: 200px;">
 6 |       <thead>
 7 |         <th>Month</th>
 8 |         <th>Total Flights</th>
 9 |       </thead>
10 |       <tbody>
11 |         {% for month in total_flights %}
12 |         <tr>
13 |           <td>{{month.Month}}</td>
14 |           <td>{{month.total_flights}}</td>
15 |         </tr>
16 |         {% endfor %}
17 |       </tbody>
18 |     </table>
19 |   </div>
20 | {% endblock %}


--------------------------------------------------------------------------------
/ch08/web/templates/total_flights_chart.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">Total Flights by Month</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app.js"></script>
22 |   <script>
23 | 
24 |   </script>
25 | {% endblock %}


--------------------------------------------------------------------------------
/ch09/explore_delays.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | 
 3 | import sys, os, re
 4 | import json
 5 | import datetime, iso8601
 6 | 
 7 | base_path = "."
 8 | 
 9 | from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
10 | from pyspark.sql.types import StructType, StructField
11 | from pyspark.sql.functions import udf
12 | 
13 | schema = StructType([
14 |   StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
15 |   StructField("CRSArrTime", TimestampType(), True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
16 |   StructField("CRSDepTime", TimestampType(), True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
17 |   StructField("Carrier", StringType(), True),  # "Carrier":"WN"
18 |   StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
19 |   StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
20 |   StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
21 |   StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
22 |   StructField("Dest", StringType(), True),  # "Dest":"SAN"
23 |   StructField("Distance", DoubleType(), True),  # "Distance":368.0
24 |   StructField("FlightDate", DateType(), True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
25 |   StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
26 |   StructField("Origin", StringType(), True),  # "Origin":"TUS"
27 | ])
28 | 
29 | features = spark.read.json(
30 |   "data/simple_flight_delay_features.json",
31 |   schema=schema
32 | )
33 | features.registerTempTable("features")
34 | features.show()
35 | 
36 | #
37 | # Check whether lateness varies a lot by hour scheduled departure/arrival
38 | #
39 | 
40 | spark.sql("""
41 |   SELECT
42 |     HOUR(CRSDepTime) + 1 AS Hour,
43 |     AVG(ArrDelay),
44 |     STD(ArrDelay)
45 |   FROM features
46 |   GROUP BY HOUR(CRSDepTime)
47 |   ORDER BY HOUR(CRSDepTime)
48 | """).show(24)
49 | 
50 | spark.sql("""
51 |   SELECT
52 |     HOUR(CRSArrTime) + 1 AS Hour,
53 |     AVG(ArrDelay),
54 |     STD(ArrDelay)
55 |   FROM features
56 |   GROUP BY HOUR(CRSArrTime)
57 |   ORDER BY HOUR(CRSArrTime)
58 | """).show(24)
59 | 
60 | 
61 | from pyspark.sql.functions import hour
62 | 
63 | features = features.withColumn('CRSDepHourOfDay', hour(features.CRSDepTime))
64 | features = features.withColumn('CRSArrHourOfDay', hour(features.CRSArrTime))
65 | 
66 | departure_cov = features.stat.cov('CRSDepHourOfDay', 'ArrDelay')
67 | arrival_cov = features.stat.cov('CRSArrHourOfDay', 'ArrDelay')
68 | 
69 | print("Departure delay covariance: {:,}".format(departure_cov))
70 | print("Arrival delay covariance:   {:,}".format(arrival_cov))
71 | 


--------------------------------------------------------------------------------
/ch09/make_predictions_final.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch09/make_predictions_final.py


--------------------------------------------------------------------------------
/ch09/make_predictions_streaming_final.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch09/make_predictions_streaming_final.py


--------------------------------------------------------------------------------
/ch10/spark_model_with_weather.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch10/spark_model_with_weather.py


--------------------------------------------------------------------------------
/ch10/web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/ch10/web/__init__.py


--------------------------------------------------------------------------------
/ch10/web/config.py:
--------------------------------------------------------------------------------
1 | # config.py, a configuration file for index.py
2 | RECORDS_PER_PAGE=15
3 | AIRPLANE_RECORDS_PER_PAGE=5
4 | ELASTIC_URL='http://localhost:9200/agile_data_science'
5 | 


--------------------------------------------------------------------------------
/ch10/web/predict_utils.py:
--------------------------------------------------------------------------------
 1 | import sys, os, re
 2 | import pymongo
 3 | import datetime, iso8601
 4 | 
 5 | def process_search(results):
 6 |   """Process elasticsearch hits and return flights records"""
 7 |   records = []
 8 |   total = 0
 9 |   if results['hits'] and results['hits']['hits']:
10 |     total = results['hits']['total']
11 |     hits = results['hits']['hits']
12 |     for hit in hits:
13 |       record = hit['_source']
14 |       records.append(record)
15 |   return records, total
16 | 
17 | def get_navigation_offsets(offset1, offset2, increment):
18 |   """Calculate offsets for fetching lists of flights from MongoDB"""
19 |   offsets = {}
20 |   offsets['Next'] = {'top_offset': offset2 + increment, 'bottom_offset':
21 |   offset1 + increment}
22 |   offsets['Previous'] = {'top_offset': max(offset2 - increment, 0),
23 |  'bottom_offset': max(offset1 - increment, 0)} # Don't go < 0
24 |   return offsets
25 | 
26 | def strip_place(url):
27 |   """Strip the existing start and end parameters from the query string"""
28 |   try:
29 |     p = re.match('(.+)\?start=.+&end=.+', url).group(1)
30 |   except AttributeError as e:
31 |     return url
32 |   return p
33 | 
34 | def get_flight_distance(client, origin, dest):
35 |   """Get the distance between a pair of airport codes"""
36 |   query = {
37 |     "Origin": origin,
38 |     "Dest": dest,
39 |   }
40 |   record = client.agile_data_science.origin_dest_distances.find_one(query)
41 |   return record["Distance"]
42 | 
43 | def get_regression_date_args(iso_date):
44 |   """Given an ISO Date, return the day of year, day of month, day of week as the API expects them."""
45 |   dt = iso8601.parse_date(iso_date)
46 |   day_of_year = dt.timetuple().tm_yday
47 |   day_of_month = dt.day
48 |   day_of_week = dt.weekday()
49 |   return {
50 |     "DayOfYear": day_of_year,
51 |     "DayOfMonth": day_of_month,
52 |     "DayOfWeek": day_of_week,
53 |   }
54 | 
55 | def get_current_timestamp():
56 |   iso_now = datetime.datetime.now().isoformat()
57 |   return iso_now
58 | 


--------------------------------------------------------------------------------
/ch10/web/static/airplanes.js:
--------------------------------------------------------------------------------
 1 | var margin = {top: 20, right: 30, bottom: 30, left: 40},
 2 |     width = 900 - margin.left - margin.right,
 3 |     height = 300 - margin.top - margin.bottom;
 4 | 
 5 | var x = d3.scale.ordinal()
 6 |     .rangeRoundBands([0, width], .1);
 7 | var y = d3.scale.linear()
 8 |     .range([height, 0]);
 9 | 
10 | var xAxis = d3.svg.axis()
11 |     .scale(x)
12 |     .orient("bottom")
13 |     .tickFormat(function(d) {
14 |         return truncate(d, 14);
15 |     });
16 | var yAxis = d3.svg.axis()
17 |     .scale(y)
18 |     .orient("left");
19 | 
20 | var chart = d3.select(".chart")
21 |     .attr("width", width + margin.left + margin.right)
22 |     .attr("height", height + margin.top + margin.bottom)
23 |     .append("g")
24 |     .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
25 | 
26 | d3.json("/airplanes/chart/manufacturers.json", function(error, data) {
27 |     var data = data.data;
28 | 
29 |     x.domain(data.map(function(d) { return d.Manufacturer; }));
30 |     y.domain([0, d3.max(data, function(d) { return d.Total; })]);
31 | 
32 |     chart.append("g")
33 |         .attr("class", "x axis")
34 |         .attr("transform", "translate(0," + height + ")")
35 |         .call(xAxis);
36 | 
37 |     chart.append("g")
38 |         .attr("class", "y axis")
39 |         .call(yAxis);
40 | 
41 |     chart.selectAll(".bar")
42 |         .data(data)
43 |         .enter().append("rect")
44 |         .attr("class", "bar")
45 |         .attr("x", function(d) { return x(d.Manufacturer); })
46 |         .attr("y", function(d) { return y(d.Total); })
47 |         .attr("height", function(d) { return height - y(d.Total); })
48 |         .attr("width", x.rangeBand());
49 | });
50 | 
51 | function truncate(d, l) {
52 |      if(d.length > l)
53 |          return d.substring(0,l)+'...';
54 |      else
55 |          return d;
56 | }
57 | 


--------------------------------------------------------------------------------
/ch10/web/static/app.js:
--------------------------------------------------------------------------------
 1 | var width = 960,
 2 |     height = 500;
 3 | 
 4 | var y = d3.scale.linear()
 5 |     .range([height, 0]);
 6 |     // We define the domain once we get our data in d3.json, below
 7 | 
 8 | var chart = d3.select(".chart")
 9 |     .attr("width", width)
10 |     .attr("height", height);
11 | 
12 | d3.json("/total_flights.json", function(data) {
13 |   y.domain([0, d3.max(data, function(d) { return d.total_flights; })]);
14 | 
15 |   var barWidth = width / data.length;
16 | 
17 |   var bar = chart.selectAll("g")
18 |       .data(data)
19 |       .enter()
20 |       .append("g")
21 |       .attr("transform", function(d, i) { return "translate(" + i * barWidth + ",0)"; });
22 | 
23 |   bar.append("rect")
24 |       .attr("y", function(d) { return y(d.total_flights); })
25 |       .attr("height", function(d) { return height - y(d.total_flights); })
26 |       .attr("width", barWidth - 1);
27 | 
28 |   bar.append("text")
29 |       .attr("x", barWidth / 2)
30 |       .attr("y", function(d) { return y(d.total_flights) + 3; })
31 |       .attr("dy", ".75em")
32 |       .text(function(d) { return d.total_flights; });
33 | });


--------------------------------------------------------------------------------
/ch10/web/static/bar.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .axis text {
 3 |   font: 8px sans-serif;
 4 | }
 5 | 
 6 | .axis path,
 7 | .axis line {
 8 |   fill: none;
 9 |   stroke: #000;
10 |   shape-rendering: crispEdges;
11 | }
12 | 
13 | .bar {
14 |   fill: #ff6600;
15 | }
16 | 


--------------------------------------------------------------------------------
/ch10/web/static/flight_delay_predict_polling.js:
--------------------------------------------------------------------------------
 1 | // Attach a submit handler to the form
 2 | $( "#flight_delay_classification" ).submit(function( event ) {
 3 | 
 4 |   // Stop form from submitting normally
 5 |   event.preventDefault();
 6 | 
 7 |   // Get some values from elements on the page:
 8 |   var $form = $( this ),
 9 |     term = $form.find( "input[name='s']" ).val(),
10 |     url = $form.attr( "action" );
11 | 
12 |   // Send the data using post
13 |   var posting = $.post(
14 |     url,
15 |     $( "#flight_delay_classification" ).serialize()
16 |   );
17 | 
18 |   // Submit the form and parse the response
19 |   posting.done(function( data ) {
20 |     response = JSON.parse(data);
21 | 
22 |     // If the response is ok, print a message to wait and start polling
23 |     if(response.status == "OK") {
24 |       $( "#result" ).empty().append( "Processing..." );
25 | 
26 |       // Every 1 second, poll the response url until we get a response
27 |       poll(response.id);
28 |     }
29 |   });
30 | });
31 | 
32 | // Poll the prediction URL
33 | function poll(id) {
34 |   var responseUrlBase = "/flights/delays/predict/classify_realtime/response/";
35 |   console.log("Polling for request id " + id + "...");
36 | 
37 |   // Append the uuid to the URL as a slug argument
38 |   var predictionUrl = responseUrlBase + id;
39 | 
40 |   $.ajax(
41 |   {
42 |     url: predictionUrl,
43 |     type: "GET",
44 |     complete: conditionalPoll
45 |   });
46 | }
47 | 
48 | // Decide whether to poll based on the response status
49 | function conditionalPoll(data) {
50 |   var response = JSON.parse(data.responseText);
51 | 
52 |   if(response.status == "OK") {
53 |     renderPage(response.prediction);
54 |   }
55 |   else if(response.status == "WAIT") {
56 |     setTimeout(function() {poll(response.id)}, 1000);
57 |   }
58 | }
59 | 
60 | // Render the response on the page for splits:
61 | // [-float("inf"), -15.0, 0, 30.0, float("inf")]
62 | function renderPage(response) {
63 | 
64 |   var displayMessage;
65 | 
66 |   if(response.Prediction == 0) {
67 |     displayMessage = "Early (15+ Minutes Early)";
68 |   }
69 |   else if(response.Prediction == 1) {
70 |     displayMessage = "Slightly Early (0-15 Minute Early)";
71 |   }
72 |   else if(response.Prediction == 2) {
73 |     displayMessage = "Slightly Late (0-30 Minute Delay)";
74 |   }
75 |   else if(response.Prediction == 3) {
76 |     displayMessage = "Very Late (30+ Minutes Late)";
77 |   }
78 | 
79 |   $( "#result" ).empty().append( displayMessage );
80 | }
81 | 


--------------------------------------------------------------------------------
/ch10/web/templates/airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airlines">Airlines</a> / <a href="/airline/{{carrier_code}}">{{carrier_code}}</a>
 5 | 
 6 |   <!-- Logo -->
 7 |   <img src="{{airline_summary.logo_url}}" style="float: right;"/>
 8 | 
 9 |   <p class="lead">
10 |     <!-- Airline Name and website-->
11 |     {{airline_summary.Name}}
12 |     / <a href="{{airline_summary.url}}">{{airline_summary.domain}}</a>
13 |   </p>
14 | 
15 |   <!-- Summary -->
16 |   <p style="text-align: justify;">{{airline_summary.summary}}</p>
17 |   <h4>Fleet: {{airline_airplanes.FleetCount}} Planes</h4>
18 |   <ul class="nav nav-pills">
19 |     {% for tail_number in airline_airplanes.TailNumbers -%}
20 |     <li class="button">
21 |       <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
22 |     </li>
23 |     {% endfor -%}
24 |   </ul>
25 | {% endblock %}
26 | 


--------------------------------------------------------------------------------
/ch10/web/templates/all_airlines.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airlines">Airlines</a>
 5 | 
 6 |   <p class="lead">US Domestic Airlines</p>
 7 |   <ul class="nav nav-pills">
 8 |     {% for airline in airlines -%}
 9 |     <li class="button">
10 |       <a href="/airline/{{airline.Carrier}}">{{airline.Carrier}}</a>
11 |     </li>
12 |     {% endfor -%}
13 |   </ul>
14 | {% endblock %}
15 | 


--------------------------------------------------------------------------------
/ch10/web/templates/all_airplanes.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/airplanes">Airplanes</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     US Commercial Fleet
 9 |   </p>
10 | 
11 |   <!-- Chart of fleet manufacturers -->
12 |   <div>
13 |     <p style="margin: 0px;">Airplanes by Manufacturer</p>
14 |     <div id="chart"><svg class="chart"></svg></div>
15 |   </div>
16 |   <script src="/static/airplanes.js"></script>
17 | 
18 |   <!-- Generate form from search_config and request args -->
19 |   <form action="/airplanes" method="get">
20 |     {% for item in search_config %}
21 |       {% if 'label' in item %}
22 |         <label for="{{item['field']}}">{{item['label']}}</label>
23 |       {% else %}
24 |         <label for="{{item['field']}}">{{item['field']}}</label>
25 |       {% endif %}
26 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value="{{args[item['field']] if args[item['field']] else ''}}"></input>
27 |     {% endfor %}
28 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
29 |   </form>
30 | 
31 |   <table class="table table-condensed table-striped">
32 |     <!-- Create table header, based on search_config -->
33 |     <thead>
34 |       {% for item in search_config %}
35 |         {% if 'label' in item %}
36 |           <th>{{item['label']}}</th>
37 |         {% else %}
38 |           <th>{{item['field']}}</th>
39 |         {% endif %}
40 |       {% endfor %}
41 |     </thead>
42 | 
43 |     <!-- Create table content, based on airplanes for each <tr> and search_config for each <td> -->
44 |     <tbody>
45 |       {% for airplane in airplanes %}
46 |       <tr>
47 |         {% for item in search_config %}
48 |           <td>{{airplane[item['field']]}}</td>
49 |         {% endfor %}
50 |       </tr>
51 |       {% endfor %}
52 |     </tbody>
53 |   </table>
54 | 
55 |   <!-- css for x axis in chart -->
56 |   <style>
57 |     .axis text {
58 |       font: 8px sans-serif;
59 |     }
60 | 
61 |     .axis path,
62 |     .axis line {
63 |       fill: none;
64 |       stroke: #000;
65 |       shape-rendering: crispEdges;
66 |     }
67 | 
68 |     .bar {
69 |       fill: #ff6600;
70 |     }
71 |   </style>
72 | 
73 | {% import "macros.jnj" as common %}
74 | {% if nav_offsets and nav_path -%}
75 |   {{ common.display_nav(nav_offsets, nav_path, airplane_count)|safe }}
76 | {% endif -%}
77 | {% endblock %}
78 | 


--------------------------------------------------------------------------------
/ch10/web/templates/delays.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |       <!-- Navigation guide -->
 5 |       / <a href="/delays">Delays</a>
 6 | 
 7 |       <p class="lead" style="margin: 10px; margin-left: 0px;">
 8 |         <!-- Airline Name and website-->
 9 |         Summary of Flight Delays
10 |       </p>
11 | 
12 |     <div id="chart"><svg class="chart"></svg></div>
13 |     <script src="/static/barchart.js"></script>
14 |     <link href="/static/bar.css" type="text/css" rel="stylesheet" />
15 | 
16 |     <script>
17 |         var barChart = new BarChart('/weather_delay_histogram.json');
18 |         barChart.render();
19 |     </script>
20 |   </div>
21 | {% endblock %}
22 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flight.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / Flights / <a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.TailNum}}</a>
 6 | 
 7 |     <p class="lead">Flight {{flight.FlightNum}}</p>
 8 |     <table class="table">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Origin</th>
12 |         <th>Destination</th>
13 |         <th>Tail Number</th>
14 |         <th>Date</th>
15 |         <th>Air Time</th>
16 |         <th>Distance</th>
17 |       </thead>
18 |       <tbody>
19 |         <tr>
20 |           <td><a href="/airline/{{flight.Carrier}}">{{flight.Carrier}}</a></td>
21 |           <td>{{flight.Origin}}</td>
22 |           <td>{{flight.Dest}}</td>
23 |           <td><a href="/airplane/flights/{{flight.TailNum}}">{{flight.TailNum}}</a></td>
24 |           <td>{{flight.FlightDate}}</td>
25 |           <td>{{flight.AirTime}}</td>
26 |           <td>{{flight.Distance}}</td>
27 |         </tr>
28 |       </tbody>
29 |     </table>
30 |   </div>
31 | {% endblock %}
32 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flight_delays_predict.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict">Flight Delay Prediction</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_regression" action="/flights/delays/predict/regress" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Delay: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <script>
29 |     // Attach a submit handler to the form
30 |     $( "#flight_delay_regression" ).submit(function( event ) {
31 | 
32 |       // Stop form from submitting normally
33 |       event.preventDefault();
34 | 
35 |       // Get some values from elements on the page:
36 |       var $form = $( this ),
37 |         term = $form.find( "input[name='s']" ).val(),
38 |         url = $form.attr( "action" );
39 | 
40 |       // Send the data using post
41 |       var posting = $.post( url, $( "#flight_delay_regression" ).serialize() );
42 | 
43 |       // Put the results in a div
44 |       posting.done(function( data ) {
45 |         result = JSON.parse(data);
46 |         $( "#result" ).empty().append( result.Delay );
47 |       });
48 |     });
49 |   </script>
50 | {% endblock %}
51 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flight_delays_predict_batch.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_batch">Flight Delay Prediction via Spark in Batch</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays via Spark in Batch
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_classification" action="/flights/delays/predict/classify" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Prediction Request Successful: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <script>
29 |     // Attach a submit handler to the form
30 |     $( "#flight_delay_classification" ).submit(function( event ) {
31 | 
32 |       // Stop form from submitting normally
33 |       event.preventDefault();
34 | 
35 |       // Get some values from elements on the page:
36 |       var $form = $( this ),
37 |         term = $form.find( "input[name='s']" ).val(),
38 |         url = $form.attr( "action" );
39 | 
40 |       // Send the data using post
41 |       var posting = $.post( url, $( "#flight_delay_classification" ).serialize() );
42 | 
43 |       // Put the results in a div
44 |       posting.done(function( data ) {
45 |         $( "#result" ).empty().append( data );
46 |       });
47 |     });
48 |   </script>
49 | {% endblock %}
50 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flight_delays_predict_batch_results.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_batch/results/{{ iso_date }}">Flight Delay Prediction Results via Spark in Batch</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 | 
 8 |     Presenting Flight Delay Predictions via Spark in Batch
 9 |   </p>
10 | 
11 |   <!-- Generate table from prediction results -->
12 |   <table class="table">
13 |       <thead>
14 |         <tr>
15 |             <td>Request Timestamp</td>
16 |             <td>Carrier</td>
17 |             <td>Flight Date</td>
18 |             <td>Origin</td>
19 |             <td>Destination</td>
20 |             <td>Distance</td>
21 |             <td>Departure Delay</td>
22 |             <td><span style="color: red;">Predicted Arrival Delay</span></td>
23 |         </tr>
24 |       </thead>
25 |       <tbody>
26 |         {% for item in predictions %}
27 |             <tr>
28 |                 <td>{{ item['Timestamp'] }}</td>
29 |                 <td>{{ item['Carrier'] }}</td>
30 |                 <td>{{ item['FlightDate'] }}</td>
31 |                 <td>{{ item['Origin'] }}</td>
32 |                 <td>{{ item['Dest'] }}</td>
33 |                 <td>{{ item['Distance'] }}</td>
34 |                 <td>{{ item['DepDelay'] }}</td>
35 |                 <td>
36 |                     <span style="color: red;">
37 |                         {% if item['Prediction'] == 0.0 %}
38 |                             On Time (0-15 Minute Delay)
39 |                         {% elif item['Prediction'] == 1.0 %}
40 |                             Slightly Late (15-60 Minute Delay)
41 |                         {% elif item['Prediction'] == 2.0 %}
42 |                             Very Late (60+ Minute Delay)
43 |                         {% endif %}
44 |                     </span>
45 |                 </td>
46 |             </tr>
47 |         {% endfor %}
48 |       </tbody>
49 |   </table>
50 | 
51 | {% endblock %}
52 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flight_delays_predict_kafka.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <!-- Navigation guide -->
 4 |   / <a href="/flights/delays/predict_kafka">Flight Delay Prediction with Kafka</a>
 5 | 
 6 |   <p class="lead" style="margin: 10px; margin-left: 0px;">
 7 |     <!-- Airline Name and website-->
 8 |     Predicting Flight Delays with Kafka
 9 |   </p>
10 | 
11 |   <!-- Generate form from search_config and request args -->
12 |   <form id="flight_delay_classification" action="/flights/delays/predict/classify_realtime" method="post">
13 |     {% for item in form_config %}
14 |       {% if 'label' in item %}
15 |         <label for="{{item['field']}}">{{item['label']}}</label>
16 |       {% else %}
17 |         <label for="{{item['field']}}">{{item['field']}}</label>
18 |       {% endif %}
19 |         <input name="{{item['field']}}" style="width: 36px; margin-right: 10px;" value=""></input>
20 |     {% endfor %}
21 |     <button type="submit" class="btn btn-xs btn-default" style="height: 25px">Submit</button>
22 |   </form>
23 | 
24 |   <div style="margin-top: 10px;">
25 |       <p>Delay: <span id="result" style="display: inline-block;"></span></p>
26 |   </div>
27 | 
28 |   <!-- Load our form processing javascript -->
29 |   <script src="/static/flight_delay_predict_polling.js"></script>
30 | {% endblock %}
31 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / <a href="/flights">Flights</a>
 6 | 
 7 |     <p class="lead">{{flight_count}} Flights on {{flight_date}}</p>
 8 |     <table class="table table-condensed table-striped">
 9 |       <thead>
10 |         <th>Airline</th>
11 |         <th>Flight Number</th>
12 |         <th>Origin</th>
13 |         <th>Destination</th>
14 |         <th>Departure Time</th>
15 |         <th>Tail Number</th>
16 |         <th>Air Time</th>
17 |         <th>Distance</th>
18 |       </thead>
19 |       <tbody>
20 |         {% for flight in flights %}
21 |         <tr>
22 |           <td>{{flight.Carrier}}</td>
23 |           <td><a href="/on_time_performance?Carrier={{flight.Carrier}}&FlightDate={{flight.FlightDate}}&FlightNum={{flight.FlightNum}}">{{flight.FlightNum}}</a></td>
24 |           <td>{{flight.Origin}}</td>
25 |           <td>{{flight.Dest}}</td>
26 |           <td>{{flight.DepTime}}</td>
27 |           <td>{{flight.TailNum}}</td>
28 |           <td>{{flight.AirTime}}</td>
29 |           <td>{{flight.Distance}}</td>
30 |         </tr>
31 |         {% endfor %}
32 |       </tbody>
33 |     </table>
34 |   </div>
35 | {% endblock %}
36 | 


--------------------------------------------------------------------------------
/ch10/web/templates/flights_per_airplane.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     / Airplanes / <a href="/airplane/{{tail_number}}">{{tail_number}}</a>
 6 | 
 7 |     <p class="lead">Flights by Tail Number {{tail_number}}</p>
 8 |     <table class="table table-condensed table-striped">
 9 |       <thead>
10 |         <th>Carrier</th>
11 |         <th>Date</th>
12 |         <th>Flight Number</th>
13 |         <th>Origin</th>
14 |         <th>Destination</th>
15 |       </thead>
16 |       <tbody>
17 |         {% for flight in flights['Flights'] %}
18 |         <tr>
19 |           <td><a href="/airline/{{flight[0]}}">{{flight[0]}}</a></td>
20 |           <td>{{flight[1]}}</td>
21 |           <td><a href="/on_time_performance?Carrier={{flight[0]}}&FlightDate={{flight[1]}}&FlightNum={{flight[2]}}">{{flight[2]}}</a></td>
22 |           <td>{{flight[3]}}</td>
23 |           <td>{{flight[4]}}</td>
24 |         </tr>
25 |         {% endfor %}
26 |       </tbody>
27 |     </table>
28 |   </div>
29 | {% endblock %}
30 | 


--------------------------------------------------------------------------------
/ch10/web/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Agile Data Science</title>
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <meta name="description" content="Chapter 5 example in Agile Data Science, 2.0">
 8 |     <meta name="author" content="Russell Jurney">
 9 |     <link href="/static/bootstrap.min.css" rel="stylesheet">
10 |     <link href="/static/bootstrap-theme.min.css" rel="stylesheet">
11 |   </head>
12 | 
13 |   <body>
14 |     <script src="/static/jquery-1.12.2.min.js"></script>
15 |     <script src="/static/bootstrap.min.js"></script>
16 |     <script src="/static/d3.v3.min.js"></script>
17 |     <div id="wrap">
18 | 
19 |       <!-- Begin page content -->
20 |       <div class="container">
21 |         <div class="page-header" style="margin: 0px;">
22 |           <h2>Agile Data Science</h2>
23 |         </div>
24 |         {% block body %}{% endblock %}
25 |       </div>
26 | 
27 |       <div id="push"></div>
28 |     </div>
29 | 
30 |     <div id="footer">
31 |       <div class="container">
32 |         <p class="muted credit"><a href="http://shop.oreilly.com/product/0636920025054.do">Agile Data Science</a> by <a href="http://www.linkedin.com/in/russelljurney">Russell Jurney</a>, 2016
33 |       </div>
34 |     </div>
35 |   </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/ch10/web/templates/macros.jnj:
--------------------------------------------------------------------------------
 1 | <!-- Display two navigation links for previous/next page in the email list -->
 2 | {% macro display_nav(offsets, path, count) -%}
 3 |   <div style="text-align: center;">
 4 |     {% for key, values in offsets.items() -%}
 5 |       {%- if values['bottom_offset'] >= 0 and values['top_offset'] > 0 and count > values['bottom_offset'] -%}
 6 |         <a style="margin-left: 20px; margin-right: 20px;" href="{{ path }}?start={{ values
 7 |           ['bottom_offset'] }}&end={{ values['top_offset'] }}{%- if query -%}?search=
 8 |           {{query}}{%- endif -%}">{{ key }}</a>
 9 |       {% else -%}
10 |         {{ key }}
11 |       {% endif %}
12 |     {% endfor -%}
13 |   </div>
14 | {% endmacro -%}
15 | 


--------------------------------------------------------------------------------
/ch10/web/templates/total_flights.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <p class="lead">Total Flights by Month</p>
 5 |     <table class="table table-condensed table-striped" style="width: 200px;">
 6 |       <thead>
 7 |         <th>Month</th>
 8 |         <th>Total Flights</th>
 9 |       </thead>
10 |       <tbody>
11 |         {% for month in total_flights %}
12 |         <tr>
13 |           <td>{{month.Month}}</td>
14 |           <td>{{month.total_flights}}</td>
15 |         </tr>
16 |         {% endfor %}
17 |       </tbody>
18 |     </table>
19 |   </div>
20 | {% endblock %}


--------------------------------------------------------------------------------
/ch10/web/templates/total_flights_chart.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 | <style>
 4 | 
 5 | .chart rect {
 6 |   fill: steelblue;
 7 | }
 8 | 
 9 | .chart text {
10 |   fill: white;
11 |   font: 10px sans-serif;
12 |   text-anchor: middle;
13 | }
14 | 
15 | </style>
16 |   
17 |   <div>
18 |     <p class="lead">Total Flights by Month</p>
19 |     <div id="chart"><svg class="chart"></svg></div>
20 |   </div>
21 |   <script src="/static/app.js"></script>
22 |   <script>
23 | 
24 |   </script>
25 | {% endblock %}


--------------------------------------------------------------------------------
/ch10/web/templates/weather_station.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% block body %}
 3 |   <div>
 4 |     <!-- Navigation guide -->
 5 |     <ol class="breadcrumb">
 6 |       <li><a href="/">Home</a></li>
 7 |       <li><a href="/weather">Weather</a></li>
 8 |       <li><a href="/weather/station/{{profile_observations.WBAN}}">Station {{profile_observations.WBAN}}</a></li>
 9 |     </ol>
10 | 
11 |     <p class="lead">Weather Station {{profile_obserations.Profile.STATION_NAME}}</p>
12 | 
13 |   </div>
14 | {% endblock %}
15 | 


--------------------------------------------------------------------------------
/dags/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/dags/.exists


--------------------------------------------------------------------------------
/download_weather.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Get weather data
 5 | #
 6 | 
 7 | cd data
 8 | 
 9 | # Get the station master list as pipe-seperated-values
10 | curl -Lko /tmp/wbanmasterlist.psv.zip http://www.ncdc.noaa.gov/homr/file/wbanmasterlist.psv.zip
11 | unzip -o /tmp/wbanmasterlist.psv.zip
12 | gzip wbanmasterlist.psv
13 | rm -f /tmp/wbanmasterlist.psv.zip
14 | 
15 | # Get monthly files of daily summaries for all stations
16 | # curl -Lko /tmp/ http://www.ncdc.noaa.gov/orders/qclcd/QCLCD201501.zip
17 | for i in $(seq -w 1 12)
18 | do
19 |   curl -Lko /tmp/QCLCD2015${i}.zip http://www.ncdc.noaa.gov/orders/qclcd/QCLCD2015${i}.zip
20 |   unzip -o /tmp/QCLCD2015${i}.zip
21 |   gzip 2015${i}*.txt
22 |   rm -f /tmp/QCLCD2015${i}.zip
23 | done
24 | 


--------------------------------------------------------------------------------
/elastic_scripts/create.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | curl -XPUT 'http://localhost:9200/agile_data_science/' -d '{
 4 |     "settings" : {
 5 |         "index" : {
 6 |             "number_of_shards" : 1,
 7 |             "number_of_replicas" : 1
 8 |         }
 9 |     }
10 | }'
11 | 
12 | curl -XPUT 'http://localhost:9200/agile_data_science_airplanes/' -d '{
13 |     "settings" : {
14 |         "index" : {
15 |             "number_of_shards" : 1,
16 |             "number_of_replicas" : 1
17 |         }
18 |     }
19 | }'
20 | 


--------------------------------------------------------------------------------
/elastic_scripts/drop.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | curl -XDELETE 'http://localhost:9200/agile_data_science/'
4 | 
5 | curl -XDELETE 'http://localhost:9200/agile_data_science_airplanes/'
6 | 


--------------------------------------------------------------------------------
/elastic_scripts/query.sh:
--------------------------------------------------------------------------------
1 | curl -X GET "localhost:9200/agile_data_science/_search?q="
2 | 
3 | curl -X GET "localhost:9200/agile_data_science_airplanes/_search?q="
4 | 


--------------------------------------------------------------------------------
/images/DeepDiscoveryTechnicalLogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/DeepDiscoveryTechnicalLogo.png


--------------------------------------------------------------------------------
/images/airline_page_enriched_wikipedia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/airline_page_enriched_wikipedia.png


--------------------------------------------------------------------------------
/images/airplanes_page_chart_v1_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/airplanes_page_chart_v1_v2.png


--------------------------------------------------------------------------------
/images/back_end_realtime_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/back_end_realtime_architecture.png


--------------------------------------------------------------------------------
/images/climbing_the_pyramid_chapter_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/climbing_the_pyramid_chapter_intro.png


--------------------------------------------------------------------------------
/images/data_syndrome_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/data_syndrome_logo.png


--------------------------------------------------------------------------------
/images/flight_delay_chart_2.0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/flight_delay_chart_2.0.png


--------------------------------------------------------------------------------
/images/front_end_realtime_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/front_end_realtime_architecture.png


--------------------------------------------------------------------------------
/images/predicting_flight_kafka_waiting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/predicting_flight_kafka_waiting.png


--------------------------------------------------------------------------------
/images/ubuntu_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/ubuntu_images.png


--------------------------------------------------------------------------------
/images/video_course_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/images/video_course_cover.png


--------------------------------------------------------------------------------
/install/phantomjs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | sudo apt-get -y update
 3 | sudo apt-get -y install build-essential chrpath libssl-dev libxft-dev
 4 | sudo apt-get -y install libfreetype6 libfreetype6-dev
 5 | sudo apt-get -y install libfontconfig1 libfontconfig1-dev
 6 | 
 7 | cd /home/ubuntu
 8 | 
 9 | export PHANTOM_JS="phantomjs-2.1.1-linux-x86_64"
10 | curl -Lko /tmp/$PHANTOM_JS.tar.bz2 https://github.com/Medium/phantomjs/releases/download/v2.1.1/$PHANTOM_JS.tar.bz2
11 | sudo tar -xvjf /tmp/$PHANTOM_JS.tar.bz2
12 | sudo mv $PHANTOM_JS /usr/local/share
13 | sudo ln -sf /usr/local/share/$PHANTOM_JS/bin/phantomjs /usr/local/bin
14 | phantomjs --version
15 | 


--------------------------------------------------------------------------------
/intro_download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | curl -Lko data/on_time_performance.parquet.tgz https://s3.amazonaws.com/agile_data_science/on_time_performance.parquet.tgz
4 | tar -xvzf data/on_time_performance.parquet.tgz -C data
5 | 


--------------------------------------------------------------------------------
/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
 1 | c = get_config()
 2 | 
 3 | # Notebook config this is where you saved your pem cert
 4 | # c.NotebookApp.certfile = u'/home/ubuntu/certs/mycert.pem'
 5 | # Run on all IP addresses of your instance
 6 | c.NotebookApp.ip = '*'
 7 | # Don't open browser by default
 8 | c.NotebookApp.open_browser = False  
 9 | # Fix port to 8888
10 | c.NotebookApp.port = 8888
11 | # Disable token authentication
12 | c.NotebookApp.token = ""
13 | 


--------------------------------------------------------------------------------
/lib/data/example.csv:
--------------------------------------------------------------------------------
1 | Russell Jurney,Relato,CEO
2 | Florian Liebert,Mesosphere,CEO
3 | Don Brown,Rocana,CIO
4 | Steve Jobs,Apple,CEO
5 | Donald Trump,The Trump Organization,CEO
6 | Russell Jurney,Data Syndrome,Principal Consultant
7 | 


--------------------------------------------------------------------------------
/lib/setup_spark.py:
--------------------------------------------------------------------------------
 1 | # If there is no SparkSession, create the environment...
 2 | # Note that this must be inserted IN your script. You can't import this, it won't work.
 3 | try:
 4 |   sc and spark
 5 | except (NameError, UnboundLocalError) as e:
 6 |   
 7 |   import findspark
 8 |   
 9 |   findspark.init()
10 |   import pyspark
11 |   import pyspark.sql
12 |   
13 |   sc = pyspark.SparkContext()
14 |   spark = pyspark.sql.SparkSession(sc).builder.appName("Agile Data Science").getOrCreate()
15 |   
16 |   # continue...
17 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Utility functions to read and write json and jsonl files
 3 | #
 4 | import bz2
 5 | import codecs
 6 | import json
 7 | import os
 8 | 
 9 | from frozendict import frozendict
10 | 
11 | 
12 | def write_json_file(obj, path):
13 |   '''Dump an object and write it out as json to a file.'''
14 |   f = codecs.open(path, 'w', 'utf-8')
15 |   f.write(json.dumps(obj, ensure_ascii=False))
16 |   f.close()
17 | 
18 | 
19 | def write_json_lines_file(ary_of_objects, path):
20 |   '''Dump a list of objects out as a json lines file.'''
21 |   f = codecs.open(path, 'w', 'utf-8')
22 |   for row_object in ary_of_objects:
23 |     json_record = json.dumps(row_object, ensure_ascii=False)
24 |     f.write(json_record + "\n")
25 |   f.close()
26 | 
27 | 
28 | def read_json_file(path):
29 |   '''Turn a normal json file (no CRs per record) into an object.'''
30 |   text = codecs.open(path, 'r', 'utf-8').read()
31 |   return json.loads(text)
32 | 
33 | 
34 | def read_json_lines_bz(path):
35 |   '''Read a JSON Lines bzip compressed file'''
36 |   ary = []
37 |   with bz2.open(path, "rt") as bz_file:
38 |     for line in bz_file:
39 |       record = json.loads(line.rstrip("\n|\r"))
40 |       ary.append(record)
41 |   return ary
42 | 
43 | 
44 | def read_json_lines(path):
45 |   '''Read a JSON Lines file'''
46 |   ary = []
47 |   with codecs.open(path, "r", "utf-8") as f:
48 |     for line in f:
49 |       record = json.loads(line.rstrip("\n|\r"))
50 |       ary.append(record)
51 |   return ary
52 | 
53 | 
54 | def read_json_lines_file(path):
55 |   '''Turn a json cr file (CRs per record) into an array of objects'''
56 |   ary = []
57 | 
58 |   if os.path.isdir(path):
59 |     for (dirpath, dirnames, filenames) in os.walk(path):
60 |         for filename in filenames:
61 |           full_path = f'{dirpath}/{filename}'
62 |           if full_path.endswith('json') or full_path.endswith('jsonl'):
63 |             ary.extend(
64 |               read_json_lines(full_path)
65 |             )
66 |           if path.endswith('bz2'):
67 |             ary.extend(
68 |               read_json_lines_bz(full_path)
69 |             )
70 |   else:
71 |     if path.endswith('bz2'):
72 |       ary.extend(
73 |         read_json_lines_bz(path)
74 |       )
75 |     else:
76 |       ary.extend(
77 |         read_json_lines(path)
78 |       )
79 |   return ary
80 | 
81 | 
82 | class FrozenEncoder(json.JSONEncoder):
83 |   def default(self, obj):
84 |     if isinstance(obj, frozendict):
85 |       return dict(obj)
86 |     if isinstance(obj, frozenset):
87 |       return list(obj)
88 |     # Let the base class default method raise the TypeError
89 |     return json.JSONEncoder.default(self, obj)
90 | 


--------------------------------------------------------------------------------
/logs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/logs/.exists


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "agile_data_science"
 3 | version = "0.1.0"
 4 | description = "Code for Agile Data Science 2.0"
 5 | authors = ["Russell Jurney <russell.jurney@gmail.com>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.7,<3.10"
10 | python-dateutil = "^2.8.2"
11 | Jinja2 = "^3.1.3"
12 | requests = "^2.26.0"
13 | Flask = "^2.0.2"
14 | elasticsearch7 = ">=7.14.0 <7.15.0"
15 | beautifulsoup4 = "^4.10.0"
16 | frozendict = "^2.0.7"
17 | geopy = "^2.2.0"
18 | ipython = "^7.28.0"
19 | confluent-kafka = {extras = ["avro", "json", "protobuf"], version = "^1.7.0"}
20 | matplotlib = "^3.4.3"
21 | seaborn = "^0.11.2"
22 | pymongo = "^3.12.1"
23 | scipy = "^1.7.1"
24 | numpy = "^1.21.3"
25 | selenium = "^4.0.0"
26 | tabulate = "^0.8.9"
27 | tldextract = "^3.1.2"
28 | wikipedia = "^1.4.0"
29 | iso8601 = "^0.1.16"
30 | notebook = "^6.4.5"
31 | WTForms = "^2.3.3"
32 | scikit-learn = "^1.0"
33 | avro = ">= 1.0"
34 | 
35 | [tool.poetry.dev-dependencies]
36 | 
37 | [build-system]
38 | requires = ["poetry-core>=1.0.0"]
39 | build-backend = "poetry.core.masonry.api"
40 | 


--------------------------------------------------------------------------------
/scripts/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code_2/862b4959adeeb0d5d4e6f9bc24452a47b0b9c70a/scripts/.exists


--------------------------------------------------------------------------------