├── .dockerignore
├── stop_gcloud_example.sh
├── run_gcloud_example.sh
├── docs
    ├── img
    │   ├── airflow_connection.png
    │   ├── console_service_account.png
    │   └── create_service_account.png
    └── bigquery_github_trends.md
├── examples
    ├── gcloud-example
    │   └── dags
    │   │   ├── support
    │   │       └── keys
    │   │       │   └── .gitignore
    │   │   └── bigquery_github
    │   │       ├── config
    │   │           └── variables.json
    │   │       └── bigquery_github_trends.py
    └── intro-example
    │   └── dags
    │       ├── config
    │           └── example_variables.json
    │       ├── example_variables.py
    │       ├── tutorial.py
    │       └── example_twitter_dag.py
├── notebooks
    ├── Dockerfile
    ├── docker-compose.yml
    └── gcloud-example
    │   └── github-trend-analysis.ipynb
├── LICENSE
├── docker-compose.yml
├── docker-compose-gcloud.yml
├── .gitignore
└── README.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git


--------------------------------------------------------------------------------
/stop_gcloud_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker-compose -f docker-compose-gcloud.yml down


--------------------------------------------------------------------------------
/run_gcloud_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker-compose -f docker-compose-gcloud.yml up --abort-on-container-exit


--------------------------------------------------------------------------------
/docs/img/airflow_connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/airflow_connection.png


--------------------------------------------------------------------------------
/docs/img/console_service_account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/console_service_account.png


--------------------------------------------------------------------------------
/docs/img/create_service_account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/create_service_account.png


--------------------------------------------------------------------------------
/examples/gcloud-example/dags/support/keys/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore


--------------------------------------------------------------------------------
/examples/intro-example/dags/config/example_variables.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"example_variables_config": {
3 | 		"var1": "value1",
4 | 		"var2": [1, 2, 3],
5 | 		"var3": {
6 | 			"k": "value3"
7 | 		}
8 | 	}
9 | }


--------------------------------------------------------------------------------
/notebooks/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/base-notebook
2 | 
3 | USER jovyan
4 | 
5 | # Install Tensorflow
6 | RUN conda install --quiet --yes \    
7 |     'pandas' \
8 |     'pandas-gbq' --channel conda-forge
9 |     


--------------------------------------------------------------------------------
/examples/gcloud-example/dags/bigquery_github/config/variables.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"bigquery_github_trends_variables": {
3 | 		"bq_conn_id": "my_gcp_conn",
4 | 		"bq_project": "my_bq_project",
5 | 		"bq_dataset": "my_bq_dataset"
6 | 	}
7 | }


--------------------------------------------------------------------------------
/notebooks/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 |   jupyter-notebook:
4 |     build: .
5 |     image: my-jupyter-notebook    
6 |     volumes:
7 |       - ./gcloud-example:/home/jovyan/work      
8 |     ports:
9 |       - "8889:8888"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Tuan Vu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:9.6
 5 |     environment:
 6 |       - POSTGRES_USER=airflow
 7 |       - POSTGRES_PASSWORD=airflow
 8 |       - POSTGRES_DB=airflow
 9 |     ports:
10 |       - "5432:5432"
11 | 
12 |   webserver:
13 |     image: puckel/docker-airflow:1.10.1
14 |     build:
15 |       context: https://github.com/puckel/docker-airflow.git#1.10.1
16 |       dockerfile: Dockerfile
17 |       args:
18 |         AIRFLOW_DEPS: gcp_api,s3
19 |         PYTHON_DEPS: sqlalchemy==1.2.0
20 |     restart: always
21 |     depends_on:
22 |       - postgres
23 |     environment:
24 |       - LOAD_EX=n
25 |       - EXECUTOR=Local
26 |       - FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo=
27 |     volumes:
28 |       - ./examples/intro-example/dags:/usr/local/airflow/dags
29 |       # Uncomment to include custom plugins
30 |       # - ./plugins:/usr/local/airflow/plugins
31 |     ports:
32 |       - "8080:8080"
33 |     command: webserver
34 |     healthcheck:
35 |       test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
36 |       interval: 30s
37 |       timeout: 30s
38 |       retries: 3
39 | 


--------------------------------------------------------------------------------
/docker-compose-gcloud.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   postgres:
 4 |     image: postgres:9.6
 5 |     environment:
 6 |       - POSTGRES_USER=airflow
 7 |       - POSTGRES_PASSWORD=airflow
 8 |       - POSTGRES_DB=airflow
 9 |     ports:
10 |       - "5432:5432"
11 | 
12 |   webserver:
13 |     image: puckel/docker-airflow:1.10.1
14 |     build:
15 |       context: https://github.com/puckel/docker-airflow.git#1.10.1
16 |       dockerfile: Dockerfile
17 |       args:
18 |         AIRFLOW_DEPS: gcp_api,s3
19 |         PYTHON_DEPS: sqlalchemy==1.2.0        
20 |     restart: always
21 |     depends_on:
22 |       - postgres
23 |     environment:
24 |       - LOAD_EX=n
25 |       - EXECUTOR=Local
26 |       - FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo=
27 |     volumes:
28 |       - ./examples/gcloud-example/dags:/usr/local/airflow/dags
29 |       # Uncomment to include custom plugins
30 |       # - ./plugins:/usr/local/airflow/plugins
31 |     ports:
32 |       - "8080:8080"
33 |     command: webserver
34 |     healthcheck:
35 |       test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
36 |       interval: 30s
37 |       timeout: 30s
38 |       retries: 3


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Mac
107 | .DS_Store
108 | 


--------------------------------------------------------------------------------
/examples/intro-example/dags/example_variables.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from datetime import datetime
 4 | 
 5 | from airflow import DAG
 6 | from airflow.models import Variable
 7 | from airflow.operators.dummy_operator import DummyOperator
 8 | from airflow.operators.bash_operator import BashOperator
 9 | 
10 | default_args = {
11 |     'owner': 'airflow',
12 |     'start_date': datetime(2019, 2, 15),
13 |     'end_date': datetime(2019, 2, 15)    
14 | }
15 | 
16 | dag = DAG('example_variables', 
17 |     schedule_interval="@once", 
18 |     default_args=default_args)
19 | 
20 | 
21 | # # Config variables
22 | # # Common
23 | # var1 = "value1"
24 | # var2 = [1, 2, 3]
25 | # var3 = {'k': 'value3'}
26 | 
27 | # # 3 DB connections called
28 | # var1 = Variable.get("var1")
29 | # var2 = Variable.get("var2")
30 | # var3 = Variable.get("var3")
31 | 
32 | # ## Recommended way
33 | # dag_config = Variable.get("example_variables_config", deserialize_json=True)
34 | # var1 = dag_config["var1"]
35 | # var2 = dag_config["var2"]
36 | # var3 = dag_config["var3"]
37 | 
38 | # start = DummyOperator(
39 | #     task_id="start",
40 | #     dag=dag
41 | # )
42 | 
43 | # # To test this task, run this command:
44 | # # docker-compose run --rm webserver airflow test example_variables get_dag_config 2019-02-15
45 | # t1 = BashOperator(
46 | #     task_id="get_dag_config",
47 | #     bash_command='echo "{0}"'.format(dag_config),
48 | #     dag=dag,
49 | # )
50 | 
51 | # # You can directly use a variable from a jinja template
52 | # ## {{ var.value.<variable_name> }}
53 | 
54 | # t2 = BashOperator(
55 | #     task_id="get_variable_value",
56 | #     bash_command='echo {{ var.value.var3 }} ',
57 | #     dag=dag,
58 | # )
59 | 
60 | # ## {{ var.json.<variable_name> }}
61 | # t3 = BashOperator(
62 | #     task_id="get_variable_json",
63 | #     bash_command='echo {{ var.json.example_variables_config.var3 }} ',
64 | #     dag=dag,
65 | # )
66 | 
67 | # start >> [t1, t2, t3]


--------------------------------------------------------------------------------
/docs/bigquery_github_trends.md:
--------------------------------------------------------------------------------
 1 | Bigquery Github Trends
 2 | ---
 3 | 
 4 | Example for building a data pipeline using Google Cloud BigQuery and Airflow.
 5 | 
 6 | ## Setup
 7 | 
 8 | This example does assume you will have an Airflow Environment up and running. But first
 9 | a quick rundown of what you need:
10 | 
11 | * Running Airflow
12 | * Create a service account (Cloud Console)
13 | * Setup a Google Cloud Connection in Airflow
14 | * Enter the config variables
15 | 
16 | ### Running Airflow
17 | 
18 | - Check out the master branch of this tutorial
19 | - Start the Airflow environment with docker
20 | 
21 | ```
22 | bash run_gcloud_example.sh
23 | ```
24 | 
25 | - Stop the Airflow environment when you are finished
26 | 
27 | ```
28 | bash stop_gcloud_example.sh
29 | ```
30 | 
31 | ### Google Cloud Service Key
32 | 
33 | Go to the console:
34 | 
35 | ![console](img/console_service_account.png?raw=true)
36 | 
37 | Create the service account. Make sure the JSON private key has Editor's rights:
38 | 
39 | ![console](img/create_service_account.png?raw=true)
40 | 
41 | Also, the service account needs to have permission to access the GCS bucket and Bigquery dataset.
42 | 
43 | ### Airflow Connection
44 | 
45 | After having the GCP key, you need to create a connection in `Admin -> Connections` using your key.
46 | 
47 | In Airflow you need to define the *my_gcp_conn* named connection to your project:
48 | 
49 | ![console](img/airflow_connection.png?raw=true)
50 | 
51 | Supply the path to the downloaded private key, supply the *project_id* and define the
52 | minimum scope of *https://www.googleapis.com/auth/cloud-platform*
53 | 
54 | ### Enter the config variables
55 | 
56 | After connection has been set up, you can go to the [bigquery_github_trends DAG](../../gcloud-example/bigquery_github/bigquery_github_trends.py), and enter the value of config variables:
57 | - __BQ_PROJECT__: the bigquery project you are working on
58 | - __BQ_DATASET__: the bigquery dataset you are working on
59 | 
60 | ### Test the DAG
61 | 
62 | After connection and config variables has been set up, you can now test and run your DAG. 
63 | 
64 | - Using the command below to test specific task in the DAG:
65 | 
66 | ```
67 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test [DAG_ID] [TASK_ID] [EXECUTION_DATE]
68 | ```
69 | 
70 | - Examples: 
71 | 
72 | ```
73 | # Task 1
74 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01
75 | 
76 | # Task 2
77 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_hackernews_full 2018-12-01
78 | ```
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Airflow tutorial
 2 | ---
 3 | 
 4 | This is the code for [Apache Airflow Tutorials](https://www.youtube.com/playlist?list=PLYizQ5FvN6pvIOcOd6dFZu3lQqc6zBGp2) playlist by Tuan Vu on Youtube
 5 | 
 6 | ## Contents
 7 | 
 8 | | Part |      Title                | Git Tag |
 9 | |------|---------------------------|---------|
10 | | 1    | [Introduction to Apache Airflow](https://youtu.be/AHMm1wfGuHE) ([blog post](https://www.applydatascience.com/airflow/airflow-tutorial-introduction/)) | [v0.1](https://github.com/tuanavu/airflow-tutorial/tree/v0.1) |
11 | | 2    | [Set up airflow environment with docker](https://youtu.be/vvr_WNzEXBE) ([blog post](https://www.applydatascience.com/airflow/set-up-airflow-env-with-docker/)) | [v0.2](https://github.com/tuanavu/airflow-tutorial/tree/v0.2) |
12 | | 3    | [Set up airflow environment using Google Cloud Composer](https://youtu.be/ld6JO3MiuPQ) ([blog post](https://www.applydatascience.com/airflow/set-up-airflow-with-google-composer/)) | N/A |
13 | | 4    | [Writing your first pipeline](https://youtu.be/43wHwwZhJMo) ([blog post](https://www.applydatascience.com/airflow/writing-your-first-pipeline/)) | N/A |
14 | | 5    | [Airflow concept](https://youtu.be/4rQSa2zEWfw) ([blog post](https://www.applydatascience.com/airflow/airflow-concept/)) | N/A |
15 | | 6    | [Build a data pipeline using Google Cloud Bigquery](https://youtu.be/wAyu5BN3VpY) ([blog post](https://www.applydatascience.com/airflow/bigquery-pipeline-airflow/)) | [v0.6](https://github.com/tuanavu/airflow-tutorial/tree/v0.6) |
16 | | 7    | [Airflow variables](https://youtu.be/bHQ7nzn0j6k) ([blog post](https://www.applydatascience.com/airflow/airflow-variables/)) | [v0.7](https://github.com/tuanavu/airflow-tutorial/tree/v0.7) |
17 | 
18 | 
19 | ## Getting Started
20 | 
21 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
22 | 
23 | - Clone this repo
24 | - Install the prerequisites
25 | - Run the service
26 | - Check http://localhost:8080
27 | - Done! :tada:
28 | 
29 | ### Prerequisites
30 | 
31 | - Install [Docker](https://www.docker.com/)
32 | - Install [Docker Compose](https://docs.docker.com/compose/install/)
33 | - Following the Airflow release from [Python Package Index](https://pypi.python.org/pypi/apache-airflow)
34 | 
35 | ### Usage
36 | 
37 | Run the web service with docker
38 | 
39 | ```
40 | docker-compose up -d
41 | 
42 | # Build the image
43 | # docker-compose up -d --build
44 | ```
45 | 
46 | Check http://localhost:8080/
47 | 
48 | - `docker-compose logs` - Displays log output
49 | - `docker-compose ps` - List containers
50 | - `docker-compose down` - Stop containers
51 | 
52 | ## Other commands
53 | 
54 | If you want to run airflow sub-commands, you can do so like this:
55 | 
56 | - `docker-compose run --rm webserver airflow list_dags` - List dags
57 | - `docker-compose run --rm webserver airflow test [DAG_ID] [TASK_ID] [EXECUTION_DATE]` - Test specific task
58 | 
59 | If you want to run/test python script, you can do so like this:
60 | - `docker-compose run --rm webserver python /usr/local/airflow/dags/[PYTHON-FILE].py` - Test python script
61 | 
62 | ## Connect to database
63 | 
64 | If you want to use Ad hoc query, make sure you've configured connections:
65 | Go to Admin -> Connections and Edit "postgres_default" set this values:
66 | - Host : postgres
67 | - Schema : airflow
68 | - Login : airflow
69 | - Password : airflow
70 | 
71 | 
72 | ## Credits
73 | 
74 | - [Apache Airflow](https://github.com/apache/incubator-airflow)
75 | - [docker-airflow](https://github.com/puckel/docker-airflow/tree/1.10.0-5)
76 | 


--------------------------------------------------------------------------------
/examples/intro-example/dags/tutorial.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | 
 20 | """
 21 | ### Tutorial Documentation
 22 | Documentation that goes along with the Airflow tutorial located
 23 | [here](https://airflow.incubator.apache.org/tutorial.html)
 24 | """
 25 | from datetime import timedelta
 26 | 
 27 | import airflow
 28 | from airflow import DAG
 29 | from airflow.operators.bash_operator import BashOperator
 30 | 
 31 | # These args will get passed on to each operator
 32 | # You can override them on a per-task basis during operator initialization
 33 | default_args = {
 34 |     'owner': 'airflow',
 35 |     'depends_on_past': False,
 36 |     'start_date': airflow.utils.dates.days_ago(2),
 37 |     'email': ['airflow@example.com'],
 38 |     'email_on_failure': False,
 39 |     'email_on_retry': False,
 40 |     'retries': 1,
 41 |     'retry_delay': timedelta(minutes=5),
 42 |     # 'queue': 'bash_queue',
 43 |     # 'pool': 'backfill',
 44 |     # 'priority_weight': 10,
 45 |     # 'end_date': datetime(2016, 1, 1),
 46 |     # 'wait_for_downstream': False,
 47 |     # 'dag': dag,
 48 |     # 'adhoc':False,
 49 |     # 'sla': timedelta(hours=2),
 50 |     # 'execution_timeout': timedelta(seconds=300),
 51 |     # 'on_failure_callback': some_function,
 52 |     # 'on_success_callback': some_other_function,
 53 |     # 'on_retry_callback': another_function,
 54 |     # 'trigger_rule': u'all_success'
 55 | }
 56 | 
 57 | dag = DAG(
 58 |     'tutorial',
 59 |     default_args=default_args,
 60 |     description='A simple tutorial DAG',
 61 |     schedule_interval=timedelta(days=1),
 62 | )
 63 | 
 64 | # t1, t2 and t3 are examples of tasks created by instantiating operators
 65 | t1 = BashOperator(
 66 |     task_id='print_date',
 67 |     bash_command='date',
 68 |     dag=dag,
 69 | )
 70 | 
 71 | t1.doc_md = """\
 72 | #### Task Documentation
 73 | You can document your task using the attributes `doc_md` (markdown),
 74 | `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
 75 | rendered in the UI's Task Instance Details page.
 76 | ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
 77 | """
 78 | 
 79 | dag.doc_md = __doc__
 80 | 
 81 | t2 = BashOperator(
 82 |     task_id='sleep',
 83 |     depends_on_past=False,
 84 |     bash_command='sleep 5',
 85 |     dag=dag,
 86 | )
 87 | 
 88 | templated_command = """
 89 | {% for i in range(5) %}
 90 |     echo "{{ ds }}"
 91 |     echo "{{ macros.ds_add(ds, 7)}}"
 92 |     echo "{{ params.my_param }}"
 93 | {% endfor %}
 94 | """
 95 | 
 96 | t3 = BashOperator(
 97 |     task_id='templated',
 98 |     depends_on_past=False,
 99 |     bash_command=templated_command,
100 |     params={'my_param': 'Parameter I passed in'},
101 |     dag=dag,
102 | )
103 | 
104 | t1 >> [t2, t3]


--------------------------------------------------------------------------------
/examples/intro-example/dags/example_twitter_dag.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #   http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing,
 14 | # software distributed under the License is distributed on an
 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 16 | # KIND, either express or implied.  See the License for the
 17 | # specific language governing permissions and limitations
 18 | # under the License.
 19 | # --------------------------------------------------------------------------------
 20 | # Written By: Ekhtiar Syed
 21 | # Last Update: 8th April 2016
 22 | # Caveat: This Dag will not run because of missing scripts.
 23 | # The purpose of this is to give you a sample of a real world example DAG!
 24 | # --------------------------------------------------------------------------------
 25 | 
 26 | # --------------------------------------------------------------------------------
 27 | # Load The Dependencies
 28 | # --------------------------------------------------------------------------------
 29 | 
 30 | import airflow
 31 | from airflow import DAG
 32 | from airflow.operators.bash_operator import BashOperator
 33 | from airflow.operators.python_operator import PythonOperator
 34 | from airflow.operators.hive_operator import HiveOperator
 35 | from datetime import date, timedelta
 36 | 
 37 | # --------------------------------------------------------------------------------
 38 | # Create a few placeholder scripts. In practice these would be different python
 39 | # script files, which are imported in this section with absolute or relative imports
 40 | # --------------------------------------------------------------------------------
 41 | 
 42 | 
 43 | def fetchtweets():
 44 |     return None
 45 | 
 46 | 
 47 | def cleantweets():
 48 |     return None
 49 | 
 50 | 
 51 | def analyzetweets():
 52 |     return None
 53 | 
 54 | 
 55 | def transfertodb():
 56 |     return None
 57 | 
 58 | 
 59 | # --------------------------------------------------------------------------------
 60 | # set default arguments
 61 | # --------------------------------------------------------------------------------
 62 | 
 63 | default_args = {
 64 |     'owner': 'airflow',
 65 |     'depends_on_past': False,
 66 |     'start_date': airflow.utils.dates.days_ago(2),
 67 |     'email': ['airflow@example.com'],
 68 |     'email_on_failure': False,
 69 |     'email_on_retry': False,
 70 |     'retries': 1,
 71 |     'retry_delay': timedelta(minutes=5),
 72 |     # 'queue': 'bash_queue',
 73 |     # 'pool': 'backfill',
 74 |     # 'priority_weight': 10,
 75 |     # 'end_date': datetime(2016, 1, 1),
 76 | }
 77 | 
 78 | dag = DAG(
 79 |     'example_twitter_dag', default_args=default_args,
 80 |     schedule_interval="@daily")
 81 | 
 82 | # --------------------------------------------------------------------------------
 83 | # This task should call Twitter API and retrieve tweets from yesterday from and to
 84 | # for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv
 85 | # output files generated by this task and naming convention
 86 | # is direction(from or to)_twitterHandle_date.csv
 87 | # --------------------------------------------------------------------------------
 88 | 
 89 | fetch_tweets = PythonOperator(
 90 |     task_id='fetch_tweets',
 91 |     python_callable=fetchtweets,
 92 |     dag=dag)
 93 | 
 94 | # --------------------------------------------------------------------------------
 95 | # Clean the eight files. In this step you can get rid of or cherry pick columns
 96 | # and different parts of the text
 97 | # --------------------------------------------------------------------------------
 98 | 
 99 | clean_tweets = PythonOperator(
100 |     task_id='clean_tweets',
101 |     python_callable=cleantweets,
102 |     dag=dag)
103 | 
104 | clean_tweets.set_upstream(fetch_tweets)
105 | 
106 | # --------------------------------------------------------------------------------
107 | # In this section you can use a script to analyze the twitter data. Could simply
108 | # be a sentiment analysis through algorithms like bag of words or something more
109 | # complicated. You can also take a look at Web Services to do such tasks
110 | # --------------------------------------------------------------------------------
111 | 
112 | analyze_tweets = PythonOperator(
113 |     task_id='analyze_tweets',
114 |     python_callable=analyzetweets,
115 |     dag=dag)
116 | 
117 | analyze_tweets.set_upstream(clean_tweets)
118 | 
119 | # --------------------------------------------------------------------------------
120 | # Although this is the last task, we need to declare it before the next tasks as we
121 | # will use set_downstream This task will extract summary from Hive data and store
122 | # it to MySQL
123 | # --------------------------------------------------------------------------------
124 | 
125 | hive_to_mysql = PythonOperator(
126 |     task_id='hive_to_mysql',
127 |     python_callable=transfertodb,
128 |     dag=dag)
129 | 
130 | # --------------------------------------------------------------------------------
131 | # The following tasks are generated using for loop. The first task puts the eight
132 | # csv files to HDFS. The second task loads these files from HDFS to respected Hive
133 | # tables. These two for loops could be combined into one loop. However, in most cases,
134 | # you will be running different analysis on your incoming incoming and outgoing tweets,
135 | # and hence they are kept separated in this example.
136 | # --------------------------------------------------------------------------------
137 | 
138 | from_channels = ['fromTwitter_A', 'fromTwitter_B', 'fromTwitter_C', 'fromTwitter_D']
139 | to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D']
140 | yesterday = date.today() - timedelta(days=1)
141 | dt = yesterday.strftime("%Y-%m-%d")
142 | # define where you want to store the tweets csv file in your local directory
143 | local_dir = "/tmp/"
144 | # define the location where you want to store in HDFS
145 | hdfs_dir = " /tmp/"
146 | 
147 | for channel in to_channels:
148 | 
149 |     file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
150 | 
151 |     load_to_hdfs = BashOperator(
152 |         task_id="put_" + channel + "_to_hdfs",
153 |         bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
154 |                      local_dir + file_name +
155 |                      hdfs_dir + channel + "/",
156 |         dag=dag)
157 | 
158 |     load_to_hdfs.set_upstream(analyze_tweets)
159 | 
160 |     load_to_hive = HiveOperator(
161 |         task_id="load_" + channel + "_to_hive",
162 |         hql="LOAD DATA INPATH '" +
163 |             hdfs_dir + channel + "/" + file_name + "' "
164 |             "INTO TABLE " + channel + " "
165 |             "PARTITION(dt='" + dt + "')",
166 |         dag=dag)
167 |     load_to_hive.set_upstream(load_to_hdfs)
168 |     load_to_hive.set_downstream(hive_to_mysql)
169 | 
170 | for channel in from_channels:
171 |     file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
172 |     load_to_hdfs = BashOperator(
173 |         task_id="put_" + channel + "_to_hdfs",
174 |         bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
175 |                      local_dir + file_name +
176 |                      hdfs_dir + channel + "/",
177 |         dag=dag)
178 | 
179 |     load_to_hdfs.set_upstream(analyze_tweets)
180 | 
181 |     load_to_hive = HiveOperator(
182 |         task_id="load_" + channel + "_to_hive",
183 |         hql="LOAD DATA INPATH '" +
184 |             hdfs_dir + channel + "/" + file_name + "' "
185 |             "INTO TABLE " + channel + " "
186 |             "PARTITION(dt='" + dt + "')",
187 |         dag=dag)
188 | 
189 |     load_to_hive.set_upstream(load_to_hdfs)
190 |     load_to_hive.set_downstream(hive_to_mysql)


--------------------------------------------------------------------------------
/examples/gcloud-example/dags/bigquery_github/bigquery_github_trends.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import timedelta, datetime
  3 | 
  4 | from airflow import DAG
  5 | from airflow.models import Variable
  6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator
  7 | from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator
  8 | 
  9 | 
 10 | # Config variables
 11 | dag_config = Variable.get("bigquery_github_trends_variables", deserialize_json=True)
 12 | BQ_CONN_ID = dag_config["bq_conn_id"]
 13 | BQ_PROJECT = dag_config["bq_project"]
 14 | BQ_DATASET = dag_config["bq_dataset"]
 15 | 
 16 | default_args = {
 17 |     'owner': 'airflow',
 18 |     'depends_on_past': True,    
 19 |     'start_date': datetime(2018, 12, 1),
 20 |     'end_date': datetime(2018, 12, 5),
 21 |     'email': ['airflow@airflow.com'],
 22 |     'email_on_failure': True,
 23 |     'email_on_retry': False,
 24 |     'retries': 2,
 25 |     'retry_delay': timedelta(minutes=5),
 26 | }
 27 | 
 28 | # Set Schedule: Run pipeline once a day. 
 29 | # Use cron to define exact time. Eg. 8:15am would be "15 08 * * *"
 30 | schedule_interval = "00 21 * * *"
 31 | 
 32 | # Define DAG: Set ID and assign default args and schedule interval
 33 | dag = DAG(
 34 |     'bigquery_github_trends', 
 35 |     default_args=default_args, 
 36 |     schedule_interval=schedule_interval
 37 |     )
 38 | 
 39 | ## Task 1: check that the github archive data has a dated table created for that date
 40 | # To test this task, run this command:
 41 | # docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01
 42 | t1 = BigQueryCheckOperator(
 43 |         task_id='bq_check_githubarchive_day',
 44 |         sql='''
 45 |         #standardSQL
 46 |         SELECT
 47 |           table_id
 48 |         FROM
 49 |           `githubarchive.day.__TABLES_SUMMARY__`
 50 |         WHERE
 51 |           table_id = "{{ yesterday_ds_nodash }}"
 52 |         ''',
 53 |         use_legacy_sql=False,
 54 |         bigquery_conn_id=BQ_CONN_ID,
 55 |         dag=dag
 56 |     )
 57 | 
 58 | ## Task 2: check that the hacker news table contains data for that date.
 59 | t2 = BigQueryCheckOperator(
 60 |         task_id='bq_check_hackernews_full',
 61 |         sql='''
 62 |         #standardSQL
 63 |         SELECT
 64 |           FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date
 65 |         FROM
 66 |           `bigquery-public-data.hacker_news.full`
 67 |         WHERE
 68 |           type = 'story'
 69 |           AND FORMAT_TIMESTAMP("%Y%m%d", timestamp ) = "{{ yesterday_ds_nodash }}"
 70 |         LIMIT
 71 |           1
 72 |         ''',
 73 |         use_legacy_sql=False,
 74 |         bigquery_conn_id=BQ_CONN_ID,
 75 |         dag=dag
 76 |     )
 77 | 
 78 | ## Task 3: create a github daily metrics partition table
 79 | t3 = BigQueryOperator(
 80 |         task_id='bq_write_to_github_daily_metrics',    
 81 |         sql='''
 82 |         #standardSQL
 83 |         SELECT
 84 |           date,
 85 |           repo,
 86 |           SUM(IF(type='WatchEvent', 1, NULL)) AS stars,
 87 |           SUM(IF(type='ForkEvent',  1, NULL)) AS forks
 88 |         FROM (
 89 |           SELECT
 90 |             FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date,
 91 |             actor.id as actor_id,
 92 |             repo.name as repo,
 93 |             type
 94 |           FROM
 95 |             `githubarchive.day.{{ yesterday_ds_nodash }}`
 96 |           WHERE type IN ('WatchEvent','ForkEvent')
 97 |         )
 98 |         GROUP BY
 99 |           date,
100 |           repo
101 |         ''',
102 |         destination_dataset_table='{0}.{1}.github_daily_metrics${2}'.format(
103 |             BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'
104 |         ),    
105 |         write_disposition='WRITE_TRUNCATE',
106 |         allow_large_results=True,
107 |         use_legacy_sql=False,
108 |         bigquery_conn_id=BQ_CONN_ID,
109 |         dag=dag
110 |     )
111 | 
112 | ## Task 4: aggregate past github events to daily partition table
113 | t4 = BigQueryOperator(
114 |         task_id='bq_write_to_github_agg',    
115 |         sql='''
116 |         #standardSQL
117 |         SELECT
118 |           "{2}" as date,
119 |           repo,
120 |           SUM(stars) as stars_last_28_days,
121 |           SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{4}") 
122 |             AND TIMESTAMP("{3}") , 
123 |             stars, null)) as stars_last_7_days,
124 |           SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{3}") 
125 |             AND TIMESTAMP("{3}") , 
126 |             stars, null)) as stars_last_1_day,
127 |           SUM(forks) as forks_last_28_days,
128 |           SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{4}") 
129 |             AND TIMESTAMP("{3}") , 
130 |             forks, null)) as forks_last_7_days,
131 |           SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{3}") 
132 |             AND TIMESTAMP("{3}") , 
133 |             forks, null)) as forks_last_1_day
134 |         FROM
135 |           `{0}.{1}.github_daily_metrics`
136 |         WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{5}") 
137 |         AND TIMESTAMP("{3}") 
138 |         GROUP BY
139 |           date,
140 |           repo
141 |         '''.format(BQ_PROJECT, BQ_DATASET,
142 |             "{{ yesterday_ds_nodash }}", "{{ yesterday_ds }}",
143 |             "{{ macros.ds_add(ds, -6) }}",
144 |             "{{ macros.ds_add(ds, -27) }}"
145 |             )
146 |         ,
147 |         destination_dataset_table='{0}.{1}.github_agg${2}'.format(
148 |             BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'
149 |         ),
150 |         write_disposition='WRITE_TRUNCATE',
151 |         allow_large_results=True,
152 |         use_legacy_sql=False,
153 |         bigquery_conn_id=BQ_CONN_ID,
154 |         dag=dag
155 |     )
156 | 
157 | # Task 5: aggregate hacker news data to a daily partition table
158 | t5 = BigQueryOperator(
159 |     task_id='bq_write_to_hackernews_agg',    
160 |     sql='''
161 |     #standardSQL
162 |     SELECT
163 |       FORMAT_TIMESTAMP("%Y%m%d", timestamp) AS date,
164 |       `by` AS submitter,
165 |       id as story_id,
166 |       REGEXP_EXTRACT(url, "(https?://github.com/[^/]*/[^/#?]*)") as url,
167 |       SUM(score) as score
168 |     FROM
169 |       `bigquery-public-data.hacker_news.full`
170 |     WHERE
171 |       type = 'story'
172 |       AND timestamp>'{{ yesterday_ds }}'
173 |       AND timestamp<'{{ ds }}'
174 |       AND url LIKE '%https://github.com%'
175 |       AND url NOT LIKE '%github.com/blog/%'
176 |     GROUP BY
177 |       date,
178 |       submitter,
179 |       story_id,
180 |       url
181 |     ''',
182 |     destination_dataset_table='{0}.{1}.hackernews_agg${2}'.format(
183 |         BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'
184 |     ),
185 |     write_disposition='WRITE_TRUNCATE',
186 |     allow_large_results=True,
187 |     use_legacy_sql=False,
188 |     bigquery_conn_id=BQ_CONN_ID,
189 |     dag=dag
190 |     )
191 | 
192 | # Task 6: join the aggregate tables
193 | t6 = BigQueryOperator(
194 |     task_id='bq_write_to_hackernews_github_agg',    
195 |     sql='''
196 |     #standardSQL
197 |     SELECT 
198 |     a.date as date,
199 |     a.url as github_url,
200 |     b.repo as github_repo,
201 |     a.score as hn_score,
202 |     a.story_id as hn_story_id,
203 |     b.stars_last_28_days as stars_last_28_days,
204 |     b.stars_last_7_days as stars_last_7_days,
205 |     b.stars_last_1_day as stars_last_1_day,
206 |     b.forks_last_28_days as forks_last_28_days,
207 |     b.forks_last_7_days as forks_last_7_days,
208 |     b.forks_last_1_day as forks_last_1_day
209 |     FROM
210 |     (SELECT
211 |       *
212 |     FROM
213 |       `{0}.{1}.hackernews_agg`
214 |       WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{2}") AND TIMESTAMP("{2}")
215 |       )as a
216 |     LEFT JOIN 
217 |       (
218 |       SELECT 
219 |       repo,
220 |       CONCAT('https://github.com/', repo) as url,
221 |       stars_last_28_days,
222 |       stars_last_7_days,
223 |       stars_last_1_day,
224 |       forks_last_28_days,
225 |       forks_last_7_days,
226 |       forks_last_1_day
227 |       FROM
228 |       `{0}.{1}.github_agg`
229 |       WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{2}") AND TIMESTAMP("{2}")
230 |       ) as b
231 |     ON a.url = b.url
232 |     '''.format(
233 |             BQ_PROJECT, BQ_DATASET, "{{ yesterday_ds }}"
234 |         ),
235 |     destination_dataset_table='{0}.{1}.hackernews_github_agg${2}'.format(
236 |         BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}'
237 |     ),
238 |     write_disposition='WRITE_TRUNCATE',
239 |     allow_large_results=True,
240 |     use_legacy_sql=False,
241 |     bigquery_conn_id=BQ_CONN_ID,
242 |     dag=dag
243 |     )
244 | 
245 | # Task 7: Check if partition data is written successfully
246 | t7 = BigQueryCheckOperator(
247 |     task_id='bq_check_hackernews_github_agg',
248 |     sql='''
249 |     #standardSQL
250 |     SELECT
251 |         COUNT(*) AS rows_in_partition
252 |     FROM `{0}.{1}.hackernews_github_agg`    
253 |     WHERE _PARTITIONDATE = "{2}"
254 |     '''.format(BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds }}'
255 |         ),
256 |     use_legacy_sql=False,
257 |     bigquery_conn_id=BQ_CONN_ID,
258 |     dag=dag)
259 | 
260 | # Setting up Dependencies
261 | t3.set_upstream(t1)
262 | t4.set_upstream(t3)
263 | t5.set_upstream(t2)
264 | t6.set_upstream(t4)
265 | t6.set_upstream(t5)
266 | t7.set_upstream(t6)
267 | 


--------------------------------------------------------------------------------
/notebooks/gcloud-example/github-trend-analysis.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# GitHub on Hacker News trends analysis"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 2,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "from __future__ import print_function\n",
  17 |     "import pandas as pd"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "markdown",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "## Input parameters"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 9,
  30 |    "metadata": {},
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "project_id = \"your-project-id\"\n",
  34 |     "process_date = \"2018-12-01\"\n",
  35 |     "process_date_nodash = \"20181201\""
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "## Exploratory Data Analysis"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "markdown",
  47 |    "metadata": {},
  48 |    "source": [
  49 |     "## Github activity data\n",
  50 |     "- Link: [Data](https://bigquery.cloud.google.com/table/githubarchive:day.20181230) - [More info](https://blog.github.com/2017-01-19-github-data-ready-for-you-to-explore-with-bigquery/)"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "markdown",
  55 |    "metadata": {},
  56 |    "source": [
  57 |     "### Different event type in Gihub activity\n",
  58 |     "- [Event Types & Payloads](https://developer.github.com/v3/activity/events/types/) explaination"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 6,
  64 |    "metadata": {},
  65 |    "outputs": [
  66 |     {
  67 |      "name": "stdout",
  68 |      "output_type": "stream",
  69 |      "text": [
  70 |       "\n",
  71 |       "SELECT \n",
  72 |       "  type,\n",
  73 |       "  COUNT(*) AS cnt\n",
  74 |       "FROM `githubarchive.day.20181201` \n",
  75 |       "GROUP BY 1\n",
  76 |       "ORDER BY 2 DESC\n",
  77 |       "\n"
  78 |      ]
  79 |     },
  80 |     {
  81 |      "data": {
  82 |       "text/html": [
  83 |        "<div>\n",
  84 |        "<style scoped>\n",
  85 |        "    .dataframe tbody tr th:only-of-type {\n",
  86 |        "        vertical-align: middle;\n",
  87 |        "    }\n",
  88 |        "\n",
  89 |        "    .dataframe tbody tr th {\n",
  90 |        "        vertical-align: top;\n",
  91 |        "    }\n",
  92 |        "\n",
  93 |        "    .dataframe thead th {\n",
  94 |        "        text-align: right;\n",
  95 |        "    }\n",
  96 |        "</style>\n",
  97 |        "<table border=\"1\" class=\"dataframe\">\n",
  98 |        "  <thead>\n",
  99 |        "    <tr style=\"text-align: right;\">\n",
 100 |        "      <th></th>\n",
 101 |        "      <th>type</th>\n",
 102 |        "      <th>cnt</th>\n",
 103 |        "    </tr>\n",
 104 |        "  </thead>\n",
 105 |        "  <tbody>\n",
 106 |        "    <tr>\n",
 107 |        "      <th>0</th>\n",
 108 |        "      <td>PushEvent</td>\n",
 109 |        "      <td>588724</td>\n",
 110 |        "    </tr>\n",
 111 |        "    <tr>\n",
 112 |        "      <th>1</th>\n",
 113 |        "      <td>CreateEvent</td>\n",
 114 |        "      <td>155010</td>\n",
 115 |        "    </tr>\n",
 116 |        "    <tr>\n",
 117 |        "      <th>2</th>\n",
 118 |        "      <td>WatchEvent</td>\n",
 119 |        "      <td>67607</td>\n",
 120 |        "    </tr>\n",
 121 |        "    <tr>\n",
 122 |        "      <th>3</th>\n",
 123 |        "      <td>PullRequestEvent</td>\n",
 124 |        "      <td>56635</td>\n",
 125 |        "    </tr>\n",
 126 |        "    <tr>\n",
 127 |        "      <th>4</th>\n",
 128 |        "      <td>IssueCommentEvent</td>\n",
 129 |        "      <td>46972</td>\n",
 130 |        "    </tr>\n",
 131 |        "    <tr>\n",
 132 |        "      <th>5</th>\n",
 133 |        "      <td>IssuesEvent</td>\n",
 134 |        "      <td>27592</td>\n",
 135 |        "    </tr>\n",
 136 |        "    <tr>\n",
 137 |        "      <th>6</th>\n",
 138 |        "      <td>ForkEvent</td>\n",
 139 |        "      <td>24331</td>\n",
 140 |        "    </tr>\n",
 141 |        "    <tr>\n",
 142 |        "      <th>7</th>\n",
 143 |        "      <td>DeleteEvent</td>\n",
 144 |        "      <td>22590</td>\n",
 145 |        "    </tr>\n",
 146 |        "    <tr>\n",
 147 |        "      <th>8</th>\n",
 148 |        "      <td>PullRequestReviewCommentEvent</td>\n",
 149 |        "      <td>9756</td>\n",
 150 |        "    </tr>\n",
 151 |        "    <tr>\n",
 152 |        "      <th>9</th>\n",
 153 |        "      <td>MemberEvent</td>\n",
 154 |        "      <td>5201</td>\n",
 155 |        "    </tr>\n",
 156 |        "    <tr>\n",
 157 |        "      <th>10</th>\n",
 158 |        "      <td>GollumEvent</td>\n",
 159 |        "      <td>4445</td>\n",
 160 |        "    </tr>\n",
 161 |        "    <tr>\n",
 162 |        "      <th>11</th>\n",
 163 |        "      <td>ReleaseEvent</td>\n",
 164 |        "      <td>3527</td>\n",
 165 |        "    </tr>\n",
 166 |        "    <tr>\n",
 167 |        "      <th>12</th>\n",
 168 |        "      <td>CommitCommentEvent</td>\n",
 169 |        "      <td>1759</td>\n",
 170 |        "    </tr>\n",
 171 |        "    <tr>\n",
 172 |        "      <th>13</th>\n",
 173 |        "      <td>PublicEvent</td>\n",
 174 |        "      <td>1064</td>\n",
 175 |        "    </tr>\n",
 176 |        "  </tbody>\n",
 177 |        "</table>\n",
 178 |        "</div>"
 179 |       ],
 180 |       "text/plain": [
 181 |        "                             type     cnt\n",
 182 |        "0                       PushEvent  588724\n",
 183 |        "1                     CreateEvent  155010\n",
 184 |        "2                      WatchEvent   67607\n",
 185 |        "3                PullRequestEvent   56635\n",
 186 |        "4               IssueCommentEvent   46972\n",
 187 |        "5                     IssuesEvent   27592\n",
 188 |        "6                       ForkEvent   24331\n",
 189 |        "7                     DeleteEvent   22590\n",
 190 |        "8   PullRequestReviewCommentEvent    9756\n",
 191 |        "9                     MemberEvent    5201\n",
 192 |        "10                    GollumEvent    4445\n",
 193 |        "11                   ReleaseEvent    3527\n",
 194 |        "12             CommitCommentEvent    1759\n",
 195 |        "13                    PublicEvent    1064"
 196 |       ]
 197 |      },
 198 |      "execution_count": 6,
 199 |      "metadata": {},
 200 |      "output_type": "execute_result"
 201 |     }
 202 |    ],
 203 |    "source": [
 204 |     "query = \"\"\"\n",
 205 |     "SELECT \n",
 206 |     "  type,\n",
 207 |     "  COUNT(*) AS cnt\n",
 208 |     "FROM `githubarchive.day.{0}` \n",
 209 |     "GROUP BY 1\n",
 210 |     "ORDER BY 2 DESC\n",
 211 |     "\"\"\".format(process_date_nodash)\n",
 212 |     "\n",
 213 |     "print (query)\n",
 214 |     "\n",
 215 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
 216 |     "df.head(20)"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "markdown",
 221 |    "metadata": {},
 222 |    "source": [
 223 |     "### Top 10 repos with the most comments in their issues\n",
 224 |     "- __IssueCommentEvent__: Triggered when an issue comment is created, edited, or deleted."
 225 |    ]
 226 |   },
 227 |   {
 228 |    "cell_type": "code",
 229 |    "execution_count": 18,
 230 |    "metadata": {},
 231 |    "outputs": [
 232 |     {
 233 |      "name": "stdout",
 234 |      "output_type": "stream",
 235 |      "text": [
 236 |       "\n",
 237 |       "SELECT \n",
 238 |       "  repo.name,\n",
 239 |       "  COUNT(*) AS cnt\n",
 240 |       "FROM `githubarchive.day.20181201`\n",
 241 |       "WHERE type IN ( 'IssueCommentEvent')\n",
 242 |       "GROUP BY 1\n",
 243 |       "ORDER BY 2 DESC\n",
 244 |       "LIMIT 10\n",
 245 |       "\n"
 246 |      ]
 247 |     },
 248 |     {
 249 |      "data": {
 250 |       "text/html": [
 251 |        "<div>\n",
 252 |        "<style scoped>\n",
 253 |        "    .dataframe tbody tr th:only-of-type {\n",
 254 |        "        vertical-align: middle;\n",
 255 |        "    }\n",
 256 |        "\n",
 257 |        "    .dataframe tbody tr th {\n",
 258 |        "        vertical-align: top;\n",
 259 |        "    }\n",
 260 |        "\n",
 261 |        "    .dataframe thead th {\n",
 262 |        "        text-align: right;\n",
 263 |        "    }\n",
 264 |        "</style>\n",
 265 |        "<table border=\"1\" class=\"dataframe\">\n",
 266 |        "  <thead>\n",
 267 |        "    <tr style=\"text-align: right;\">\n",
 268 |        "      <th></th>\n",
 269 |        "      <th>name</th>\n",
 270 |        "      <th>cnt</th>\n",
 271 |        "    </tr>\n",
 272 |        "  </thead>\n",
 273 |        "  <tbody>\n",
 274 |        "    <tr>\n",
 275 |        "      <th>0</th>\n",
 276 |        "      <td>google-test/signcla-probe-repo</td>\n",
 277 |        "      <td>327</td>\n",
 278 |        "    </tr>\n",
 279 |        "    <tr>\n",
 280 |        "      <th>1</th>\n",
 281 |        "      <td>Azure/azure-rest-api-specs</td>\n",
 282 |        "      <td>287</td>\n",
 283 |        "    </tr>\n",
 284 |        "    <tr>\n",
 285 |        "      <th>2</th>\n",
 286 |        "      <td>kubernetes/kubernetes</td>\n",
 287 |        "      <td>227</td>\n",
 288 |        "    </tr>\n",
 289 |        "    <tr>\n",
 290 |        "      <th>3</th>\n",
 291 |        "      <td>rust-lang/rust</td>\n",
 292 |        "      <td>207</td>\n",
 293 |        "    </tr>\n",
 294 |        "    <tr>\n",
 295 |        "      <th>4</th>\n",
 296 |        "      <td>apache/spark</td>\n",
 297 |        "      <td>204</td>\n",
 298 |        "    </tr>\n",
 299 |        "    <tr>\n",
 300 |        "      <th>5</th>\n",
 301 |        "      <td>freeCodeCamp/freeCodeCamp</td>\n",
 302 |        "      <td>196</td>\n",
 303 |        "    </tr>\n",
 304 |        "    <tr>\n",
 305 |        "      <th>6</th>\n",
 306 |        "      <td>everypolitician/everypolitician-data</td>\n",
 307 |        "      <td>192</td>\n",
 308 |        "    </tr>\n",
 309 |        "    <tr>\n",
 310 |        "      <th>7</th>\n",
 311 |        "      <td>TeamNewPipe/NewPipe</td>\n",
 312 |        "      <td>158</td>\n",
 313 |        "    </tr>\n",
 314 |        "    <tr>\n",
 315 |        "      <th>8</th>\n",
 316 |        "      <td>openshift/origin</td>\n",
 317 |        "      <td>140</td>\n",
 318 |        "    </tr>\n",
 319 |        "    <tr>\n",
 320 |        "      <th>9</th>\n",
 321 |        "      <td>NixOS/nixpkgs</td>\n",
 322 |        "      <td>126</td>\n",
 323 |        "    </tr>\n",
 324 |        "  </tbody>\n",
 325 |        "</table>\n",
 326 |        "</div>"
 327 |       ],
 328 |       "text/plain": [
 329 |        "                                   name  cnt\n",
 330 |        "0        google-test/signcla-probe-repo  327\n",
 331 |        "1            Azure/azure-rest-api-specs  287\n",
 332 |        "2                 kubernetes/kubernetes  227\n",
 333 |        "3                        rust-lang/rust  207\n",
 334 |        "4                          apache/spark  204\n",
 335 |        "5             freeCodeCamp/freeCodeCamp  196\n",
 336 |        "6  everypolitician/everypolitician-data  192\n",
 337 |        "7                   TeamNewPipe/NewPipe  158\n",
 338 |        "8                      openshift/origin  140\n",
 339 |        "9                         NixOS/nixpkgs  126"
 340 |       ]
 341 |      },
 342 |      "execution_count": 18,
 343 |      "metadata": {},
 344 |      "output_type": "execute_result"
 345 |     }
 346 |    ],
 347 |    "source": [
 348 |     "query = \"\"\"\n",
 349 |     "SELECT \n",
 350 |     "  repo.name,\n",
 351 |     "  COUNT(*) AS cnt\n",
 352 |     "FROM `githubarchive.day.{0}`\n",
 353 |     "WHERE type IN ( 'IssueCommentEvent')\n",
 354 |     "GROUP BY 1\n",
 355 |     "ORDER BY 2 DESC\n",
 356 |     "LIMIT 10\n",
 357 |     "\"\"\".format(process_date_nodash)\n",
 358 |     "\n",
 359 |     "print (query)\n",
 360 |     "\n",
 361 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
 362 |     "df.head(20)"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "markdown",
 367 |    "metadata": {},
 368 |    "source": [
 369 |     "### Top 10 repos by stars and fork event"
 370 |    ]
 371 |   },
 372 |   {
 373 |    "cell_type": "code",
 374 |    "execution_count": 8,
 375 |    "metadata": {},
 376 |    "outputs": [
 377 |     {
 378 |      "name": "stdout",
 379 |      "output_type": "stream",
 380 |      "text": [
 381 |       "\n",
 382 |       "SELECT \n",
 383 |       "  repo.name,\n",
 384 |       "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
 385 |       "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
 386 |       "  COUNT(*) AS cnt\n",
 387 |       "FROM `githubarchive.day.20181201`\n",
 388 |       "WHERE type IN ('WatchEvent','ForkEvent')\n",
 389 |       "GROUP BY 1\n",
 390 |       "ORDER BY 2 DESC\n",
 391 |       "LIMIT 10\n",
 392 |       "\n"
 393 |      ]
 394 |     },
 395 |     {
 396 |      "data": {
 397 |       "text/html": [
 398 |        "<div>\n",
 399 |        "<style scoped>\n",
 400 |        "    .dataframe tbody tr th:only-of-type {\n",
 401 |        "        vertical-align: middle;\n",
 402 |        "    }\n",
 403 |        "\n",
 404 |        "    .dataframe tbody tr th {\n",
 405 |        "        vertical-align: top;\n",
 406 |        "    }\n",
 407 |        "\n",
 408 |        "    .dataframe thead th {\n",
 409 |        "        text-align: right;\n",
 410 |        "    }\n",
 411 |        "</style>\n",
 412 |        "<table border=\"1\" class=\"dataframe\">\n",
 413 |        "  <thead>\n",
 414 |        "    <tr style=\"text-align: right;\">\n",
 415 |        "      <th></th>\n",
 416 |        "      <th>name</th>\n",
 417 |        "      <th>stars</th>\n",
 418 |        "      <th>forks</th>\n",
 419 |        "      <th>cnt</th>\n",
 420 |        "    </tr>\n",
 421 |        "  </thead>\n",
 422 |        "  <tbody>\n",
 423 |        "    <tr>\n",
 424 |        "      <th>0</th>\n",
 425 |        "      <td>BcRikko/NES.css</td>\n",
 426 |        "      <td>386</td>\n",
 427 |        "      <td>35</td>\n",
 428 |        "      <td>421</td>\n",
 429 |        "    </tr>\n",
 430 |        "    <tr>\n",
 431 |        "      <th>1</th>\n",
 432 |        "      <td>leisurelicht/wtfpython-cn</td>\n",
 433 |        "      <td>241</td>\n",
 434 |        "      <td>31</td>\n",
 435 |        "      <td>272</td>\n",
 436 |        "    </tr>\n",
 437 |        "    <tr>\n",
 438 |        "      <th>2</th>\n",
 439 |        "      <td>satwikkansal/wtfpython</td>\n",
 440 |        "      <td>190</td>\n",
 441 |        "      <td>30</td>\n",
 442 |        "      <td>220</td>\n",
 443 |        "    </tr>\n",
 444 |        "    <tr>\n",
 445 |        "      <th>3</th>\n",
 446 |        "      <td>cssanimation/css-animation-101</td>\n",
 447 |        "      <td>178</td>\n",
 448 |        "      <td>5</td>\n",
 449 |        "      <td>183</td>\n",
 450 |        "    </tr>\n",
 451 |        "    <tr>\n",
 452 |        "      <th>4</th>\n",
 453 |        "      <td>firecracker-microvm/firecracker</td>\n",
 454 |        "      <td>150</td>\n",
 455 |        "      <td>13</td>\n",
 456 |        "      <td>163</td>\n",
 457 |        "    </tr>\n",
 458 |        "    <tr>\n",
 459 |        "      <th>5</th>\n",
 460 |        "      <td>crazyandcoder/kindle_free_books</td>\n",
 461 |        "      <td>132</td>\n",
 462 |        "      <td>31</td>\n",
 463 |        "      <td>163</td>\n",
 464 |        "    </tr>\n",
 465 |        "    <tr>\n",
 466 |        "      <th>6</th>\n",
 467 |        "      <td>withspectrum/spectrum</td>\n",
 468 |        "      <td>132</td>\n",
 469 |        "      <td>9</td>\n",
 470 |        "      <td>141</td>\n",
 471 |        "    </tr>\n",
 472 |        "    <tr>\n",
 473 |        "      <th>7</th>\n",
 474 |        "      <td>afshinea/stanford-cs-230-deep-learning</td>\n",
 475 |        "      <td>120</td>\n",
 476 |        "      <td>17</td>\n",
 477 |        "      <td>137</td>\n",
 478 |        "    </tr>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>8</th>\n",
 481 |        "      <td>algorithm-visualizer/algorithm-visualizer</td>\n",
 482 |        "      <td>119</td>\n",
 483 |        "      <td>15</td>\n",
 484 |        "      <td>134</td>\n",
 485 |        "    </tr>\n",
 486 |        "    <tr>\n",
 487 |        "      <th>9</th>\n",
 488 |        "      <td>olifolkerd/tabulator</td>\n",
 489 |        "      <td>114</td>\n",
 490 |        "      <td>3</td>\n",
 491 |        "      <td>117</td>\n",
 492 |        "    </tr>\n",
 493 |        "  </tbody>\n",
 494 |        "</table>\n",
 495 |        "</div>"
 496 |       ],
 497 |       "text/plain": [
 498 |        "                                        name  stars  forks  cnt\n",
 499 |        "0                            BcRikko/NES.css    386     35  421\n",
 500 |        "1                  leisurelicht/wtfpython-cn    241     31  272\n",
 501 |        "2                     satwikkansal/wtfpython    190     30  220\n",
 502 |        "3             cssanimation/css-animation-101    178      5  183\n",
 503 |        "4            firecracker-microvm/firecracker    150     13  163\n",
 504 |        "5            crazyandcoder/kindle_free_books    132     31  163\n",
 505 |        "6                      withspectrum/spectrum    132      9  141\n",
 506 |        "7     afshinea/stanford-cs-230-deep-learning    120     17  137\n",
 507 |        "8  algorithm-visualizer/algorithm-visualizer    119     15  134\n",
 508 |        "9                       olifolkerd/tabulator    114      3  117"
 509 |       ]
 510 |      },
 511 |      "execution_count": 8,
 512 |      "metadata": {},
 513 |      "output_type": "execute_result"
 514 |     }
 515 |    ],
 516 |    "source": [
 517 |     "query = \"\"\"\n",
 518 |     "SELECT \n",
 519 |     "  repo.name,\n",
 520 |     "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
 521 |     "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
 522 |     "  COUNT(*) AS cnt\n",
 523 |     "FROM `githubarchive.day.{0}`\n",
 524 |     "WHERE type IN ('WatchEvent','ForkEvent')\n",
 525 |     "GROUP BY 1\n",
 526 |     "ORDER BY 2 DESC\n",
 527 |     "LIMIT 10\n",
 528 |     "\"\"\".format(process_date_nodash)\n",
 529 |     "\n",
 530 |     "print (query)\n",
 531 |     "\n",
 532 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
 533 |     "df.head(20)"
 534 |    ]
 535 |   },
 536 |   {
 537 |    "cell_type": "markdown",
 538 |    "metadata": {},
 539 |    "source": [
 540 |     "## Hacker News data\n",
 541 |     "- Link: [Data](https://bigquery.cloud.google.com/table/bigquery-public-data:hacker_news.full) - [More info](https://medium.com/@hoffa/hacker-news-on-bigquery-now-with-daily-updates-so-what-are-the-top-domains-963d3c68b2e2)"
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "markdown",
 546 |    "metadata": {},
 547 |    "source": [
 548 |     "### Top domains shared in Hacker News\n",
 549 |     "- Domain with higher score are more likely to make it to the front page.\n",
 550 |     "- __nytimes__ has the highest average score."
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": 12,
 556 |    "metadata": {},
 557 |    "outputs": [
 558 |     {
 559 |      "name": "stdout",
 560 |      "output_type": "stream",
 561 |      "text": [
 562 |       "\n",
 563 |       "SELECT \n",
 564 |       "  REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n",
 565 |       "  AVG(score) as avg_score,\n",
 566 |       "  COUNT(*) AS cnt\n",
 567 |       "FROM `bigquery-public-data.hacker_news.full`\n",
 568 |       "WHERE url!='' \n",
 569 |       "AND EXTRACT(DATE FROM timestamp)=\"2018-12-01\"\n",
 570 |       "GROUP BY 1\n",
 571 |       "ORDER BY 3 DESC \n",
 572 |       "LIMIT 10\n",
 573 |       "\n"
 574 |      ]
 575 |     },
 576 |     {
 577 |      "data": {
 578 |       "text/html": [
 579 |        "<div>\n",
 580 |        "<style scoped>\n",
 581 |        "    .dataframe tbody tr th:only-of-type {\n",
 582 |        "        vertical-align: middle;\n",
 583 |        "    }\n",
 584 |        "\n",
 585 |        "    .dataframe tbody tr th {\n",
 586 |        "        vertical-align: top;\n",
 587 |        "    }\n",
 588 |        "\n",
 589 |        "    .dataframe thead th {\n",
 590 |        "        text-align: right;\n",
 591 |        "    }\n",
 592 |        "</style>\n",
 593 |        "<table border=\"1\" class=\"dataframe\">\n",
 594 |        "  <thead>\n",
 595 |        "    <tr style=\"text-align: right;\">\n",
 596 |        "      <th></th>\n",
 597 |        "      <th>domain</th>\n",
 598 |        "      <th>avg_score</th>\n",
 599 |        "      <th>cnt</th>\n",
 600 |        "    </tr>\n",
 601 |        "  </thead>\n",
 602 |        "  <tbody>\n",
 603 |        "    <tr>\n",
 604 |        "      <th>0</th>\n",
 605 |        "      <td>github.com</td>\n",
 606 |        "      <td>14.966667</td>\n",
 607 |        "      <td>30</td>\n",
 608 |        "    </tr>\n",
 609 |        "    <tr>\n",
 610 |        "      <th>1</th>\n",
 611 |        "      <td>medium.com</td>\n",
 612 |        "      <td>15.592593</td>\n",
 613 |        "      <td>27</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>2</th>\n",
 617 |        "      <td>www.youtube.com</td>\n",
 618 |        "      <td>12.666667</td>\n",
 619 |        "      <td>24</td>\n",
 620 |        "    </tr>\n",
 621 |        "    <tr>\n",
 622 |        "      <th>3</th>\n",
 623 |        "      <td>www.nytimes.com</td>\n",
 624 |        "      <td>41.263158</td>\n",
 625 |        "      <td>19</td>\n",
 626 |        "    </tr>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>4</th>\n",
 629 |        "      <td>venturebeat.com</td>\n",
 630 |        "      <td>2.100000</td>\n",
 631 |        "      <td>10</td>\n",
 632 |        "    </tr>\n",
 633 |        "    <tr>\n",
 634 |        "      <th>5</th>\n",
 635 |        "      <td>www.reddit.com</td>\n",
 636 |        "      <td>21.428571</td>\n",
 637 |        "      <td>7</td>\n",
 638 |        "    </tr>\n",
 639 |        "    <tr>\n",
 640 |        "      <th>6</th>\n",
 641 |        "      <td>www.theguardian.com</td>\n",
 642 |        "      <td>31.166667</td>\n",
 643 |        "      <td>6</td>\n",
 644 |        "    </tr>\n",
 645 |        "    <tr>\n",
 646 |        "      <th>7</th>\n",
 647 |        "      <td>en.wikipedia.org</td>\n",
 648 |        "      <td>15.833333</td>\n",
 649 |        "      <td>6</td>\n",
 650 |        "    </tr>\n",
 651 |        "    <tr>\n",
 652 |        "      <th>8</th>\n",
 653 |        "      <td>arstechnica.com</td>\n",
 654 |        "      <td>22.666667</td>\n",
 655 |        "      <td>6</td>\n",
 656 |        "    </tr>\n",
 657 |        "    <tr>\n",
 658 |        "      <th>9</th>\n",
 659 |        "      <td>www.theverge.com</td>\n",
 660 |        "      <td>2.200000</td>\n",
 661 |        "      <td>5</td>\n",
 662 |        "    </tr>\n",
 663 |        "  </tbody>\n",
 664 |        "</table>\n",
 665 |        "</div>"
 666 |       ],
 667 |       "text/plain": [
 668 |        "                domain  avg_score  cnt\n",
 669 |        "0           github.com  14.966667   30\n",
 670 |        "1           medium.com  15.592593   27\n",
 671 |        "2      www.youtube.com  12.666667   24\n",
 672 |        "3      www.nytimes.com  41.263158   19\n",
 673 |        "4      venturebeat.com   2.100000   10\n",
 674 |        "5       www.reddit.com  21.428571    7\n",
 675 |        "6  www.theguardian.com  31.166667    6\n",
 676 |        "7     en.wikipedia.org  15.833333    6\n",
 677 |        "8      arstechnica.com  22.666667    6\n",
 678 |        "9     www.theverge.com   2.200000    5"
 679 |       ]
 680 |      },
 681 |      "execution_count": 12,
 682 |      "metadata": {},
 683 |      "output_type": "execute_result"
 684 |     }
 685 |    ],
 686 |    "source": [
 687 |     "query = \"\"\"\n",
 688 |     "SELECT \n",
 689 |     "  REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n",
 690 |     "  AVG(score) as avg_score,\n",
 691 |     "  COUNT(*) AS cnt\n",
 692 |     "FROM `bigquery-public-data.hacker_news.full`\n",
 693 |     "WHERE url!='' \n",
 694 |     "AND EXTRACT(DATE FROM timestamp)=\"{0}\"\n",
 695 |     "GROUP BY 1\n",
 696 |     "ORDER BY 3 DESC \n",
 697 |     "LIMIT 10\n",
 698 |     "\"\"\".format(process_date)\n",
 699 |     "\n",
 700 |     "print (query)\n",
 701 |     "\n",
 702 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
 703 |     "df.head(20)"
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "markdown",
 708 |    "metadata": {},
 709 |    "source": [
 710 |     "### What domains have the best chance of getting more than 40 upvotes?\n",
 711 |     "- Certainly Hacker News likes content hosted on sites like github.com and the nytimes."
 712 |    ]
 713 |   },
 714 |   {
 715 |    "cell_type": "code",
 716 |    "execution_count": 11,
 717 |    "metadata": {},
 718 |    "outputs": [
 719 |     {
 720 |      "name": "stdout",
 721 |      "output_type": "stream",
 722 |      "text": [
 723 |       "\n",
 724 |       "SELECT \n",
 725 |       "  REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n",
 726 |       "  COUNTIF(score>40) as score_gt_40,\n",
 727 |       "  COUNT(*) AS cnt\n",
 728 |       "FROM `bigquery-public-data.hacker_news.full`\n",
 729 |       "WHERE url!='' \n",
 730 |       "AND EXTRACT(DATE FROM timestamp)=\"2018-12-01\"\n",
 731 |       "GROUP BY 1\n",
 732 |       "ORDER BY 2 DESC \n",
 733 |       "LIMIT 10\n",
 734 |       "\n"
 735 |      ]
 736 |     },
 737 |     {
 738 |      "data": {
 739 |       "text/html": [
 740 |        "<div>\n",
 741 |        "<style scoped>\n",
 742 |        "    .dataframe tbody tr th:only-of-type {\n",
 743 |        "        vertical-align: middle;\n",
 744 |        "    }\n",
 745 |        "\n",
 746 |        "    .dataframe tbody tr th {\n",
 747 |        "        vertical-align: top;\n",
 748 |        "    }\n",
 749 |        "\n",
 750 |        "    .dataframe thead th {\n",
 751 |        "        text-align: right;\n",
 752 |        "    }\n",
 753 |        "</style>\n",
 754 |        "<table border=\"1\" class=\"dataframe\">\n",
 755 |        "  <thead>\n",
 756 |        "    <tr style=\"text-align: right;\">\n",
 757 |        "      <th></th>\n",
 758 |        "      <th>domain</th>\n",
 759 |        "      <th>score_gt_40</th>\n",
 760 |        "      <th>cnt</th>\n",
 761 |        "    </tr>\n",
 762 |        "  </thead>\n",
 763 |        "  <tbody>\n",
 764 |        "    <tr>\n",
 765 |        "      <th>0</th>\n",
 766 |        "      <td>www.nytimes.com</td>\n",
 767 |        "      <td>4</td>\n",
 768 |        "      <td>19</td>\n",
 769 |        "    </tr>\n",
 770 |        "    <tr>\n",
 771 |        "      <th>1</th>\n",
 772 |        "      <td>github.com</td>\n",
 773 |        "      <td>4</td>\n",
 774 |        "      <td>30</td>\n",
 775 |        "    </tr>\n",
 776 |        "    <tr>\n",
 777 |        "      <th>2</th>\n",
 778 |        "      <td>medium.com</td>\n",
 779 |        "      <td>3</td>\n",
 780 |        "      <td>27</td>\n",
 781 |        "    </tr>\n",
 782 |        "    <tr>\n",
 783 |        "      <th>3</th>\n",
 784 |        "      <td>www.wsj.com</td>\n",
 785 |        "      <td>2</td>\n",
 786 |        "      <td>4</td>\n",
 787 |        "    </tr>\n",
 788 |        "    <tr>\n",
 789 |        "      <th>4</th>\n",
 790 |        "      <td>www.theatlantic.com</td>\n",
 791 |        "      <td>2</td>\n",
 792 |        "      <td>5</td>\n",
 793 |        "    </tr>\n",
 794 |        "    <tr>\n",
 795 |        "      <th>5</th>\n",
 796 |        "      <td>www.youtube.com</td>\n",
 797 |        "      <td>2</td>\n",
 798 |        "      <td>24</td>\n",
 799 |        "    </tr>\n",
 800 |        "    <tr>\n",
 801 |        "      <th>6</th>\n",
 802 |        "      <td>www.jamiefuller.com</td>\n",
 803 |        "      <td>1</td>\n",
 804 |        "      <td>1</td>\n",
 805 |        "    </tr>\n",
 806 |        "    <tr>\n",
 807 |        "      <th>7</th>\n",
 808 |        "      <td>arstechnica.com</td>\n",
 809 |        "      <td>1</td>\n",
 810 |        "      <td>6</td>\n",
 811 |        "    </tr>\n",
 812 |        "    <tr>\n",
 813 |        "      <th>8</th>\n",
 814 |        "      <td>www.vulture.com</td>\n",
 815 |        "      <td>1</td>\n",
 816 |        "      <td>2</td>\n",
 817 |        "    </tr>\n",
 818 |        "    <tr>\n",
 819 |        "      <th>9</th>\n",
 820 |        "      <td>www.newsshooter.com</td>\n",
 821 |        "      <td>1</td>\n",
 822 |        "      <td>1</td>\n",
 823 |        "    </tr>\n",
 824 |        "  </tbody>\n",
 825 |        "</table>\n",
 826 |        "</div>"
 827 |       ],
 828 |       "text/plain": [
 829 |        "                domain  score_gt_40  cnt\n",
 830 |        "0      www.nytimes.com            4   19\n",
 831 |        "1           github.com            4   30\n",
 832 |        "2           medium.com            3   27\n",
 833 |        "3          www.wsj.com            2    4\n",
 834 |        "4  www.theatlantic.com            2    5\n",
 835 |        "5      www.youtube.com            2   24\n",
 836 |        "6  www.jamiefuller.com            1    1\n",
 837 |        "7      arstechnica.com            1    6\n",
 838 |        "8      www.vulture.com            1    2\n",
 839 |        "9  www.newsshooter.com            1    1"
 840 |       ]
 841 |      },
 842 |      "execution_count": 11,
 843 |      "metadata": {},
 844 |      "output_type": "execute_result"
 845 |     }
 846 |    ],
 847 |    "source": [
 848 |     "query = \"\"\"\n",
 849 |     "SELECT \n",
 850 |     "  REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n",
 851 |     "  COUNTIF(score>40) as score_gt_40,\n",
 852 |     "  COUNT(*) AS cnt\n",
 853 |     "FROM `bigquery-public-data.hacker_news.full`\n",
 854 |     "WHERE url!='' \n",
 855 |     "AND EXTRACT(DATE FROM timestamp)=\"{0}\"\n",
 856 |     "GROUP BY 1\n",
 857 |     "ORDER BY 2 DESC \n",
 858 |     "LIMIT 10\n",
 859 |     "\"\"\".format(process_date)\n",
 860 |     "\n",
 861 |     "print (query)\n",
 862 |     "\n",
 863 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
 864 |     "df.head(20)"
 865 |    ]
 866 |   },
 867 |   {
 868 |    "cell_type": "markdown",
 869 |    "metadata": {},
 870 |    "source": [
 871 |     "### Top 10 Hacker news stories from Github by highest score"
 872 |    ]
 873 |   },
 874 |   {
 875 |    "cell_type": "code",
 876 |    "execution_count": 17,
 877 |    "metadata": {},
 878 |    "outputs": [
 879 |     {
 880 |      "name": "stdout",
 881 |      "output_type": "stream",
 882 |      "text": [
 883 |       "\n",
 884 |       "SELECT     \n",
 885 |       "  `by` AS submitter,\n",
 886 |       "  id as story_id,\n",
 887 |       "  REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
 888 |       "  SUM(score) as score\n",
 889 |       "FROM\n",
 890 |       "  `bigquery-public-data.hacker_news.full`\n",
 891 |       "WHERE\n",
 892 |       "  type = 'story'\n",
 893 |       "  AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n",
 894 |       "  AND url LIKE '%https://github.com%'\n",
 895 |       "  AND url NOT LIKE '%github.com/blog/%'\n",
 896 |       "GROUP BY  \n",
 897 |       "  submitter,\n",
 898 |       "  story_id,\n",
 899 |       "  url\n",
 900 |       "ORDER BY score DESC\n",
 901 |       "\n"
 902 |      ]
 903 |     },
 904 |     {
 905 |      "data": {
 906 |       "text/html": [
 907 |        "<div>\n",
 908 |        "<style scoped>\n",
 909 |        "    .dataframe tbody tr th:only-of-type {\n",
 910 |        "        vertical-align: middle;\n",
 911 |        "    }\n",
 912 |        "\n",
 913 |        "    .dataframe tbody tr th {\n",
 914 |        "        vertical-align: top;\n",
 915 |        "    }\n",
 916 |        "\n",
 917 |        "    .dataframe thead th {\n",
 918 |        "        text-align: right;\n",
 919 |        "    }\n",
 920 |        "</style>\n",
 921 |        "<table border=\"1\" class=\"dataframe\">\n",
 922 |        "  <thead>\n",
 923 |        "    <tr style=\"text-align: right;\">\n",
 924 |        "      <th></th>\n",
 925 |        "      <th>submitter</th>\n",
 926 |        "      <th>story_id</th>\n",
 927 |        "      <th>url</th>\n",
 928 |        "      <th>score</th>\n",
 929 |        "    </tr>\n",
 930 |        "  </thead>\n",
 931 |        "  <tbody>\n",
 932 |        "    <tr>\n",
 933 |        "      <th>0</th>\n",
 934 |        "      <td>ithinco</td>\n",
 935 |        "      <td>18574181</td>\n",
 936 |        "      <td>https://github.com/ithinco/i-am-chinese-the-dr...</td>\n",
 937 |        "      <td>129</td>\n",
 938 |        "    </tr>\n",
 939 |        "    <tr>\n",
 940 |        "      <th>1</th>\n",
 941 |        "      <td>mountainview</td>\n",
 942 |        "      <td>18576170</td>\n",
 943 |        "      <td>https://github.com/YugaByte/yugabyte-db</td>\n",
 944 |        "      <td>115</td>\n",
 945 |        "    </tr>\n",
 946 |        "    <tr>\n",
 947 |        "      <th>2</th>\n",
 948 |        "      <td>oxplot</td>\n",
 949 |        "      <td>18575094</td>\n",
 950 |        "      <td>https://github.com/oxplot/pdftilecut</td>\n",
 951 |        "      <td>64</td>\n",
 952 |        "    </tr>\n",
 953 |        "    <tr>\n",
 954 |        "      <th>3</th>\n",
 955 |        "      <td>codeadict</td>\n",
 956 |        "      <td>18574683</td>\n",
 957 |        "      <td>https://github.com/alertlogic/erllambda</td>\n",
 958 |        "      <td>64</td>\n",
 959 |        "    </tr>\n",
 960 |        "    <tr>\n",
 961 |        "      <th>4</th>\n",
 962 |        "      <td>pjmlp</td>\n",
 963 |        "      <td>18575802</td>\n",
 964 |        "      <td>https://github.com/chocolatey/boxstarter</td>\n",
 965 |        "      <td>9</td>\n",
 966 |        "    </tr>\n",
 967 |        "    <tr>\n",
 968 |        "      <th>5</th>\n",
 969 |        "      <td>snek</td>\n",
 970 |        "      <td>18577658</td>\n",
 971 |        "      <td>https://github.com/devsnek/engine262</td>\n",
 972 |        "      <td>8</td>\n",
 973 |        "    </tr>\n",
 974 |        "    <tr>\n",
 975 |        "      <th>6</th>\n",
 976 |        "      <td>delvincasper</td>\n",
 977 |        "      <td>18577036</td>\n",
 978 |        "      <td>https://github.com/jerverless/jerverless</td>\n",
 979 |        "      <td>4</td>\n",
 980 |        "    </tr>\n",
 981 |        "    <tr>\n",
 982 |        "      <th>7</th>\n",
 983 |        "      <td>andrewchaa</td>\n",
 984 |        "      <td>18574107</td>\n",
 985 |        "      <td>https://github.com/andrewchaa/functional.pipe</td>\n",
 986 |        "      <td>4</td>\n",
 987 |        "    </tr>\n",
 988 |        "    <tr>\n",
 989 |        "      <th>8</th>\n",
 990 |        "      <td>anmonteiro90</td>\n",
 991 |        "      <td>18578964</td>\n",
 992 |        "      <td>https://github.com/anmonteiro/aws-lambda-ocaml...</td>\n",
 993 |        "      <td>4</td>\n",
 994 |        "    </tr>\n",
 995 |        "    <tr>\n",
 996 |        "      <th>9</th>\n",
 997 |        "      <td>KumarAbhirup</td>\n",
 998 |        "      <td>18577887</td>\n",
 999 |        "      <td>https://github.com/KumarAbhirup/bulk-mail-cli</td>\n",
1000 |        "      <td>4</td>\n",
1001 |        "    </tr>\n",
1002 |        "  </tbody>\n",
1003 |        "</table>\n",
1004 |        "</div>"
1005 |       ],
1006 |       "text/plain": [
1007 |        "      submitter  story_id                                                url  \\\n",
1008 |        "0       ithinco  18574181  https://github.com/ithinco/i-am-chinese-the-dr...   \n",
1009 |        "1  mountainview  18576170            https://github.com/YugaByte/yugabyte-db   \n",
1010 |        "2        oxplot  18575094               https://github.com/oxplot/pdftilecut   \n",
1011 |        "3     codeadict  18574683            https://github.com/alertlogic/erllambda   \n",
1012 |        "4         pjmlp  18575802           https://github.com/chocolatey/boxstarter   \n",
1013 |        "5          snek  18577658               https://github.com/devsnek/engine262   \n",
1014 |        "6  delvincasper  18577036           https://github.com/jerverless/jerverless   \n",
1015 |        "7    andrewchaa  18574107      https://github.com/andrewchaa/functional.pipe   \n",
1016 |        "8  anmonteiro90  18578964  https://github.com/anmonteiro/aws-lambda-ocaml...   \n",
1017 |        "9  KumarAbhirup  18577887      https://github.com/KumarAbhirup/bulk-mail-cli   \n",
1018 |        "\n",
1019 |        "   score  \n",
1020 |        "0    129  \n",
1021 |        "1    115  \n",
1022 |        "2     64  \n",
1023 |        "3     64  \n",
1024 |        "4      9  \n",
1025 |        "5      8  \n",
1026 |        "6      4  \n",
1027 |        "7      4  \n",
1028 |        "8      4  \n",
1029 |        "9      4  "
1030 |       ]
1031 |      },
1032 |      "execution_count": 17,
1033 |      "metadata": {},
1034 |      "output_type": "execute_result"
1035 |     }
1036 |    ],
1037 |    "source": [
1038 |     "query = \"\"\"\n",
1039 |     "SELECT     \n",
1040 |     "  `by` AS submitter,\n",
1041 |     "  id as story_id,\n",
1042 |     "  REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
1043 |     "  SUM(score) as score\n",
1044 |     "FROM\n",
1045 |     "  `bigquery-public-data.hacker_news.full`\n",
1046 |     "WHERE\n",
1047 |     "  type = 'story'\n",
1048 |     "  AND EXTRACT(DATE FROM timestamp)='{0}' \n",
1049 |     "  AND url LIKE '%https://github.com%'\n",
1050 |     "  AND url NOT LIKE '%github.com/blog/%'\n",
1051 |     "GROUP BY  \n",
1052 |     "  submitter,\n",
1053 |     "  story_id,\n",
1054 |     "  url\n",
1055 |     "ORDER BY score DESC\n",
1056 |     "\"\"\".format(process_date)\n",
1057 |     "\n",
1058 |     "print (query)\n",
1059 |     "\n",
1060 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
1061 |     "df.head(10)"
1062 |    ]
1063 |   },
1064 |   {
1065 |    "cell_type": "markdown",
1066 |    "metadata": {},
1067 |    "source": [
1068 |     "## Example Final table: GitHub on Hacker News Trends of 2018-12-01"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": 20,
1074 |    "metadata": {},
1075 |    "outputs": [
1076 |     {
1077 |      "name": "stdout",
1078 |      "output_type": "stream",
1079 |      "text": [
1080 |       "\n",
1081 |       "WITH github_activity AS (\n",
1082 |       "SELECT   \n",
1083 |       "  repo.name as repo,\n",
1084 |       "  CONCAT('https://github.com/', repo.name) as url,\n",
1085 |       "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1086 |       "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
1087 |       "  COUNT(*) AS cnt\n",
1088 |       "FROM `githubarchive.day.20181201`\n",
1089 |       "WHERE type IN ('WatchEvent','ForkEvent')\n",
1090 |       "GROUP BY 1,2\n",
1091 |       "),\n",
1092 |       "hacker_news AS (\n",
1093 |       "SELECT\n",
1094 |       "  EXTRACT(DATE FROM timestamp) as date,\n",
1095 |       "  `by` AS submitter,\n",
1096 |       "  id as story_id,\n",
1097 |       "  REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
1098 |       "  SUM(score) as score\n",
1099 |       "FROM\n",
1100 |       "  `bigquery-public-data.hacker_news.full`\n",
1101 |       "WHERE\n",
1102 |       "  type = 'story'\n",
1103 |       "  AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n",
1104 |       "  AND url LIKE '%https://github.com%'\n",
1105 |       "  AND url NOT LIKE '%github.com/blog/%'\n",
1106 |       "GROUP BY 1,2,3,4\n",
1107 |       ")\n",
1108 |       "\n",
1109 |       "SELECT\n",
1110 |       "  a.date as date,\n",
1111 |       "  a.url as github_url,\n",
1112 |       "  b.repo as github_repo,\n",
1113 |       "  a.score as hn_score,\n",
1114 |       "  a.story_id as hn_story_id,\n",
1115 |       "  b.stars as stars,\n",
1116 |       "  b.forks as forks\n",
1117 |       "FROM hacker_news as a\n",
1118 |       "LEFT JOIN github_activity as b\n",
1119 |       "ON a.url=b.url\n",
1120 |       "ORDER BY hn_score DESC\n",
1121 |       "LIMIT 10\n",
1122 |       "\n"
1123 |      ]
1124 |     },
1125 |     {
1126 |      "data": {
1127 |       "text/html": [
1128 |        "<div>\n",
1129 |        "<style scoped>\n",
1130 |        "    .dataframe tbody tr th:only-of-type {\n",
1131 |        "        vertical-align: middle;\n",
1132 |        "    }\n",
1133 |        "\n",
1134 |        "    .dataframe tbody tr th {\n",
1135 |        "        vertical-align: top;\n",
1136 |        "    }\n",
1137 |        "\n",
1138 |        "    .dataframe thead th {\n",
1139 |        "        text-align: right;\n",
1140 |        "    }\n",
1141 |        "</style>\n",
1142 |        "<table border=\"1\" class=\"dataframe\">\n",
1143 |        "  <thead>\n",
1144 |        "    <tr style=\"text-align: right;\">\n",
1145 |        "      <th></th>\n",
1146 |        "      <th>date</th>\n",
1147 |        "      <th>github_url</th>\n",
1148 |        "      <th>github_repo</th>\n",
1149 |        "      <th>hn_score</th>\n",
1150 |        "      <th>hn_story_id</th>\n",
1151 |        "      <th>stars</th>\n",
1152 |        "      <th>forks</th>\n",
1153 |        "    </tr>\n",
1154 |        "  </thead>\n",
1155 |        "  <tbody>\n",
1156 |        "    <tr>\n",
1157 |        "      <th>0</th>\n",
1158 |        "      <td>2018-12-01</td>\n",
1159 |        "      <td>https://github.com/ithinco/i-am-chinese-the-dr...</td>\n",
1160 |        "      <td>ithinco/i-am-chinese-the-dragonfly-must-go-on</td>\n",
1161 |        "      <td>129</td>\n",
1162 |        "      <td>18574181</td>\n",
1163 |        "      <td>60.0</td>\n",
1164 |        "      <td>1.0</td>\n",
1165 |        "    </tr>\n",
1166 |        "    <tr>\n",
1167 |        "      <th>1</th>\n",
1168 |        "      <td>2018-12-01</td>\n",
1169 |        "      <td>https://github.com/YugaByte/yugabyte-db</td>\n",
1170 |        "      <td>YugaByte/yugabyte-db</td>\n",
1171 |        "      <td>115</td>\n",
1172 |        "      <td>18576170</td>\n",
1173 |        "      <td>2.0</td>\n",
1174 |        "      <td>NaN</td>\n",
1175 |        "    </tr>\n",
1176 |        "    <tr>\n",
1177 |        "      <th>2</th>\n",
1178 |        "      <td>2018-12-01</td>\n",
1179 |        "      <td>https://github.com/alertlogic/erllambda</td>\n",
1180 |        "      <td>alertlogic/erllambda</td>\n",
1181 |        "      <td>64</td>\n",
1182 |        "      <td>18574683</td>\n",
1183 |        "      <td>48.0</td>\n",
1184 |        "      <td>NaN</td>\n",
1185 |        "    </tr>\n",
1186 |        "    <tr>\n",
1187 |        "      <th>3</th>\n",
1188 |        "      <td>2018-12-01</td>\n",
1189 |        "      <td>https://github.com/oxplot/pdftilecut</td>\n",
1190 |        "      <td>oxplot/pdftilecut</td>\n",
1191 |        "      <td>64</td>\n",
1192 |        "      <td>18575094</td>\n",
1193 |        "      <td>91.0</td>\n",
1194 |        "      <td>NaN</td>\n",
1195 |        "    </tr>\n",
1196 |        "    <tr>\n",
1197 |        "      <th>4</th>\n",
1198 |        "      <td>2018-12-01</td>\n",
1199 |        "      <td>https://github.com/chocolatey/boxstarter</td>\n",
1200 |        "      <td>chocolatey/boxstarter</td>\n",
1201 |        "      <td>9</td>\n",
1202 |        "      <td>18575802</td>\n",
1203 |        "      <td>1.0</td>\n",
1204 |        "      <td>NaN</td>\n",
1205 |        "    </tr>\n",
1206 |        "    <tr>\n",
1207 |        "      <th>5</th>\n",
1208 |        "      <td>2018-12-01</td>\n",
1209 |        "      <td>https://github.com/devsnek/engine262</td>\n",
1210 |        "      <td>devsnek/engine262</td>\n",
1211 |        "      <td>8</td>\n",
1212 |        "      <td>18577658</td>\n",
1213 |        "      <td>1.0</td>\n",
1214 |        "      <td>NaN</td>\n",
1215 |        "    </tr>\n",
1216 |        "    <tr>\n",
1217 |        "      <th>6</th>\n",
1218 |        "      <td>2018-12-01</td>\n",
1219 |        "      <td>https://github.com/andrewchaa/functional.pipe</td>\n",
1220 |        "      <td>andrewchaa/functional.pipe</td>\n",
1221 |        "      <td>4</td>\n",
1222 |        "      <td>18574107</td>\n",
1223 |        "      <td>2.0</td>\n",
1224 |        "      <td>NaN</td>\n",
1225 |        "    </tr>\n",
1226 |        "    <tr>\n",
1227 |        "      <th>7</th>\n",
1228 |        "      <td>2018-12-01</td>\n",
1229 |        "      <td>https://github.com/anmonteiro/aws-lambda-ocaml...</td>\n",
1230 |        "      <td>anmonteiro/aws-lambda-ocaml-runtime</td>\n",
1231 |        "      <td>4</td>\n",
1232 |        "      <td>18578964</td>\n",
1233 |        "      <td>5.0</td>\n",
1234 |        "      <td>NaN</td>\n",
1235 |        "    </tr>\n",
1236 |        "    <tr>\n",
1237 |        "      <th>8</th>\n",
1238 |        "      <td>2018-12-01</td>\n",
1239 |        "      <td>https://github.com/KumarAbhirup/bulk-mail-cli</td>\n",
1240 |        "      <td>None</td>\n",
1241 |        "      <td>4</td>\n",
1242 |        "      <td>18577887</td>\n",
1243 |        "      <td>NaN</td>\n",
1244 |        "      <td>NaN</td>\n",
1245 |        "    </tr>\n",
1246 |        "    <tr>\n",
1247 |        "      <th>9</th>\n",
1248 |        "      <td>2018-12-01</td>\n",
1249 |        "      <td>https://github.com/jerverless/jerverless</td>\n",
1250 |        "      <td>None</td>\n",
1251 |        "      <td>4</td>\n",
1252 |        "      <td>18577036</td>\n",
1253 |        "      <td>NaN</td>\n",
1254 |        "      <td>NaN</td>\n",
1255 |        "    </tr>\n",
1256 |        "  </tbody>\n",
1257 |        "</table>\n",
1258 |        "</div>"
1259 |       ],
1260 |       "text/plain": [
1261 |        "        date                                         github_url  \\\n",
1262 |        "0 2018-12-01  https://github.com/ithinco/i-am-chinese-the-dr...   \n",
1263 |        "1 2018-12-01            https://github.com/YugaByte/yugabyte-db   \n",
1264 |        "2 2018-12-01            https://github.com/alertlogic/erllambda   \n",
1265 |        "3 2018-12-01               https://github.com/oxplot/pdftilecut   \n",
1266 |        "4 2018-12-01           https://github.com/chocolatey/boxstarter   \n",
1267 |        "5 2018-12-01               https://github.com/devsnek/engine262   \n",
1268 |        "6 2018-12-01      https://github.com/andrewchaa/functional.pipe   \n",
1269 |        "7 2018-12-01  https://github.com/anmonteiro/aws-lambda-ocaml...   \n",
1270 |        "8 2018-12-01      https://github.com/KumarAbhirup/bulk-mail-cli   \n",
1271 |        "9 2018-12-01           https://github.com/jerverless/jerverless   \n",
1272 |        "\n",
1273 |        "                                     github_repo  hn_score  hn_story_id  \\\n",
1274 |        "0  ithinco/i-am-chinese-the-dragonfly-must-go-on       129     18574181   \n",
1275 |        "1                           YugaByte/yugabyte-db       115     18576170   \n",
1276 |        "2                           alertlogic/erllambda        64     18574683   \n",
1277 |        "3                              oxplot/pdftilecut        64     18575094   \n",
1278 |        "4                          chocolatey/boxstarter         9     18575802   \n",
1279 |        "5                              devsnek/engine262         8     18577658   \n",
1280 |        "6                     andrewchaa/functional.pipe         4     18574107   \n",
1281 |        "7            anmonteiro/aws-lambda-ocaml-runtime         4     18578964   \n",
1282 |        "8                                           None         4     18577887   \n",
1283 |        "9                                           None         4     18577036   \n",
1284 |        "\n",
1285 |        "   stars  forks  \n",
1286 |        "0   60.0    1.0  \n",
1287 |        "1    2.0    NaN  \n",
1288 |        "2   48.0    NaN  \n",
1289 |        "3   91.0    NaN  \n",
1290 |        "4    1.0    NaN  \n",
1291 |        "5    1.0    NaN  \n",
1292 |        "6    2.0    NaN  \n",
1293 |        "7    5.0    NaN  \n",
1294 |        "8    NaN    NaN  \n",
1295 |        "9    NaN    NaN  "
1296 |       ]
1297 |      },
1298 |      "execution_count": 20,
1299 |      "metadata": {},
1300 |      "output_type": "execute_result"
1301 |     }
1302 |    ],
1303 |    "source": [
1304 |     "query = \"\"\"\n",
1305 |     "WITH github_activity AS (\n",
1306 |     "SELECT   \n",
1307 |     "  repo.name as repo,\n",
1308 |     "  CONCAT('https://github.com/', repo.name) as url,\n",
1309 |     "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1310 |     "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
1311 |     "  COUNT(*) AS cnt\n",
1312 |     "FROM `githubarchive.day.{0}`\n",
1313 |     "WHERE type IN ('WatchEvent','ForkEvent')\n",
1314 |     "GROUP BY 1,2\n",
1315 |     "),\n",
1316 |     "hacker_news AS (\n",
1317 |     "SELECT\n",
1318 |     "  EXTRACT(DATE FROM timestamp) as date,\n",
1319 |     "  `by` AS submitter,\n",
1320 |     "  id as story_id,\n",
1321 |     "  REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
1322 |     "  SUM(score) as score\n",
1323 |     "FROM\n",
1324 |     "  `bigquery-public-data.hacker_news.full`\n",
1325 |     "WHERE\n",
1326 |     "  type = 'story'\n",
1327 |     "  AND EXTRACT(DATE FROM timestamp)='{1}' \n",
1328 |     "  AND url LIKE '%https://github.com%'\n",
1329 |     "  AND url NOT LIKE '%github.com/blog/%'\n",
1330 |     "GROUP BY 1,2,3,4\n",
1331 |     ")\n",
1332 |     "\n",
1333 |     "SELECT\n",
1334 |     "  a.date as date,\n",
1335 |     "  a.url as github_url,\n",
1336 |     "  b.repo as github_repo,\n",
1337 |     "  a.score as hn_score,\n",
1338 |     "  a.story_id as hn_story_id,\n",
1339 |     "  b.stars as stars,\n",
1340 |     "  b.forks as forks\n",
1341 |     "FROM hacker_news as a\n",
1342 |     "LEFT JOIN github_activity as b\n",
1343 |     "ON a.url=b.url\n",
1344 |     "ORDER BY hn_score DESC\n",
1345 |     "LIMIT 10\n",
1346 |     "\"\"\".format(process_date_nodash, process_date)\n",
1347 |     "\n",
1348 |     "print (query)\n",
1349 |     "\n",
1350 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
1351 |     "df.head(10)"
1352 |    ]
1353 |   },
1354 |   {
1355 |    "cell_type": "markdown",
1356 |    "metadata": {},
1357 |    "source": [
1358 |     "## Python PyPI stats\n",
1359 |     "- The Python Software Foundation provides the raw logs of Python installation activitie\n",
1360 |     "- Link: [Data](https://bigquery.cloud.google.com/table/the-psf:pypi.downloads20181230) - [More info](https://packaging.python.org/guides/analyzing-pypi-package-downloads/)\n",
1361 |     "\n",
1362 |     "__Challenge__: \n",
1363 |     "- Find associated Github stars, fork event, and Hacker News story for top downloads Python packages from pip"
1364 |    ]
1365 |   },
1366 |   {
1367 |    "cell_type": "markdown",
1368 |    "metadata": {},
1369 |    "source": [
1370 |     "### Top 10 downloads packages from pip"
1371 |    ]
1372 |   },
1373 |   {
1374 |    "cell_type": "code",
1375 |    "execution_count": 15,
1376 |    "metadata": {},
1377 |    "outputs": [
1378 |     {
1379 |      "name": "stdout",
1380 |      "output_type": "stream",
1381 |      "text": [
1382 |       "\n",
1383 |       "SELECT \n",
1384 |       "  TIMESTAMP_TRUNC(timestamp, DAY) as day,\n",
1385 |       "  file.project as project,\n",
1386 |       "  COUNT(*) as downloads\n",
1387 |       "FROM `the-psf.pypi.downloads20181201`\n",
1388 |       "WHERE details.installer.name = 'pip'\n",
1389 |       "GROUP BY 1,2\n",
1390 |       "ORDER BY 3 desc\n",
1391 |       "\n"
1392 |      ]
1393 |     },
1394 |     {
1395 |      "data": {
1396 |       "text/html": [
1397 |        "<div>\n",
1398 |        "<style scoped>\n",
1399 |        "    .dataframe tbody tr th:only-of-type {\n",
1400 |        "        vertical-align: middle;\n",
1401 |        "    }\n",
1402 |        "\n",
1403 |        "    .dataframe tbody tr th {\n",
1404 |        "        vertical-align: top;\n",
1405 |        "    }\n",
1406 |        "\n",
1407 |        "    .dataframe thead th {\n",
1408 |        "        text-align: right;\n",
1409 |        "    }\n",
1410 |        "</style>\n",
1411 |        "<table border=\"1\" class=\"dataframe\">\n",
1412 |        "  <thead>\n",
1413 |        "    <tr style=\"text-align: right;\">\n",
1414 |        "      <th></th>\n",
1415 |        "      <th>day</th>\n",
1416 |        "      <th>project</th>\n",
1417 |        "      <th>downloads</th>\n",
1418 |        "    </tr>\n",
1419 |        "  </thead>\n",
1420 |        "  <tbody>\n",
1421 |        "    <tr>\n",
1422 |        "      <th>0</th>\n",
1423 |        "      <td>2018-12-01</td>\n",
1424 |        "      <td>pip</td>\n",
1425 |        "      <td>1562226</td>\n",
1426 |        "    </tr>\n",
1427 |        "    <tr>\n",
1428 |        "      <th>1</th>\n",
1429 |        "      <td>2018-12-01</td>\n",
1430 |        "      <td>urllib3</td>\n",
1431 |        "      <td>1271997</td>\n",
1432 |        "    </tr>\n",
1433 |        "    <tr>\n",
1434 |        "      <th>2</th>\n",
1435 |        "      <td>2018-12-01</td>\n",
1436 |        "      <td>botocore</td>\n",
1437 |        "      <td>1069194</td>\n",
1438 |        "    </tr>\n",
1439 |        "    <tr>\n",
1440 |        "      <th>3</th>\n",
1441 |        "      <td>2018-12-01</td>\n",
1442 |        "      <td>six</td>\n",
1443 |        "      <td>966172</td>\n",
1444 |        "    </tr>\n",
1445 |        "    <tr>\n",
1446 |        "      <th>4</th>\n",
1447 |        "      <td>2018-12-01</td>\n",
1448 |        "      <td>python-dateutil</td>\n",
1449 |        "      <td>946327</td>\n",
1450 |        "    </tr>\n",
1451 |        "    <tr>\n",
1452 |        "      <th>5</th>\n",
1453 |        "      <td>2018-12-01</td>\n",
1454 |        "      <td>s3transfer</td>\n",
1455 |        "      <td>877832</td>\n",
1456 |        "    </tr>\n",
1457 |        "    <tr>\n",
1458 |        "      <th>6</th>\n",
1459 |        "      <td>2018-12-01</td>\n",
1460 |        "      <td>docutils</td>\n",
1461 |        "      <td>813135</td>\n",
1462 |        "    </tr>\n",
1463 |        "    <tr>\n",
1464 |        "      <th>7</th>\n",
1465 |        "      <td>2018-12-01</td>\n",
1466 |        "      <td>pyyaml</td>\n",
1467 |        "      <td>796706</td>\n",
1468 |        "    </tr>\n",
1469 |        "    <tr>\n",
1470 |        "      <th>8</th>\n",
1471 |        "      <td>2018-12-01</td>\n",
1472 |        "      <td>pyasn1</td>\n",
1473 |        "      <td>782540</td>\n",
1474 |        "    </tr>\n",
1475 |        "    <tr>\n",
1476 |        "      <th>9</th>\n",
1477 |        "      <td>2018-12-01</td>\n",
1478 |        "      <td>jmespath</td>\n",
1479 |        "      <td>772065</td>\n",
1480 |        "    </tr>\n",
1481 |        "  </tbody>\n",
1482 |        "</table>\n",
1483 |        "</div>"
1484 |       ],
1485 |       "text/plain": [
1486 |        "         day          project  downloads\n",
1487 |        "0 2018-12-01              pip    1562226\n",
1488 |        "1 2018-12-01          urllib3    1271997\n",
1489 |        "2 2018-12-01         botocore    1069194\n",
1490 |        "3 2018-12-01              six     966172\n",
1491 |        "4 2018-12-01  python-dateutil     946327\n",
1492 |        "5 2018-12-01       s3transfer     877832\n",
1493 |        "6 2018-12-01         docutils     813135\n",
1494 |        "7 2018-12-01           pyyaml     796706\n",
1495 |        "8 2018-12-01           pyasn1     782540\n",
1496 |        "9 2018-12-01         jmespath     772065"
1497 |       ]
1498 |      },
1499 |      "execution_count": 15,
1500 |      "metadata": {},
1501 |      "output_type": "execute_result"
1502 |     }
1503 |    ],
1504 |    "source": [
1505 |     "query = \"\"\"\n",
1506 |     "SELECT \n",
1507 |     "  TIMESTAMP_TRUNC(timestamp, DAY) as day,\n",
1508 |     "  file.project as project,\n",
1509 |     "  COUNT(*) as downloads\n",
1510 |     "FROM `the-psf.pypi.downloads{0}`\n",
1511 |     "WHERE details.installer.name = 'pip'\n",
1512 |     "GROUP BY 1,2\n",
1513 |     "ORDER BY 3 desc\n",
1514 |     "\"\"\".format(process_date_nodash)\n",
1515 |     "\n",
1516 |     "print (query)\n",
1517 |     "\n",
1518 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
1519 |     "df.head(10)"
1520 |    ]
1521 |   },
1522 |   {
1523 |    "cell_type": "markdown",
1524 |    "metadata": {},
1525 |    "source": [
1526 |     "### What is the number stars and fork event for botocore?"
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "code",
1531 |    "execution_count": 19,
1532 |    "metadata": {},
1533 |    "outputs": [
1534 |     {
1535 |      "name": "stdout",
1536 |      "output_type": "stream",
1537 |      "text": [
1538 |       "\n",
1539 |       "SELECT \n",
1540 |       "  repo.name,\n",
1541 |       "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1542 |       "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
1543 |       "  COUNT(*) AS cnt\n",
1544 |       "FROM `githubarchive.day.20181201`\n",
1545 |       "WHERE type IN ('WatchEvent','ForkEvent')\n",
1546 |       "AND repo.name LIKE \"%botocore%\"\n",
1547 |       "GROUP BY 1\n",
1548 |       "ORDER BY 2 DESC\n",
1549 |       "LIMIT 10\n",
1550 |       "\n"
1551 |      ]
1552 |     },
1553 |     {
1554 |      "data": {
1555 |       "text/html": [
1556 |        "<div>\n",
1557 |        "<style scoped>\n",
1558 |        "    .dataframe tbody tr th:only-of-type {\n",
1559 |        "        vertical-align: middle;\n",
1560 |        "    }\n",
1561 |        "\n",
1562 |        "    .dataframe tbody tr th {\n",
1563 |        "        vertical-align: top;\n",
1564 |        "    }\n",
1565 |        "\n",
1566 |        "    .dataframe thead th {\n",
1567 |        "        text-align: right;\n",
1568 |        "    }\n",
1569 |        "</style>\n",
1570 |        "<table border=\"1\" class=\"dataframe\">\n",
1571 |        "  <thead>\n",
1572 |        "    <tr style=\"text-align: right;\">\n",
1573 |        "      <th></th>\n",
1574 |        "      <th>name</th>\n",
1575 |        "      <th>stars</th>\n",
1576 |        "      <th>forks</th>\n",
1577 |        "      <th>cnt</th>\n",
1578 |        "    </tr>\n",
1579 |        "  </thead>\n",
1580 |        "  <tbody>\n",
1581 |        "    <tr>\n",
1582 |        "      <th>0</th>\n",
1583 |        "      <td>boto/botocore</td>\n",
1584 |        "      <td>2</td>\n",
1585 |        "      <td>1</td>\n",
1586 |        "      <td>3</td>\n",
1587 |        "    </tr>\n",
1588 |        "  </tbody>\n",
1589 |        "</table>\n",
1590 |        "</div>"
1591 |       ],
1592 |       "text/plain": [
1593 |        "            name  stars  forks  cnt\n",
1594 |        "0  boto/botocore      2      1    3"
1595 |       ]
1596 |      },
1597 |      "execution_count": 19,
1598 |      "metadata": {},
1599 |      "output_type": "execute_result"
1600 |     }
1601 |    ],
1602 |    "source": [
1603 |     "query = \"\"\"\n",
1604 |     "SELECT \n",
1605 |     "  repo.name,\n",
1606 |     "  SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
1607 |     "  SUM(IF(type='ForkEvent',  1, NULL)) AS forks,\n",
1608 |     "  COUNT(*) AS cnt\n",
1609 |     "FROM `githubarchive.day.{0}`\n",
1610 |     "WHERE type IN ('WatchEvent','ForkEvent')\n",
1611 |     "AND repo.name LIKE \"%botocore%\"\n",
1612 |     "GROUP BY 1\n",
1613 |     "ORDER BY 2 DESC\n",
1614 |     "LIMIT 10\n",
1615 |     "\"\"\".format(process_date_nodash)\n",
1616 |     "\n",
1617 |     "print (query)\n",
1618 |     "\n",
1619 |     "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
1620 |     "df.head(20)"
1621 |    ]
1622 |   },
1623 |   {
1624 |    "cell_type": "markdown",
1625 |    "metadata": {},
1626 |    "source": [
1627 |     "## Resources\n",
1628 |     "- [GitHub data, ready for you to explore with BigQuery](https://blog.github.com/2017-01-19-github-data-ready-for-you-to-explore-with-bigquery/)\n",
1629 |     "- [Hacker News on BigQuery](https://medium.com/@hoffa/hacker-news-on-bigquery-now-with-daily-updates-so-what-are-the-top-domains-963d3c68b2e2)\n",
1630 |     "- [Analyzing PyPI package downloads](https://packaging.python.org/guides/analyzing-pypi-package-downloads/)"
1631 |    ]
1632 |   }
1633 |  ],
1634 |  "metadata": {
1635 |   "kernelspec": {
1636 |    "display_name": "Python 3",
1637 |    "language": "python",
1638 |    "name": "python3"
1639 |   },
1640 |   "language_info": {
1641 |    "codemirror_mode": {
1642 |     "name": "ipython",
1643 |     "version": 3
1644 |    },
1645 |    "file_extension": ".py",
1646 |    "mimetype": "text/x-python",
1647 |    "name": "python",
1648 |    "nbconvert_exporter": "python",
1649 |    "pygments_lexer": "ipython3",
1650 |    "version": "3.6.7"
1651 |   },
1652 |   "toc": {
1653 |    "base_numbering": 1,
1654 |    "nav_menu": {},
1655 |    "number_sections": false,
1656 |    "sideBar": true,
1657 |    "skip_h1_title": false,
1658 |    "title_cell": "Table of Contents",
1659 |    "title_sidebar": "Contents",
1660 |    "toc_cell": false,
1661 |    "toc_position": {},
1662 |    "toc_section_display": true,
1663 |    "toc_window_display": true
1664 |   }
1665 |  },
1666 |  "nbformat": 4,
1667 |  "nbformat_minor": 2
1668 | }
1669 | 


--------------------------------------------------------------------------------