├── .dockerignore ├── stop_gcloud_example.sh ├── run_gcloud_example.sh ├── docs ├── img │ ├── airflow_connection.png │ ├── console_service_account.png │ └── create_service_account.png └── bigquery_github_trends.md ├── examples ├── gcloud-example │ └── dags │ │ ├── support │ │ └── keys │ │ │ └── .gitignore │ │ └── bigquery_github │ │ ├── config │ │ └── variables.json │ │ └── bigquery_github_trends.py └── intro-example │ └── dags │ ├── config │ └── example_variables.json │ ├── example_variables.py │ ├── tutorial.py │ └── example_twitter_dag.py ├── notebooks ├── Dockerfile ├── docker-compose.yml └── gcloud-example │ └── github-trend-analysis.ipynb ├── LICENSE ├── docker-compose.yml ├── docker-compose-gcloud.yml ├── .gitignore └── README.md /.dockerignore: -------------------------------------------------------------------------------- 1 | .git -------------------------------------------------------------------------------- /stop_gcloud_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker-compose -f docker-compose-gcloud.yml down -------------------------------------------------------------------------------- /run_gcloud_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker-compose -f docker-compose-gcloud.yml up --abort-on-container-exit -------------------------------------------------------------------------------- /docs/img/airflow_connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/airflow_connection.png -------------------------------------------------------------------------------- /docs/img/console_service_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/console_service_account.png -------------------------------------------------------------------------------- /docs/img/create_service_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuanavu/airflow-tutorial/HEAD/docs/img/create_service_account.png -------------------------------------------------------------------------------- /examples/gcloud-example/dags/support/keys/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore -------------------------------------------------------------------------------- /examples/intro-example/dags/config/example_variables.json: -------------------------------------------------------------------------------- 1 | { 2 | "example_variables_config": { 3 | "var1": "value1", 4 | "var2": [1, 2, 3], 5 | "var3": { 6 | "k": "value3" 7 | } 8 | } 9 | } -------------------------------------------------------------------------------- /notebooks/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/base-notebook 2 | 3 | USER jovyan 4 | 5 | # Install Tensorflow 6 | RUN conda install --quiet --yes \ 7 | 'pandas' \ 8 | 'pandas-gbq' --channel conda-forge 9 | -------------------------------------------------------------------------------- /examples/gcloud-example/dags/bigquery_github/config/variables.json: -------------------------------------------------------------------------------- 1 | { 2 | "bigquery_github_trends_variables": { 3 | "bq_conn_id": "my_gcp_conn", 4 | "bq_project": "my_bq_project", 5 | "bq_dataset": "my_bq_dataset" 6 | } 7 | } -------------------------------------------------------------------------------- /notebooks/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | jupyter-notebook: 4 | build: . 5 | image: my-jupyter-notebook 6 | volumes: 7 | - ./gcloud-example:/home/jovyan/work 8 | ports: 9 | - "8889:8888" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tuan Vu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:9.6 5 | environment: 6 | - POSTGRES_USER=airflow 7 | - POSTGRES_PASSWORD=airflow 8 | - POSTGRES_DB=airflow 9 | ports: 10 | - "5432:5432" 11 | 12 | webserver: 13 | image: puckel/docker-airflow:1.10.1 14 | build: 15 | context: https://github.com/puckel/docker-airflow.git#1.10.1 16 | dockerfile: Dockerfile 17 | args: 18 | AIRFLOW_DEPS: gcp_api,s3 19 | PYTHON_DEPS: sqlalchemy==1.2.0 20 | restart: always 21 | depends_on: 22 | - postgres 23 | environment: 24 | - LOAD_EX=n 25 | - EXECUTOR=Local 26 | - FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo= 27 | volumes: 28 | - ./examples/intro-example/dags:/usr/local/airflow/dags 29 | # Uncomment to include custom plugins 30 | # - ./plugins:/usr/local/airflow/plugins 31 | ports: 32 | - "8080:8080" 33 | command: webserver 34 | healthcheck: 35 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 36 | interval: 30s 37 | timeout: 30s 38 | retries: 3 39 | -------------------------------------------------------------------------------- /docker-compose-gcloud.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | postgres: 4 | image: postgres:9.6 5 | environment: 6 | - POSTGRES_USER=airflow 7 | - POSTGRES_PASSWORD=airflow 8 | - POSTGRES_DB=airflow 9 | ports: 10 | - "5432:5432" 11 | 12 | webserver: 13 | image: puckel/docker-airflow:1.10.1 14 | build: 15 | context: https://github.com/puckel/docker-airflow.git#1.10.1 16 | dockerfile: Dockerfile 17 | args: 18 | AIRFLOW_DEPS: gcp_api,s3 19 | PYTHON_DEPS: sqlalchemy==1.2.0 20 | restart: always 21 | depends_on: 22 | - postgres 23 | environment: 24 | - LOAD_EX=n 25 | - EXECUTOR=Local 26 | - FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo= 27 | volumes: 28 | - ./examples/gcloud-example/dags:/usr/local/airflow/dags 29 | # Uncomment to include custom plugins 30 | # - ./plugins:/usr/local/airflow/plugins 31 | ports: 32 | - "8080:8080" 33 | command: webserver 34 | healthcheck: 35 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 36 | interval: 30s 37 | timeout: 30s 38 | retries: 3 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Mac 107 | .DS_Store 108 | -------------------------------------------------------------------------------- /examples/intro-example/dags/example_variables.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from datetime import datetime 4 | 5 | from airflow import DAG 6 | from airflow.models import Variable 7 | from airflow.operators.dummy_operator import DummyOperator 8 | from airflow.operators.bash_operator import BashOperator 9 | 10 | default_args = { 11 | 'owner': 'airflow', 12 | 'start_date': datetime(2019, 2, 15), 13 | 'end_date': datetime(2019, 2, 15) 14 | } 15 | 16 | dag = DAG('example_variables', 17 | schedule_interval="@once", 18 | default_args=default_args) 19 | 20 | 21 | # # Config variables 22 | # # Common 23 | # var1 = "value1" 24 | # var2 = [1, 2, 3] 25 | # var3 = {'k': 'value3'} 26 | 27 | # # 3 DB connections called 28 | # var1 = Variable.get("var1") 29 | # var2 = Variable.get("var2") 30 | # var3 = Variable.get("var3") 31 | 32 | # ## Recommended way 33 | # dag_config = Variable.get("example_variables_config", deserialize_json=True) 34 | # var1 = dag_config["var1"] 35 | # var2 = dag_config["var2"] 36 | # var3 = dag_config["var3"] 37 | 38 | # start = DummyOperator( 39 | # task_id="start", 40 | # dag=dag 41 | # ) 42 | 43 | # # To test this task, run this command: 44 | # # docker-compose run --rm webserver airflow test example_variables get_dag_config 2019-02-15 45 | # t1 = BashOperator( 46 | # task_id="get_dag_config", 47 | # bash_command='echo "{0}"'.format(dag_config), 48 | # dag=dag, 49 | # ) 50 | 51 | # # You can directly use a variable from a jinja template 52 | # ## {{ var.value. }} 53 | 54 | # t2 = BashOperator( 55 | # task_id="get_variable_value", 56 | # bash_command='echo {{ var.value.var3 }} ', 57 | # dag=dag, 58 | # ) 59 | 60 | # ## {{ var.json. }} 61 | # t3 = BashOperator( 62 | # task_id="get_variable_json", 63 | # bash_command='echo {{ var.json.example_variables_config.var3 }} ', 64 | # dag=dag, 65 | # ) 66 | 67 | # start >> [t1, t2, t3] -------------------------------------------------------------------------------- /docs/bigquery_github_trends.md: -------------------------------------------------------------------------------- 1 | Bigquery Github Trends 2 | --- 3 | 4 | Example for building a data pipeline using Google Cloud BigQuery and Airflow. 5 | 6 | ## Setup 7 | 8 | This example does assume you will have an Airflow Environment up and running. But first 9 | a quick rundown of what you need: 10 | 11 | * Running Airflow 12 | * Create a service account (Cloud Console) 13 | * Setup a Google Cloud Connection in Airflow 14 | * Enter the config variables 15 | 16 | ### Running Airflow 17 | 18 | - Check out the master branch of this tutorial 19 | - Start the Airflow environment with docker 20 | 21 | ``` 22 | bash run_gcloud_example.sh 23 | ``` 24 | 25 | - Stop the Airflow environment when you are finished 26 | 27 | ``` 28 | bash stop_gcloud_example.sh 29 | ``` 30 | 31 | ### Google Cloud Service Key 32 | 33 | Go to the console: 34 | 35 | ![console](img/console_service_account.png?raw=true) 36 | 37 | Create the service account. Make sure the JSON private key has Editor's rights: 38 | 39 | ![console](img/create_service_account.png?raw=true) 40 | 41 | Also, the service account needs to have permission to access the GCS bucket and Bigquery dataset. 42 | 43 | ### Airflow Connection 44 | 45 | After having the GCP key, you need to create a connection in `Admin -> Connections` using your key. 46 | 47 | In Airflow you need to define the *my_gcp_conn* named connection to your project: 48 | 49 | ![console](img/airflow_connection.png?raw=true) 50 | 51 | Supply the path to the downloaded private key, supply the *project_id* and define the 52 | minimum scope of *https://www.googleapis.com/auth/cloud-platform* 53 | 54 | ### Enter the config variables 55 | 56 | After connection has been set up, you can go to the [bigquery_github_trends DAG](../../gcloud-example/bigquery_github/bigquery_github_trends.py), and enter the value of config variables: 57 | - __BQ_PROJECT__: the bigquery project you are working on 58 | - __BQ_DATASET__: the bigquery dataset you are working on 59 | 60 | ### Test the DAG 61 | 62 | After connection and config variables has been set up, you can now test and run your DAG. 63 | 64 | - Using the command below to test specific task in the DAG: 65 | 66 | ``` 67 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test [DAG_ID] [TASK_ID] [EXECUTION_DATE] 68 | ``` 69 | 70 | - Examples: 71 | 72 | ``` 73 | # Task 1 74 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01 75 | 76 | # Task 2 77 | docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_hackernews_full 2018-12-01 78 | ``` 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Airflow tutorial 2 | --- 3 | 4 | This is the code for [Apache Airflow Tutorials](https://www.youtube.com/playlist?list=PLYizQ5FvN6pvIOcOd6dFZu3lQqc6zBGp2) playlist by Tuan Vu on Youtube 5 | 6 | ## Contents 7 | 8 | | Part | Title | Git Tag | 9 | |------|---------------------------|---------| 10 | | 1 | [Introduction to Apache Airflow](https://youtu.be/AHMm1wfGuHE) ([blog post](https://www.applydatascience.com/airflow/airflow-tutorial-introduction/)) | [v0.1](https://github.com/tuanavu/airflow-tutorial/tree/v0.1) | 11 | | 2 | [Set up airflow environment with docker](https://youtu.be/vvr_WNzEXBE) ([blog post](https://www.applydatascience.com/airflow/set-up-airflow-env-with-docker/)) | [v0.2](https://github.com/tuanavu/airflow-tutorial/tree/v0.2) | 12 | | 3 | [Set up airflow environment using Google Cloud Composer](https://youtu.be/ld6JO3MiuPQ) ([blog post](https://www.applydatascience.com/airflow/set-up-airflow-with-google-composer/)) | N/A | 13 | | 4 | [Writing your first pipeline](https://youtu.be/43wHwwZhJMo) ([blog post](https://www.applydatascience.com/airflow/writing-your-first-pipeline/)) | N/A | 14 | | 5 | [Airflow concept](https://youtu.be/4rQSa2zEWfw) ([blog post](https://www.applydatascience.com/airflow/airflow-concept/)) | N/A | 15 | | 6 | [Build a data pipeline using Google Cloud Bigquery](https://youtu.be/wAyu5BN3VpY) ([blog post](https://www.applydatascience.com/airflow/bigquery-pipeline-airflow/)) | [v0.6](https://github.com/tuanavu/airflow-tutorial/tree/v0.6) | 16 | | 7 | [Airflow variables](https://youtu.be/bHQ7nzn0j6k) ([blog post](https://www.applydatascience.com/airflow/airflow-variables/)) | [v0.7](https://github.com/tuanavu/airflow-tutorial/tree/v0.7) | 17 | 18 | 19 | ## Getting Started 20 | 21 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 22 | 23 | - Clone this repo 24 | - Install the prerequisites 25 | - Run the service 26 | - Check http://localhost:8080 27 | - Done! :tada: 28 | 29 | ### Prerequisites 30 | 31 | - Install [Docker](https://www.docker.com/) 32 | - Install [Docker Compose](https://docs.docker.com/compose/install/) 33 | - Following the Airflow release from [Python Package Index](https://pypi.python.org/pypi/apache-airflow) 34 | 35 | ### Usage 36 | 37 | Run the web service with docker 38 | 39 | ``` 40 | docker-compose up -d 41 | 42 | # Build the image 43 | # docker-compose up -d --build 44 | ``` 45 | 46 | Check http://localhost:8080/ 47 | 48 | - `docker-compose logs` - Displays log output 49 | - `docker-compose ps` - List containers 50 | - `docker-compose down` - Stop containers 51 | 52 | ## Other commands 53 | 54 | If you want to run airflow sub-commands, you can do so like this: 55 | 56 | - `docker-compose run --rm webserver airflow list_dags` - List dags 57 | - `docker-compose run --rm webserver airflow test [DAG_ID] [TASK_ID] [EXECUTION_DATE]` - Test specific task 58 | 59 | If you want to run/test python script, you can do so like this: 60 | - `docker-compose run --rm webserver python /usr/local/airflow/dags/[PYTHON-FILE].py` - Test python script 61 | 62 | ## Connect to database 63 | 64 | If you want to use Ad hoc query, make sure you've configured connections: 65 | Go to Admin -> Connections and Edit "postgres_default" set this values: 66 | - Host : postgres 67 | - Schema : airflow 68 | - Login : airflow 69 | - Password : airflow 70 | 71 | 72 | ## Credits 73 | 74 | - [Apache Airflow](https://github.com/apache/incubator-airflow) 75 | - [docker-airflow](https://github.com/puckel/docker-airflow/tree/1.10.0-5) 76 | -------------------------------------------------------------------------------- /examples/intro-example/dags/tutorial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | """ 21 | ### Tutorial Documentation 22 | Documentation that goes along with the Airflow tutorial located 23 | [here](https://airflow.incubator.apache.org/tutorial.html) 24 | """ 25 | from datetime import timedelta 26 | 27 | import airflow 28 | from airflow import DAG 29 | from airflow.operators.bash_operator import BashOperator 30 | 31 | # These args will get passed on to each operator 32 | # You can override them on a per-task basis during operator initialization 33 | default_args = { 34 | 'owner': 'airflow', 35 | 'depends_on_past': False, 36 | 'start_date': airflow.utils.dates.days_ago(2), 37 | 'email': ['airflow@example.com'], 38 | 'email_on_failure': False, 39 | 'email_on_retry': False, 40 | 'retries': 1, 41 | 'retry_delay': timedelta(minutes=5), 42 | # 'queue': 'bash_queue', 43 | # 'pool': 'backfill', 44 | # 'priority_weight': 10, 45 | # 'end_date': datetime(2016, 1, 1), 46 | # 'wait_for_downstream': False, 47 | # 'dag': dag, 48 | # 'adhoc':False, 49 | # 'sla': timedelta(hours=2), 50 | # 'execution_timeout': timedelta(seconds=300), 51 | # 'on_failure_callback': some_function, 52 | # 'on_success_callback': some_other_function, 53 | # 'on_retry_callback': another_function, 54 | # 'trigger_rule': u'all_success' 55 | } 56 | 57 | dag = DAG( 58 | 'tutorial', 59 | default_args=default_args, 60 | description='A simple tutorial DAG', 61 | schedule_interval=timedelta(days=1), 62 | ) 63 | 64 | # t1, t2 and t3 are examples of tasks created by instantiating operators 65 | t1 = BashOperator( 66 | task_id='print_date', 67 | bash_command='date', 68 | dag=dag, 69 | ) 70 | 71 | t1.doc_md = """\ 72 | #### Task Documentation 73 | You can document your task using the attributes `doc_md` (markdown), 74 | `doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets 75 | rendered in the UI's Task Instance Details page. 76 | ![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) 77 | """ 78 | 79 | dag.doc_md = __doc__ 80 | 81 | t2 = BashOperator( 82 | task_id='sleep', 83 | depends_on_past=False, 84 | bash_command='sleep 5', 85 | dag=dag, 86 | ) 87 | 88 | templated_command = """ 89 | {% for i in range(5) %} 90 | echo "{{ ds }}" 91 | echo "{{ macros.ds_add(ds, 7)}}" 92 | echo "{{ params.my_param }}" 93 | {% endfor %} 94 | """ 95 | 96 | t3 = BashOperator( 97 | task_id='templated', 98 | depends_on_past=False, 99 | bash_command=templated_command, 100 | params={'my_param': 'Parameter I passed in'}, 101 | dag=dag, 102 | ) 103 | 104 | t1 >> [t2, t3] -------------------------------------------------------------------------------- /examples/intro-example/dags/example_twitter_dag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # -------------------------------------------------------------------------------- 20 | # Written By: Ekhtiar Syed 21 | # Last Update: 8th April 2016 22 | # Caveat: This Dag will not run because of missing scripts. 23 | # The purpose of this is to give you a sample of a real world example DAG! 24 | # -------------------------------------------------------------------------------- 25 | 26 | # -------------------------------------------------------------------------------- 27 | # Load The Dependencies 28 | # -------------------------------------------------------------------------------- 29 | 30 | import airflow 31 | from airflow import DAG 32 | from airflow.operators.bash_operator import BashOperator 33 | from airflow.operators.python_operator import PythonOperator 34 | from airflow.operators.hive_operator import HiveOperator 35 | from datetime import date, timedelta 36 | 37 | # -------------------------------------------------------------------------------- 38 | # Create a few placeholder scripts. In practice these would be different python 39 | # script files, which are imported in this section with absolute or relative imports 40 | # -------------------------------------------------------------------------------- 41 | 42 | 43 | def fetchtweets(): 44 | return None 45 | 46 | 47 | def cleantweets(): 48 | return None 49 | 50 | 51 | def analyzetweets(): 52 | return None 53 | 54 | 55 | def transfertodb(): 56 | return None 57 | 58 | 59 | # -------------------------------------------------------------------------------- 60 | # set default arguments 61 | # -------------------------------------------------------------------------------- 62 | 63 | default_args = { 64 | 'owner': 'airflow', 65 | 'depends_on_past': False, 66 | 'start_date': airflow.utils.dates.days_ago(2), 67 | 'email': ['airflow@example.com'], 68 | 'email_on_failure': False, 69 | 'email_on_retry': False, 70 | 'retries': 1, 71 | 'retry_delay': timedelta(minutes=5), 72 | # 'queue': 'bash_queue', 73 | # 'pool': 'backfill', 74 | # 'priority_weight': 10, 75 | # 'end_date': datetime(2016, 1, 1), 76 | } 77 | 78 | dag = DAG( 79 | 'example_twitter_dag', default_args=default_args, 80 | schedule_interval="@daily") 81 | 82 | # -------------------------------------------------------------------------------- 83 | # This task should call Twitter API and retrieve tweets from yesterday from and to 84 | # for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv 85 | # output files generated by this task and naming convention 86 | # is direction(from or to)_twitterHandle_date.csv 87 | # -------------------------------------------------------------------------------- 88 | 89 | fetch_tweets = PythonOperator( 90 | task_id='fetch_tweets', 91 | python_callable=fetchtweets, 92 | dag=dag) 93 | 94 | # -------------------------------------------------------------------------------- 95 | # Clean the eight files. In this step you can get rid of or cherry pick columns 96 | # and different parts of the text 97 | # -------------------------------------------------------------------------------- 98 | 99 | clean_tweets = PythonOperator( 100 | task_id='clean_tweets', 101 | python_callable=cleantweets, 102 | dag=dag) 103 | 104 | clean_tweets.set_upstream(fetch_tweets) 105 | 106 | # -------------------------------------------------------------------------------- 107 | # In this section you can use a script to analyze the twitter data. Could simply 108 | # be a sentiment analysis through algorithms like bag of words or something more 109 | # complicated. You can also take a look at Web Services to do such tasks 110 | # -------------------------------------------------------------------------------- 111 | 112 | analyze_tweets = PythonOperator( 113 | task_id='analyze_tweets', 114 | python_callable=analyzetweets, 115 | dag=dag) 116 | 117 | analyze_tweets.set_upstream(clean_tweets) 118 | 119 | # -------------------------------------------------------------------------------- 120 | # Although this is the last task, we need to declare it before the next tasks as we 121 | # will use set_downstream This task will extract summary from Hive data and store 122 | # it to MySQL 123 | # -------------------------------------------------------------------------------- 124 | 125 | hive_to_mysql = PythonOperator( 126 | task_id='hive_to_mysql', 127 | python_callable=transfertodb, 128 | dag=dag) 129 | 130 | # -------------------------------------------------------------------------------- 131 | # The following tasks are generated using for loop. The first task puts the eight 132 | # csv files to HDFS. The second task loads these files from HDFS to respected Hive 133 | # tables. These two for loops could be combined into one loop. However, in most cases, 134 | # you will be running different analysis on your incoming incoming and outgoing tweets, 135 | # and hence they are kept separated in this example. 136 | # -------------------------------------------------------------------------------- 137 | 138 | from_channels = ['fromTwitter_A', 'fromTwitter_B', 'fromTwitter_C', 'fromTwitter_D'] 139 | to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D'] 140 | yesterday = date.today() - timedelta(days=1) 141 | dt = yesterday.strftime("%Y-%m-%d") 142 | # define where you want to store the tweets csv file in your local directory 143 | local_dir = "/tmp/" 144 | # define the location where you want to store in HDFS 145 | hdfs_dir = " /tmp/" 146 | 147 | for channel in to_channels: 148 | 149 | file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" 150 | 151 | load_to_hdfs = BashOperator( 152 | task_id="put_" + channel + "_to_hdfs", 153 | bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + 154 | local_dir + file_name + 155 | hdfs_dir + channel + "/", 156 | dag=dag) 157 | 158 | load_to_hdfs.set_upstream(analyze_tweets) 159 | 160 | load_to_hive = HiveOperator( 161 | task_id="load_" + channel + "_to_hive", 162 | hql="LOAD DATA INPATH '" + 163 | hdfs_dir + channel + "/" + file_name + "' " 164 | "INTO TABLE " + channel + " " 165 | "PARTITION(dt='" + dt + "')", 166 | dag=dag) 167 | load_to_hive.set_upstream(load_to_hdfs) 168 | load_to_hive.set_downstream(hive_to_mysql) 169 | 170 | for channel in from_channels: 171 | file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" 172 | load_to_hdfs = BashOperator( 173 | task_id="put_" + channel + "_to_hdfs", 174 | bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + 175 | local_dir + file_name + 176 | hdfs_dir + channel + "/", 177 | dag=dag) 178 | 179 | load_to_hdfs.set_upstream(analyze_tweets) 180 | 181 | load_to_hive = HiveOperator( 182 | task_id="load_" + channel + "_to_hive", 183 | hql="LOAD DATA INPATH '" + 184 | hdfs_dir + channel + "/" + file_name + "' " 185 | "INTO TABLE " + channel + " " 186 | "PARTITION(dt='" + dt + "')", 187 | dag=dag) 188 | 189 | load_to_hive.set_upstream(load_to_hdfs) 190 | load_to_hive.set_downstream(hive_to_mysql) -------------------------------------------------------------------------------- /examples/gcloud-example/dags/bigquery_github/bigquery_github_trends.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import timedelta, datetime 3 | 4 | from airflow import DAG 5 | from airflow.models import Variable 6 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 7 | from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator 8 | 9 | 10 | # Config variables 11 | dag_config = Variable.get("bigquery_github_trends_variables", deserialize_json=True) 12 | BQ_CONN_ID = dag_config["bq_conn_id"] 13 | BQ_PROJECT = dag_config["bq_project"] 14 | BQ_DATASET = dag_config["bq_dataset"] 15 | 16 | default_args = { 17 | 'owner': 'airflow', 18 | 'depends_on_past': True, 19 | 'start_date': datetime(2018, 12, 1), 20 | 'end_date': datetime(2018, 12, 5), 21 | 'email': ['airflow@airflow.com'], 22 | 'email_on_failure': True, 23 | 'email_on_retry': False, 24 | 'retries': 2, 25 | 'retry_delay': timedelta(minutes=5), 26 | } 27 | 28 | # Set Schedule: Run pipeline once a day. 29 | # Use cron to define exact time. Eg. 8:15am would be "15 08 * * *" 30 | schedule_interval = "00 21 * * *" 31 | 32 | # Define DAG: Set ID and assign default args and schedule interval 33 | dag = DAG( 34 | 'bigquery_github_trends', 35 | default_args=default_args, 36 | schedule_interval=schedule_interval 37 | ) 38 | 39 | ## Task 1: check that the github archive data has a dated table created for that date 40 | # To test this task, run this command: 41 | # docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01 42 | t1 = BigQueryCheckOperator( 43 | task_id='bq_check_githubarchive_day', 44 | sql=''' 45 | #standardSQL 46 | SELECT 47 | table_id 48 | FROM 49 | `githubarchive.day.__TABLES_SUMMARY__` 50 | WHERE 51 | table_id = "{{ yesterday_ds_nodash }}" 52 | ''', 53 | use_legacy_sql=False, 54 | bigquery_conn_id=BQ_CONN_ID, 55 | dag=dag 56 | ) 57 | 58 | ## Task 2: check that the hacker news table contains data for that date. 59 | t2 = BigQueryCheckOperator( 60 | task_id='bq_check_hackernews_full', 61 | sql=''' 62 | #standardSQL 63 | SELECT 64 | FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date 65 | FROM 66 | `bigquery-public-data.hacker_news.full` 67 | WHERE 68 | type = 'story' 69 | AND FORMAT_TIMESTAMP("%Y%m%d", timestamp ) = "{{ yesterday_ds_nodash }}" 70 | LIMIT 71 | 1 72 | ''', 73 | use_legacy_sql=False, 74 | bigquery_conn_id=BQ_CONN_ID, 75 | dag=dag 76 | ) 77 | 78 | ## Task 3: create a github daily metrics partition table 79 | t3 = BigQueryOperator( 80 | task_id='bq_write_to_github_daily_metrics', 81 | sql=''' 82 | #standardSQL 83 | SELECT 84 | date, 85 | repo, 86 | SUM(IF(type='WatchEvent', 1, NULL)) AS stars, 87 | SUM(IF(type='ForkEvent', 1, NULL)) AS forks 88 | FROM ( 89 | SELECT 90 | FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date, 91 | actor.id as actor_id, 92 | repo.name as repo, 93 | type 94 | FROM 95 | `githubarchive.day.{{ yesterday_ds_nodash }}` 96 | WHERE type IN ('WatchEvent','ForkEvent') 97 | ) 98 | GROUP BY 99 | date, 100 | repo 101 | ''', 102 | destination_dataset_table='{0}.{1}.github_daily_metrics${2}'.format( 103 | BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}' 104 | ), 105 | write_disposition='WRITE_TRUNCATE', 106 | allow_large_results=True, 107 | use_legacy_sql=False, 108 | bigquery_conn_id=BQ_CONN_ID, 109 | dag=dag 110 | ) 111 | 112 | ## Task 4: aggregate past github events to daily partition table 113 | t4 = BigQueryOperator( 114 | task_id='bq_write_to_github_agg', 115 | sql=''' 116 | #standardSQL 117 | SELECT 118 | "{2}" as date, 119 | repo, 120 | SUM(stars) as stars_last_28_days, 121 | SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{4}") 122 | AND TIMESTAMP("{3}") , 123 | stars, null)) as stars_last_7_days, 124 | SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{3}") 125 | AND TIMESTAMP("{3}") , 126 | stars, null)) as stars_last_1_day, 127 | SUM(forks) as forks_last_28_days, 128 | SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{4}") 129 | AND TIMESTAMP("{3}") , 130 | forks, null)) as forks_last_7_days, 131 | SUM(IF(_PARTITIONTIME BETWEEN TIMESTAMP("{3}") 132 | AND TIMESTAMP("{3}") , 133 | forks, null)) as forks_last_1_day 134 | FROM 135 | `{0}.{1}.github_daily_metrics` 136 | WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{5}") 137 | AND TIMESTAMP("{3}") 138 | GROUP BY 139 | date, 140 | repo 141 | '''.format(BQ_PROJECT, BQ_DATASET, 142 | "{{ yesterday_ds_nodash }}", "{{ yesterday_ds }}", 143 | "{{ macros.ds_add(ds, -6) }}", 144 | "{{ macros.ds_add(ds, -27) }}" 145 | ) 146 | , 147 | destination_dataset_table='{0}.{1}.github_agg${2}'.format( 148 | BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}' 149 | ), 150 | write_disposition='WRITE_TRUNCATE', 151 | allow_large_results=True, 152 | use_legacy_sql=False, 153 | bigquery_conn_id=BQ_CONN_ID, 154 | dag=dag 155 | ) 156 | 157 | # Task 5: aggregate hacker news data to a daily partition table 158 | t5 = BigQueryOperator( 159 | task_id='bq_write_to_hackernews_agg', 160 | sql=''' 161 | #standardSQL 162 | SELECT 163 | FORMAT_TIMESTAMP("%Y%m%d", timestamp) AS date, 164 | `by` AS submitter, 165 | id as story_id, 166 | REGEXP_EXTRACT(url, "(https?://github.com/[^/]*/[^/#?]*)") as url, 167 | SUM(score) as score 168 | FROM 169 | `bigquery-public-data.hacker_news.full` 170 | WHERE 171 | type = 'story' 172 | AND timestamp>'{{ yesterday_ds }}' 173 | AND timestamp<'{{ ds }}' 174 | AND url LIKE '%https://github.com%' 175 | AND url NOT LIKE '%github.com/blog/%' 176 | GROUP BY 177 | date, 178 | submitter, 179 | story_id, 180 | url 181 | ''', 182 | destination_dataset_table='{0}.{1}.hackernews_agg${2}'.format( 183 | BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}' 184 | ), 185 | write_disposition='WRITE_TRUNCATE', 186 | allow_large_results=True, 187 | use_legacy_sql=False, 188 | bigquery_conn_id=BQ_CONN_ID, 189 | dag=dag 190 | ) 191 | 192 | # Task 6: join the aggregate tables 193 | t6 = BigQueryOperator( 194 | task_id='bq_write_to_hackernews_github_agg', 195 | sql=''' 196 | #standardSQL 197 | SELECT 198 | a.date as date, 199 | a.url as github_url, 200 | b.repo as github_repo, 201 | a.score as hn_score, 202 | a.story_id as hn_story_id, 203 | b.stars_last_28_days as stars_last_28_days, 204 | b.stars_last_7_days as stars_last_7_days, 205 | b.stars_last_1_day as stars_last_1_day, 206 | b.forks_last_28_days as forks_last_28_days, 207 | b.forks_last_7_days as forks_last_7_days, 208 | b.forks_last_1_day as forks_last_1_day 209 | FROM 210 | (SELECT 211 | * 212 | FROM 213 | `{0}.{1}.hackernews_agg` 214 | WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{2}") AND TIMESTAMP("{2}") 215 | )as a 216 | LEFT JOIN 217 | ( 218 | SELECT 219 | repo, 220 | CONCAT('https://github.com/', repo) as url, 221 | stars_last_28_days, 222 | stars_last_7_days, 223 | stars_last_1_day, 224 | forks_last_28_days, 225 | forks_last_7_days, 226 | forks_last_1_day 227 | FROM 228 | `{0}.{1}.github_agg` 229 | WHERE _PARTITIONTIME BETWEEN TIMESTAMP("{2}") AND TIMESTAMP("{2}") 230 | ) as b 231 | ON a.url = b.url 232 | '''.format( 233 | BQ_PROJECT, BQ_DATASET, "{{ yesterday_ds }}" 234 | ), 235 | destination_dataset_table='{0}.{1}.hackernews_github_agg${2}'.format( 236 | BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds_nodash }}' 237 | ), 238 | write_disposition='WRITE_TRUNCATE', 239 | allow_large_results=True, 240 | use_legacy_sql=False, 241 | bigquery_conn_id=BQ_CONN_ID, 242 | dag=dag 243 | ) 244 | 245 | # Task 7: Check if partition data is written successfully 246 | t7 = BigQueryCheckOperator( 247 | task_id='bq_check_hackernews_github_agg', 248 | sql=''' 249 | #standardSQL 250 | SELECT 251 | COUNT(*) AS rows_in_partition 252 | FROM `{0}.{1}.hackernews_github_agg` 253 | WHERE _PARTITIONDATE = "{2}" 254 | '''.format(BQ_PROJECT, BQ_DATASET, '{{ yesterday_ds }}' 255 | ), 256 | use_legacy_sql=False, 257 | bigquery_conn_id=BQ_CONN_ID, 258 | dag=dag) 259 | 260 | # Setting up Dependencies 261 | t3.set_upstream(t1) 262 | t4.set_upstream(t3) 263 | t5.set_upstream(t2) 264 | t6.set_upstream(t4) 265 | t6.set_upstream(t5) 266 | t7.set_upstream(t6) 267 | -------------------------------------------------------------------------------- /notebooks/gcloud-example/github-trend-analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GitHub on Hacker News trends analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from __future__ import print_function\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Input parameters" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 9, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "project_id = \"your-project-id\"\n", 34 | "process_date = \"2018-12-01\"\n", 35 | "process_date_nodash = \"20181201\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Exploratory Data Analysis" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Github activity data\n", 50 | "- Link: [Data](https://bigquery.cloud.google.com/table/githubarchive:day.20181230) - [More info](https://blog.github.com/2017-01-19-github-data-ready-for-you-to-explore-with-bigquery/)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Different event type in Gihub activity\n", 58 | "- [Event Types & Payloads](https://developer.github.com/v3/activity/events/types/) explaination" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "\n", 71 | "SELECT \n", 72 | " type,\n", 73 | " COUNT(*) AS cnt\n", 74 | "FROM `githubarchive.day.20181201` \n", 75 | "GROUP BY 1\n", 76 | "ORDER BY 2 DESC\n", 77 | "\n" 78 | ] 79 | }, 80 | { 81 | "data": { 82 | "text/html": [ 83 | "
\n", 84 | "\n", 97 | "\n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
typecnt
0PushEvent588724
1CreateEvent155010
2WatchEvent67607
3PullRequestEvent56635
4IssueCommentEvent46972
5IssuesEvent27592
6ForkEvent24331
7DeleteEvent22590
8PullRequestReviewCommentEvent9756
9MemberEvent5201
10GollumEvent4445
11ReleaseEvent3527
12CommitCommentEvent1759
13PublicEvent1064
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " type cnt\n", 182 | "0 PushEvent 588724\n", 183 | "1 CreateEvent 155010\n", 184 | "2 WatchEvent 67607\n", 185 | "3 PullRequestEvent 56635\n", 186 | "4 IssueCommentEvent 46972\n", 187 | "5 IssuesEvent 27592\n", 188 | "6 ForkEvent 24331\n", 189 | "7 DeleteEvent 22590\n", 190 | "8 PullRequestReviewCommentEvent 9756\n", 191 | "9 MemberEvent 5201\n", 192 | "10 GollumEvent 4445\n", 193 | "11 ReleaseEvent 3527\n", 194 | "12 CommitCommentEvent 1759\n", 195 | "13 PublicEvent 1064" 196 | ] 197 | }, 198 | "execution_count": 6, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "query = \"\"\"\n", 205 | "SELECT \n", 206 | " type,\n", 207 | " COUNT(*) AS cnt\n", 208 | "FROM `githubarchive.day.{0}` \n", 209 | "GROUP BY 1\n", 210 | "ORDER BY 2 DESC\n", 211 | "\"\"\".format(process_date_nodash)\n", 212 | "\n", 213 | "print (query)\n", 214 | "\n", 215 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 216 | "df.head(20)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### Top 10 repos with the most comments in their issues\n", 224 | "- __IssueCommentEvent__: Triggered when an issue comment is created, edited, or deleted." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 18, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "\n", 237 | "SELECT \n", 238 | " repo.name,\n", 239 | " COUNT(*) AS cnt\n", 240 | "FROM `githubarchive.day.20181201`\n", 241 | "WHERE type IN ( 'IssueCommentEvent')\n", 242 | "GROUP BY 1\n", 243 | "ORDER BY 2 DESC\n", 244 | "LIMIT 10\n", 245 | "\n" 246 | ] 247 | }, 248 | { 249 | "data": { 250 | "text/html": [ 251 | "
\n", 252 | "\n", 265 | "\n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | "
namecnt
0google-test/signcla-probe-repo327
1Azure/azure-rest-api-specs287
2kubernetes/kubernetes227
3rust-lang/rust207
4apache/spark204
5freeCodeCamp/freeCodeCamp196
6everypolitician/everypolitician-data192
7TeamNewPipe/NewPipe158
8openshift/origin140
9NixOS/nixpkgs126
\n", 326 | "
" 327 | ], 328 | "text/plain": [ 329 | " name cnt\n", 330 | "0 google-test/signcla-probe-repo 327\n", 331 | "1 Azure/azure-rest-api-specs 287\n", 332 | "2 kubernetes/kubernetes 227\n", 333 | "3 rust-lang/rust 207\n", 334 | "4 apache/spark 204\n", 335 | "5 freeCodeCamp/freeCodeCamp 196\n", 336 | "6 everypolitician/everypolitician-data 192\n", 337 | "7 TeamNewPipe/NewPipe 158\n", 338 | "8 openshift/origin 140\n", 339 | "9 NixOS/nixpkgs 126" 340 | ] 341 | }, 342 | "execution_count": 18, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "query = \"\"\"\n", 349 | "SELECT \n", 350 | " repo.name,\n", 351 | " COUNT(*) AS cnt\n", 352 | "FROM `githubarchive.day.{0}`\n", 353 | "WHERE type IN ( 'IssueCommentEvent')\n", 354 | "GROUP BY 1\n", 355 | "ORDER BY 2 DESC\n", 356 | "LIMIT 10\n", 357 | "\"\"\".format(process_date_nodash)\n", 358 | "\n", 359 | "print (query)\n", 360 | "\n", 361 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 362 | "df.head(20)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "### Top 10 repos by stars and fork event" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 8, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "\n", 382 | "SELECT \n", 383 | " repo.name,\n", 384 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 385 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 386 | " COUNT(*) AS cnt\n", 387 | "FROM `githubarchive.day.20181201`\n", 388 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 389 | "GROUP BY 1\n", 390 | "ORDER BY 2 DESC\n", 391 | "LIMIT 10\n", 392 | "\n" 393 | ] 394 | }, 395 | { 396 | "data": { 397 | "text/html": [ 398 | "
\n", 399 | "\n", 412 | "\n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | "
namestarsforkscnt
0BcRikko/NES.css38635421
1leisurelicht/wtfpython-cn24131272
2satwikkansal/wtfpython19030220
3cssanimation/css-animation-1011785183
4firecracker-microvm/firecracker15013163
5crazyandcoder/kindle_free_books13231163
6withspectrum/spectrum1329141
7afshinea/stanford-cs-230-deep-learning12017137
8algorithm-visualizer/algorithm-visualizer11915134
9olifolkerd/tabulator1143117
\n", 495 | "
" 496 | ], 497 | "text/plain": [ 498 | " name stars forks cnt\n", 499 | "0 BcRikko/NES.css 386 35 421\n", 500 | "1 leisurelicht/wtfpython-cn 241 31 272\n", 501 | "2 satwikkansal/wtfpython 190 30 220\n", 502 | "3 cssanimation/css-animation-101 178 5 183\n", 503 | "4 firecracker-microvm/firecracker 150 13 163\n", 504 | "5 crazyandcoder/kindle_free_books 132 31 163\n", 505 | "6 withspectrum/spectrum 132 9 141\n", 506 | "7 afshinea/stanford-cs-230-deep-learning 120 17 137\n", 507 | "8 algorithm-visualizer/algorithm-visualizer 119 15 134\n", 508 | "9 olifolkerd/tabulator 114 3 117" 509 | ] 510 | }, 511 | "execution_count": 8, 512 | "metadata": {}, 513 | "output_type": "execute_result" 514 | } 515 | ], 516 | "source": [ 517 | "query = \"\"\"\n", 518 | "SELECT \n", 519 | " repo.name,\n", 520 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 521 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 522 | " COUNT(*) AS cnt\n", 523 | "FROM `githubarchive.day.{0}`\n", 524 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 525 | "GROUP BY 1\n", 526 | "ORDER BY 2 DESC\n", 527 | "LIMIT 10\n", 528 | "\"\"\".format(process_date_nodash)\n", 529 | "\n", 530 | "print (query)\n", 531 | "\n", 532 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 533 | "df.head(20)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "## Hacker News data\n", 541 | "- Link: [Data](https://bigquery.cloud.google.com/table/bigquery-public-data:hacker_news.full) - [More info](https://medium.com/@hoffa/hacker-news-on-bigquery-now-with-daily-updates-so-what-are-the-top-domains-963d3c68b2e2)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "### Top domains shared in Hacker News\n", 549 | "- Domain with higher score are more likely to make it to the front page.\n", 550 | "- __nytimes__ has the highest average score." 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 12, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "name": "stdout", 560 | "output_type": "stream", 561 | "text": [ 562 | "\n", 563 | "SELECT \n", 564 | " REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n", 565 | " AVG(score) as avg_score,\n", 566 | " COUNT(*) AS cnt\n", 567 | "FROM `bigquery-public-data.hacker_news.full`\n", 568 | "WHERE url!='' \n", 569 | "AND EXTRACT(DATE FROM timestamp)=\"2018-12-01\"\n", 570 | "GROUP BY 1\n", 571 | "ORDER BY 3 DESC \n", 572 | "LIMIT 10\n", 573 | "\n" 574 | ] 575 | }, 576 | { 577 | "data": { 578 | "text/html": [ 579 | "
\n", 580 | "\n", 593 | "\n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | "
domainavg_scorecnt
0github.com14.96666730
1medium.com15.59259327
2www.youtube.com12.66666724
3www.nytimes.com41.26315819
4venturebeat.com2.10000010
5www.reddit.com21.4285717
6www.theguardian.com31.1666676
7en.wikipedia.org15.8333336
8arstechnica.com22.6666676
9www.theverge.com2.2000005
\n", 665 | "
" 666 | ], 667 | "text/plain": [ 668 | " domain avg_score cnt\n", 669 | "0 github.com 14.966667 30\n", 670 | "1 medium.com 15.592593 27\n", 671 | "2 www.youtube.com 12.666667 24\n", 672 | "3 www.nytimes.com 41.263158 19\n", 673 | "4 venturebeat.com 2.100000 10\n", 674 | "5 www.reddit.com 21.428571 7\n", 675 | "6 www.theguardian.com 31.166667 6\n", 676 | "7 en.wikipedia.org 15.833333 6\n", 677 | "8 arstechnica.com 22.666667 6\n", 678 | "9 www.theverge.com 2.200000 5" 679 | ] 680 | }, 681 | "execution_count": 12, 682 | "metadata": {}, 683 | "output_type": "execute_result" 684 | } 685 | ], 686 | "source": [ 687 | "query = \"\"\"\n", 688 | "SELECT \n", 689 | " REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n", 690 | " AVG(score) as avg_score,\n", 691 | " COUNT(*) AS cnt\n", 692 | "FROM `bigquery-public-data.hacker_news.full`\n", 693 | "WHERE url!='' \n", 694 | "AND EXTRACT(DATE FROM timestamp)=\"{0}\"\n", 695 | "GROUP BY 1\n", 696 | "ORDER BY 3 DESC \n", 697 | "LIMIT 10\n", 698 | "\"\"\".format(process_date)\n", 699 | "\n", 700 | "print (query)\n", 701 | "\n", 702 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 703 | "df.head(20)" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "### What domains have the best chance of getting more than 40 upvotes?\n", 711 | "- Certainly Hacker News likes content hosted on sites like github.com and the nytimes." 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 11, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "name": "stdout", 721 | "output_type": "stream", 722 | "text": [ 723 | "\n", 724 | "SELECT \n", 725 | " REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n", 726 | " COUNTIF(score>40) as score_gt_40,\n", 727 | " COUNT(*) AS cnt\n", 728 | "FROM `bigquery-public-data.hacker_news.full`\n", 729 | "WHERE url!='' \n", 730 | "AND EXTRACT(DATE FROM timestamp)=\"2018-12-01\"\n", 731 | "GROUP BY 1\n", 732 | "ORDER BY 2 DESC \n", 733 | "LIMIT 10\n", 734 | "\n" 735 | ] 736 | }, 737 | { 738 | "data": { 739 | "text/html": [ 740 | "
\n", 741 | "\n", 754 | "\n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | "
domainscore_gt_40cnt
0www.nytimes.com419
1github.com430
2medium.com327
3www.wsj.com24
4www.theatlantic.com25
5www.youtube.com224
6www.jamiefuller.com11
7arstechnica.com16
8www.vulture.com12
9www.newsshooter.com11
\n", 826 | "
" 827 | ], 828 | "text/plain": [ 829 | " domain score_gt_40 cnt\n", 830 | "0 www.nytimes.com 4 19\n", 831 | "1 github.com 4 30\n", 832 | "2 medium.com 3 27\n", 833 | "3 www.wsj.com 2 4\n", 834 | "4 www.theatlantic.com 2 5\n", 835 | "5 www.youtube.com 2 24\n", 836 | "6 www.jamiefuller.com 1 1\n", 837 | "7 arstechnica.com 1 6\n", 838 | "8 www.vulture.com 1 2\n", 839 | "9 www.newsshooter.com 1 1" 840 | ] 841 | }, 842 | "execution_count": 11, 843 | "metadata": {}, 844 | "output_type": "execute_result" 845 | } 846 | ], 847 | "source": [ 848 | "query = \"\"\"\n", 849 | "SELECT \n", 850 | " REGEXP_EXTRACT(url, '//([^/]*)/?') as domain,\n", 851 | " COUNTIF(score>40) as score_gt_40,\n", 852 | " COUNT(*) AS cnt\n", 853 | "FROM `bigquery-public-data.hacker_news.full`\n", 854 | "WHERE url!='' \n", 855 | "AND EXTRACT(DATE FROM timestamp)=\"{0}\"\n", 856 | "GROUP BY 1\n", 857 | "ORDER BY 2 DESC \n", 858 | "LIMIT 10\n", 859 | "\"\"\".format(process_date)\n", 860 | "\n", 861 | "print (query)\n", 862 | "\n", 863 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 864 | "df.head(20)" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "### Top 10 Hacker news stories from Github by highest score" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 17, 877 | "metadata": {}, 878 | "outputs": [ 879 | { 880 | "name": "stdout", 881 | "output_type": "stream", 882 | "text": [ 883 | "\n", 884 | "SELECT \n", 885 | " `by` AS submitter,\n", 886 | " id as story_id,\n", 887 | " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", 888 | " SUM(score) as score\n", 889 | "FROM\n", 890 | " `bigquery-public-data.hacker_news.full`\n", 891 | "WHERE\n", 892 | " type = 'story'\n", 893 | " AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n", 894 | " AND url LIKE '%https://github.com%'\n", 895 | " AND url NOT LIKE '%github.com/blog/%'\n", 896 | "GROUP BY \n", 897 | " submitter,\n", 898 | " story_id,\n", 899 | " url\n", 900 | "ORDER BY score DESC\n", 901 | "\n" 902 | ] 903 | }, 904 | { 905 | "data": { 906 | "text/html": [ 907 | "
\n", 908 | "\n", 921 | "\n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | "
submitterstory_idurlscore
0ithinco18574181https://github.com/ithinco/i-am-chinese-the-dr...129
1mountainview18576170https://github.com/YugaByte/yugabyte-db115
2oxplot18575094https://github.com/oxplot/pdftilecut64
3codeadict18574683https://github.com/alertlogic/erllambda64
4pjmlp18575802https://github.com/chocolatey/boxstarter9
5snek18577658https://github.com/devsnek/engine2628
6delvincasper18577036https://github.com/jerverless/jerverless4
7andrewchaa18574107https://github.com/andrewchaa/functional.pipe4
8anmonteiro9018578964https://github.com/anmonteiro/aws-lambda-ocaml...4
9KumarAbhirup18577887https://github.com/KumarAbhirup/bulk-mail-cli4
\n", 1004 | "
" 1005 | ], 1006 | "text/plain": [ 1007 | " submitter story_id url \\\n", 1008 | "0 ithinco 18574181 https://github.com/ithinco/i-am-chinese-the-dr... \n", 1009 | "1 mountainview 18576170 https://github.com/YugaByte/yugabyte-db \n", 1010 | "2 oxplot 18575094 https://github.com/oxplot/pdftilecut \n", 1011 | "3 codeadict 18574683 https://github.com/alertlogic/erllambda \n", 1012 | "4 pjmlp 18575802 https://github.com/chocolatey/boxstarter \n", 1013 | "5 snek 18577658 https://github.com/devsnek/engine262 \n", 1014 | "6 delvincasper 18577036 https://github.com/jerverless/jerverless \n", 1015 | "7 andrewchaa 18574107 https://github.com/andrewchaa/functional.pipe \n", 1016 | "8 anmonteiro90 18578964 https://github.com/anmonteiro/aws-lambda-ocaml... \n", 1017 | "9 KumarAbhirup 18577887 https://github.com/KumarAbhirup/bulk-mail-cli \n", 1018 | "\n", 1019 | " score \n", 1020 | "0 129 \n", 1021 | "1 115 \n", 1022 | "2 64 \n", 1023 | "3 64 \n", 1024 | "4 9 \n", 1025 | "5 8 \n", 1026 | "6 4 \n", 1027 | "7 4 \n", 1028 | "8 4 \n", 1029 | "9 4 " 1030 | ] 1031 | }, 1032 | "execution_count": 17, 1033 | "metadata": {}, 1034 | "output_type": "execute_result" 1035 | } 1036 | ], 1037 | "source": [ 1038 | "query = \"\"\"\n", 1039 | "SELECT \n", 1040 | " `by` AS submitter,\n", 1041 | " id as story_id,\n", 1042 | " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", 1043 | " SUM(score) as score\n", 1044 | "FROM\n", 1045 | " `bigquery-public-data.hacker_news.full`\n", 1046 | "WHERE\n", 1047 | " type = 'story'\n", 1048 | " AND EXTRACT(DATE FROM timestamp)='{0}' \n", 1049 | " AND url LIKE '%https://github.com%'\n", 1050 | " AND url NOT LIKE '%github.com/blog/%'\n", 1051 | "GROUP BY \n", 1052 | " submitter,\n", 1053 | " story_id,\n", 1054 | " url\n", 1055 | "ORDER BY score DESC\n", 1056 | "\"\"\".format(process_date)\n", 1057 | "\n", 1058 | "print (query)\n", 1059 | "\n", 1060 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 1061 | "df.head(10)" 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "markdown", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "## Example Final table: GitHub on Hacker News Trends of 2018-12-01" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": 20, 1074 | "metadata": {}, 1075 | "outputs": [ 1076 | { 1077 | "name": "stdout", 1078 | "output_type": "stream", 1079 | "text": [ 1080 | "\n", 1081 | "WITH github_activity AS (\n", 1082 | "SELECT \n", 1083 | " repo.name as repo,\n", 1084 | " CONCAT('https://github.com/', repo.name) as url,\n", 1085 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 1086 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 1087 | " COUNT(*) AS cnt\n", 1088 | "FROM `githubarchive.day.20181201`\n", 1089 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 1090 | "GROUP BY 1,2\n", 1091 | "),\n", 1092 | "hacker_news AS (\n", 1093 | "SELECT\n", 1094 | " EXTRACT(DATE FROM timestamp) as date,\n", 1095 | " `by` AS submitter,\n", 1096 | " id as story_id,\n", 1097 | " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", 1098 | " SUM(score) as score\n", 1099 | "FROM\n", 1100 | " `bigquery-public-data.hacker_news.full`\n", 1101 | "WHERE\n", 1102 | " type = 'story'\n", 1103 | " AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n", 1104 | " AND url LIKE '%https://github.com%'\n", 1105 | " AND url NOT LIKE '%github.com/blog/%'\n", 1106 | "GROUP BY 1,2,3,4\n", 1107 | ")\n", 1108 | "\n", 1109 | "SELECT\n", 1110 | " a.date as date,\n", 1111 | " a.url as github_url,\n", 1112 | " b.repo as github_repo,\n", 1113 | " a.score as hn_score,\n", 1114 | " a.story_id as hn_story_id,\n", 1115 | " b.stars as stars,\n", 1116 | " b.forks as forks\n", 1117 | "FROM hacker_news as a\n", 1118 | "LEFT JOIN github_activity as b\n", 1119 | "ON a.url=b.url\n", 1120 | "ORDER BY hn_score DESC\n", 1121 | "LIMIT 10\n", 1122 | "\n" 1123 | ] 1124 | }, 1125 | { 1126 | "data": { 1127 | "text/html": [ 1128 | "
\n", 1129 | "\n", 1142 | "\n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | "
dategithub_urlgithub_repohn_scorehn_story_idstarsforks
02018-12-01https://github.com/ithinco/i-am-chinese-the-dr...ithinco/i-am-chinese-the-dragonfly-must-go-on1291857418160.01.0
12018-12-01https://github.com/YugaByte/yugabyte-dbYugaByte/yugabyte-db115185761702.0NaN
22018-12-01https://github.com/alertlogic/erllambdaalertlogic/erllambda641857468348.0NaN
32018-12-01https://github.com/oxplot/pdftilecutoxplot/pdftilecut641857509491.0NaN
42018-12-01https://github.com/chocolatey/boxstarterchocolatey/boxstarter9185758021.0NaN
52018-12-01https://github.com/devsnek/engine262devsnek/engine2628185776581.0NaN
62018-12-01https://github.com/andrewchaa/functional.pipeandrewchaa/functional.pipe4185741072.0NaN
72018-12-01https://github.com/anmonteiro/aws-lambda-ocaml...anmonteiro/aws-lambda-ocaml-runtime4185789645.0NaN
82018-12-01https://github.com/KumarAbhirup/bulk-mail-cliNone418577887NaNNaN
92018-12-01https://github.com/jerverless/jerverlessNone418577036NaNNaN
\n", 1258 | "
" 1259 | ], 1260 | "text/plain": [ 1261 | " date github_url \\\n", 1262 | "0 2018-12-01 https://github.com/ithinco/i-am-chinese-the-dr... \n", 1263 | "1 2018-12-01 https://github.com/YugaByte/yugabyte-db \n", 1264 | "2 2018-12-01 https://github.com/alertlogic/erllambda \n", 1265 | "3 2018-12-01 https://github.com/oxplot/pdftilecut \n", 1266 | "4 2018-12-01 https://github.com/chocolatey/boxstarter \n", 1267 | "5 2018-12-01 https://github.com/devsnek/engine262 \n", 1268 | "6 2018-12-01 https://github.com/andrewchaa/functional.pipe \n", 1269 | "7 2018-12-01 https://github.com/anmonteiro/aws-lambda-ocaml... \n", 1270 | "8 2018-12-01 https://github.com/KumarAbhirup/bulk-mail-cli \n", 1271 | "9 2018-12-01 https://github.com/jerverless/jerverless \n", 1272 | "\n", 1273 | " github_repo hn_score hn_story_id \\\n", 1274 | "0 ithinco/i-am-chinese-the-dragonfly-must-go-on 129 18574181 \n", 1275 | "1 YugaByte/yugabyte-db 115 18576170 \n", 1276 | "2 alertlogic/erllambda 64 18574683 \n", 1277 | "3 oxplot/pdftilecut 64 18575094 \n", 1278 | "4 chocolatey/boxstarter 9 18575802 \n", 1279 | "5 devsnek/engine262 8 18577658 \n", 1280 | "6 andrewchaa/functional.pipe 4 18574107 \n", 1281 | "7 anmonteiro/aws-lambda-ocaml-runtime 4 18578964 \n", 1282 | "8 None 4 18577887 \n", 1283 | "9 None 4 18577036 \n", 1284 | "\n", 1285 | " stars forks \n", 1286 | "0 60.0 1.0 \n", 1287 | "1 2.0 NaN \n", 1288 | "2 48.0 NaN \n", 1289 | "3 91.0 NaN \n", 1290 | "4 1.0 NaN \n", 1291 | "5 1.0 NaN \n", 1292 | "6 2.0 NaN \n", 1293 | "7 5.0 NaN \n", 1294 | "8 NaN NaN \n", 1295 | "9 NaN NaN " 1296 | ] 1297 | }, 1298 | "execution_count": 20, 1299 | "metadata": {}, 1300 | "output_type": "execute_result" 1301 | } 1302 | ], 1303 | "source": [ 1304 | "query = \"\"\"\n", 1305 | "WITH github_activity AS (\n", 1306 | "SELECT \n", 1307 | " repo.name as repo,\n", 1308 | " CONCAT('https://github.com/', repo.name) as url,\n", 1309 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 1310 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 1311 | " COUNT(*) AS cnt\n", 1312 | "FROM `githubarchive.day.{0}`\n", 1313 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 1314 | "GROUP BY 1,2\n", 1315 | "),\n", 1316 | "hacker_news AS (\n", 1317 | "SELECT\n", 1318 | " EXTRACT(DATE FROM timestamp) as date,\n", 1319 | " `by` AS submitter,\n", 1320 | " id as story_id,\n", 1321 | " REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n", 1322 | " SUM(score) as score\n", 1323 | "FROM\n", 1324 | " `bigquery-public-data.hacker_news.full`\n", 1325 | "WHERE\n", 1326 | " type = 'story'\n", 1327 | " AND EXTRACT(DATE FROM timestamp)='{1}' \n", 1328 | " AND url LIKE '%https://github.com%'\n", 1329 | " AND url NOT LIKE '%github.com/blog/%'\n", 1330 | "GROUP BY 1,2,3,4\n", 1331 | ")\n", 1332 | "\n", 1333 | "SELECT\n", 1334 | " a.date as date,\n", 1335 | " a.url as github_url,\n", 1336 | " b.repo as github_repo,\n", 1337 | " a.score as hn_score,\n", 1338 | " a.story_id as hn_story_id,\n", 1339 | " b.stars as stars,\n", 1340 | " b.forks as forks\n", 1341 | "FROM hacker_news as a\n", 1342 | "LEFT JOIN github_activity as b\n", 1343 | "ON a.url=b.url\n", 1344 | "ORDER BY hn_score DESC\n", 1345 | "LIMIT 10\n", 1346 | "\"\"\".format(process_date_nodash, process_date)\n", 1347 | "\n", 1348 | "print (query)\n", 1349 | "\n", 1350 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 1351 | "df.head(10)" 1352 | ] 1353 | }, 1354 | { 1355 | "cell_type": "markdown", 1356 | "metadata": {}, 1357 | "source": [ 1358 | "## Python PyPI stats\n", 1359 | "- The Python Software Foundation provides the raw logs of Python installation activitie\n", 1360 | "- Link: [Data](https://bigquery.cloud.google.com/table/the-psf:pypi.downloads20181230) - [More info](https://packaging.python.org/guides/analyzing-pypi-package-downloads/)\n", 1361 | "\n", 1362 | "__Challenge__: \n", 1363 | "- Find associated Github stars, fork event, and Hacker News story for top downloads Python packages from pip" 1364 | ] 1365 | }, 1366 | { 1367 | "cell_type": "markdown", 1368 | "metadata": {}, 1369 | "source": [ 1370 | "### Top 10 downloads packages from pip" 1371 | ] 1372 | }, 1373 | { 1374 | "cell_type": "code", 1375 | "execution_count": 15, 1376 | "metadata": {}, 1377 | "outputs": [ 1378 | { 1379 | "name": "stdout", 1380 | "output_type": "stream", 1381 | "text": [ 1382 | "\n", 1383 | "SELECT \n", 1384 | " TIMESTAMP_TRUNC(timestamp, DAY) as day,\n", 1385 | " file.project as project,\n", 1386 | " COUNT(*) as downloads\n", 1387 | "FROM `the-psf.pypi.downloads20181201`\n", 1388 | "WHERE details.installer.name = 'pip'\n", 1389 | "GROUP BY 1,2\n", 1390 | "ORDER BY 3 desc\n", 1391 | "\n" 1392 | ] 1393 | }, 1394 | { 1395 | "data": { 1396 | "text/html": [ 1397 | "
\n", 1398 | "\n", 1411 | "\n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | "
dayprojectdownloads
02018-12-01pip1562226
12018-12-01urllib31271997
22018-12-01botocore1069194
32018-12-01six966172
42018-12-01python-dateutil946327
52018-12-01s3transfer877832
62018-12-01docutils813135
72018-12-01pyyaml796706
82018-12-01pyasn1782540
92018-12-01jmespath772065
\n", 1483 | "
" 1484 | ], 1485 | "text/plain": [ 1486 | " day project downloads\n", 1487 | "0 2018-12-01 pip 1562226\n", 1488 | "1 2018-12-01 urllib3 1271997\n", 1489 | "2 2018-12-01 botocore 1069194\n", 1490 | "3 2018-12-01 six 966172\n", 1491 | "4 2018-12-01 python-dateutil 946327\n", 1492 | "5 2018-12-01 s3transfer 877832\n", 1493 | "6 2018-12-01 docutils 813135\n", 1494 | "7 2018-12-01 pyyaml 796706\n", 1495 | "8 2018-12-01 pyasn1 782540\n", 1496 | "9 2018-12-01 jmespath 772065" 1497 | ] 1498 | }, 1499 | "execution_count": 15, 1500 | "metadata": {}, 1501 | "output_type": "execute_result" 1502 | } 1503 | ], 1504 | "source": [ 1505 | "query = \"\"\"\n", 1506 | "SELECT \n", 1507 | " TIMESTAMP_TRUNC(timestamp, DAY) as day,\n", 1508 | " file.project as project,\n", 1509 | " COUNT(*) as downloads\n", 1510 | "FROM `the-psf.pypi.downloads{0}`\n", 1511 | "WHERE details.installer.name = 'pip'\n", 1512 | "GROUP BY 1,2\n", 1513 | "ORDER BY 3 desc\n", 1514 | "\"\"\".format(process_date_nodash)\n", 1515 | "\n", 1516 | "print (query)\n", 1517 | "\n", 1518 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 1519 | "df.head(10)" 1520 | ] 1521 | }, 1522 | { 1523 | "cell_type": "markdown", 1524 | "metadata": {}, 1525 | "source": [ 1526 | "### What is the number stars and fork event for botocore?" 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "execution_count": 19, 1532 | "metadata": {}, 1533 | "outputs": [ 1534 | { 1535 | "name": "stdout", 1536 | "output_type": "stream", 1537 | "text": [ 1538 | "\n", 1539 | "SELECT \n", 1540 | " repo.name,\n", 1541 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 1542 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 1543 | " COUNT(*) AS cnt\n", 1544 | "FROM `githubarchive.day.20181201`\n", 1545 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 1546 | "AND repo.name LIKE \"%botocore%\"\n", 1547 | "GROUP BY 1\n", 1548 | "ORDER BY 2 DESC\n", 1549 | "LIMIT 10\n", 1550 | "\n" 1551 | ] 1552 | }, 1553 | { 1554 | "data": { 1555 | "text/html": [ 1556 | "
\n", 1557 | "\n", 1570 | "\n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | "
namestarsforkscnt
0boto/botocore213
\n", 1590 | "
" 1591 | ], 1592 | "text/plain": [ 1593 | " name stars forks cnt\n", 1594 | "0 boto/botocore 2 1 3" 1595 | ] 1596 | }, 1597 | "execution_count": 19, 1598 | "metadata": {}, 1599 | "output_type": "execute_result" 1600 | } 1601 | ], 1602 | "source": [ 1603 | "query = \"\"\"\n", 1604 | "SELECT \n", 1605 | " repo.name,\n", 1606 | " SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n", 1607 | " SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n", 1608 | " COUNT(*) AS cnt\n", 1609 | "FROM `githubarchive.day.{0}`\n", 1610 | "WHERE type IN ('WatchEvent','ForkEvent')\n", 1611 | "AND repo.name LIKE \"%botocore%\"\n", 1612 | "GROUP BY 1\n", 1613 | "ORDER BY 2 DESC\n", 1614 | "LIMIT 10\n", 1615 | "\"\"\".format(process_date_nodash)\n", 1616 | "\n", 1617 | "print (query)\n", 1618 | "\n", 1619 | "df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n", 1620 | "df.head(20)" 1621 | ] 1622 | }, 1623 | { 1624 | "cell_type": "markdown", 1625 | "metadata": {}, 1626 | "source": [ 1627 | "## Resources\n", 1628 | "- [GitHub data, ready for you to explore with BigQuery](https://blog.github.com/2017-01-19-github-data-ready-for-you-to-explore-with-bigquery/)\n", 1629 | "- [Hacker News on BigQuery](https://medium.com/@hoffa/hacker-news-on-bigquery-now-with-daily-updates-so-what-are-the-top-domains-963d3c68b2e2)\n", 1630 | "- [Analyzing PyPI package downloads](https://packaging.python.org/guides/analyzing-pypi-package-downloads/)" 1631 | ] 1632 | } 1633 | ], 1634 | "metadata": { 1635 | "kernelspec": { 1636 | "display_name": "Python 3", 1637 | "language": "python", 1638 | "name": "python3" 1639 | }, 1640 | "language_info": { 1641 | "codemirror_mode": { 1642 | "name": "ipython", 1643 | "version": 3 1644 | }, 1645 | "file_extension": ".py", 1646 | "mimetype": "text/x-python", 1647 | "name": "python", 1648 | "nbconvert_exporter": "python", 1649 | "pygments_lexer": "ipython3", 1650 | "version": "3.6.7" 1651 | }, 1652 | "toc": { 1653 | "base_numbering": 1, 1654 | "nav_menu": {}, 1655 | "number_sections": false, 1656 | "sideBar": true, 1657 | "skip_h1_title": false, 1658 | "title_cell": "Table of Contents", 1659 | "title_sidebar": "Contents", 1660 | "toc_cell": false, 1661 | "toc_position": {}, 1662 | "toc_section_display": true, 1663 | "toc_window_display": true 1664 | } 1665 | }, 1666 | "nbformat": 4, 1667 | "nbformat_minor": 2 1668 | } 1669 | --------------------------------------------------------------------------------