├── .editorconfig ├── README.md ├── appendix ├── dags │ ├── 01.airflow-example.py │ └── 02.simple-elt-bigquery.py └── data │ ├── bike_data_20191125.csv │ ├── bike_data_20191126.csv │ ├── bike_data_20191127.csv │ ├── bike_data_20191128.csv │ └── bike_schema.json ├── bigquery_cover.jpg ├── blogs ├── bigquery_backup │ ├── README.md │ ├── bigquery_backup.py │ ├── bigquery_restore.py │ └── helper_utils.py ├── bqml_arima │ ├── README.md │ └── bqml_arima.ipynb ├── bqml_model_export │ ├── README.md │ ├── deploy.sh │ ├── input.json │ └── queries.sql ├── bqml_recommendations │ ├── README.md │ ├── bqml_ga360.ipynb │ ├── create_table.sql │ ├── predict.sql │ └── train.sql ├── dbt_load │ ├── .gitignore │ ├── README.md │ ├── college-scorecard │ │ ├── .gitignore │ │ ├── README.md │ │ ├── dbt_project.yml │ │ ├── macros │ │ │ └── cleanup_numeric.sql │ │ └── models │ │ │ ├── college_scorecard.sql │ │ │ ├── selective_firstgen.sql │ │ │ └── selective_firstgen_top10.sql │ ├── load_external_gcs.sh │ ├── profiles.yml │ └── setup.sh ├── flex_slots │ └── run_query_on_flex_slots.sh ├── graphdb │ ├── README.md │ └── find_routes.scala └── xmlload │ ├── orders.xml │ ├── xmlload.ipynb │ └── xmlload.py ├── ch01 ├── 01.sql ├── 02.sql └── 03.sql ├── ch02 ├── 01.sql ├── 02.sql ├── 03.sql ├── 04.sql ├── 05.sql ├── 06.sql ├── 07.sql ├── 08.sql ├── 09.sql ├── 10.sql ├── 11.sql ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.sql ├── 16.sql ├── 17.sql ├── 18.sql ├── 19.sql ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.sql ├── 30.sql ├── 31.sql ├── 32.sql ├── 33.sql ├── 34.sql ├── 35.sql ├── 36.sql ├── 37.sql ├── 38.sql ├── 39.sql ├── 40.sql └── 41.sql ├── ch03 ├── 01.sql ├── 02.sql ├── 03.sql ├── 04.sql ├── 05.sql ├── 06.sql ├── 07.sql ├── 08.sql ├── 09.sql ├── 10.sql ├── 11.sql ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.sql ├── 16.sql ├── 17.sql ├── 18.sql ├── 19.sql ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.sql ├── 30.sql ├── 31.sql ├── 32.sql ├── 33.sql └── 34.sql ├── ch04 ├── 01.sql ├── 02.sh ├── 03.sh ├── 04.sh ├── 05.sh ├── 06.sh ├── 07.sql ├── 08.sql ├── 09.sh ├── 10.sh ├── 11.sh ├── 12.sql ├── 13.sql ├── 14.sh ├── 15.sql ├── 16.sql ├── 17.sh ├── 18.sql ├── 19.sql ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sh ├── 24.sh ├── 25.sh ├── 26.sh ├── 27.sh ├── 28.sh ├── 29.sh ├── 30.sh ├── 31.sh ├── 32.sh ├── 33.sh ├── 34.sh ├── 35.sql ├── 36.sql ├── 37.sql ├── 38.sql ├── 39.sql ├── 40.sh ├── 41.sql ├── 42.sql ├── 43.sql ├── 44.sql ├── 45.sql ├── 46.sql ├── 47.sql ├── 48.sql ├── 49.sql ├── 50.sql ├── 51.sh ├── 52.sql ├── 53.sql ├── 54.py ├── 55.py ├── 56.py ├── 57.py ├── 58.sh ├── bigtable │ ├── delete_instance.sh │ └── setup_data.sh ├── college_scorecard.csv.gz ├── dataflow.ipynb ├── load_external_gcs.sh ├── load_from.gcs.sh ├── load_from_gcs.sh ├── load_from_local.sh ├── queries.txt ├── query_temp_table.sh ├── schema.json ├── setup_data_transfer.sh ├── sheets_data.csv └── students.csv ├── ch05 ├── 01.sh ├── 02.sql ├── 03.sh ├── 04.sh ├── 05.json ├── 06.json ├── 07.sh ├── 08.py ├── 09.py ├── 10.py ├── 11.py ├── 12.py ├── 13.py ├── 14.py ├── 15.py ├── 16.py ├── 17.py ├── 18.py ├── 19.py ├── 20.py ├── 21.py ├── 22.py ├── 23.py ├── 24.sh ├── 25.py ├── 26.py ├── 27.py ├── 28.py ├── 29.py ├── 30.py ├── 31.py ├── 32.py ├── 33.py ├── 34.py ├── 35.py ├── 36.py ├── 37.py ├── 38.py ├── 39.py ├── 40.py ├── 41.py ├── 42.py ├── 43.py ├── 44.py ├── 45.py ├── 46.py ├── 47.py ├── 48.py ├── 49.py ├── 50.py ├── 51.py ├── 52.sh ├── 53.sh ├── 54.sql ├── 55.sql ├── 56.sql ├── 57.py ├── 58.py ├── 59.py ├── 60.py ├── 61.py ├── 62.sh ├── 63.py ├── 64.r ├── 65.r ├── 66.ipynb ├── 67.sql ├── 68.ipynb ├── 69.py ├── 70.py ├── 71.py ├── 72.js ├── 73.js ├── 74.sh ├── 75.sh ├── 76.sh ├── 77.sh ├── 78.sh ├── 79.sh ├── 80.sh ├── 81.sh ├── 82.sh ├── 83.sh ├── 84.sh ├── 84.sql ├── 85.sh ├── 86.sh ├── 87.sh ├── 88.sh ├── 89.sh ├── 90.sh ├── 91.sh ├── 92.bigqueryrc ├── 93.sh ├── 94.sh ├── 95.sh ├── 96.sh ├── 97.sh ├── 98.sh ├── bigquery_cloud_client.ipynb ├── bq_query.sh ├── bq_to_slides.gs ├── find_url.sh ├── google_api_client.ipynb ├── launch_notebook.sh ├── magics.ipynb ├── pandas.ipynb ├── requirements.txt ├── rest_list.sh ├── rest_query.sh ├── rest_query_async.sh └── statfit.ipynb ├── ch06 ├── 01.sql ├── 02.sql ├── 03.sql ├── 04.sh ├── 05.sql ├── 06.sql ├── 07.sql ├── 08.sql ├── 09.sql └── 10.sql ├── ch07 ├── 01.sql ├── 02.sh ├── 03.sh ├── 04.sh ├── 05.sh ├── 06.sh ├── 07.sh ├── 08.sh ├── 09.sql ├── 10.sh ├── 11.sql ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.sql ├── 16.sql ├── 17.sql ├── 18.sql ├── 19.sql ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.sql ├── 30.sql ├── 31.sql ├── 32.sql ├── 33.sql ├── 34.sql ├── 35.sql ├── 36.sql ├── 37.sql ├── 38.sql ├── 39.sql ├── 40.sql ├── 41.sql ├── 42.sql ├── 43.sql ├── 44.sql ├── 45.sql ├── 46.sql ├── 47.sql ├── 48.sql ├── 49.sh ├── 50.sh ├── 51.py ├── 52.sql ├── 53.sql ├── 54.sql ├── 55.sql ├── 56.sql ├── 57.sql ├── 58.sql ├── 59.sql ├── 60.sql ├── 61.sql ├── 62.sql ├── 63.sql ├── 64.sql ├── 65.sh ├── 66.sql ├── 67.sql ├── 68.sql ├── 69.sql ├── 70.sql ├── 71.py ├── get_job_details.sh ├── get_job_details_compressed.sh ├── get_recent_jobs.sh ├── google_analytics.sql ├── hurricanes.sql ├── install_workload_tester.sh ├── time_bqwt.sh └── time_query.sh ├── ch08 ├── 01.py ├── 02.py ├── 03.py ├── 04.py ├── 05.py ├── 06.py ├── 07.py ├── 08.py ├── 09.py ├── 10.py ├── 11.py ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.sql ├── 16.sql ├── 17.sql ├── 18.sql ├── 19.sql ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.sql ├── 30.sql ├── 31.sql ├── 32.sql ├── 33.sql ├── 34.sql ├── 35.sql ├── 36.sql ├── 37.sql ├── 38.sql ├── 39.sql ├── 40.sql ├── 41.sql ├── 42.sql ├── 43.sql ├── 44.sh ├── 45.sh ├── 46.sh ├── 47.sql ├── 48.sql ├── 49.sql ├── 50.sql ├── 51.sql ├── 52.sql ├── 53.sql ├── 54.sql ├── 55.sql ├── 56.sql ├── 57.sql ├── 58.sql ├── 59.sql ├── 60.sql ├── 61.sql ├── 62.sql ├── 63.sql ├── 64.sql ├── 65.sql ├── 66.sql ├── 67.sql ├── 68.sql ├── 69.sql ├── 70.sql ├── 71.sql ├── 72.sql ├── 73.sql ├── 74.sql ├── 75.sql ├── 76.sql ├── 77.sql ├── 78.sql ├── 79.sql ├── 80.sql ├── 81.sql ├── 82.sql ├── 83.sql ├── 84.sql ├── 85.sql ├── 86.sql ├── 87.sql ├── 88.sql ├── 89.sql ├── 90.sql ├── 91.sql ├── 92.sql ├── 93.sql ├── 94.sql ├── 95.sql ├── 96.sql ├── 97.sql ├── 98.sql ├── 99.sql ├── param_array.py ├── param_named.py ├── param_positional.py ├── param_timestamp.py ├── script_loop.sql ├── script_seq.sql ├── script_temptbl.sql ├── script_var.sql ├── stored_procedure_def.sql └── stored_procedure_inout.sql ├── ch09 ├── 01.sql ├── 02.sql ├── 03.sql ├── 04.sql ├── 05.sql ├── 06.sql ├── 07.sql ├── 08.sql ├── 09.sql ├── 10.sql ├── 11.sql ├── 12.sql ├── 13.sql ├── 14.sql ├── 15.py ├── 16.sql ├── 17.sql ├── 18.sql ├── 19.py ├── 20.sql ├── 21.sql ├── 22.sql ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.py ├── 30.sql ├── 31.sql ├── 32.sql ├── 33-.sql ├── 34-.sql ├── 35-.sql ├── 36-.sql ├── 37-.sql ├── 38-.sql ├── 39-.sql ├── 40-.sql ├── 41-.sql ├── 42-.sql ├── 43-.sql ├── 44-.sql ├── 45.sh ├── 46-.sql ├── 47-.sql ├── 48-.sql ├── 49-.sql ├── 50-.sql ├── 51-.sql ├── 52-.sql ├── 53-.sql ├── 54-.sql ├── 55-.sql ├── 56-.sql ├── 57-.sql ├── 58-.sql ├── 59-.sql ├── 60-.sql ├── 61-.sql ├── 62-.sql ├── 63-.sql ├── 64-.sql ├── 65-.py ├── 66-.py ├── 67-.py ├── 68-.py ├── 69.sh ├── 70.py ├── 71.py ├── 72.py ├── 73.py ├── 74.py ├── 75.py ├── 76.sh ├── 77.sh ├── 78.sh ├── 79.sql ├── 80.sql ├── arr_to_input_16.sql ├── hybrid.sql ├── hyperparam.ipynb ├── hyperparam.yaml ├── image_embeddings.ipynb └── text_embeddings.ipynb ├── ch10 ├── 01.yaml ├── 02.sh ├── 03.sh ├── 04.sh ├── 05.sh ├── 06.sh ├── 07.sql ├── 08.sh ├── 09.sh ├── 10.py ├── 11.py ├── 12.sql ├── 13.sh ├── 14.sh ├── 15.sh ├── 16.sql ├── 17.sh ├── 18.sh ├── 19.sh ├── 20.sql ├── 21.sql ├── 22.sh ├── 23.sql ├── 24.sql ├── 25.sql ├── 26.sql ├── 27.sql ├── 28.sql ├── 29.sql ├── 30.sql ├── 31.sql ├── 32.sql ├── 33.sql ├── 34.sh ├── 35.sh ├── 36.sh ├── list_jobs.sh ├── recover_table.sh └── setup_cmek.sh └── set_env.md /.editorconfig: -------------------------------------------------------------------------------- 1 | root=true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | indent_size = 4 7 | indent_style = space -------------------------------------------------------------------------------- /bigquery_cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlybooks/bigquery/9e5e4234720876404cb28058f8387a0f8154aee0/bigquery_cover.jpg -------------------------------------------------------------------------------- /blogs/bqml_arima/README.md: -------------------------------------------------------------------------------- 1 | ## Using an ARIMA model in BigQuery ML to do demand forecasting 2 | 3 | Accompanies the blog post: 4 | 5 | [How to do time series forecasting in BigQuery]( 6 | https://medium.com/@lakshmanok/how-to-do-time-series-forecasting-in-bigquery-af9eb6be8159 7 | ) 8 | -------------------------------------------------------------------------------- /blogs/bqml_model_export/README.md: -------------------------------------------------------------------------------- 1 | ## BigQuery ML model export 2 | 3 | Accompanies the blog post: 4 | 5 | [How to export a BigQuery ML model and deploy it for online prediction]( 6 | https://medium.com/@lakshmanok/how-to-export-a-bigquery-ml-model-and-deploy-it-for-online-prediction-a7e4d44c4c93 7 | ) 8 | -------------------------------------------------------------------------------- /blogs/bqml_model_export/input.json: -------------------------------------------------------------------------------- 1 | {"start_station_name": "Vauxhall Cross, Vauxhall", "dayofweek": "weekend", "hourofday": "17"} 2 | -------------------------------------------------------------------------------- /blogs/bqml_recommendations/README.md: -------------------------------------------------------------------------------- 1 | Training a recommendation model for Google Analytics data using BigQuery ML 2 | 3 | Accompanies the blog post: https://towardsdatascience.com/training-a-recommendation-model-for-google-analytics-data-using-bigquery-ml-2327f9a2e8e9 4 | -------------------------------------------------------------------------------- /blogs/bqml_recommendations/predict.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | visitorId, 3 | ARRAY_AGG(STRUCT(contentId, predicted_normalized_session_duration) 4 | ORDER BY predicted_normalized_session_duration DESC 5 | LIMIT 3) 6 | FROM ML.RECOMMEND(MODEL advdata.ga360_recommendations_model) 7 | WHERE predicted_normalized_session_duration < 1 8 | GROUP BY visitorId 9 | -------------------------------------------------------------------------------- /blogs/bqml_recommendations/train.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE MODEL advdata.ga360_recommendations_model 2 | OPTIONS(model_type='matrix_factorization', 3 | user_col='visitorId', item_col='contentId', 4 | rating_col='normalized_session_duration', 5 | l2_reg=10) 6 | AS 7 | SELECT * from advdata.ga360_recommendations_data 8 | -------------------------------------------------------------------------------- /blogs/dbt_load/.gitignore: -------------------------------------------------------------------------------- 1 | keyfile.json 2 | -------------------------------------------------------------------------------- /blogs/dbt_load/README.md: -------------------------------------------------------------------------------- 1 | 1. ./setup.sh # This installs dbt 2 | 2. ./load_external_gcs.sh # This creates the initial table definition where everything is a string 3 | 3. cd college-scorecard 4 | 4. dbt run # This creates a table and a view. 5 | -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [dbt community](http://community.getbdt.com/) to learn from other analytics engineers 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/macros/cleanup_numeric.sql: -------------------------------------------------------------------------------- 1 | {% macro cleanup_numeric_macro() %} 2 | 3 | CREATE OR REPLACE FUNCTION {{target.schema}}.cleanup_numeric(x STRING) AS 4 | ( 5 | IF ( x != 'NULL' AND x != 'PrivacySuppressed', 6 | CAST(x as FLOAT64), 7 | NULL ) 8 | ); 9 | 10 | {% endmacro %} 11 | -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/models/college_scorecard.sql: -------------------------------------------------------------------------------- 1 | 2 | WITH etl_data AS ( 3 | SELECT 4 | * EXCEPT(ADM_RATE_ALL, FIRST_GEN, MD_FAMINC, SAT_AVG, MD_EARN_WNE_P10) 5 | , {{target.schema}}.cleanup_numeric(ADM_RATE_ALL) AS ADM_RATE_ALL 6 | , {{target.schema}}.cleanup_numeric(FIRST_GEN) AS FIRST_GEN 7 | , {{target.schema}}.cleanup_numeric(MD_FAMINC) AS MD_FAMINC 8 | , {{target.schema}}.cleanup_numeric(SAT_AVG) AS SAT_AVG 9 | , {{target.schema}}.cleanup_numeric(MD_EARN_WNE_P10) AS MD_EARN_WNE_P10 10 | FROM 11 | ch04.college_scorecard_gcs 12 | ) 13 | 14 | SELECT * FROM etl_data -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/models/selective_firstgen.sql: -------------------------------------------------------------------------------- 1 | 2 | SELECT 3 | INSTNM, ADM_RATE_ALL, FIRST_GEN, MD_FAMINC, SAT_AVG, MD_EARN_WNE_P10 4 | FROM 5 | {{ ref('college_scorecard') }} 6 | WHERE 7 | SAT_AVG > 1300 8 | AND ADM_RATE_ALL < 0.2 9 | AND FIRST_GEN > 0.1 10 | -------------------------------------------------------------------------------- /blogs/dbt_load/college-scorecard/models/selective_firstgen_top10.sql: -------------------------------------------------------------------------------- 1 | {{ config(materialized='view') }} /* overrides the project definition */ 2 | 3 | SELECT 4 | INSTNM, ADM_RATE_ALL, FIRST_GEN, MD_FAMINC, SAT_AVG 5 | FROM 6 | {{ ref('selective_firstgen') }} 7 | ORDER BY 8 | MD_FAMINC ASC 9 | LIMIT 10 10 | -------------------------------------------------------------------------------- /blogs/dbt_load/load_external_gcs.sh: -------------------------------------------------------------------------------- 1 | ../../04_load/load_external_gcs.sh -------------------------------------------------------------------------------- /blogs/dbt_load/profiles.yml: -------------------------------------------------------------------------------- 1 | default: 2 | target: dev 3 | outputs: 4 | dev: 5 | type: bigquery 6 | method: service-account # or oauth 7 | project: ai-analytics-solutions # CHANGE 8 | dataset: ch04 9 | threads: 1 10 | keyfile: /home/jupyter/.dbt/keyfile.json 11 | timeout_seconds: 600 12 | location: US 13 | priority: interactive 14 | retries: 3 15 | -------------------------------------------------------------------------------- /blogs/graphdb/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | The code in this directory accompanies [this blog post](https://medium.com/@lakshmanok/graph-data-analysis-with-cypher-and-spark-sql-on-cloud-dataproc-861ba6b7b648) 4 | 5 | To try the scala code interactively: 6 | * Create Spark 3 Dataproc Cluster 7 | * Switch to VM instances tab and click on SSH button 8 | * In SSH window, type: 9 | ```spark-shell --jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar --packages=org.opencypher:morpheus-spark-cypher:0.4.2``` 10 | * Copy-paste lines starting val BUCKET = ... into the REPL 11 | 12 | Or you can simply submit the code (find_routes.scala) and it will install the jars and packages 13 | -------------------------------------------------------------------------------- /ch01/01.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT(YEAR FROM starttime) AS year, 3 | EXTRACT(MONTH FROM starttime) AS month, 4 | COUNT(starttime) AS number_one_way 5 | FROM 6 | mydb.return_transactions 7 | WHERE 8 | start_station_name != end_station_name 9 | GROUP BY year, month 10 | ORDER BY year ASC, month ASC 11 | -------------------------------------------------------------------------------- /ch01/02.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT(YEAR FROM starttime) AS year, 3 | EXTRACT(MONTH FROM starttime) AS month, 4 | COUNT(starttime) AS number_one_way 5 | FROM 6 | `bigquery-public-data.new_york_citibike.citibike_trips` 7 | WHERE 8 | start_station_name != end_station_name 9 | GROUP BY year, month 10 | ORDER BY year ASC, month ASC -------------------------------------------------------------------------------- /ch01/03.sql: -------------------------------------------------------------------------------- 1 | WITH bicycle_rentals AS ( 2 | SELECT 3 | COUNT(starttime) as num_trips, 4 | EXTRACT(DATE from starttime) as trip_date 5 | FROM `bigquery-public-data.new_york_citibike.citibike_trips` 6 | GROUP BY trip_date 7 | ), 8 | 9 | rainy_days AS 10 | ( 11 | SELECT 12 | date, 13 | (MAX(prcp) > 5) AS rainy 14 | FROM ( 15 | SELECT 16 | wx.date AS date, 17 | IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp 18 | FROM 19 | `bigquery-public-data.ghcn_d.ghcnd_2016` AS wx 20 | WHERE 21 | wx.id = 'USW00094728' 22 | ) 23 | GROUP BY 24 | date 25 | ) 26 | 27 | SELECT 28 | ROUND(AVG(bk.num_trips)) AS num_trips, 29 | wx.rainy 30 | FROM bicycle_rentals AS bk 31 | JOIN rainy_days AS wx 32 | ON wx.date = bk.trip_date 33 | GROUP BY wx.rainy -------------------------------------------------------------------------------- /ch02/01.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/02.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration 3 | FROM 4 | `bigquery-public-data.new_york_citibike.citibike_trips` 5 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/03.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration AS rental_duration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/04.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration/60 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/05.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration/60 AS duration_minutes 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/06.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE tripduration < 600 6 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/07.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE tripduration >= 300 AND tripduration < 600 AND gender = 'female' 6 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/08.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration/60 AS minutes 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE (tripduration/60) < 10 6 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/09.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_stations 5 | WHERE name LIKE '%Riverside%' 6 | -------------------------------------------------------------------------------- /ch02/10.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * EXCEPT(short_name, last_reported) 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_stations 5 | WHERE name LIKE '%Riverside%' -------------------------------------------------------------------------------- /ch02/11.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * REPLACE(num_bikes_available + 5 AS num_bikes_available) 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_stations -------------------------------------------------------------------------------- /ch02/12.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM ( 2 | SELECT 3 | gender, tripduration / 60 AS minutes 4 | FROM 5 | `bigquery-public-data`.new_york_citibike.citibike_trips 6 | ) 7 | WHERE minutes < 10 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/13.sql: -------------------------------------------------------------------------------- 1 | WITH all_trips AS ( 2 | SELECT 3 | gender, tripduration / 60 AS minutes 4 | FROM 5 | `bigquery-public-data`.new_york_citibike.citibike_trips 6 | ) 7 | 8 | SELECT * FROM all_trips 9 | WHERE minutes < 10 10 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/14.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration/60 AS minutes 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE gender = 'female' 6 | ORDER BY minutes DESC 7 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/15.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | AVG(tripduration/60) AS avg_trip_duration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE 6 | gender = 'male' -------------------------------------------------------------------------------- /ch02/16.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, AVG(tripduration/60) AS avg_trip_duration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE 6 | tripduration is not NULL 7 | GROUP BY 8 | gender 9 | ORDER BY 10 | avg_trip_duration -------------------------------------------------------------------------------- /ch02/17.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, 3 | COUNT(*) AS rides, 4 | AVG(tripduration / 60) AS avg_trip_duration 5 | FROM 6 | `bigquery-public-data`.new_york_citibike.citibike_trips 7 | WHERE tripduration IS NOT NULL 8 | GROUP BY 9 | gender 10 | ORDER BY 11 | avg_trip_duration -------------------------------------------------------------------------------- /ch02/18.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, AVG(tripduration / 60) AS avg_trip_duration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE tripduration IS NOT NULL 6 | GROUP BY 7 | gender 8 | HAVING avg_trip_duration > 14 9 | ORDER BY 10 | avg_trip_duration -------------------------------------------------------------------------------- /ch02/19.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | gender 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips -------------------------------------------------------------------------------- /ch02/20.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | bikeid, 3 | tripduration, 4 | gender 5 | FROM 6 | `bigquery-public-data`.new_york_citibike.citibike_trips 7 | WHERE gender = "" 8 | LIMIT 100 -------------------------------------------------------------------------------- /ch02/21.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | gender, 3 | usertype 4 | FROM 5 | `bigquery-public-data`.new_york_citibike.citibike_trips 6 | WHERE gender != '' -------------------------------------------------------------------------------- /ch02/22.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | city, SPLIT(city, ' ') AS parts 3 | FROM ( 4 | SELECT * FROM UNNEST([ 5 | 'Seattle WA', 'New York', 'Singapore' 6 | ]) AS city 7 | ) -------------------------------------------------------------------------------- /ch02/23.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways 3 | UNION ALL SELECT 'Sun', 2376, 936 4 | UNION ALL SELECT 'Mon', 1476, 736 5 | ) 6 | 7 | SELECT * FROM example 8 | WHERE numrides < 2000 -------------------------------------------------------------------------------- /ch02/24.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender 3 | , EXTRACT(YEAR from starttime) AS year 4 | , COUNT(*) AS numtrips 5 | FROM 6 | `bigquery-public-data`.new_york_citibike.citibike_trips 7 | WHERE gender != 'unknown' and starttime IS NOT NULL 8 | GROUP BY gender, year 9 | HAVING year > 2016 -------------------------------------------------------------------------------- /ch02/25.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender 3 | , ARRAY_AGG(numtrips order by year) AS numtrips 4 | FROM ( 5 | SELECT 6 | gender 7 | , EXTRACT(YEAR FROM starttime) AS year 8 | , COUNT(1) AS numtrips 9 | FROM 10 | `bigquery-public-data`.new_york_citibike.citibike_trips 11 | WHERE gender != 'unknown' AND starttime IS NOT NULL 12 | GROUP BY gender, year 13 | HAVING year > 2016 14 | ) 15 | GROUP BY gender -------------------------------------------------------------------------------- /ch02/26.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | [ 3 | STRUCT('male' AS gender, [9306602, 3955871] AS numtrips) 4 | , STRUCT('female' AS gender, [3236735, 1260893] AS numtrips) 5 | ] AS bikerides -------------------------------------------------------------------------------- /ch02/27.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | [ 3 | ('male', [9306602, 3955871]) 4 | , ('female', [3236735, 1260893]) 5 | ] -------------------------------------------------------------------------------- /ch02/28.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ARRAY_LENGTH(bikerides) AS num_items 3 | , bikerides[ OFFSET(0) ].gender AS first_gender 4 | FROM 5 | (SELECT 6 | [ 7 | STRUCT('male' AS gender, [9306602, 3955871] AS numtrips) 8 | , STRUCT('female' AS gender, [3236735, 1260893] AS numtrips) 9 | ] AS bikerides) -------------------------------------------------------------------------------- /ch02/29.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM UNNEST( 2 | [ 3 | STRUCT('male' AS gender, [9306602, 3955871] AS numtrips) 4 | , STRUCT('female' AS gender, [3236735, 1260893] AS numtrips) 5 | ]) 6 | -------------------------------------------------------------------------------- /ch02/30.sql: -------------------------------------------------------------------------------- 1 | SELECT numtrips FROM UNNEST( 2 | [ 3 | STRUCT('male' AS gender, [9306602, 3955871] AS numtrips) 4 | , STRUCT('female' AS gender, [3236735, 1260893] AS numtrips) 5 | ]) 6 | -------------------------------------------------------------------------------- /ch02/31.sql: -------------------------------------------------------------------------------- 1 | WITH bicycle_rentals AS ( 2 | SELECT 3 | COUNT(starttime) AS num_trips, 4 | EXTRACT(DATE FROM starttime) AS trip_date 5 | FROM `bigquery-public-data`.new_york_citibike.citibike_trips 6 | GROUP BY trip_date 7 | ), 8 | 9 | rainy_days AS 10 | ( 11 | SELECT 12 | date, 13 | (MAX(prcp) > 5) AS rainy 14 | FROM ( 15 | SELECT 16 | wx.date AS date, 17 | IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp 18 | FROM 19 | `bigquery-public-data.ghcn_d.ghcnd_2016` AS wx 20 | WHERE 21 | wx.id = 'USW00094728' 22 | ) 23 | GROUP BY 24 | date 25 | ) 26 | 27 | 28 | 29 | SELECT 30 | ROUND(AVG(bk.num_trips)) AS num_trips, 31 | wx.rainy 32 | FROM bicycle_rentals AS bk 33 | JOIN rainy_days AS wx 34 | ON wx.date = bk.trip_date 35 | GROUP BY wx.rainy -------------------------------------------------------------------------------- /ch02/32.sql: -------------------------------------------------------------------------------- 1 | WITH bicycle_rentals AS ( 2 | SELECT 3 | COUNT(starttime) AS num_trips, 4 | EXTRACT(DATE FROM starttime) AS trip_date 5 | FROM `bigquery-public-data`.new_york_citibike.citibike_trips 6 | GROUP BY trip_date 7 | ) 8 | SELECT * FROM bicycle_rentals LIMIT 5 -------------------------------------------------------------------------------- /ch02/33.sql: -------------------------------------------------------------------------------- 1 | WITH bicycle_rentals AS ( 2 | SELECT 3 | COUNT(starttime) AS num_trips, 4 | EXTRACT(DATE FROM starttime) AS trip_date 5 | FROM `bigquery-public-data`.new_york_citibike.citibike_trips 6 | GROUP BY trip_date 7 | ), 8 | 9 | rainy_days AS 10 | ( 11 | SELECT 12 | date, 13 | (MAX(prcp) > 5) AS rainy 14 | FROM ( 15 | SELECT 16 | wx.date AS date, 17 | IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp 18 | FROM 19 | `bigquery-public-data.ghcn_d.ghcnd_2016` AS wx 20 | WHERE 21 | wx.id = 'USW00094728' 22 | ) 23 | GROUP BY 24 | date 25 | ) 26 | 27 | SELECT 28 | bk.trip_date, 29 | bk.num_trips, 30 | wx.rainy 31 | FROM bicycle_rentals AS bk 32 | JOIN rainy_days AS wx 33 | ON wx.date = bk.trip_date 34 | LIMIT 5 -------------------------------------------------------------------------------- /ch02/34.sql: -------------------------------------------------------------------------------- 1 | WITH from_item_a AS ( 2 | SELECT 'Dalles' AS city, 'OR' AS state 3 | UNION ALL SELECT 'Tokyo', 'Tokyo' 4 | UNION ALL SELECT 'Mumbai', 'Maharashtra' 5 | ), 6 | 7 | from_item_b AS ( 8 | SELECT 'OR' AS state, 'USA' AS country 9 | UNION ALL SELECT 'Tokyo', 'Japan' 10 | UNION ALL SELECT 'Maharashtra', 'India' 11 | ) 12 | 13 | SELECT from_item_a.*, country 14 | FROM from_item_a 15 | JOIN from_item_b 16 | ON from_item_a.state = from_item_b.state -------------------------------------------------------------------------------- /ch02/35.sql: -------------------------------------------------------------------------------- 1 | WITH from_item_a AS ( 2 | SELECT 'Dalles' AS city, 'OR' AS state 3 | UNION ALL SELECT 'Tokyo', 'Tokyo' 4 | UNION ALL SELECT 'Mumbai', 'Maharashtra' 5 | ), 6 | 7 | from_item_b AS ( 8 | SELECT 'OR' AS state, 'USA' AS country 9 | UNION ALL SELECT 'Tokyo', 'Japan' 10 | UNION ALL SELECT 'Maharashtra', 'India' 11 | ) 12 | 13 | SELECT from_item_a.*, country AS surcharge 14 | FROM from_item_a 15 | JOIN from_item_b 16 | ON from_item_a.state != from_item_b.state -------------------------------------------------------------------------------- /ch02/36.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | ), 6 | gifts AS ( 7 | SELECT 'Google Home' AS gift, '100m' AS event 8 | UNION ALL SELECT 'Google Hub', '200m' 9 | UNION ALL SELECT 'Pixel3', '400m' 10 | ) 11 | SELECT winners.*, gifts.gift 12 | FROM winners 13 | JOIN gifts 14 | USING (event) 15 | -------------------------------------------------------------------------------- /ch02/37.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | ), 6 | gifts AS ( 7 | SELECT 'Google Home' AS gift 8 | UNION ALL SELECT 'Google Hub' 9 | UNION ALL SELECT 'Pixel3' 10 | ) 11 | SELECT person, gift 12 | FROM winners 13 | CROSS JOIN gifts -------------------------------------------------------------------------------- /ch02/38.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | UNION ALL SELECT 'Kwame', '50m' 6 | ), 7 | gifts AS ( 8 | SELECT 'Google Home' AS gift, '100m' AS event 9 | UNION ALL SELECT 'Google Hub', '200m' 10 | UNION ALL SELECT 'Pixel3', '400m' 11 | UNION ALL SELECT 'Google Mini', '5000m' 12 | ) 13 | 14 | SELECT person, gift 15 | FROM winners 16 | INNER JOIN gifts 17 | ON winners.event = gifts.event 18 | -------------------------------------------------------------------------------- /ch02/39.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | UNION ALL SELECT 'Kwame', '50m' 6 | ), 7 | gifts AS ( 8 | SELECT 'Google Home' AS gift, '100m' AS event 9 | UNION ALL SELECT 'Google Hub', '200m' 10 | UNION ALL SELECT 'Pixel3', '400m' 11 | UNION ALL SELECT 'Google Mini', '5000m' 12 | ) 13 | 14 | SELECT person, gift 15 | FROM winners 16 | FULL OUTER JOIN gifts 17 | ON winners.event = gifts.event 18 | -------------------------------------------------------------------------------- /ch02/40.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | UNION ALL SELECT 'Kwame', '50m' 6 | ), 7 | gifts AS ( 8 | SELECT 'Google Home' AS gift, '100m' AS event 9 | UNION ALL SELECT 'Google Hub', '200m' 10 | UNION ALL SELECT 'Pixel3', '400m' 11 | UNION ALL SELECT 'Google Mini', '5000m' 12 | ) 13 | 14 | SELECT person, gift 15 | FROM winners 16 | LEFT OUTER JOIN gifts 17 | ON winners.event = gifts.event 18 | -------------------------------------------------------------------------------- /ch02/41.sql: -------------------------------------------------------------------------------- 1 | WITH winners AS ( 2 | SELECT 'John' AS person, '100m' AS event 3 | UNION ALL SELECT 'Hiroshi', '200m' 4 | UNION ALL SELECT 'Sita', '400m' 5 | UNION ALL SELECT 'Kwame', '50m' 6 | ), 7 | gifts AS ( 8 | SELECT 'Google Home' AS gift, '100m' AS event 9 | UNION ALL SELECT 'Google Hub', '200m' 10 | UNION ALL SELECT 'Pixel3', '400m' 11 | UNION ALL SELECT 'Google Mini', '5000m' 12 | ) 13 | 14 | SELECT person, gift 15 | FROM winners 16 | RIGHT OUTER JOIN gifts 17 | ON winners.event = gifts.event -------------------------------------------------------------------------------- /ch03/01.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways 3 | UNION ALL SELECT 'Sun', 2376, 936 4 | ) 5 | SELECT *, (oneways/numrides) AS frac_oneway FROM example 6 | -------------------------------------------------------------------------------- /ch03/02.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways 3 | UNION ALL SELECT 'Sun', 2376, 936 4 | ) 5 | SELECT *, ROUND(oneways/numrides, 2) AS frac_oneway FROM example -------------------------------------------------------------------------------- /ch03/03.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways 3 | UNION ALL SELECT 'Sun', 2376, 936 4 | UNION ALL SELECT 'Wed', 0, 0 5 | ) 6 | SELECT 7 | *, ROUND(IEEE_DIVIDE(oneways, numrides), 2) 8 | AS frac_oneway FROM example -------------------------------------------------------------------------------- /ch03/04.sql: -------------------------------------------------------------------------------- 1 | SELECT SAFE.LOG(10, -3), LOG(10, 3) -------------------------------------------------------------------------------- /ch03/05.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'Sat' AS day, 1451 AS numrides, 1018 AS oneways 3 | UNION ALL SELECT 'Sun', 2376, 936 4 | UNION ALL SELECT 'Mon', NULL, NULL 5 | UNION ALL SELECT 'Tue', IEEE_DIVIDE(-3,0), 0 6 | ) 7 | SELECT * FROM example 8 | ORDER BY numrides 9 | -------------------------------------------------------------------------------- /ch03/06.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 1.23 AS payment 3 | UNION ALL SELECT 7.89 4 | UNION ALL SELECT 12.43 5 | ) 6 | SELECT 7 | SUM(payment) AS total_paid, 8 | AVG(payment) AS average_paid 9 | FROM example -------------------------------------------------------------------------------- /ch03/07.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT NUMERIC '1.23' AS payment 3 | UNION ALL SELECT NUMERIC '7.89' 4 | UNION ALL SELECT NUMERIC '12.43' 5 | ) 6 | SELECT 7 | SUM(payment) AS total_paid, 8 | AVG(payment) AS average_paid 9 | FROM example -------------------------------------------------------------------------------- /ch03/08.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, tripduration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | WHERE (tripduration < 600 AND gender = 'female') OR gender = 'male' 6 | -------------------------------------------------------------------------------- /ch03/09.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT NULL AS is_vowel, NULL AS letter, -1 AS position 3 | UNION ALL SELECT true, 'a', 1 4 | UNION ALL SELECT false, 'b', 2 5 | UNION ALL SELECT false, 'c', 3 6 | ) 7 | SELECT * FROM example WHERE is_vowel != false -------------------------------------------------------------------------------- /ch03/10.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT NULL AS is_vowel, NULL AS letter, -1 AS position 3 | UNION ALL SELECT true, 'a', 1 4 | UNION ALL SELECT false, 'b', 2 5 | UNION ALL SELECT false, 'c', 3 6 | ) 7 | SELECT * FROM example WHERE is_vowel IS NOT false -------------------------------------------------------------------------------- /ch03/11.sql: -------------------------------------------------------------------------------- 1 | WITH catalog AS ( 2 | SELECT 30.0 AS costPrice, 0.15 AS markup, 0.1 AS taxRate 3 | UNION ALL SELECT NULL, 0.21, 0.15 4 | UNION ALL SELECT 30.0, NULL, 0.09 5 | UNION ALL SELECT 30.0, 0.30, NULL 6 | UNION ALL SELECT 30.0, NULL, NULL 7 | ) 8 | SELECT 9 | *, ROUND( 10 | costPrice * 11 | IF(markup IS NULL, 1.05, 1+markup) * 12 | IF(taxRate IS NULL, 1.10, 1+taxRate) 13 | , 2) AS salesPrice 14 | FROM catalog 15 | -------------------------------------------------------------------------------- /ch03/12.sql: -------------------------------------------------------------------------------- 1 | WITH catalog AS ( 2 | SELECT 30.0 AS costPrice, 0.15 AS markup, 0.1 AS taxRate 3 | UNION ALL SELECT NULL, 0.21, 0.15 4 | UNION ALL SELECT 30.0, NULL, 0.09 5 | UNION ALL SELECT 30.0, 0.30, NULL 6 | UNION ALL SELECT 30.0, NULL, NULL 7 | ) 8 | SELECT 9 | *, ROUND(COALESCE( 10 | costPrice * (1+markup) * (1+taxrate), 11 | costPrice * 1.05 * (1+taxrate), 12 | costPrice * (1+markup) * 1.10, 13 | NULL 14 | ), 2) AS salesPrice 15 | FROM catalog -------------------------------------------------------------------------------- /ch03/13.sql: -------------------------------------------------------------------------------- 1 | WITH catalog AS ( 2 | SELECT 30.0 AS costPrice, 0.15 AS markup, 0.1 AS taxRate 3 | UNION ALL SELECT NULL, 0.21, 0.15 4 | UNION ALL SELECT 30.0, NULL, 0.09 5 | UNION ALL SELECT 30.0, 0.30, NULL 6 | UNION ALL SELECT 30.0, NULL, NULL 7 | ) 8 | SELECT 9 | *, ROUND( 10 | costPrice * 11 | (1 + IFNULL(markup, 0.05)) * 12 | (1 + IFNULL(taxrate,0.10)) 13 | , 2) AS salesPrice 14 | FROM catalog -------------------------------------------------------------------------------- /ch03/14.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'John' AS employee, 'Paternity Leave' AS hours_worked 3 | UNION ALL SELECT 'Janaki', '35' 4 | UNION ALL SELECT 'Jian', 'Vacation' 5 | UNION ALL SELECT 'Jose', '40' 6 | ) 7 | SELECT SUM(hours_worked) FROM example -------------------------------------------------------------------------------- /ch03/15.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'John' AS employee, 'Paternity Leave' AS hours_worked 3 | UNION ALL SELECT 'Janaki', '35' 4 | UNION ALL SELECT 'Jian', 'Vacation' 5 | UNION ALL SELECT 'Jose', '40' 6 | ) 7 | SELECT SUM(SAFE_CAST(hours_worked AS INT64)) FROM example -------------------------------------------------------------------------------- /ch03/16.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'John' AS employee, '0' AS hours_worked 3 | UNION ALL SELECT 'Janaki', '35' 4 | UNION ALL SELECT 'Jian', '0' 5 | UNION ALL SELECT 'Jose', '40' 6 | ) 7 | SELECT SUM(CAST(hours_worked AS INT64)) FROM example -------------------------------------------------------------------------------- /ch03/17.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT true AS is_vowel, 'a' AS letter, 1 AS position 3 | UNION ALL SELECT false, 'b', 2 4 | UNION ALL SELECT false, 'c', 3 5 | ) 6 | SELECT SUM(IF(is_vowel, 1, 0)) AS num_vowels FROM example -------------------------------------------------------------------------------- /ch03/18.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT true AS is_vowel, 'a' AS letter, 1 AS position 3 | UNION ALL SELECT false, 'b', 2 4 | UNION ALL SELECT false, 'c', 3 5 | ) 6 | SELECT COUNTIF(is_vowel) AS num_vowels FROM example -------------------------------------------------------------------------------- /ch03/19.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT * FROM unnest([ 3 | 'Seattle', 'New York', 'Singapore' 4 | ]) AS city 5 | ) 6 | SELECT 7 | city 8 | , LENGTH(city) AS len 9 | , LOWER(city) AS lower 10 | , STRPOS(city, 'or') AS orpos 11 | FROM example -------------------------------------------------------------------------------- /ch03/20.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'armin@abc.com' AS email, 'Annapolis,, MD' AS city 3 | UNION ALL SELECT 'boyan@bca.com', 'Boulder, CA' 4 | UNION ALL SELECT 'carrie@cab.com', 'Chicago, IL' 5 | ) 6 | 7 | SELECT 8 | CONCAT( 9 | SUBSTR(email, 1, STRPOS(email, '@') - 1), -- username 10 | ' from ', city) AS callers 11 | FROM example -------------------------------------------------------------------------------- /ch03/21.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT * FROM unnest([ 3 | 'Seattle', 'New York', 'சிங்கப்பூர்', '東京' 4 | ]) AS city 5 | ) 6 | SELECT 7 | city 8 | , UPPER(city) AS allcaps 9 | , CAST(city AS BYTES) AS bytes 10 | FROM example -------------------------------------------------------------------------------- /ch03/22.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT * FROM unnest([ 3 | 'Seattle', 'New York', 'சிங்கப்பூர்', '東京' 4 | ]) AS city 5 | ) 6 | SELECT 7 | city 8 | , CHAR_LENGTH(city) AS char_len 9 | , TO_CODE_POINTS(city)[OFFSET(1)] AS first_codept 10 | , ARRAY_LENGTH(TO_CODE_POINTS(city)) AS num_codept 11 | , CAST (city AS BYTES) AS bytes 12 | , BYTE_LENGTH(city) AS byte_len 13 | FROM example -------------------------------------------------------------------------------- /ch03/23.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | CAST(42 AS STRING) 3 | , CAST('42' AS INT64) 4 | , FORMAT('%03d', 42) 5 | , FORMAT('%5.3f', 32.457842) 6 | , FORMAT('%5.3f', 32.4) 7 | , FORMAT('**%s**', 'H') 8 | , FORMAT('%s-%03d', 'Agent', 7) -------------------------------------------------------------------------------- /ch03/24.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ENDS_WITH('Hello', 'o') -- true 3 | , ENDS_WITH('Hello', 'h') -- false 4 | , STARTS_WITH('Hello', 'h') -- false 5 | , STRPOS('Hello', 'e') -- 2 6 | , STRPOS('Hello', 'f') -- 0 for not-found 7 | , SUBSTR('Hello', 2, 4) -- 1-based 8 | , CONCAT('Hello', 'World') -------------------------------------------------------------------------------- /ch03/25.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | LPAD('Hello', 10, '*') -- 왼쪽에 *가 추가된다 3 | , RPAD('Hello', 10, '*') -- 오른쪽에 *가 추가된다 4 | , LPAD('Hello', 10) -- 왼쪽에 공백이 추가된다 5 | , LTRIM(' Hello ') -- 왼쪽의 공백이 제거된다 6 | , RTRIM(' Hello ') -- 오른쪽의 공백이 제거된다 7 | , TRIM (' Hello ') -- 양쪽의 공백이 제거된다 8 | , TRIM ('***Hello***', '*') -- 양쪽의 *이 제거된다 9 | , REVERSE('Hello') -- 문자열이 뒤바뀐다 -------------------------------------------------------------------------------- /ch03/26.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | column 3 | , REGEXP_CONTAINS(column, r'\d{5}(?:[-\s]\d{4})?') has_zipcode 4 | , REGEXP_CONTAINS(column, r'^\d{5}(?:[-\s]\d{4})?$') is_zipcode 5 | , REGEXP_EXTRACT(column, r'\d{5}(?:[-\s]\d{4})?') the_zipcode 6 | , REGEXP_EXTRACT_ALL(column, r'\d{5}(?:[-\s]\d{4})?') all_zipcodes 7 | , REGEXP_REPLACE(column, r'\d{5}(?:[-\s]\d{4})?', '*****') masked 8 | FROM ( 9 | SELECT * FROM UNNEST([ 10 | '12345', '1234', '12345-9876', 11 | 'abc 12345 def', 'abcde-fghi', 12 | '12345 ab 34567', '12345 9876' 13 | ]) AS column 14 | ) -------------------------------------------------------------------------------- /ch03/27.sql: -------------------------------------------------------------------------------- 1 | SELECT t1, t2, TIMESTAMP_DIFF(t1, t2, MICROSECOND) 2 | FROM (SELECT 3 | TIMESTAMP "2017-09-27 12:30:00.45" AS t1, 4 | TIMESTAMP "2017-09-27 13:30:00.45+1" AS t2 5 | ) -------------------------------------------------------------------------------- /ch03/28.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | fmt, input, zone 3 | , PARSE_TIMESTAMP(fmt, input, zone) AS ts 4 | FROM ( 5 | SELECT '%Y%m%d-%H%M%S' AS fmt, '20181118-220800' AS input, '+0' AS zone 6 | UNION ALL SELECT '%c', 'Sat Nov 24 21:26:00 2018', 'America/Los_Angeles' 7 | UNION ALL SELECT '%x %X', '11/18/18 22:08:00', 'UTC' 8 | ) -------------------------------------------------------------------------------- /ch03/29.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ts, fmt 3 | , FORMAT_TIMESTAMP(fmt, ts, '+6') AS ts_output 4 | FROM ( 5 | SELECT CURRENT_TIMESTAMP() AS ts, '%Y%m%d-%H%M%S' AS fmt 6 | UNION ALL SELECT CURRENT_TIMESTAMP() AS ts, '%c' AS fmt 7 | UNION ALL SELECT CURRENT_TIMESTAMP() AS ts, '%x %X' AS fmt 8 | ) 9 | -------------------------------------------------------------------------------- /ch03/30.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ts 3 | , FORMAT_TIMESTAMP('%c', ts) AS repr 4 | , EXTRACT(DAYOFWEEK FROM ts) AS dayofweek 5 | , EXTRACT(YEAR FROM ts) AS year 6 | , EXTRACT(WEEK FROM ts) AS weekno 7 | FROM ( 8 | SELECT PARSE_TIMESTAMP('%Y%m%d-%H%M%S', '19181111-054500') AS ts 9 | ) -------------------------------------------------------------------------------- /ch03/31.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | UNIX_MILLIS(TIMESTAMP "2018-11-25 22:30:00 UTC") 3 | , UNIX_MILLIS(TIMESTAMP "1918-11-11 22:30:00 UTC") --유효하지 않다 4 | , TIMESTAMP_MILLIS(1543185000000) 5 | -------------------------------------------------------------------------------- /ch03/32.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT(TIME FROM TIMESTAMP_ADD(t1, INTERVAL 1 HOUR)) AS plus_1h 3 | , EXTRACT(TIME FROM TIMESTAMP_SUB(t1, INTERVAL 10 MINUTE)) AS minus_10min 4 | , TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), 5 | TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 MINUTE), 6 | SECOND) AS plus_1min 7 | , TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), 8 | TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 MINUTE), 9 | SECOND) AS minus_1min 10 | FROM (SELECT 11 | TIMESTAMP "2017-09-27 12:30:00.45" AS t1 12 | ) -------------------------------------------------------------------------------- /ch03/33.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT(DATETIME FROM CURRENT_TIMESTAMP()) AS dt 3 | , CAST(CURRENT_DATETIME() AS TIMESTAMP) AS ts -------------------------------------------------------------------------------- /ch03/34.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | state_name 3 | FROM `bigquery-public-data.utility_us.us_states_area` 4 | WHERE 5 | ST_CONTAINS( 6 | state_geom, 7 | ST_GeogPoint(-122.33, 47.61)) -------------------------------------------------------------------------------- /ch04/01.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | state_name 3 | FROM `bigquery-public-data`.utility_us.us_states_area 4 | WHERE 5 | ST_CONTAINS( 6 | state_geom, 7 | ST_GeogPoint(-122.33, 47.61)) 8 | -------------------------------------------------------------------------------- /ch04/02.sh: -------------------------------------------------------------------------------- 1 | zless college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/03.sh: -------------------------------------------------------------------------------- 1 | bq --location=US mk ch04 -------------------------------------------------------------------------------- /ch04/04.sh: -------------------------------------------------------------------------------- 1 | bq --location=US \ 2 | load \ 3 | --source_format=CSV --autodetect \ 4 | ch04.college_scorecard \ 5 | ./college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/05.sh: -------------------------------------------------------------------------------- 1 | bq --location=US \ 2 | load --null_marker=NULL \ 3 | --source_format=CSV --autodetect \ 4 | ch04.college_scorecard \ 5 | ./college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/06.sh: -------------------------------------------------------------------------------- 1 | bq --location=US \ 2 | load --null_marker=NULL --replace \ 3 | --source_format=CSV --autodetect \ 4 | ch04.college_scorecard \ 5 | ./college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/07.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE ch04.college_scorecard 2 | SET OPTIONS ( 3 | expiration_timestamp = 4 | TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 7 DAY), 5 | description="지금으로부터 7일 뒤에 만료되는 college_scorecard 테이블" 6 | ) -------------------------------------------------------------------------------- /ch04/08.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | INSTNM 3 | , ADM_RATE_ALL 4 | , FIRST_GEN 5 | , MD_FAMINC 6 | , MD_EARN_WNE_P10 , SAT_AVG 7 | FROM 8 | ch04.college_scorecard 9 | WHERE 10 | SAFE_CAST(SAT_AVG AS FLOAT64) > 1300 11 | AND SAFE_CAST(ADM_RATE_ALL AS FLOAT64) < 0.2 12 | AND SAFE_CAST(FIRST_GEN AS FLOAT64) > 0.1 13 | ORDER BY 14 | CAST(MD_FAMINC AS FLOAT64) ASC -------------------------------------------------------------------------------- /ch04/09.sh: -------------------------------------------------------------------------------- 1 | zless ./college_scorecard.csv.gz | \ 2 | sed 's/PrivacySuppressed/NULL/g' | \ 3 | gzip > /tmp/college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/10.sh: -------------------------------------------------------------------------------- 1 | bq show --format prettyjson --schema ch04.college_scorecard -------------------------------------------------------------------------------- /ch04/11.sh: -------------------------------------------------------------------------------- 1 | bq show --format prettyjson --schema ch04.college_scorecard > schema.json -------------------------------------------------------------------------------- /ch04/12.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | table_name 3 | , column_name 4 | , ordinal_position 5 | , is_nullable 6 | , data_type 7 | FROM 8 | ch04.INFORMATION_SCHEMA.COLUMNS -------------------------------------------------------------------------------- /ch04/13.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | TO_JSON_STRING( 3 | ARRAY_AGG(STRUCT( 4 | IF(is_nullable = 'YES', 'NULLABLE', 'REQUIRED') AS mode, 5 | column_name AS name, 6 | data_type AS type) 7 | ORDER BY ordinal_position), TRUE) AS schema 8 | FROM 9 | ch04.INFORMATION_SCHEMA.COLUMNS 10 | WHERE 11 | table_name = 'college_scorecard' -------------------------------------------------------------------------------- /ch04/14.sh: -------------------------------------------------------------------------------- 1 | bq --location=US \ 2 | load --null_marker=NULL --replace \ 3 | --source_format=CSV \ 4 | --schema=schema.json --skip_leading_rows=1 \ 5 | ch04.college_scorecard \ 6 | ./college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/15.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | INSTNM 3 | , ADM_RATE_ALL 4 | , FIRST_GEN 5 | , MD_FAMINC 6 | , MD_EARN_WNE_P10 , SAT_AVG 7 | FROM 8 | ch04.college_scorecard 9 | WHERE 10 | SAT_AVG > 1300 11 | AND ADM_RATE_ALL < 0.2 12 | AND FIRST_GEN > 0.1 13 | ORDER BY 14 | MD_FAMINC ASC -------------------------------------------------------------------------------- /ch04/16.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch04.college_scorecard_etl AS 2 | SELECT 3 | INSTNM 4 | , ADM_RATE_ALL 5 | , FIRST_GEN 6 | , MD_FAMINC 7 | , SAT_AVG 8 | , MD_EARN_WNE_P10 9 | FROM ch04.college_scorecard -------------------------------------------------------------------------------- /ch04/17.sh: -------------------------------------------------------------------------------- 1 | bq rm ch04.college_scorecard 2 | bq rm -r -f ch04 -------------------------------------------------------------------------------- /ch04/18.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS ch04.college_scorecard_gcs -------------------------------------------------------------------------------- /ch04/19.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE ch04.college_scorecard 2 | SET OPTIONS ( 3 | expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 7 DAY), 4 | description="지금으로부터 7일뒤에 만료되는 college_scorecard 테이블" 5 | ) -------------------------------------------------------------------------------- /ch04/20.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM ch04.college_scorecard 2 | WHERE SAT_AVG IS NULL -------------------------------------------------------------------------------- /ch04/21.sql: -------------------------------------------------------------------------------- 1 | INSERT ch04.college_scorecard 2 | (INSTNM 3 | , ADM_RATE_ALL , FIRST_GEN 4 | , MD_FAMINC 5 | , SAT_AVG 6 | , MD_EARN_WNE_P10 7 | ) 8 | VALUES ('abc', 0.1, 0.3, 12345, 1234, 23456), 9 | ('def', 0.2, 0.2, 23451, 1232, 32456) -------------------------------------------------------------------------------- /ch04/22.sql: -------------------------------------------------------------------------------- 1 | INSERT ch04.college_scorecard 2 | SELECT * 3 | FROM ch04.college_scorecard_etl 4 | WHERE SAT_AVG IS NULL -------------------------------------------------------------------------------- /ch04/23.sh: -------------------------------------------------------------------------------- 1 | bq cp ch04.college_scorecard 2 | someds.college_scorecard_copy -------------------------------------------------------------------------------- /ch04/24.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=CSV \ 2 | --autodetect \ 3 | gs://bigquery-oreilly-book/college_scorecard.csv 4 | 5 | -------------------------------------------------------------------------------- /ch04/25.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=CSV \ 2 | --autodetect \ 3 | gs://bigquery-oreilly-book/college_scorecard.csv \ 4 | > /tmp/mytable.json 5 | 6 | bq mk --external_table_definition=/tmp/mytable.json \ 7 | ch04.college_scorecard -------------------------------------------------------------------------------- /ch04/26.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=CSV \ 2 | --autodetect \ 3 | gs://bigquery-oreilly-book/college_* \ 4 | > /tmp/mytable.json -------------------------------------------------------------------------------- /ch04/27.sh: -------------------------------------------------------------------------------- 1 | LOC="--location US" 2 | INPUT=gs://bigquery-oreilly-book/college_scorecard.csv 3 | 4 | SCHEMA=$(gsutil cat $INPUT | head -1 | awk -F, '{ORS=","}{for (i=1; i <= NF; i++){ print $i":STRING"; }}' | sed 's/,$//g'| cut -b 4- ) 5 | 6 | bq $LOC query \ 7 | --external_table_definition=cstable::${SCHEMA}@CSV=${INPUT} \ 8 | 'SELECT SUM(IF(SAT_AVG != "NULL", 1, 0))/COUNT(SAT_AVG) FROM cstable' -------------------------------------------------------------------------------- /ch04/28.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=PARQUET gs://bucket/dir/files* > table_def.json 2 | bq mk --external_table_definition=table_def.json . 3 | -------------------------------------------------------------------------------- /ch04/29.sh: -------------------------------------------------------------------------------- 1 | bq load --source_format=ORC --autodetect \ 2 | --hive_partitioning_mode=AUTO .
-------------------------------------------------------------------------------- /ch04/30.sh: -------------------------------------------------------------------------------- 1 | bq load --source_format=ORC --autodetect \ 2 | --hive_partitioning_mode=STRINGS .
-------------------------------------------------------------------------------- /ch04/31.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=ORC --autodetect \ 2 | --hive_partitioning_mode=AUTO > table_def.json -------------------------------------------------------------------------------- /ch04/32.sh: -------------------------------------------------------------------------------- 1 | bq mkdef --source_format=NEWLINE_DELIMITED_JSON --autodetect \ 2 | -- hive_partitioning_mode=STRINGS > table_def.json -------------------------------------------------------------------------------- /ch04/33.sh: -------------------------------------------------------------------------------- 1 | INPUT=gs://bigquery-oreilly-book/college_scorecard.csv 2 | SCHEMA=$(gsutil cat $INPUT | head -1 | cut -b 4- ) -------------------------------------------------------------------------------- /ch04/34.sh: -------------------------------------------------------------------------------- 1 | LOC="--location US" 2 | OUTPUT=/tmp/college_scorecard_def.json 3 | bq $LOC \ 4 | mkdef \ 5 | --source_format=CSV \ 6 | --noautodetect \ 7 | $INPUT \ 8 | $SCHEMA \ 9 | | sed 's/"skipLeadingRows": 0/"skipLeadingRows": 1/g' \ 10 | | sed 's/"allowJaggedRows": false/"allowJaggedRows": true/g' \ 11 | > $OUTPUT -------------------------------------------------------------------------------- /ch04/35.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | MAX(CAST(SAT_AVG AS FLOAT64)) AS MAX_SAT_AVG 3 | FROM 4 | `ch04.college_scorecard_gcs` -------------------------------------------------------------------------------- /ch04/36.sql: -------------------------------------------------------------------------------- 1 | WITH etl_data AS ( 2 | SELECT 3 | SAFE_CAST(SAT_AVG AS FLOAT64) AS SAT_AVG 4 | FROM 5 | `ch04.college_scorecard_gcs` 6 | ) 7 | SELECT 8 | MAX(SAT_AVG) AS MAX_SAT_AVG 9 | FROM 10 | etl_data -------------------------------------------------------------------------------- /ch04/37.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP FUNCTION cleanup_numeric(x STRING) AS 2 | ( 3 | IF ( x != 'NULL' AND x != 'PrivacySuppressed', 4 | CAST(x as FLOAT64), 5 | NULL ) 6 | ); 7 | 8 | WITH etl_data AS ( 9 | SELECT 10 | INSTNM 11 | , cleanup_numeric(ADM_RATE_ALL) AS ADM_RATE_ALL 12 | , cleanup_numeric(FIRST_GEN) AS FIRST_GEN 13 | , cleanup_numeric(MD_FAMINC) AS MD_FAMINC 14 | , cleanup_numeric(SAT_AVG) AS SAT_AVG 15 | , cleanup_numeric(MD_EARN_WNE_P10) AS MD_EARN_WNE_P10 16 | FROM 17 | `ch04.college_scorecard_gcs` 18 | ) 19 | 20 | SELECT 21 | * 22 | FROM 23 | etl_data 24 | WHERE 25 | SAT_AVG > 1300 26 | AND ADM_RATE_ALL < 0.2 27 | AND FIRST_GEN > 0.1 28 | ORDER BY 29 | MD_FAMINC ASC 30 | LIMIT 10 -------------------------------------------------------------------------------- /ch04/38.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP FUNCTION cleanup_numeric(x STRING) AS 2 | ( 3 | IF ( x != 'NULL' AND x != 'PrivacySuppressed', 4 | CAST(x as FLOAT64), 5 | NULL ) 6 | ); 7 | 8 | CREATE TABLE ch04.college_scorecard_etl 9 | OPTIONS(description="대학 점수카드 데이터를 정리한 데이터") AS 10 | 11 | WITH etl_data AS ( 12 | SELECT 13 | INSTNM 14 | , cleanup_numeric(ADM_RATE_ALL) AS ADM_RATE_ALL 15 | , cleanup_numeric(FIRST_GEN) AS FIRST_GEN 16 | , cleanup_numeric(MD_FAMINC) AS MD_FAMINC 17 | , cleanup_numeric(SAT_AVG) AS SAT_AVG 18 | , cleanup_numeric(MD_EARN_WNE_P10) AS MD_EARN_WNE_P10 19 | FROM 20 | `ch04.college_scorecard_gcs` 21 | ) 22 | 23 | SELECT * FROM etl_data -------------------------------------------------------------------------------- /ch04/39.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | REGEXP_EXTRACT(protopayload_auditlog.resourceName, '^projects/[^/]+/datasets/([^/]+)/tables') AS datasetRef, 3 | COUNTIF(JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.tableDataRead") IS NOT NULL) AS dataReadEvents 4 | FROM ch04.cloudaudit_googleapis_com_data_access_2019* 5 | WHERE 6 | JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.tableDataRead") IS NOT NULL 7 | GROUP BY datasetRef 8 | ORDER BY dataReadEvents DESC 9 | LIMIT 5 -------------------------------------------------------------------------------- /ch04/40.sh: -------------------------------------------------------------------------------- 1 | mysql somedb < select_data.sql | \ 2 | gsutil cp - gs://BUCKET/data_$(date -u "+%F-%T").tsv 3 | -------------------------------------------------------------------------------- /ch04/41.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM my_table 2 | WHERE transaction_date >= DATE_SUB(CURDATE(), INTERVAL 10 DAY) -------------------------------------------------------------------------------- /ch04/42.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM EXTERNAL_QUERY(connection_id, cloud_sql_query); -------------------------------------------------------------------------------- /ch04/43.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | c.customer_id 3 | , c.gift_card_balance 4 | , rq.latest_order_date 5 | FROM ch04.gift_cards AS c 6 | LEFT OUTER JOIN EXTERNAL_QUERY( 7 | 'connection_id', 8 | '''SELECT customer_id, MAX(order_date) AS latest_order_date 9 | FROM orders 10 | GROUP BY customer_id''') AS rq ON rq.customer_id = c.customer_id 11 | WHERE c.gift_card_balance > 100 12 | ORDER BY rq.latest_order_date ASC; -------------------------------------------------------------------------------- /ch04/44.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM advdata.students -------------------------------------------------------------------------------- /ch04/45.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM 4 | ch04.college_scorecard_etl -------------------------------------------------------------------------------- /ch04/46.sql: -------------------------------------------------------------------------------- 1 | SELECT INSTNM, COUNT(display_name) AS numusers 2 | FROM `bigquery-public-data`.stackoverflow.users, ch04.college_scorecard_gs 3 | WHERE REGEXP_CONTAINS(about_me, INSTNM) 4 | GROUP BY INSTNM 5 | ORDER BY numusers DESC 6 | LIMIT 5 -------------------------------------------------------------------------------- /ch04/47.sql: -------------------------------------------------------------------------------- 1 | SELECT SUM(sales.qty.cell.value) AS num_sold 2 | FROM ch04.logs 3 | WHERE sales.itemid.cell.value = '12345' 4 | -------------------------------------------------------------------------------- /ch04/48.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE 2 | ch04.college_scorecard_dts 3 | AS 4 | SELECT * FROM ch04.college_scorecard_gcs 5 | LIMIT 0 -------------------------------------------------------------------------------- /ch04/49.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE 2 | ch04.college_scorecard_valid_sat 3 | AS 4 | SELECT * FROM ch04.college_scorecard_gcs 5 | WHERE LENGTH(SAT_AVG) > 0 -------------------------------------------------------------------------------- /ch04/50.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE ch04.payment_transactions 2 | ( 3 | PAYEE STRING OPTIONS(description="수취인 ID"), 4 | AMOUNT NUMERIC OPTIONS(description="지불액") 5 | ) -------------------------------------------------------------------------------- /ch04/51.sh: -------------------------------------------------------------------------------- 1 | bq mk --transfer_config --data_source=google_cloud_storage \ 2 | --target_dataset=ch04 --display_name ch04_college_scorecard \ 3 | --params='{"data_path_template":"gs://bigquery-oreilly-book/college_*.csv", "destination_table_name_template":"college_scorecard_dts", "file_format":"CSV", "max_bad_records":"10", "skip_leading_rows":"1", "allow_jagged_rows":"true"}' 4 | -------------------------------------------------------------------------------- /ch04/52.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | gender, AVG(tripduration / 60) AS avg_trip_duration 3 | FROM 4 | `bigquery-public-data`.new_york_citibike.citibike_trips 5 | GROUP BY 6 | gender 7 | HAVING avg_trip_duration > 14 8 | ORDER BY 9 | avg_trip_duration -------------------------------------------------------------------------------- /ch04/53.sql: -------------------------------------------------------------------------------- 1 | SELECT protopayload_auditlog.status.message FROM 2 | ch04.cloudaudit_googleapis_com_data_access_20190128 -------------------------------------------------------------------------------- /ch04/54.py: -------------------------------------------------------------------------------- 1 | INPATTERNS = 'gs://bigquery-oreilly-book/college_*.csv' 2 | RUNNER = 'DataflowRunner' 3 | with beam.Pipeline(RUNNER, options = opts) as p: 4 | (p 5 | | 'read' >> beam.io.ReadFromText(INPATTERNS, skip_header_lines=1) 6 | | 'parse_csv' >> beam.FlatMap(parse_csv) 7 | | 'pull_fields' >> beam.FlatMap(pull_fields) 8 | | 'write_bq' >> beam.io.gcp.bigquery.WriteToBigQuery(bqtable, bqdataset, schema=get_output_schema()) 9 | ) -------------------------------------------------------------------------------- /ch04/55.py: -------------------------------------------------------------------------------- 1 | def parse_csv(line): 2 | try: 3 | values = line.split(',') 4 | rowdict = {} 5 | for colname, value in zip(COLNAMES, values): 6 | rowdict[colname] = value 7 | yield rowdict 8 | except: 9 | logging.warn('Ignoring line ...') 10 | -------------------------------------------------------------------------------- /ch04/56.py: -------------------------------------------------------------------------------- 1 | def pull_fields(rowdict): 2 | result = {} 3 | # 필요한 문자열 필드 4 | for col in 'INSTNM'.split(','): 5 | if col in rowdict: 6 | result[col] = rowdict[col] 7 | else: 8 | logging.info('Ignoring line missing {}', col) 9 | return 10 | 11 | # 부동소수점 필드 12 | for col in 'ADM_RATE_ALL,FIRST_GEN,MD_FAMINC,SAT_AVG,MD_EARN_WNE_P10'.split(','): 13 | try: 14 | result[col] = (float) (rowdict[col]) 15 | except: 16 | result[col] = None 17 | yield result 18 | -------------------------------------------------------------------------------- /ch04/57.py: -------------------------------------------------------------------------------- 1 | # create an array of tuples and insert as data becomes available 2 | rows_to_insert = [ 3 | (u'U. Puerto Rico', 0.18, 0.46, 23000, 1134, 32000), 4 | (u'Guam U.', 0.43, 0.21, 28000, 1234, 33000) 5 | ] 6 | errors = client.insert_rows(table, rows_to_insert) # API request 7 | -------------------------------------------------------------------------------- /ch04/58.sh: -------------------------------------------------------------------------------- 1 | gsutil -m cp /some/dir/myfiles*.csv gs://bucket/some/dir -------------------------------------------------------------------------------- /ch04/bigtable/delete_instance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcloud bigtable instances delete bqbook-instance 4 | #--cluster=bqbook-cluster --cluster-zone=us-central1-a --display-name=bqbook-instance --instance-type=DEVELOPMENT 5 | -------------------------------------------------------------------------------- /ch04/college_scorecard.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlybooks/bigquery/9e5e4234720876404cb28058f8387a0f8154aee0/ch04/college_scorecard.csv.gz -------------------------------------------------------------------------------- /ch04/load_external_gcs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOC="--location US" 4 | INPUT=gs://bigquery-oreilly-book/college_scorecard.csv 5 | 6 | bq $LOC mk ch04 # okay if it fails 7 | bq $LOC rm ch04.college_scorecard_gcs # replace 8 | 9 | DEF=/tmp/college_scorecard_def.json 10 | 11 | SCHEMA=$(gsutil cat $INPUT | head -1 | awk -F, '{ORS=","}{for (i=1; i <= NF; i++){ print $i":STRING"; }}' | sed 's/,$//g'| cut -b 4- ) 12 | echo $SCHEMA > /tmp/schema.txt 13 | 14 | bq $LOC \ 15 | mkdef \ 16 | --source_format=CSV \ 17 | --noautodetect \ 18 | $INPUT \ 19 | $SCHEMA \ 20 | | sed 's/"skipLeadingRows": 0/"skipLeadingRows": 1/g' \ 21 | | sed 's/"allowJaggedRows": false/"allowJaggedRows": true/g' \ 22 | > $DEF 23 | 24 | 25 | bq mk --external_table_definition=$DEF ch04.college_scorecard_gcs 26 | 27 | -------------------------------------------------------------------------------- /ch04/load_from.gcs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOC="--location US" 4 | 5 | #bq $LOC rm -r ch04 6 | bq $LOC mk ch04 7 | 8 | bq $LOC \ 9 | load --null_marker=NULL --replace \ 10 | --source_format=CSV --autodetect \ 11 | ch04.college_scorecard \ 12 | gs://cloud-training-demos/tmp/college_scorecard.csv.gz 13 | 14 | # ./college_scorecard.csv.gz \ -------------------------------------------------------------------------------- /ch04/load_from_gcs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOC="--location US" 4 | 5 | #bq $LOC rm -r ch04 6 | bq $LOC mk ch04 7 | 8 | bq $LOC \ 9 | load --null_marker=NULL --replace \ 10 | --source_format=CSV --autodetect \ 11 | ch04.college_scorecard \ 12 | gs://cloud-training-demos/tmp/college_scorecard.csv.gz 13 | 14 | # ./college_scorecard.csv.gz \ 15 | -------------------------------------------------------------------------------- /ch04/load_from_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOC="--location US" 4 | 5 | bq $LOC mk ch03 6 | 7 | zless ./college_scorecard.csv.gz | sed 's/PrivacySuppressed/NULL/g' | gzip > /tmp/college_scorecard.csv.gz 8 | 9 | #SCHEMA="--autodetect" 10 | SCHEMA="--schema=schema.json --skip_leading_rows=1" 11 | 12 | bq $LOC \ 13 | load --null_marker=NULL --replace \ 14 | --source_format=CSV $SCHEMA \ 15 | ch03.college_scorecard \ 16 | /tmp/college_scorecard.csv.gz 17 | 18 | # ./college_scorecard.csv.gz \ 19 | -------------------------------------------------------------------------------- /ch04/query_temp_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LOC="--location US" 4 | INPUT=gs://bigquery-oreilly-book/college_scorecard.csv 5 | 6 | SCHEMA=$(gsutil cat $INPUT | head -1 | awk -F, '{ORS=","}{for (i=1; i <= NF; i++){ print $i":STRING"; }}' | sed 's/,$//g'| cut -b 4- ) 7 | 8 | bq $LOC query --external_table_definition=cstable::${SCHEMA}@CSV=${INPUT} \ 9 | 'SELECT SUM(IF(SAT_AVG != "NULL", 1, 0))/COUNT(SAT_AVG) FROM cstable' 10 | 11 | -------------------------------------------------------------------------------- /ch04/setup_data_transfer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bq mk --transfer_config --data_source=google_cloud_storage \ 4 | --target_dataset=ch04 --display_name ch04_college_scorecard \ 5 | --params='{"data_path_template":"gs://bigquery-oreilly-book/college_*.csv", "destination_table_name_template":"college_scorecard_dts", "file_format":"CSV", "max_bad_records":"10", "skip_leading_rows":"1", "allow_jagged_rows":"true"}' 6 | 7 | -------------------------------------------------------------------------------- /ch04/sheets_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onlybooks/bigquery/9e5e4234720876404cb28058f8387a0f8154aee0/ch04/sheets_data.csv -------------------------------------------------------------------------------- /ch04/students.csv: -------------------------------------------------------------------------------- 1 | Student,Home state,SAT score 2 | Aarti,KS,1111 3 | Billy,LA,1222 4 | Cao,MT,1333 5 | Dalia,NE,1444 -------------------------------------------------------------------------------- /ch05/01.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT=$(gcloud config get-value project) 3 | access_token=$(gcloud auth application-default print-access-token) curl -H "Authorization: Bearer $access_token" \ 4 | -H "Content-Type: application/json" \ 5 | -X GET "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/datasets/ch04/tables" 6 | -------------------------------------------------------------------------------- /ch05/02.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | table_name, creation_time 3 | FROM 4 | ch04.INFORMATION_SCHEMA.TABLES 5 | -------------------------------------------------------------------------------- /ch05/03.sh: -------------------------------------------------------------------------------- 1 | read -d '' QUERY_TEXT << EOF 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 5 10 | EOF -------------------------------------------------------------------------------- /ch05/04.sh: -------------------------------------------------------------------------------- 1 | curl -H "Authorization: Bearer $access_token" \ 2 | -H "Content-Type: application/json" \ 3 | -X POST \ 4 | -d "$request" \ 5 | "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/queries" -------------------------------------------------------------------------------- /ch05/05.json: -------------------------------------------------------------------------------- 1 | { 2 | "useLegacySql": false, 3 | "timeoutMs": 0, 4 | "useQueryCache": false, 5 | "query": \"${QUERY_TEXT}\" 6 | } -------------------------------------------------------------------------------- /ch05/06.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#queryResponse", 3 | "jobReference": { 4 | "projectId": "cloud-training-demos", 5 | "jobId": "job_gv0Kq8nWzXIkuBwoxsKMcTJIVbX4", 6 | "location": "EU" 7 | }, 8 | "jobComplete": false 9 | } -------------------------------------------------------------------------------- /ch05/07.sh: -------------------------------------------------------------------------------- 1 | pip install google-cloud-bigquery -------------------------------------------------------------------------------- /ch05/08.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery 2 | bq = bigquery.Client(project=PROJECT) 3 | -------------------------------------------------------------------------------- /ch05/09.py: -------------------------------------------------------------------------------- 1 | dsinfo = bq.get_dataset('bigquery-public-data.london_bicycles') -------------------------------------------------------------------------------- /ch05/10.py: -------------------------------------------------------------------------------- 1 | dsinfo = bq.get_dataset('ch04') -------------------------------------------------------------------------------- /ch05/11.py: -------------------------------------------------------------------------------- 1 | print(dsinfo.dataset_id) 2 | print(dsinfo.created) -------------------------------------------------------------------------------- /ch05/12.py: -------------------------------------------------------------------------------- 1 | print('{} created on {} in {}'.format( 2 | dsinfo.dataset_id, dsinfo.created, dsinfo.location)) -------------------------------------------------------------------------------- /ch05/13.py: -------------------------------------------------------------------------------- 1 | for access in dsinfo.access_entries: 2 | if access.role == 'READER': 3 | print(access) 4 | -------------------------------------------------------------------------------- /ch05/14.py: -------------------------------------------------------------------------------- 1 | dataset_id = "{}.ch05".format(PROJECT) 2 | ds = bq.create_dataset(dataset_id, exists_ok=True) 3 | -------------------------------------------------------------------------------- /ch05/15.py: -------------------------------------------------------------------------------- 1 | dataset_id = "{}.ch05eu".format(PROJECT) 2 | dsinfo = bigquery.Dataset(dataset_id) 3 | dsinfo.location = 'EU' 4 | ds = bq.create_dataset(dsinfo, exists_ok=True) 5 | -------------------------------------------------------------------------------- /ch05/16.py: -------------------------------------------------------------------------------- 1 | bq.delete_dataset('ch05', not_found_ok=True) 2 | -------------------------------------------------------------------------------- /ch05/17.py: -------------------------------------------------------------------------------- 1 | bq.delete_dataset('{}.ch05'.format(PROJECT), not_found_ok=True) 2 | -------------------------------------------------------------------------------- /ch05/18.py: -------------------------------------------------------------------------------- 1 | dsinfo = bq.get_dataset("ch05") 2 | print(dsinfo.description) 3 | dsinfo.description = "Chapter 5 of BigQuery: The Definitive Guide" 4 | dsinfo = bq.update_dataset(dsinfo, ['description']) 5 | print(dsinfo.description) -------------------------------------------------------------------------------- /ch05/19.py: -------------------------------------------------------------------------------- 1 | dsinfo = bq.get_dataset("ch05") 2 | entry = bigquery.AccessEntry( 3 | role="READER", 4 | entity_type="userByEmail", 5 | entity_id="xyz@google.com", 6 | ) 7 | if entry not in dsinfo.access_entries: 8 | entries = list(dsinfo.access_entries) 9 | entries.append(entry) 10 | dsinfo.access_entries = entries 11 | dsinfo = bq.update_dataset(dsinfo, ["access_entries"]) # API request 12 | else: 13 | print('{} already has access'.format(entry.entity_id)) 14 | print(dsinfo.access_entries) 15 | -------------------------------------------------------------------------------- /ch05/20.py: -------------------------------------------------------------------------------- 1 | tables = bq.list_tables("bigquery-public-data.london_bicycles") 2 | for table in tables: 3 | print(table.table_id) 4 | -------------------------------------------------------------------------------- /ch05/21.py: -------------------------------------------------------------------------------- 1 | table = bq.get_table( "bigquery-public-data.london_bicycles.cycle_stations") 2 | print('{} rows in {}'.format(table.num_rows, table.table_id)) -------------------------------------------------------------------------------- /ch05/22.py: -------------------------------------------------------------------------------- 1 | table = bq.get_table("bigquery-public-data.london_bicycles.cycle_stations") 2 | for field in table.schema: 3 | if 'count' in field.name: 4 | print(field) 5 | -------------------------------------------------------------------------------- /ch05/23.py: -------------------------------------------------------------------------------- 1 | bq.delete_table('ch05.temp_table', not_found_ok=True) -------------------------------------------------------------------------------- /ch05/24.sh: -------------------------------------------------------------------------------- 1 | bq --location=US cp ch05.temp_table@1418864998000 ch05.temp_table2 -------------------------------------------------------------------------------- /ch05/25.py: -------------------------------------------------------------------------------- 1 | table_id = '{}.ch05.temp_table'.format(PROJECT) 2 | table = bq.create_table(table_id, exists_ok=True) 3 | -------------------------------------------------------------------------------- /ch05/26.py: -------------------------------------------------------------------------------- 1 | schema = [ 2 | bigquery.SchemaField("chapter", "INTEGER", mode="REQUIRED"), 3 | bigquery.SchemaField("title", "STRING", mode="REQUIRED"), 4 | ] 5 | table_id = '{}.ch05.temp_table'.format(PROJECT) 6 | table = bq.get_table(table_id) 7 | print(table.etag) 8 | table.schema = schema 9 | table = bq.update_table(table, ["schema"]) 10 | print(table.schema) 11 | print(table.etag) 12 | -------------------------------------------------------------------------------- /ch05/27.py: -------------------------------------------------------------------------------- 1 | rows = [ 2 | (1, u'What is BigQuery?'), 3 | (2, u'Query essentials'), 4 | ] 5 | errors = bq.insert_rows(table, rows) 6 | -------------------------------------------------------------------------------- /ch05/28.py: -------------------------------------------------------------------------------- 1 | rows = [ 2 | ('3', u'Operating on data types'), 3 | ('wont work', u'This will fail'), 4 | ('4', u'Loading data into BigQuery'), 5 | ] 6 | errors = bq.insert_rows(table, rows) 7 | print(errors) 8 | -------------------------------------------------------------------------------- /ch05/29.py: -------------------------------------------------------------------------------- 1 | rows = [ 2 | (1, u'What is BigQuery?'), 3 | (2, u'Query essentials'), 4 | ] 5 | print(table.table_id, table.num_rows) 6 | errors = bq.insert_rows(table, rows) 7 | print(errors) 8 | table = bq.get_table(table_id) 9 | print(table.table_id, table.num_rows) # DELAYED 10 | -------------------------------------------------------------------------------- /ch05/30.py: -------------------------------------------------------------------------------- 1 | schema = [ 2 | bigquery.SchemaField("chapter", "INTEGER", mode="REQUIRED"), 3 | bigquery.SchemaField("title", "STRING", mode="REQUIRED"), 4 | ] 5 | table_id = '{}.ch05.temp_table2'.format(PROJECT) 6 | table = bigquery.Table(table_id, schema) 7 | table = bq.create_table(table, exists_ok=True) 8 | print('{} created on {}'.format(table.table_id, table.created)) 9 | print(table.schema) 10 | -------------------------------------------------------------------------------- /ch05/31.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | data = [ 4 | (1, u'What is BigQuery?'), 5 | (2, u'Query essentials'), 6 | ] 7 | df = pd.DataFrame(data, columns=['chapter', 'title']) 8 | -------------------------------------------------------------------------------- /ch05/32.py: -------------------------------------------------------------------------------- 1 | table_id = '{}.ch05.temp_table3'.format(PROJECT) 2 | job = bq.load_table_from_dataframe(df, table_id) 3 | job.result() # blocks and waits 4 | print("Loaded {} rows into {}".format(job.output_rows, tblref.table_id)) -------------------------------------------------------------------------------- /ch05/33.py: -------------------------------------------------------------------------------- 1 | from google.cloud.bigquery.job import LoadJobConfig, WriteDisposition, CreateDisposition 2 | load_config = LoadJobConfig( 3 | create_disposition=CreateDisposition.CREATE_IF_NEEDED, 4 | write_disposition=WriteDisposition.WRITE_TRUNCATE) 5 | job = bq.load_table_from_dataframe(df, table_id, job_config=load_config) 6 | -------------------------------------------------------------------------------- /ch05/34.py: -------------------------------------------------------------------------------- 1 | job_config = bigquery.LoadJobConfig() 2 | job_config.autodetect = True 3 | job_config.source_format = bigquery.SourceFormat.CSV 4 | job_config.null_marker = 'NULL' 5 | uri = "gs://bigquery-oreilly-book/college_scorecard.csv" 6 | table_id = '{}.ch05.college_scorecard_gcs'.format(PROJECT) 7 | job = bq.load_table_from_uri(uri, table_id, job_config=job_config) -------------------------------------------------------------------------------- /ch05/35.py: -------------------------------------------------------------------------------- 1 | while not job.done(): 2 | print('.', end='', flush=True) 3 | time.sleep(0.1) 4 | print('Done') 5 | table = bq.get_table(tblref) 6 | print("Loaded {} rows into {}.".format(table.num_rows, table.table_id)) 7 | -------------------------------------------------------------------------------- /ch05/36.py: -------------------------------------------------------------------------------- 1 | with gzip.open('../04_load/college_scorecard.csv.gz') as fp: 2 | job = bq.load_table_from_file(fp, tblref, job_config=job_config) 3 | -------------------------------------------------------------------------------- /ch05/37.py: -------------------------------------------------------------------------------- 1 | source_tbl = 'bigquery-public-data.london_bicycles.cycle_stations' 2 | dest_tbl = '{}.ch05eu.cycle_stations_copy'.format(PROJECT) 3 | job = bq.copy_table(source_tbl, dest_tbl, location='EU') 4 | job.result() # blocks and waits 5 | dest_table = bq.get_table(dest_tbl) 6 | print(dest_table.num_rows) -------------------------------------------------------------------------------- /ch05/38.py: -------------------------------------------------------------------------------- 1 | source_tbl = 'bigquery-public-data.london_bicycles.cycle_stations' 2 | dest_uri = 'gs://{}/tmp/exported/cycle_stations'.format(BUCKET) 3 | config = bigquery.job.ExtractJobConfig( 4 | destination_format=bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON) 5 | job = bq.extract_table(source_tbl, dest_uri, 6 | location='EU',job_config=config) 7 | job.result() # 중단 및 대기 8 | -------------------------------------------------------------------------------- /ch05/39.py: -------------------------------------------------------------------------------- 1 | table_id = 'bigquery-public-data.london_bicycles.cycle_stations' 2 | table = bq.get_table(table_id) 3 | rows = bq.list_rows(table, 4 | start_index=0, 5 | max_results=5) 6 | -------------------------------------------------------------------------------- /ch05/40.py: -------------------------------------------------------------------------------- 1 | rows = bq.list_rows(table) -------------------------------------------------------------------------------- /ch05/41.py: -------------------------------------------------------------------------------- 1 | page_size = 10000 2 | row_iter = bq.list_rows(table, 3 | page_size=page_size) 4 | for page in row_iter.pages: 5 | rows = list(page) 6 | # 로드된 행에 대해 필요한 작업을 실행한다. 7 | print(len(rows)) 8 | -------------------------------------------------------------------------------- /ch05/42.py: -------------------------------------------------------------------------------- 1 | fields = [field for field in table.schema if 'count' in field.name or field.name == 'id'] 2 | rows = bq.list_rows(table, 3 | start_index=300, 4 | max_results=5, 5 | selected_fields=fields) 6 | -------------------------------------------------------------------------------- /ch05/43.py: -------------------------------------------------------------------------------- 1 | fmt = '{!s:<10} ' * len(rows.schema) 2 | print(fmt.format(*[field.name for field in rows.schema])) 3 | for row in rows: 4 | print(fmt.format(*row)) 5 | -------------------------------------------------------------------------------- /ch05/44.py: -------------------------------------------------------------------------------- 1 | query = """ 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 10 10 | """ -------------------------------------------------------------------------------- /ch05/45.py: -------------------------------------------------------------------------------- 1 | config = bigquery.QueryJobConfig() 2 | config.dry_run = True 3 | job = bq.query(query, location='EU', job_config=config) 4 | print("This query will process {} bytes." 5 | .format(job.total_bytes_processed)) -------------------------------------------------------------------------------- /ch05/46.py: -------------------------------------------------------------------------------- 1 | job = bq.query(query, location='EU') 2 | fmt = '{!s:<40} {:>10d} {:>10d}' 3 | for row in job: 4 | fields = (row['start_station_name'], 5 | (int)(0.5 + row['duration']), 6 | row['num_trips']) 7 | print(fmt.format(*fields)) 8 | -------------------------------------------------------------------------------- /ch05/47.py: -------------------------------------------------------------------------------- 1 | query = """ 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | """ 9 | df = bq.query(query, location='EU').to_dataframe() 10 | print(df.describe()) -------------------------------------------------------------------------------- /ch05/48.py: -------------------------------------------------------------------------------- 1 | query2 = """ 2 | SELECT 3 | start_station_name 4 | , COUNT(duration) as num_trips 5 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 6 | WHERE duration >= @min_duration 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 10 10 | """ -------------------------------------------------------------------------------- /ch05/49.py: -------------------------------------------------------------------------------- 1 | query2 = """ 2 | SELECT 3 | start_station_name 4 | , COUNT(duration) as num_trips 5 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 6 | WHERE duration >= {} 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 10 10 | """.format(min_duration) 11 | -------------------------------------------------------------------------------- /ch05/50.py: -------------------------------------------------------------------------------- 1 | config = bigquery.QueryJobConfig() 2 | config.query_parameters = [ 3 | bigquery.ScalarQueryParameter('min_duration', "INT64", 600) 4 | ] 5 | job = bq.query(query2, location='EU', job_config=config) 6 | -------------------------------------------------------------------------------- /ch05/51.py: -------------------------------------------------------------------------------- 1 | fmt = '{!s:<40} {:>10d}' 2 | for row in job: 3 | fields = (row['start_station_name'], row['num_trips']) 4 | print(fmt.format(*fields)) 5 | -------------------------------------------------------------------------------- /ch05/52.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcloud beta notebooks instances create bqhook \ 4 | --location=us-west1-a \ 5 | --vm-image-family=tf-latest-cpu \ 6 | --vm-image-project=deeplearning-platform-release \ 7 | --machine-type=n1-standard-2 8 | -------------------------------------------------------------------------------- /ch05/53.sh: -------------------------------------------------------------------------------- 1 | !pip install google-cloud-bigquery 2 | %load_ext google.cloud.bigquery -------------------------------------------------------------------------------- /ch05/54.sql: -------------------------------------------------------------------------------- 1 | %%bigquery --project $PROJECT 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 5 -------------------------------------------------------------------------------- /ch05/55.sql: -------------------------------------------------------------------------------- 1 | %%bigquery df --project $PROJECT 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC -------------------------------------------------------------------------------- /ch05/56.sql: -------------------------------------------------------------------------------- 1 | %%bigquery badtrips --project $PROJECT 2 | 3 | WITH all_bad_trips AS ( 4 | SELECT 5 | start_station_name 6 | , COUNTIF(duration < 600 AND start_station_name = end_station_name) AS bad_trips 7 | , COUNT(*) AS num_trips 8 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 9 | WHERE EXTRACT(YEAR FROM start_date) = 2015 10 | GROUP BY start_station_name 11 | HAVING num_trips > 10 12 | ) 13 | SELECT *, bad_trips / num_trips AS fraction_bad FROM all_bad_trips 14 | ORDER BY fraction_bad DESC -------------------------------------------------------------------------------- /ch05/57.py: -------------------------------------------------------------------------------- 1 | badtrips.describe() -------------------------------------------------------------------------------- /ch05/58.py: -------------------------------------------------------------------------------- 1 | badtrips.plot.scatter('num_trips', 'fraction_bad'); -------------------------------------------------------------------------------- /ch05/59.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | ax = sns.regplot(badtrips['num_trips'],badtrips['fraction_bad']); 3 | ax.set_ylim(0, 0.05); -------------------------------------------------------------------------------- /ch05/60.py: -------------------------------------------------------------------------------- 1 | stations_to_examine = [] 2 | for band in range(1, 5): 3 | min_trips = badtrips['num_trips'].quantile(0.2 * (band)) 4 | max_trips = badtrips['num_trips'].quantile(0.2 * (band + 1)) 5 | query = 'num_trips >= {} and num_trips < {}'.format(min_trips, max_trips) 6 | print(query) # 묶음을 찾는다 7 | stations = badtrips.query(query) 8 | stations = stations.sort_values( 9 | by=['fraction_bad'], 10 | ascending=False)[:5] 11 | print(stations) # 망친 나들이 비율이 가장 높은 5개 12 | stations_to_examine.append(stations) 13 | print() 14 | -------------------------------------------------------------------------------- /ch05/61.py: -------------------------------------------------------------------------------- 1 | stations_to_examine = pd.concat(stations_to_examine) 2 | bq = bigquery.Client(project=PROJECT) 3 | tblref = TableReference.from_string( 4 | '{}.ch05eu.bad_bikes'.format(PROJECT)) 5 | job = bq.load_table_from_dataframe(stations_to_examine, tblref) 6 | job.result() # 중단 후 대기 -------------------------------------------------------------------------------- /ch05/62.sh: -------------------------------------------------------------------------------- 1 | %%bigquery stations_to_examine --project $PROJECT 2 | SELECT 3 | start_station_name AS station_name , num_trips 4 | , fraction_bad 5 | , latitude 6 | , longitude 7 | FROM ch05eu.bad_bikes AS bad 8 | JOIN `bigquery-public-data`.london_bicycles.cycle_stations AS s 9 | ON bad.start_station_name = s.name -------------------------------------------------------------------------------- /ch05/63.py: -------------------------------------------------------------------------------- 1 | import folium 2 | map_pts = folium.Map(location=[51.5, -0.15], zoom_start=12) 3 | for idx, row in stations_to_examine.iterrows(): 4 | folium.Marker(location=[row['latitude'], row['longitude']], 5 | popup=row['station_name']).add_to(map_pts) 6 | -------------------------------------------------------------------------------- /ch05/64.r: -------------------------------------------------------------------------------- 1 | install.packages("bigrquery", dependencies=TRUE) -------------------------------------------------------------------------------- /ch05/65.r: -------------------------------------------------------------------------------- 1 | billing <- 'cloud-training-demos' # your project name 2 | sql <- " 3 | SELECT 4 | start_station_name 5 | , AVG(duration) as duration 6 | , COUNT(duration) as num_trips 7 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 8 | GROUP BY start_station_name 9 | ORDER BY num_trips DESC 10 | LIMIT 5 11 | " 12 | tbl <- bq_project_query(billing, sql) 13 | bq_table_download(tbl, max_results=100) 14 | grid.tbl(tbl) -------------------------------------------------------------------------------- /ch05/66.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": 3 14 | }, 15 | "orig_nbformat": 2 16 | }, 17 | "nbformat": 4, 18 | "nbformat_minor": 2, 19 | "cells": [ 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!conda install rpy2\n", 27 | "%load_ext rpy2.ipython" 28 | ] 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /ch05/67.sql: -------------------------------------------------------------------------------- 1 | %%bigquery docks --project $PROJECT 2 | SELECT 3 | docks_count, latitude, longitude 4 | FROM `bigquery-public-data`.london_bicycles.cycle_stations 5 | WHERE bikes_count > 0 6 | -------------------------------------------------------------------------------- /ch05/68.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": 3 14 | }, 15 | "orig_nbformat": 2 16 | }, 17 | "nbformat": 4, 18 | "nbformat_minor": 2, 19 | "cells": [ 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%%R -i docks\n", 27 | "mod <- lm(docks ~ latitude + longitude)\n", 28 | "summary(mod)" 29 | ] 30 | } 31 | ] 32 | } -------------------------------------------------------------------------------- /ch05/69.py: -------------------------------------------------------------------------------- 1 | from scipy import stats 2 | ag,bg,cg = stats.gamma.fit(df['duration']) -------------------------------------------------------------------------------- /ch05/70.py: -------------------------------------------------------------------------------- 1 | opts = beam.pipeline.PipelineOptions(flags=[], **options) 2 | RUNNER = 'DataflowRunner' 3 | query = """ 4 | SELECT start_station_id, ARRAY_AGG(duration) AS duration_array 5 | FROM `bigquery-public-data.london_bicycles.cycle_hire` 6 | GROUP BY start_station_id 7 | """ 8 | 9 | with beam.Pipeline(RUNNER, options=opts) as p: 10 | (p 11 | | 'read_bq' >> beam.io.Read(beam.io.BigQuerySource(query=query)) 12 | | 'compute_fit' >> beam.Map(compute_fit) 13 | | 'write_bq' >> beam.io.gcp.bigquery.WriteToBigQuery('ch05eu.station_stats', 14 | schema='station_id:string,ag:FLOAT64,bg:FLOAT64,cg:FLOAT64') 15 | ) 16 | -------------------------------------------------------------------------------- /ch05/71.py: -------------------------------------------------------------------------------- 1 | def compute_fit(row): 2 | from scipy import stats 3 | result = {} 4 | result['station_id'] = row['start_station_id'] 5 | durations = row['duration_array'] 6 | ag, bg, cg = stats.gamma.fit(durations) 7 | result['ag'] = ag 8 | result['bg'] = bg 9 | result['cg'] = cg 10 | return result 11 | -------------------------------------------------------------------------------- /ch05/72.js: -------------------------------------------------------------------------------- 1 | function createBigQueryPresentation() { 2 | var spreadsheet = runQuery(); 3 | Logger.log('Results spreadsheet created: %s', spreadsheet.getUrl()); 4 | var chart = createColumnChart(spreadsheet); 5 | var deck = createSlidePresentation(spreadsheet, chart); 6 | Logger.log('Results slide deck created: %s', deck.getUrl()); 7 | } -------------------------------------------------------------------------------- /ch05/73.js: -------------------------------------------------------------------------------- 1 | var queryResults = BigQuery.Jobs.query(request, PROJECT_ID); 2 | var rows = queryResults.rows; 3 | while (queryResults.pageToken) { 4 | queryResults = BigQuery.Jobs.getQueryResults( 5 | PROJECT_ID, 6 | jobId, 7 | { 8 | pageToken: queryResults.pageToken 9 | } 10 | ); 11 | rows = rows.concat(queryResults.rows); 12 | } -------------------------------------------------------------------------------- /ch05/74.sh: -------------------------------------------------------------------------------- 1 | bq mk --location=US \ 2 | --default_table_expiration 3600 \ 3 | --description "Chapter 5 of BigQuery Book." \ 4 | ch05 5 | -------------------------------------------------------------------------------- /ch05/75.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | bq_safe_mk() { 3 | dataset=$1 4 | exists=$(bq ls --dataset | grep -w $dataset) 5 | if [ -n "$exists" ]; then 6 | echo "Not creating $dataset since it already exists" 7 | else 8 | echo "Creating $dataset" 9 | bq mk $dataset 10 | fi 11 | } 12 | # 이 함수를 호출하는 방법 13 | bq_safe_mk ch05 -------------------------------------------------------------------------------- /ch05/76.sh: -------------------------------------------------------------------------------- 1 | bq mk --location=US \ 2 | --default_table_expiration 3600 \ 3 | --description "Chapter 5 of BigQuery Book." \ 4 | projectname:ch05 5 | -------------------------------------------------------------------------------- /ch05/77.sh: -------------------------------------------------------------------------------- 1 | bq mk --table \ 2 | --expiration 3600 \ 3 | --description "One hour of data" \ 4 | --label persistence:volatile \ 5 | ch05.rentals_last_hour rental_id:STRING,duration:FLOAT -------------------------------------------------------------------------------- /ch05/78.sh: -------------------------------------------------------------------------------- 1 | bq mk --table \ 2 | --expiration 3600 \ 3 | --description "One hour of data" \ 4 | --label persistence:volatile \ 5 | ch05.rentals_last_hour schema.json 6 | -------------------------------------------------------------------------------- /ch05/79.sh: -------------------------------------------------------------------------------- 1 | bq cp ch04.old_table ch05.new_table -------------------------------------------------------------------------------- /ch05/80.sh: -------------------------------------------------------------------------------- 1 | bq wait --fail_on_error job_id -------------------------------------------------------------------------------- /ch05/81.sh: -------------------------------------------------------------------------------- 1 | bq wait --fail_on_error job_id 600 -------------------------------------------------------------------------------- /ch05/82.sh: -------------------------------------------------------------------------------- 1 | bq mk --location eu ch10eu 2 | -------------------------------------------------------------------------------- /ch05/83.sh: -------------------------------------------------------------------------------- 1 | bq mk --transfer_config --data_source=cross_region_copy \ 2 | --params={"source_dataset_id": "iowa_liquor_sales", \ 3 | "source_project_id": "bigquery-public-data"}' \ 4 | --target_dataset=ch10eu --display_name=liquor \ 5 | --schedule_end_time=$(date -v +1H -u +%Y-%m-%dT%H:%M:%SZ)" -------------------------------------------------------------------------------- /ch05/84.sh: -------------------------------------------------------------------------------- 1 | bq query \ 2 | --use_legacy_sql=false \ 3 | 'SELECT MAX(duration) FROM \ 4 | `bigquery-public-data`.london_bicycles.cycle_hire' 5 | -------------------------------------------------------------------------------- /ch05/84.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE dataset.table_restored 2 | AS 3 | SELECT * 4 | FROM dataset.table 5 | FOR SYSTEM TIME AS OF 6 | TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -1 DAY) -------------------------------------------------------------------------------- /ch05/85.sh: -------------------------------------------------------------------------------- 1 | bq show --schema dataset.table. # schema.json 2 | bq --format=json show dataset.table. # tabledef.json 3 | bq extract --destination_format=AVRO \ 4 | dataset.table gs://.../data_*.avro # AVRO 파일 -------------------------------------------------------------------------------- /ch05/86.sh: -------------------------------------------------------------------------------- 1 | bq load --source_format=AVRO \ 2 | --time_partitioning_expiration … \ 3 | --time_partitioning_field … \ 4 | --time_partitioning_type … \ 5 | --clustering_fields … \ 6 | --schema … \ 7 | todataset.table_name \ 8 | gs://.../data_*.avro -------------------------------------------------------------------------------- /ch05/87.sh: -------------------------------------------------------------------------------- 1 | bq insert ch05_rentals_last_hour data.json -------------------------------------------------------------------------------- /ch05/88.sh: -------------------------------------------------------------------------------- 1 | bq extract --format=json ch05.bad_bikes gs://bad_bikes.json -------------------------------------------------------------------------------- /ch05/89.sh: -------------------------------------------------------------------------------- 1 | bq query \ 2 | --use_legacy_sql=false \ 3 | 'SELECT MAX(duration) FROM \ 4 | `bigquery-public-data`.london_bicycles.cycle_hire' -------------------------------------------------------------------------------- /ch05/90.sh: -------------------------------------------------------------------------------- 1 | echo "SELECT MAX(duration) FROM \ 2 | `bigquery-public-data`.london_bicycles.cycle_hire" \ 3 | | bq query --use_legacy_sql=false -------------------------------------------------------------------------------- /ch05/91.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | read -d '' QUERY_TEXT << EOF 3 | SELECT 4 | start_station_name 5 | , AVG(duration) as duration 6 | , COUNT(duration) as num_trips 7 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 8 | GROUP BY start_station_name 9 | ORDER BY num_trips DESC 10 | LIMIT 5 11 | EOF 12 | bq query --project_id=some_project --use_legacy_sql=false $QUERY_TEXT -------------------------------------------------------------------------------- /ch05/92.bigqueryrc: -------------------------------------------------------------------------------- 1 | --location=EU 2 | --project_id=some_project 3 | [mk] 4 | --expiration=3600 5 | [query] 6 | --use_legacy_sql=false -------------------------------------------------------------------------------- /ch05/93.sh: -------------------------------------------------------------------------------- 1 | bq head -n 10 ch05.bad_bikes -------------------------------------------------------------------------------- /ch05/94.sh: -------------------------------------------------------------------------------- 1 | bq head -s 10 -n 10 ch05.bad_bikes -------------------------------------------------------------------------------- /ch05/95.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | read -d '' QUERY_TEXT << EOF 3 | SELECT 4 | start_station_name 5 | , duration/60 AS duration_minutes 6 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 7 | EOF 8 | bq mk --view=$QUERY_TEXT ch05.rental_duration -------------------------------------------------------------------------------- /ch05/96.sh: -------------------------------------------------------------------------------- 1 | bq update --description "Bikes that need repair" ch05.bad_bikes -------------------------------------------------------------------------------- /ch05/97.sh: -------------------------------------------------------------------------------- 1 | bq update \ 2 | --view "SELECT ..."\ 3 | ch05.rental_duration -------------------------------------------------------------------------------- /ch05/98.sh: -------------------------------------------------------------------------------- 1 | bq update --reservation --location=US \ 2 | --project_id=some_project \ 3 | --reservation_size=2000000000 -------------------------------------------------------------------------------- /ch05/bq_query.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | read -d '' QUERY_TEXT << EOF 4 | SELECT 5 | start_station_name 6 | , AVG(duration) as duration 7 | , COUNT(duration) as num_trips 8 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 9 | GROUP BY start_station_name 10 | ORDER BY num_trips DESC 11 | LIMIT 5 12 | EOF 13 | 14 | bq query --use_legacy_sql=false $QUERY_TEXT 15 | -------------------------------------------------------------------------------- /ch05/find_url.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMAGE=--image-family=tf-latest-cpu 4 | INSTANCE_NAME=dlvm 5 | MAIL=vlakshmanan@google.com # CHANGE THIS 6 | 7 | echo "Looking for Jupyter URL on $INSTANCE_NAME" 8 | while true; do 9 | proxy=$(gcloud compute instances describe ${INSTANCE_NAME} 2> /dev/null | grep dot-datalab-vm) 10 | if [ -z "$proxy" ] 11 | then 12 | echo -n "." 13 | sleep 1 14 | else 15 | echo "done!" 16 | echo "$proxy" 17 | break 18 | fi 19 | done 20 | -------------------------------------------------------------------------------- /ch05/launch_notebook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcloud beta notebooks instances create bqhook \ 4 | --location=us-west1-a \ 5 | --vm-image-family=tf-latest-cpu \ 6 | --vm-image-project=deeplearning-platform-release \ 7 | --machine-type=n1-standard-2 8 | -------------------------------------------------------------------------------- /ch05/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | -------------------------------------------------------------------------------- /ch05/rest_list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT=$(gcloud config get-value project) 4 | 5 | access_token=$(gcloud auth application-default print-access-token) 6 | 7 | curl -H "Authorization: Bearer $access_token" \ 8 | -H "Content-Type: application/json" \ 9 | -X GET "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/datasets/ch04/tables" 10 | 11 | -------------------------------------------------------------------------------- /ch06/01.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) 3 | , start_station_name 4 | FROM 5 | `bigquery-public-data`.london_bicycles.cycle_hire 6 | GROUP BY 2 7 | ORDER BY 1 DESC 8 | LIMIT 10 -------------------------------------------------------------------------------- /ch06/02.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(*) 3 | , starts.start_station_id AS point_a 4 | , ends.start_station_id AS point_b 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire starts, 7 | `bigquery-public-data`.london_bicycles.cycle_hire ends 8 | WHERE 9 | starts.start_station_id = ends.end_station_id 10 | AND ends.start_station_id = starts.end_station_id 11 | AND starts.start_station_id <> ends.start_station_id 12 | AND starts.start_date = ends.start_date 13 | GROUP BY 2, 3 14 | ORDER BY 1 DESC 15 | LIMIT 10 -------------------------------------------------------------------------------- /ch06/03.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) as c 2 | FROM `bigquery-public-data`.new_york_taxi_trips.tlc_yellow_trips_2017 3 | WHERE passenger_count > 5 -------------------------------------------------------------------------------- /ch06/04.sh: -------------------------------------------------------------------------------- 1 | bq --format=prettyjson show -j \ 2 | | grep completedParallelInputs -------------------------------------------------------------------------------- /ch06/05.sql: -------------------------------------------------------------------------------- 1 | SELECT title, COUNT(title) AS c 2 | FROM `bigquery-samples.wikipedia_benchmark.Wiki1B` 3 | WHERE title LIKE "G%o%o%g%l%e" 4 | GROUP BY title 5 | ORDER BY c DESC -------------------------------------------------------------------------------- /ch06/06.sql: -------------------------------------------------------------------------------- 1 | SELECT title, COUNT(title) AS c 2 | FROM `bigquery-samples.wikipedia_benchmark.Wiki1B` 3 | GROUP BY title 4 | ORDER BY c DESC -------------------------------------------------------------------------------- /ch06/07.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | repo_commits AS ( 3 | SELECT repos AS repo_name, author.name AS author 4 | FROM `bigquery-public-data.github_repos.commits` c, c.repo_name repos 5 | WHERE author.name IN ("Valliappa Lakshmanan", "Jordan Tigani") 6 | GROUP BY repos, author 7 | ), 8 | repo_languages AS ( 9 | SELECT lang.name AS lang, lang.bytes AS lang_bytes, repos.repo_name AS repo_name 10 | FROM `bigquery-public-data.github_repos.languages` repos, repos.LANGUAGE AS lang 11 | ) 12 | 13 | SELECT lang, author, SUM(lang_bytes) AS total_bytes 14 | FROM repo_languages 15 | JOIN repo_commits USING (repo_name) 16 | GROUP BY lang, author 17 | ORDER BY total_bytes DESC -------------------------------------------------------------------------------- /ch06/08.sql: -------------------------------------------------------------------------------- 1 | SELECT lang, author, SUM(lang_bytes) AS total_bytes 2 | FROM repo_languages 3 | JOIN repo_commits USING (repo_name) 4 | GROUP BY lang, author 5 | ORDER BY total_bytes DESC -------------------------------------------------------------------------------- /ch06/09.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | repo_commits AS ( 3 | SELECT repos AS repo_name, author.name AS author 4 | FROM `bigquery-public-data.github_repos.commits` c, c.repo_name repos 5 | -- WHERE author.name IN ("Valliappa Lakshmanan", "Jordan Tigani") 6 | GROUP BY repos, author 7 | ), 8 | repo_languages AS ( 9 | SELECT lang.name AS lang, lang.bytes AS lang_bytes, repos.repo_name AS repo_name 10 | FROM `bigquery-public-data.github_repos.languages` repos, repos.LANGUAGE AS lang 11 | ) 12 | 13 | SELECT lang, author, SUM(lang_bytes) AS total_bytes 14 | FROM repo_languages 15 | JOIN repo_commits USING (repo_name) 16 | GROUP BY lang, author 17 | ORDER BY total_bytes DESC 18 | LIMIT 100 -------------------------------------------------------------------------------- /ch06/10.sql: -------------------------------------------------------------------------------- 1 | SELECT orders.order_id 2 | FROM retail.orders AS orders JOIN retail.customers 3 | ON orders.customer_id = customers.customer_id 4 | WHERE customers.customer_name = 'Jordan Tigani' 5 | -------------------------------------------------------------------------------- /ch07/01.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | job_id 3 | , query 4 | , user_email 5 | , total_bytes_processed 6 | , total_slot_ms 7 | FROM `some-project`.INFORMATION_SCHEMA.JOBS_BY_PROJECT 8 | WHERE EXTRACT(YEAR FROM creation_time) = 2019 9 | ORDER BY total_bytes_processed DESC 10 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/02.sh: -------------------------------------------------------------------------------- 1 | read -d '' QUERY_TEXT << EOF 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 5 10 | EOF 11 | 12 | read -d '' request << EOF 13 | { 14 | "useLegacySql": false, 15 | "useQueryCache": false, 16 | "query": \"${QUERY_TEXT}\" 17 | } 18 | EOF 19 | request=$(echo "$request" | tr '\n' ' ') -------------------------------------------------------------------------------- /ch07/03.sh: -------------------------------------------------------------------------------- 1 | access_token=$(gcloud auth application-default print-access-token) 2 | PROJECT=$(gcloud config get-value project) -------------------------------------------------------------------------------- /ch07/04.sh: -------------------------------------------------------------------------------- 1 | NUM_TIMES=10 2 | time for i in $(seq 1 $NUM_TIMES); do 3 | echo -en "\r ... $i / $NUM_NUMTIMES ..." 4 | curl --silent \ 5 | -H "Authorization: Bearer $access_token" \ 6 | -H "Content-Type: application/json" \ 7 | -X POST \ 8 | -d "$request" \ 9 | "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/queries" > /dev/null 10 | done -------------------------------------------------------------------------------- /ch07/05.sh: -------------------------------------------------------------------------------- 1 | read -d '' request << EOF 2 | { 3 | "useLegacySql": false, 4 | "useQueryCache": true, 5 | "query": \"${QUERY_TEXT}\" 6 | } 7 | EOF -------------------------------------------------------------------------------- /ch07/06.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/GoogleCloudPlatform/pontem.git 2 | cd pontem/BigQueryWorkloadTester 3 | gradle clean :BigQueryWorkloadTester:build -------------------------------------------------------------------------------- /ch07/07.sh: -------------------------------------------------------------------------------- 1 | cat < queries/busystations.sql 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as duration 5 | , COUNT(duration) as num_trips 6 | FROM \`bigquery-public-data\`.london_bicycles.cycle_hire 7 | GROUP BY start_station_name 8 | ORDER BY num_trips DESC 9 | LIMIT 5 10 | EOF -------------------------------------------------------------------------------- /ch07/08.sh: -------------------------------------------------------------------------------- 1 | cat <./config.yaml 2 | concurrencyLevel: 1 3 | isRatioBasedBenchmark: true 4 | benchmarkRatios: [1.0, 2.0] 5 | outputFileFolder: $OUTDIR 6 | workloads: 7 | - name: "Busy stations" 8 | projectId: $PROJECT 9 | queryFiles: 10 | - queries/busystations.sql 11 | outputFileName: busystations.json 12 | EOF 13 | -------------------------------------------------------------------------------- /ch07/09.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name, 3 | AVG(duration) AS duration, 4 | COUNT(duration) AS num_trips 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY 8 | start_station_name 9 | ORDER BY 10 | num_trips DESC 11 | LIMIT 12 | 5 -------------------------------------------------------------------------------- /ch07/10.sh: -------------------------------------------------------------------------------- 1 | JOBID=8adbf3fd-e310-44bb-9c6e-88254958ccac # CHANGE 2 | access_token=$(gcloud auth application-default print-access-token) 3 | PROJECT=$(gcloud config get-value project) 4 | curl --silent \ 5 | -H "Authorization: Bearer $access_token" \ 6 | -X GET \ 7 | "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs/$JOBID" -------------------------------------------------------------------------------- /ch07/11.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | bike_id 3 | , duration 4 | FROM 5 | `bigquery-public-data`.london_bicycles.cycle_hire 6 | ORDER BY duration DESC 7 | LIMIT 1 -------------------------------------------------------------------------------- /ch07/12.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM 4 | `bigquery-public-data`.london_bicycles.cycle_hire 5 | ORDER BY duration DESC 6 | LIMIT 1 -------------------------------------------------------------------------------- /ch07/13.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | MIN(start_station_name) AS start_station_name 3 | , MIN(end_station_name) AS end_station_name 4 | , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration 5 | , COUNT(duration) AS num_trips 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | WHERE 9 | start_station_id != end_station_id 10 | GROUP BY 11 | start_station_id, end_station_id 12 | ORDER BY num_trips DESC 13 | LIMIT 10 -------------------------------------------------------------------------------- /ch07/14.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , end_station_name 4 | , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration 5 | , COUNT(duration) AS num_trips 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | WHERE 9 | start_station_name != end_station_name 10 | GROUP BY 11 | start_station_name, end_station_name 12 | ORDER BY num_trips DESC 13 | LIMIT 10 -------------------------------------------------------------------------------- /ch07/15.sql: -------------------------------------------------------------------------------- 1 | WITH trip_distance AS ( 2 | SELECT 3 | bike_id 4 | , ST_Distance(ST_GeogPoint(s.longitude, s.latitude), 5 | ST_GeogPoint(e.longitude, e.latitude)) AS distance 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire, 8 | `bigquery-public-data`.london_bicycles.cycle_stations s, 9 | `bigquery-public-data`.london_bicycles.cycle_stations e 10 | WHERE 11 | start_station_id = s.id 12 | AND end_station_id = e.id 13 | ) 14 | 15 | SELECT 16 | bike_id 17 | , SUM(distance)/1000 AS total_distance 18 | FROM trip_distance 19 | GROUP BY bike_id 20 | ORDER BY total_distance DESC 21 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/16.sql: -------------------------------------------------------------------------------- 1 | WITH stations AS ( 2 | SELECT 3 | s.id AS start_id 4 | , e.id AS end_id 5 | , ST_Distance(ST_GeogPoint(s.longitude, s.latitude), 6 | ST_GeogPoint(e.longitude, e.latitude)) AS distance 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_stations s, 9 | `bigquery-public-data`.london_bicycles.cycle_stations e 10 | ), 11 | 12 | trip_distance AS ( 13 | SELECT 14 | bike_id 15 | , distance 16 | FROM 17 | `bigquery-public-data`.london_bicycles.cycle_hire, 18 | stations 19 | WHERE 20 | start_station_id = start_id 21 | AND end_station_id = end_id 22 | ) 23 | 24 | SELECT 25 | bike_id 26 | , SUM(distance)/1000 AS total_distance 27 | FROM trip_distance 28 | GROUP BY bike_id 29 | ORDER BY total_distance DESC 30 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/17.sql: -------------------------------------------------------------------------------- 1 | WITH oneway AS ( 2 | SELECT EXTRACT(date FROM start_date) AS rental_date, 3 | duration, start_station_name, end_station_name 4 | FROM 5 | ch07eu.cycle_hire 6 | WHERE start_station_name != end_station_name 7 | ) 8 | 9 | SELECT 10 | rental_date, AVG(duration) AS avg_duration, 11 | start_station_name, end_station_name 12 | FROM oneway 13 | WHERE rental_date BETWEEN ‘2015-01-01’ AND ‘2015-01-07’ 14 | GROUP BY rental_date, start_station_name, end_station_name -------------------------------------------------------------------------------- /ch07/18.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE MATERIALIZED VIEW ch07eu.oneway_rentals 2 | AS 3 | 4 | WITH oneway AS ( 5 | SELECT EXTRACT(date FROM start_date) AS rental_date, 6 | duration, start_station_name, end_station_name 7 | FROM 8 | ch07eu.cycle_hire 9 | WHERE start_station_name != end_station_name 10 | ) 11 | 12 | SELECT 13 | rental_date, AVG(duration) AS avg_duration, 14 | start_station_name, end_station_name 15 | FROM oneway 16 | GROUP BY rental_date, start_station_name, end_station_name -------------------------------------------------------------------------------- /ch07/19.sql: -------------------------------------------------------------------------------- 1 | WITH typical_trip AS ( 2 | SELECT 3 | start_station_name 4 | , end_station_name 5 | , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration 6 | , COUNT(duration) AS num_trips 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_hire 9 | GROUP BY 10 | start_station_name, end_station_name 11 | ) -------------------------------------------------------------------------------- /ch07/20.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.typical_trip AS 2 | SELECT 3 | start_station_name 4 | , end_station_name 5 | , APPROX_QUANTILES(duration, 10)[OFFSET(5)] AS typical_duration 6 | , COUNT(duration) AS num_trips 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_hire 9 | GROUP BY 10 | start_station_name, end_station_name -------------------------------------------------------------------------------- /ch07/21.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT (DATE FROM start_date) AS trip_date 3 | , APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio 4 | , COUNT(*) AS num_trips_on_day 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire AS hire 7 | JOIN typical_trip AS trip 8 | ON 9 | hire.start_station_name = trip.start_station_name 10 | AND hire.end_station_name = trip.end_station_name 11 | AND num_trips > 10 12 | GROUP BY trip_date 13 | HAVING num_trips_on_day > 10 14 | ORDER BY ratio DESC 15 | LIMIT 10 -------------------------------------------------------------------------------- /ch07/22.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | EXTRACT (DATE FROM start_date) AS trip_date 3 | , APPROX_QUANTILES(duration / typical_duration, 10)[OFFSET(5)] AS ratio 4 | , COUNT(*) AS num_trips_on_day 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire AS hire 7 | JOIN ch07eu.typical_trip AS trip 8 | ON 9 | hire.start_station_name = trip.start_station_name 10 | AND hire.end_station_name = trip.end_station_name 11 | AND num_trips > 10 12 | GROUP BY trip_date 13 | HAVING num_trips_on_day > 10 14 | ORDER BY ratio DESC 15 | LIMIT 10 -------------------------------------------------------------------------------- /ch07/23.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.london_bicycles_denorm AS 2 | SELECT 3 | start_station_id 4 | , s.latitude AS start_latitude 5 | , s.longitude AS start_longitude 6 | , end_station_id 7 | , e.latitude AS end_latitude 8 | , e.longitude AS end_longitude 9 | FROM 10 | `bigquery-public-data`.london_bicycles.cycle_hire AS h 11 | JOIN 12 | `bigquery-public-data`.london_bicycles.cycle_stations AS s 13 | ON 14 | h.start_station_id = s.id 15 | JOIN 16 | `bigquery-public-data`.london_bicycles.cycle_stations AS e 17 | ON 18 | h.end_station_id = e.id -------------------------------------------------------------------------------- /ch07/24.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | name 3 | , number AS num_babies 4 | FROM `bigquery-public-data`.usa_names.usa_1910_current 5 | WHERE gender = 'M' AND year = 2015 AND state = 'MA' 6 | ORDER BY num_babies DESC 7 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/25.sql: -------------------------------------------------------------------------------- 1 | WITH male_babies AS ( 2 | SELECT 3 | name 4 | , number AS num_babies 5 | FROM `bigquery-public-data`.usa_names.usa_1910_current 6 | WHERE gender = 'M' 7 | ), 8 | female_babies AS ( 9 | SELECT 10 | name 11 | , number AS num_babies 12 | FROM `bigquery-public-data`.usa_names.usa_1910_current 13 | WHERE gender = 'F' 14 | ), 15 | both_genders AS ( 16 | SELECT 17 | name 18 | , SUM(m.num_babies) + SUM(f.num_babies) AS num_babies 19 | , SUM(m.num_babies) / (SUM(m.num_babies) + SUM(f.num_babies)) AS frac_male 20 | FROM male_babies AS m 21 | JOIN female_babies AS f 22 | USING (name) 23 | GROUP BY name 24 | ) 25 | 26 | SELECT * FROM both_genders 27 | WHERE frac_male BETWEEN 0.3 AND 0.7 28 | ORDER BY num_babies DESC 29 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/26.sql: -------------------------------------------------------------------------------- 1 | WITH all_babies AS ( 2 | SELECT 3 | name 4 | , SUM(IF(gender = 'M', number, 0)) AS male_babies 5 | , SUM(IF(gender = 'F', number, 0)) AS female_babies 6 | FROM `bigquery-public-data.usa_names.usa_1910_current` 7 | GROUP BY name 8 | ), 9 | 10 | both_genders AS ( 11 | SELECT 12 | name 13 | , (male_babies + female_babies) AS num_babies 14 | , SAFE_DIVIDE(male_babies, male_babies + female_babies) AS frac_male 15 | FROM all_babies 16 | WHERE male_babies > 0 AND female_babies > 0 17 | ) 18 | 19 | SELECT * FROM both_genders 20 | WHERE frac_male BETWEEN 0.3 and 0.7 21 | ORDER BY num_babies DESC 22 | limit 5 -------------------------------------------------------------------------------- /ch07/27.sql: -------------------------------------------------------------------------------- 1 | with all_names AS ( 2 | SELECT name, gender, SUM(number) AS num_babies 3 | FROM `bigquery-public-data`.usa_names.usa_1910_current 4 | GROUP BY name, gender 5 | ), 6 | 7 | male_names AS ( 8 | SELECT name, num_babies 9 | FROM all_names 10 | WHERE gender = 'M' 11 | ), 12 | 13 | female_names AS ( 14 | SELECT name, num_babies 15 | FROM all_names 16 | WHERE gender = 'F' 17 | ), 18 | 19 | ratio AS ( 20 | SELECT 21 | name 22 | , (f.num_babies + m.num_babies) AS num_babies 23 | , m.num_babies / (f.num_babies + m.num_babies) AS frac_male 24 | FROM male_names AS m 25 | JOIN female_names AS f 26 | USING (name) 27 | ) 28 | 29 | SELECT * from ratio 30 | WHERE frac_male BETWEEN 0.3 and 0.7 31 | ORDER BY num_babies DESC 32 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/28.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | bike_id 3 | , start_date 4 | , end_date 5 | , TIMESTAMP_DIFF( 6 | start_date, 7 | LAG(end_date) OVER (PARTITION BY bike_id ORDER BY start_date), 8 | SECOND) AS time_at_station 9 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 10 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/29.sql: -------------------------------------------------------------------------------- 1 | WITH unused AS ( 2 | SELECT 3 | bike_id 4 | , start_station_name 5 | , start_date 6 | , end_date 7 | , TIMESTAMP_DIFF(start_date, LAG(end_date) OVER (PARTITION BY bike_id ORDER BY start_date), SECOND) AS time_at_station 8 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 9 | ) 10 | 11 | SELECT 12 | start_station_name 13 | , AVG(time_at_station) AS unused_seconds 14 | FROM unused 15 | GROUP BY start_station_name 16 | ORDER BY unused_seconds ASC 17 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/32.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.cycle_hire AS 2 | SELECT 3 | start_station_name 4 | , end_station_name 5 | , ST_DISTANCE(ST_GeogPoint(s1.longitude, s1.latitude), 6 | ST_GeogPoint(s2.longitude, s2.latitude)) AS distance 7 | , duration 8 | FROM 9 | `bigquery-public-data`.london_bicycles.cycle_hire AS h 10 | JOIN 11 | `bigquery-public-data`.london_bicycles.cycle_stations AS s1 12 | ON h.start_station_id = s1.id 13 | JOIN 14 | `bigquery-public-data`.london_bicycles.cycle_stations AS s2 15 | ON h.end_station_id = s2.id -------------------------------------------------------------------------------- /ch07/33.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | rental_id 3 | , ROW_NUMBER() OVER(ORDER BY end_date) AS rental_number 4 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 5 | ORDER BY rental_number ASC 6 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/34.sql: -------------------------------------------------------------------------------- 1 | WITH rentals_on_day AS ( 2 | SELECT 3 | rental_id 4 | , end_date 5 | , EXTRACT(DATE FROM end_date) AS rental_date 6 | FROM `bigquery-public-data.london_bicycles.cycle_hire` 7 | ) 8 | 9 | SELECT 10 | rental_id 11 | , rental_date 12 | , ROW_NUMBER() OVER(PARTITION BY rental_date ORDER BY end_date) AS 13 | rental_number_on_day 14 | FROM rentals_on_day 15 | ORDER BY rental_date ASC, rental_number_on_day ASC 16 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/35.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | repo_name 3 | , ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) 4 | FROM `bigquery-public-data.github_repos.commits`, UNNEST(repo_name) AS repo_name 5 | GROUP BY repo_name -------------------------------------------------------------------------------- /ch07/36.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | author.tz_offset, ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) 3 | FROM `bigquery-public-data.github_repos.commits` 4 | GROUP BY author.tz_offset -------------------------------------------------------------------------------- /ch07/37.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | author.tz_offset, ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds LIMIT 1000) 3 | FROM `bigquery-public-data.github_repos.commits` 4 | GROUP BY author.tz_offset -------------------------------------------------------------------------------- /ch07/38.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | repo_name, author.tz_offset 3 | , ARRAY_AGG(STRUCT(author, committer, subject, message, trailer, difference, encoding) ORDER BY author.date.seconds) 4 | FROM `bigquery-public-data.github_repos.commits`, UNNEST(repo_name) AS repo_name 5 | GROUP BY repo_name, author.tz_offset -------------------------------------------------------------------------------- /ch07/39.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(DISTINCT repo_name) AS num_repos 3 | FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name -------------------------------------------------------------------------------- /ch07/40.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | APPROX_COUNT_DISTINCT(repo_name) AS num_repos 3 | FROM `bigquery-public-data`.github_repos.commits, UNNEST(repo_name) AS repo_name -------------------------------------------------------------------------------- /ch07/41.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | COUNT(DISTINCT bike_id) AS num_bikes 3 | FROM `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch07/42.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | APPROX_COUNT_DISTINCT(bike_id) AS num_bikes 3 | FROM `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch07/43.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | APPROX_TOP_COUNT(bike_id, 5) AS num_bikes 3 | FROM `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch07/44.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | APPROX_TOP_SUM(start_station_name, duration, 5) AS num_bikes 3 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 4 | WHERE duration > 0 -------------------------------------------------------------------------------- /ch07/45.sql: -------------------------------------------------------------------------------- 1 | WITH sketch AS ( 2 | SELECT 3 | HLL_COUNT.INIT(start_station_name) AS hll_start 4 | , HLL_COUNT.INIT(end_station_name) AS hll_end 5 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 6 | ) 7 | 8 | SELECT 9 | HLL_COUNT.MERGE(hll_start) AS distinct_start 10 | , HLL_COUNT.MERGE(hll_end) AS distinct_end 11 | , HLL_COUNT.MERGE(hll_both) AS distinct_station 12 | FROM sketch, UNNEST([hll_start, hll_end]) AS hll_both -------------------------------------------------------------------------------- /ch07/46.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | APPROX_COUNT_DISTINCT(start_station_name) AS distinct_start 3 | , APPROX_COUNT_DISTINCT(end_station_name) AS distinct_end 4 | , APPROX_COUNT_DISTINCT(both_stations) AS distinct_station 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | , UNNEST([start_station_name, end_station_name]) AS both_stations -------------------------------------------------------------------------------- /ch07/47.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO approx_distinct_users_agg AS 2 | SELECT date, product, country, HLL_COUNT.INIT(user_id) AS sketch 3 | GROUP BY date, product, country, sketch -------------------------------------------------------------------------------- /ch07/48.sql: -------------------------------------------------------------------------------- 1 | SELECT date, HLL_COUNT.MERGE(sketch) 2 | FROM approx_distinct_users_agg 3 | GROUP BY date -------------------------------------------------------------------------------- /ch07/49.sh: -------------------------------------------------------------------------------- 1 | JOBSURL="https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs" 2 | FIELDS="statistics(query(queryPlan(steps)))" 3 | curl --silent \ 4 | -H "Authorization: Bearer $access_token" \ 5 | -H "Accept-Encoding: gzip" \ 6 | -H "User-Agent: get_job_details (gzip)" \ 7 | -X GET \ 8 | "${JOBSURL}/${JOBID}?fields=${FIELDS}" \ 9 | | zcat -------------------------------------------------------------------------------- /ch07/50.sh: -------------------------------------------------------------------------------- 1 | # 가장 최근에 성공한 5개의 작업 2 | JOBS=$(bq ls -j -n 50 | grep SUCCESS | head -5 | awk '{print $1}') 3 | 4 | BATCHURL="https://www.googleapis.com/batch/bigquery/v2" 5 | JOBSPATH="/projects/$PROJECT/jobs" 6 | FIELDS="statistics(query(queryPlan(steps)))" 7 | 8 | request="" 9 | for JOBID in $JOBS; do 10 | read -d '' part << EOF 11 | --batch_part_starts_here 12 | GET ${JOBSPATH}/${JOBID}?fields=${FIELDS} 13 | EOF 14 | request=$(echo "$request"; echo "$part") 15 | done 16 | 17 | curl --silent \ 18 | -H "Authorization: Bearer $access_token" \ 19 | -H "Content-Type: multipart/mixed; boundary=batch_part_starts_here" \ 20 | -X POST \ 21 | -d "$request" \ 22 | "${BATCHURL}" -------------------------------------------------------------------------------- /ch07/51.py: -------------------------------------------------------------------------------- 1 | %%bigquery df --use_bqstorage_api --project $PROJECT 2 | SELECT 3 | start_station_name 4 | , end_station_name 5 | , start_date 6 | , duration 7 | FROM `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch07/52.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | sid, number, basin, name, 3 | ARRAY_AGG(STRUCT(iso_time, usa_latitude, usa_longitude, usa_wind) ORDER BY 4 | usa_wind DESC LIMIT 1)[OFFSET(0)].* 5 | FROM 6 | `bigquery-public-data`.noaa_hurricanes.hurricanes 7 | WHERE 8 | season = '2018' 9 | GROUP BY 10 | sid, number, basin, name 11 | ORDER BY number ASC -------------------------------------------------------------------------------- /ch07/54.sql: -------------------------------------------------------------------------------- 1 | WITH hurricane_detail AS ( 2 | SELECT sid, season, number, basin, name, 3 | ARRAY_AGG( 4 | STRUCT( 5 | iso_time, 6 | nature, 7 | usa_sshs, 8 | STRUCT(usa_latitude AS latitude, usa_longitude AS longitude, usa_wind AS wind, usa_pressure AS pressure) AS usa, 9 | STRUCT(tokyo_latitude AS latitude, tokyo_longitude AS longitude, tokyo_wind AS wind, tokyo_pressure AS pressure) AS tokyo 10 | ) ORDER BY iso_time ASC ) AS obs 11 | FROM 'bigquery-public-data'.noaa_hurricanes.hurricanes 12 | GROUP BY sid, season, number, basin, name 13 | ) 14 | 15 | SELECT 16 | COUNT(sid) AS count_of_storms, 17 | season 18 | FROM hurricane_detail 19 | GROUP BY season 20 | ORDER BY season DESC -------------------------------------------------------------------------------- /ch07/55.sql: -------------------------------------------------------------------------------- 1 | SELECT name, zipcode 2 | FROM `bigquery-public-data`.utility_us.zipcode_area 3 | JOIN `bigquery-public-data`.utility_us.us_cities_area 4 | ON ST_INTERSECTS(ST_GeogFromText(zipcode_geom), city_geom) 5 | WHERE name LIKE '%Santa Fe%' -------------------------------------------------------------------------------- /ch07/56.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07.zipcode_area AS 2 | SELECT 3 | * REPLACE(ST_GeogFromText(zipcode_geom) AS zipcode_geom) 4 | FROM 5 | `bigquery-public-data`.utility_us.zipcode_area -------------------------------------------------------------------------------- /ch07/57.sql: -------------------------------------------------------------------------------- 1 | SELECT name, zipcode 2 | FROM ch07.zipcode_area 3 | JOIN `bigquery-public-data`.utility_us.us_cities_area 4 | ON ST_INTERSECTS(zipcode_geom, city_geom) 5 | WHERE name LIKE '%Santa Fe%' -------------------------------------------------------------------------------- /ch07/58.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , AVG(duration) AS avg_duration 4 | FROM `bigquery-public-data`.london_bicycles.cycle_hire 5 | WHERE EXTRACT(YEAR FROM start_date) = 2015 6 | GROUP BY start_station_name 7 | ORDER BY avg_duration DESC 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/59.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.cycle_hire_2015 AS ( 2 | SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire 3 | WHERE EXTRACT(YEAR FROM start_date) = 2015 4 | ) -------------------------------------------------------------------------------- /ch07/60.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , AVG(duration) AS avg_duration 4 | FROM ch07eu.cycle_hire_2015 5 | GROUP BY start_station_name 6 | ORDER BY avg_duration DESC 7 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/61.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , AVG(duration) AS avg_duration 4 | FROM `ch07eu.cycle_hire_*` 5 | WHERE _TABLE_SUFFIX BETWEEN '2015' AND '2016' 6 | GROUP BY start_station_name 7 | ORDER BY avg_duration DESC 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/62.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.cycle_hire_partitioned 2 | PARTITION BY DATE(start_date) AS 3 | SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch07/63.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , AVG(duration) AS avg_duration 4 | FROM ch07eu.cycle_hire_partitioned 5 | WHERE start_date BETWEEN '2015-01-01' AND '2015-12-31' 6 | GROUP BY start_station_name 7 | ORDER BY avg_duration DESC 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/64.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , AVG(duration) AS avg_duration 4 | FROM ch07eu.cycle_hire_partitioned 5 | WHERE EXTRACT(YEAR FROM start_date) = 2015 6 | GROUP BY start_station_name 7 | ORDER BY avg_duration DESC 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch07/65.sh: -------------------------------------------------------------------------------- 1 | bq mk --range_partitioning=bike_id,0,1000,5 \ 2 | ch07eu.partition_cycle_hire schema.json -------------------------------------------------------------------------------- /ch07/66.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch07eu.cycle_hire_clustered 2 | PARTITION BY DATE(start_date) 3 | CLUSTER BY start_station_name, end_station_name 4 | AS ( 5 | SELECT * FROM `bigquery-public-data`.london_bicycles.cycle_hire 6 | ) -------------------------------------------------------------------------------- /ch07/67.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , end_station_name 4 | , AVG(duration) AS duration 5 | FROM ch07eu.cycle_hire_clustered 6 | WHERE 7 | start_station_name LIKE '%Kennington%' 8 | AND end_station_name LIKE '%Hyde%' 9 | GROUP BY start_station_name, end_station_name -------------------------------------------------------------------------------- /ch07/68.sql: -------------------------------------------------------------------------------- 1 | SELECT title, SUM(views) AS views 2 | FROM `fh-bigquery.wikipedia_v3.pageviews_2017` 3 | WHERE DATE(datehour) BETWEEN '2017-06-01' AND '2017-06-30' 4 | AND wiki = 'en' 5 | AND title LIKE '%Liberia%' 6 | GROUP BY title -------------------------------------------------------------------------------- /ch07/69.sql: -------------------------------------------------------------------------------- 1 | MERGE ch07eu.cycle_hire_clustered all_hires 2 | USING ch07eu.cycle_hire_corrections some_month 3 | ON all_hires.start_station_name = some_month.start_station_name 4 | WHEN MATCHED 5 | AND all_hires._PARTITIONTIME = DATE(some_month.start_date) THEN 6 | INSERT (rental_id, duration, ...) 7 | VALUES (rental_id, duration, ...) -------------------------------------------------------------------------------- /ch07/70.sql: -------------------------------------------------------------------------------- 1 | SELECT o.* 2 | FROM orders o 3 | JOIN customers c USING (customer_id) 4 | WHERE c.name = "Changying Bao" -------------------------------------------------------------------------------- /ch07/71.py: -------------------------------------------------------------------------------- 1 | BigQueryIO.writeTableRows() \ 2 | .to("project-id:dataset-id.table-id") \ 3 | .withCreateDisposition( \ 4 | BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) \ 5 | .withMethod(Method.FILE_LOADS) \ 6 | .withTriggeringFrequency(Duration.standardSeconds(600)) \ 7 | .withNumFileShards(10) \ 8 | .withSchema(new TableSchema()...) \ 9 | .withoutValidation()) -------------------------------------------------------------------------------- /ch07/get_job_details.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Google LLC. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | JOBID=Pontem_BigQuery_WorkloadTester_8adbf3fd-e310-44bb-9c6e-88254958ccac # CHANGE 7 | 8 | access_token=$(gcloud auth application-default print-access-token) 9 | PROJECT=$(gcloud config get-value project) 10 | 11 | echo "$request" 12 | curl --silent \ 13 | -H "Authorization: Bearer $access_token" \ 14 | -X GET \ 15 | "https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs/${JOBID}" 16 | 17 | -------------------------------------------------------------------------------- /ch07/get_job_details_compressed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019 Google LLC. 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | JOBID=Pontem_BigQuery_WorkloadTester_8adbf3fd-e310-44bb-9c6e-88254958ccac # CHANGE 7 | 8 | access_token=$(gcloud auth application-default print-access-token) 9 | PROJECT=$(gcloud config get-value project) 10 | JOBSURL="https://www.googleapis.com/bigquery/v2/projects/$PROJECT/jobs" 11 | FIELDS="statistics(query(queryPlan(steps)))" 12 | 13 | echo "$request" 14 | curl --silent \ 15 | -H "Authorization: Bearer $access_token" \ 16 | -H "Accept-Encoding: gzip" \ 17 | -H "User-Agent: get_job_details (gzip)" \ 18 | -X GET \ 19 | "${JOBSURL}/${JOBID}?fields=${FIELDS}" \ 20 | | zcat 21 | -------------------------------------------------------------------------------- /ch07/google_analytics.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT 2 | visitId 3 | , totals.pageviews 4 | , totals.timeOnsite 5 | , trafficSource.source 6 | , device.browser 7 | , device.isMobile 8 | , h.page.pageTitle 9 | FROM 10 | `bigquery-public-data`.google_analytics_sample.ga_sessions_20170801, 11 | UNNEST(hits) AS h 12 | WHERE 13 | totals.timeOnSite IS NOT NULL AND h.page.pageTitle = 'Shopping Cart' 14 | ORDER BY pageviews DESC 15 | LIMIT 10 16 | 17 | -------------------------------------------------------------------------------- /ch07/install_workload_tester.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #sudo apt-get -y install gradle 4 | git clone https://github.com/GoogleCloudPlatform/pontem.git 5 | cd pontem/BigQueryWorkloadTester 6 | gradle clean :BigQueryWorkloadTester:build 7 | 8 | -------------------------------------------------------------------------------- /ch08/01.py: -------------------------------------------------------------------------------- 1 | query = """ 2 | SELECT 3 | start_station_name 4 | , AVG(duration) as avg_duration 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | WHERE 8 | start_station_name LIKE CONCAT('%', @STATION, '%') 9 | AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION 10 | GROUP BY start_station_name 11 | """ 12 | -------------------------------------------------------------------------------- /ch08/02.py: -------------------------------------------------------------------------------- 1 | query_params = [ 2 | bigquery.ScalarQueryParameter( 3 | "STATION", "STRING", station_name), 4 | bigquery.ScalarQueryParameter( 5 | "MIN_DURATION", "FLOAT64", min_duration), 6 | bigquery.ScalarQueryParameter( 7 | "MAX_DURATION", "FLOAT64", max_duration), 8 | ] 9 | 10 | job_config = bigquery.QueryJobConfig() 11 | job_config.query_parameters = query_params 12 | query_job = client.query( 13 | query, 14 | location="EU", 15 | job_config=job_config, 16 | ) 17 | for row in query_job: 18 | print("{}: \t{}".format( 19 | row.start_station_name, row.avg_duration)) 20 | -------------------------------------------------------------------------------- /ch08/03.py: -------------------------------------------------------------------------------- 1 | def print_query_results(client, 2 | station_name, 3 | min_duration=0, 4 | max_duration=84000): 5 | -------------------------------------------------------------------------------- /ch08/04.py: -------------------------------------------------------------------------------- 1 | client = bigquery.Client() 2 | print_query_results(client, 'Kennington', 300) 3 | print_query_results(client, 'Hyde Park', 600, 6000) 4 | -------------------------------------------------------------------------------- /ch08/05.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery 2 | from datetime import datetime 3 | from datetime import timedelta 4 | import pytz 5 | 6 | 7 | def print_query_results(client, mid_time): 8 | start_time = mid_time - timedelta(minutes=30) 9 | end_time = mid_time + timedelta(minutes=30) 10 | -------------------------------------------------------------------------------- /ch08/06.py: -------------------------------------------------------------------------------- 1 | query = """ 2 | SELECT 3 | AVG(duration) as avg_duration 4 | FROM 5 | `bigquery-public-data`.london_bicycles.cycle_hire 6 | WHERE 7 | start_date BETWEEN @START_TIME AND @END_TIME 8 | """ 9 | query_params = [ 10 | bigquery.ScalarQueryParameter( 11 | "START_TIME", "TIMESTAMP", start_time), 12 | bigquery.ScalarQueryParameter( 13 | "END_TIME", "TIMESTAMP", end_time), 14 | ] 15 | job_config = bigquery.QueryJobConfig() 16 | job_config.query_parameters = query_params 17 | query_job = client.query( 18 | query, 19 | location="EU", 20 | job_config=job_config, 21 | ) 22 | for row in query_job: 23 | print(row.avg_duration) 24 | print("______________________") 25 | -------------------------------------------------------------------------------- /ch08/07.py: -------------------------------------------------------------------------------- 1 | client = bigquery.Client() 2 | print_query_results(client, datetime(2015, 12, 25, 15, 0, tzinfo=pytz.UTC)) 3 | -------------------------------------------------------------------------------- /ch08/08.py: -------------------------------------------------------------------------------- 1 | def print_query_results(client, params): 2 | query = """ 3 | SELECT 4 | start_station_name 5 | , AVG(duration) as avg_duration 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | WHERE 9 | start_station_name LIKE CONCAT('%', ?, '%') 10 | AND duration BETWEEN ? AND ? 11 | GROUP BY start_station_name 12 | """ 13 | 14 | query_params = [ 15 | bigquery.ScalarQueryParameter( 16 | None, "STRING", params[0]), 17 | bigquery.ScalarQueryParameter( 18 | None, "FLOAT64", params[1]), 19 | bigquery.ScalarQueryParameter( 20 | None, "FLOAT64", params[2]), 21 | ] 22 | -------------------------------------------------------------------------------- /ch08/09.py: -------------------------------------------------------------------------------- 1 | query = """ 2 | SELECT 3 | start_station_id 4 | , COUNT(*) AS num_trips 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | WHERE 8 | start_station_id IN UNNEST(@STATIONS) 9 | AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION 10 | GROUP BY start_station_id 11 | """ 12 | query_params = [ 13 | bigquery.ArrayQueryParameter( 14 | 'STATIONS', "INT64", ids), 15 | bigquery.ScalarQueryParameter( 16 | 'MIN_DURATION', "FLOAT64", min_duration), 17 | bigquery.ScalarQueryParameter( 18 | 'MAX_DURATION', "FLOAT64", max_duration), 19 | ] 20 | -------------------------------------------------------------------------------- /ch08/10.py: -------------------------------------------------------------------------------- 1 | print_query_results(client, [270, 235, 62, 149], 300, 600) -------------------------------------------------------------------------------- /ch08/11.py: -------------------------------------------------------------------------------- 1 | bigquery.StructQueryParameter( 2 | "bicycle_trip", 3 | bigquery.ScalarQueryParameter("start_station_id", "INT64", 62), 4 | bigquery.ScalarQueryParameter("end_station_id", "INT64", 421), 5 | ) 6 | -------------------------------------------------------------------------------- /ch08/12.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMPORARY FUNCTION dayOfWeek(x TIMESTAMP) AS 2 | ( 3 | ['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] 4 | [ORDINAL(EXTRACT(DAYOFWEEK FROM x))] 5 | ); 6 | CREATE TEMPORARY FUNCTION getDate(x TIMESTAMP) AS 7 | ( 8 | EXTRACT(DATE FROM x) 9 | ); -------------------------------------------------------------------------------- /ch08/13.sql: -------------------------------------------------------------------------------- 1 | WITH overnight_trips AS ( 2 | SELECT 3 | duration 4 | , dayOfWeek(start_date) AS start_day 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | WHERE 8 | getDate(start_date) != getDate(end_date) 9 | ) -------------------------------------------------------------------------------- /ch08/14.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_day 3 | , COUNT(*) AS num_overnight_rentals 4 | , AVG(duration)/3600 AS avg_duration_hours 5 | FROM 6 | overnight_trips 7 | GROUP BY 8 | start_day 9 | ORDER BY num_overnight_rentals DESC -------------------------------------------------------------------------------- /ch08/15.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION ch08eu.dayOfWeek(x TIMESTAMP) AS 2 | ( 3 | ['Sun','Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'] 4 | [ORDINAL(EXTRACT(DAYOFWEEK FROM x))] 5 | ); -------------------------------------------------------------------------------- /ch08/16.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE FUNCTION fhoffa.x.median (arr ANY TYPE) AS (( 2 | SELECT IF (MOD(ARRAY_LENGTH(arr), 2) = 0, 3 | ( arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2) - 1)] + 4 | arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))] ) / 2, 5 | arr[OFFSET(DIV(ARRAY_LENGTH(arr), 2))] 6 | ) 7 | FROM (SELECT ARRAY_AGG(x ORDER BY x) AS arr FROM UNNEST(arr) AS x) 8 | )); -------------------------------------------------------------------------------- /ch08/17.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , COUNT(*) AS num_trips 4 | , fhoffa.x.median(ARRAY_AGG(tripduration)) AS typical_duration 5 | FROM `bigquery-public-data`.new_york_citibike.citibike_trips 6 | GROUP BY start_station_name 7 | HAVING num_trips > 1000 8 | ORDER BY typical_duration DESC 9 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/18.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_name 3 | , COUNT(*) AS num_trips 4 | , bqutil.fn.median(ARRAY_AGG(tripduration)) AS typical_duration 5 | FROM `bigquery-public-data`.new_york_citibike.citibike_trips 6 | GROUP BY start_station_name 7 | HAVING num_trips > 1000 8 | ORDER BY typical_duration DESC 9 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/19.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_date, 3 | COUNT(*) AS num_long_trips 4 | FROM -- "첫 번째 FROM 절" 5 | (SELECT 6 | start_station_name 7 | , duration 8 | , EXTRACT(DATE FROM start_date) AS start_date 9 | FROM 10 | `bigquery-public-data`.london_bicycles.cycle_hire 11 | WHERE 12 | start_station_name = end_station_name) AS roundtrips 13 | WHERE -- "바깥쪽의 WHERE" 14 | duration > 2*( 15 | SELECT 16 | AVG(duration) AS avg_duration 17 | FROM 18 | `bigquery-public-data`.london_bicycles.cycle_hire 19 | WHERE 20 | start_station_name = end_station_name 21 | AND roundtrips.start_station_name = start_station_name 22 | ) 23 | GROUP BY start_date 24 | ORDER BY num_long_trips DESC 25 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/20.sql: -------------------------------------------------------------------------------- 1 | WITH roundtrips AS ( 2 | SELECT 3 | start_station_name 4 | , duration 5 | , EXTRACT(DATE FROM start_date) AS start_date 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | WHERE 9 | start_station_name = end_station_name 10 | ), 11 | station_avg AS ( 12 | SELECT 13 | start_station_name 14 | , AVG(duration) as avg_duration 15 | FROM 16 | roundtrips 17 | GROUP BY start_station_name 18 | ) -------------------------------------------------------------------------------- /ch08/21.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_date, 3 | COUNT(*) AS num_long_trips 4 | FROM 5 | roundtrips 6 | JOIN station_avg USING(start_station_name) 7 | WHERE duration > 2*avg_duration 8 | GROUP BY start_date 9 | ORDER BY num_long_trips DESC 10 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/22.sql: -------------------------------------------------------------------------------- 1 | WITH params AS ( 2 | SELECT 600 AS DURATION_THRESH 3 | ) 4 | SELECT 5 | start_station_name 6 | , COUNT(duration) AS num_trips 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_hire 9 | , params 10 | WHERE duration >= DURATION_THRESH 11 | GROUP BY start_station_name 12 | ORDER BY num_trips DESC 13 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/23.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | bike_id, 3 | COUNT(*) AS num_trips 4 | FROM 5 | `bigquery-public-data`.london_bicycles.cycle_hire 6 | GROUP BY 7 | bike_id 8 | ORDER BY 9 | num_trips DESC 10 | LIMIT 11 | 100 -------------------------------------------------------------------------------- /ch08/24.sql: -------------------------------------------------------------------------------- 1 | WITH numtrips AS ( 2 | SELECT 3 | bike_id AS id, 4 | COUNT(*) AS num_trips 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | GROUP BY 8 | bike_id 9 | ) 10 | 11 | SELECT 12 | ARRAY_AGG(STRUCT(id,num_trips) ORDER BY num_trips DESC LIMIT 100) AS bike 13 | FROM 14 | numtrips -------------------------------------------------------------------------------- /ch08/25.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ein 3 | , ARRAY_AGG(STRUCT(elf, tax_pd, subseccd)) AS filing 4 | FROM `bigquery-public-data`.irs_990.irs_990_2015 5 | WHERE ein BETWEEN '390' AND '399' 6 | GROUP BY ein 7 | LIMIT 3 -------------------------------------------------------------------------------- /ch08/26.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', INTERVAL 10 DAY) AS summer -------------------------------------------------------------------------------- /ch08/27.sql: -------------------------------------------------------------------------------- 1 | WITH days AS ( 2 | SELECT 3 | GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', INTERVAL 10 DAY) AS summer 4 | ) 5 | SELECT summer_day 6 | FROM days, UNNEST(summer) AS summer_day -------------------------------------------------------------------------------- /ch08/28.sql: -------------------------------------------------------------------------------- 1 | SELECT ['Lak', 'Jordan', 'Graham'] AS minions -------------------------------------------------------------------------------- /ch08/29.sql: -------------------------------------------------------------------------------- 1 | WITH days AS ( 2 | SELECT 3 | GENERATE_DATE_ARRAY('2019-06-23', '2019-08-22', INTERVAL 10 DAY) AS summer, 4 | ['Lak', 'Jordan', 'Graham'] AS minions 5 | ) 6 | 7 | SELECT 8 | summer[ORDINAL(dayno)] AS summer_day 9 | , minions[OFFSET(MOD(dayno, ARRAY_LENGTH(minions)))] AS minion 10 | FROM 11 | days, UNNEST(GENERATE_ARRAY(1,ARRAY_LENGTH(summer),1)) dayno 12 | ORDER BY summer_day ASC -------------------------------------------------------------------------------- /ch08/30.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ARRAY_CONCAT( 3 | GENERATE_DATE_ARRAY('2019-03-23', '2019-06-22', INTERVAL 20 DAY) 4 | , GENERATE_DATE_ARRAY('2019-08-23', '2019-11-22', INTERVAL 20 DAY) 5 | ) AS shoulder_season -------------------------------------------------------------------------------- /ch08/31.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ARRAY_TO_STRING(['A', 'B', NULL, 'D'], '*', 'na') AS arr -------------------------------------------------------------------------------- /ch08/32.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | MAX(duration) AS longest_duration 3 | , COUNT(*) AS num_trips 4 | , AVG(duration) AS average_duration 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire -------------------------------------------------------------------------------- /ch08/33.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | AVG(duration) 3 | OVER(ORDER BY start_date ASC 4 | ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS average_duration 5 | FROM 6 | `bigquery-public-data`.london_bicycles.cycle_hire 7 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/34.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | AVG(duration) 3 | OVER(PARTITION BY start_station_id 4 | ORDER BY start_date ASC 5 | ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS average_duration 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/35.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | AVG(duration) 3 | OVER(PARTITION BY start_station_id 4 | ORDER BY UNIX_SECONDS(start_date) ASC 5 | RANGE BETWEEN 3600 PRECEDING AND CURRENT ROW) AS average_duration 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/36.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_date 3 | , end_date 4 | , LAST_VALUE(start_date) 5 | OVER(PARTITION BY bike_id 6 | ORDER BY start_date ASC 7 | ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) AS next_rental_start 8 | FROM 9 | `bigquery-public-data`.london_bicycles.cycle_hire 10 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/37.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_date 3 | , end_date 4 | , LEAD(start_date, 1) 5 | OVER(PARTITION BY bike_id 6 | ORDER BY start_date ASC) AS next_rental_start 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_hire 9 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/38.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | start_station_id 3 | , duration 4 | , RANK() 5 | OVER(PARTITION BY start_station_id ORDER BY duration DESC) AS nth_longest 6 | FROM 7 | `bigquery-public-data`.london_bicycles.cycle_hire 8 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/39.sql: -------------------------------------------------------------------------------- 1 | WITH longest_trips AS ( 2 | SELECT 3 | start_station_id 4 | , duration 5 | , RANK() 6 | OVER(PARTITION BY start_station_id ORDER BY duration DESC) AS nth_longest 7 | FROM 8 | `bigquery-public-data`.london_bicycles.cycle_hire 9 | ) 10 | SELECT 11 | start_station_id 12 | , ARRAY_AGG(duration ORDER BY nth_longest LIMIT 3) AS durations 13 | FROM 14 | longest_trips 15 | GROUP BY start_station_id 16 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/40.sql: -------------------------------------------------------------------------------- 1 | WITH example AS ( 2 | SELECT 'A' AS name, 32 AS age 3 | UNION ALL SELECT 'B', 32 4 | UNION ALL SELECT 'C', 33 5 | UNION ALL SELECT 'D', 33 6 | UNION ALL SELECT 'E', 34 7 | ) 8 | SELECT 9 | name 10 | , age 11 | , RANK() OVER(ORDER BY age) AS rank 12 | , DENSE_RANK() OVER(ORDER BY age) AS dense_rank 13 | , ROW_NUMBER() OVER(ORDER BY age) AS row_number 14 | FROM example -------------------------------------------------------------------------------- /ch08/41.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ein 3 | , ARRAY_AGG(STRUCT(elf, tax_pd, subseccd)) AS filing 4 | FROM `bigquery-public-data`.irs_990.irs_990_2015 5 | GROUP BY ein -------------------------------------------------------------------------------- /ch08/42.sql: -------------------------------------------------------------------------------- 1 | SELECT column_name 2 | FROM `bigquery-public-data`.irs_990.INFORMATION_SCHEMA.COLUMNS 3 | WHERE table_name = 'irs_990_2015' -------------------------------------------------------------------------------- /ch08/43.sql: -------------------------------------------------------------------------------- 1 | WITH columns AS ( 2 | SELECT column_name 3 | FROM `bigquery-public-data`.irs_990.INFORMATION_SCHEMA.COLUMNS 4 | WHERE table_name = 'irs_990_2015' AND column_name != 'ein' 5 | ) 6 | 7 | SELECT CONCAT( 8 | 'SELECT ein, ARRAY_AGG(STRUCT(', 9 | ARRAY_TO_STRING(ARRAY(SELECT column_name FROM columns), ',\n '), 10 | '\n) FROM `bigquery-public-data`.irs_990.irs_990_2015\n', 11 | 'GROUP BY ein') -------------------------------------------------------------------------------- /ch08/44.sh: -------------------------------------------------------------------------------- 1 | bq update --set_label costcenter:abc342 ch08eu -------------------------------------------------------------------------------- /ch08/45.sh: -------------------------------------------------------------------------------- 1 | bq update --set_label costcenter:def456 ch08eu -------------------------------------------------------------------------------- /ch08/46.sh: -------------------------------------------------------------------------------- 1 | bq ls --filter 'labels.costcenter:def456' -------------------------------------------------------------------------------- /ch08/47.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | * 3 | FROM `bigquery-public-data`.london_bicycles.cycle_stations 4 | FOR SYSTEM_TIME AS OF 5 | TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 6 HOUR) -------------------------------------------------------------------------------- /ch08/48.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch08eu.hydepark_stations AS 2 | SELECT 3 | * EXCEPT(longitude, latitude) 4 | , ST_GeogPoint(longitude, latitude) AS location 5 | FROM `bigquery-public-data`.london_bicycles.cycle_stations 6 | WHERE name LIKE '%Hyde%' -------------------------------------------------------------------------------- /ch08/49.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch08eu.hydepark_stations 2 | OPTIONS( 3 | expiration_timestamp=TIMESTAMP "2020-01-01 00:00:00 UTC", 4 | description="Stations with Hyde Park in the name", 5 | labels=[("cost_center", "abc123")] 6 | ) AS 7 | SELECT 8 | * EXCEPT(longitude, latitude) 9 | , ST_GeogPoint(longitude, latitude) AS location 10 | FROM `bigquery-public-data.london_bicycles.cycle_stations` 11 | WHERE name LIKE '%Hyde%' -------------------------------------------------------------------------------- /ch08/50.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE ch08eu.hydepark_rides 2 | ( 3 | start_time TIMESTAMP, 4 | duration INT64, 5 | start_station_id INT64, 6 | start_station_name STRING, 7 | end_station_id INT64, 8 | end_station_name STRING 9 | ) 10 | PARTITION BY DATE(start_time) 11 | CLUSTER BY start_station_id -------------------------------------------------------------------------------- /ch08/51.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE ch08eu.hydepark_rides 2 | SET OPTIONS( 3 | expiration_timestamp=TIMESTAMP "2021-01-01 00:00:00 UTC", 4 | require_partition_filter=True, 5 | labels=[("cost_center", "def456")] 6 | ) -------------------------------------------------------------------------------- /ch08/52.sql: -------------------------------------------------------------------------------- 1 | INSERT ch08eu.hydepark_rides 2 | SELECT 3 | start_date AS start_time 4 | , duration 5 | , start_station_id 6 | , start_station_name 7 | , end_station_id 8 | , end_station_name 9 | FROM 10 | `bigquery-public-data`.london_bicycles.cycle_hire 11 | WHERE 12 | start_station_name LIKE '%Hyde%' -------------------------------------------------------------------------------- /ch08/53.sql: -------------------------------------------------------------------------------- 1 | WITH rides_in_year AS ( 2 | SELECT 3 | EXTRACT(MONTH from start_time) AS month 4 | , duration 5 | FROM ch08eu.hydepark_rides 6 | WHERE 7 | DATE(start_time) BETWEEN '2016-01-01' AND '2016-12-31' 8 | AND start_station_id = 300 9 | AND end_station_id = 303 10 | ) 11 | SELECT 12 | month 13 | , AVG(duration)/60 AS avg_duration_minutes 14 | FROM rides_in_year 15 | GROUP BY month 16 | ORDER BY avg_duration_minutes DESC 17 | LIMIT 5 -------------------------------------------------------------------------------- /ch08/54.sql: -------------------------------------------------------------------------------- 1 | INSERT ch08eu.hydepark_rides ( 2 | start_time 3 | , duration 4 | , start_station_id 5 | , start_station_name 6 | , end_station_id 7 | , end_station_name 8 | ) 9 | VALUES 10 | ('2016-02-18 17:21:00 UTC', 720, 300, 11 | 'Serpentine Car Park, Hyde Park', 303, 'Albert Gate, Hyde Park'), 12 | ('2016-02-18 16:30:00 UTC', 1320, 300, 13 | 'Serpentine Car Park, Hyde Park', 303, 'Albert Gate, Hyde Park') -------------------------------------------------------------------------------- /ch08/55.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMPORARY FUNCTION stationName(stationId INT64) AS( 2 | (SELECT name FROM 3 | `bigquery-public-data`.london_bicycles.cycle_stations 4 | WHERE id = stationId) 5 | ); -------------------------------------------------------------------------------- /ch08/56.sql: -------------------------------------------------------------------------------- 1 | DELETE ch08eu.hydepark_rides 2 | WHERE 3 | start_time > '2016-12-01' AND 4 | (duration IS NULL OR duration = 0) -------------------------------------------------------------------------------- /ch08/57.sql: -------------------------------------------------------------------------------- 1 | DELETE ch08eu.hydepark_rides 2 | WHERE 3 | userId = 3452123 -------------------------------------------------------------------------------- /ch08/58.sql: -------------------------------------------------------------------------------- 1 | UPDATE ch08eu.hydepark_rides 2 | SET duration = duration * 60 3 | WHERE 4 | start_time > '2016-12-01' AND 5 | start_station_id = 303 -------------------------------------------------------------------------------- /ch08/59.sql: -------------------------------------------------------------------------------- 1 | UPDATE ch08eu.stations_table 2 | SET maintenance = ARRAY_CONCAT(maintenance, 3 | ARRAY_STRUCT